whisper-webui

Runtime error

App Files Files Community

Mikerhinos

aadnk commited on Nov 28, 2022

Commit

9e8682f

•

0 Parent(s):

Duplicate from aadnk/whisper-webui

Browse files

Co-authored-by: Kristian Stangeland <aadnk@users.noreply.huggingface.co>

Files changed (21) hide show

.gitattributes +31 -0
.gitignore +5 -0
README.md +136 -0
app-local.py +3 -0
app-network.py +3 -0
app-shared.py +3 -0
app.py +339 -0
cli.py +121 -0
dockerfile +20 -0
docs/options.md +78 -0
requirements.txt +6 -0
src/__init__.py +0 -0
src/download.py +72 -0
src/modelCache.py +17 -0
src/segments.py +55 -0
src/utils.py +115 -0
src/vad.py +527 -0
src/vadParallel.py +251 -0
src/whisperContainer.py +106 -0
tests/segments_test.py +48 -0
tests/vad_test.py +66 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,31 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+flagged/
+*.py[cod]
+*$py.class

README.md ADDED Viewed

	@@ -0,0 +1,136 @@

+---
+title: Whisper Webui
+emoji: ⚡
+colorFrom: pink
+colorTo: purple
+sdk: gradio
+sdk_version: 3.3.1
+app_file: app.py
+pinned: false
+license: apache-2.0
+duplicated_from: aadnk/whisper-webui
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+# Running Locally
+To run this program locally, first install Python 3.9+ and Git. Then install Pytorch 10.1+ and all the other dependencies:
+```
+pip install -r requirements.txt
+```
+Finally, run the full version (no audio length restrictions) of the app with parallel CPU/GPU enabled:
+```
+python app.py --input_audio_max_duration -1 --server_name 127.0.0.1 --auto_parallel True
+```
+You can also run the CLI interface, which is similar to Whisper's own CLI but also supports the following additional arguments:
+```
+python cli.py \
+[--vad {none,silero-vad,silero-vad-skip-gaps,silero-vad-expand-into-gaps,periodic-vad}] \
+[--vad_merge_window VAD_MERGE_WINDOW] \
+[--vad_max_merge_size VAD_MAX_MERGE_SIZE] \
+[--vad_padding VAD_PADDING] \
+[--vad_prompt_window VAD_PROMPT_WINDOW]
+[--vad_cpu_cores NUMBER_OF_CORES]
+[--vad_parallel_devices COMMA_DELIMITED_DEVICES]
+[--auto_parallel BOOLEAN]
+```
+In addition, you may also use URL's in addition to file paths as input.
+```
+python cli.py --model large --vad silero-vad --language Japanese "https://www.youtube.com/watch?v=4cICErqqRSM"
+```
+## Parallel Execution
+You can also run both the Web-UI or the CLI on multiple GPUs in parallel, using the `vad_parallel_devices` option. This takes a comma-delimited list of
+device IDs (0, 1, etc.) that Whisper should be distributed to and run on concurrently:
+```
+python cli.py --model large --vad silero-vad --language Japanese \
+--vad_parallel_devices 0,1 "https://www.youtube.com/watch?v=4cICErqqRSM"
+```
+Note that this requires a VAD to function properly, otherwise only the first GPU will be used. Though you could use `period-vad` to avoid taking the hit
+of running Silero-Vad, at a slight cost to accuracy.
+This is achieved by creating N child processes (where N is the number of selected devices), where Whisper is run concurrently. In `app.py`, you can also
+set the `vad_process_timeout` option. This configures the number of seconds until a process is killed due to inactivity, freeing RAM and video memory.
+The default value is 30 minutes.
+```
+python app.py --input_audio_max_duration -1 --vad_parallel_devices 0,1 --vad_process_timeout 3600
+```
+To execute the Silero VAD itself in parallel, use the `vad_cpu_cores` option:
+```
+python app.py --input_audio_max_duration -1 --vad_parallel_devices 0,1 --vad_process_timeout 3600 --vad_cpu_cores 4
+```
+You may also use `vad_process_timeout` with a single device (`--vad_parallel_devices 0`), if you prefer to always free video memory after a period of time.
+### Auto Parallel
+You can also set `auto_parallel` to `True`. This will set `vad_parallel_devices` to use all the GPU devices on the system, and `vad_cpu_cores` to be equal to the number of
+cores (up to 8):
+```
+python app.py --input_audio_max_duration -1 --auto_parallel True
+```
+# Docker
+To run it in Docker, first install Docker and optionally the NVIDIA Container Toolkit in order to use the GPU.
+Then either use the GitLab hosted container below, or check out this repository and build an image:
+```
+sudo docker build -t whisper-webui:1 .
+```
+You can then start the WebUI with GPU support like so:
+```
+sudo docker run -d --gpus=all -p 7860:7860 whisper-webui:1
+```
+Leave out "--gpus=all" if you don't have access to a GPU with enough memory, and are fine with running it on the CPU only:
+```
+sudo docker run -d -p 7860:7860 whisper-webui:1
+```
+# GitLab Docker Registry
+This Docker container is also hosted on GitLab:
+```
+sudo docker run -d --gpus=all -p 7860:7860 registry.gitlab.com/aadnk/whisper-webui:latest
+```
+## Custom Arguments
+You can also pass custom arguments to `app.py` in the Docker container, for instance to be able to use all the GPUs in parallel:
+```
+sudo docker run -d --gpus all -p 7860:7860 \
+--mount type=bind,source=/home/administrator/.cache/whisper,target=/root/.cache/whisper \
+--restart=on-failure:15 registry.gitlab.com/aadnk/whisper-webui:latest \
+app.py --input_audio_max_duration -1 --server_name 0.0.0.0 --vad_parallel_devices 0,1 \
+--default_vad silero-vad --default_model_name large
+```
+You can also call `cli.py` the same way:
+```
+sudo docker run --gpus all \
+--mount type=bind,source=/home/administrator/.cache/whisper,target=/root/.cache/whisper \
+--mount type=bind,source=${PWD},target=/app/data \
+registry.gitlab.com/aadnk/whisper-webui:latest \
+cli.py --model large --vad_parallel_devices 0,1 --vad silero-vad \
+--output_dir /app/data /app/data/YOUR-FILE-HERE.mp4
+```
+## Caching
+Note that the models themselves are currently not included in the Docker images, and will be downloaded on the demand.
+To avoid this, bind the directory /root/.cache/whisper to some directory on the host (for instance /home/administrator/.cache/whisper), where you can (optionally)
+prepopulate the directory with the different Whisper models.
+```
+sudo docker run -d --gpus=all -p 7860:7860 \
+--mount type=bind,source=/home/administrator/.cache/whisper,target=/root/.cache/whisper \
+registry.gitlab.com/aadnk/whisper-webui:latest
+```

app-local.py ADDED Viewed

	@@ -0,0 +1,3 @@

+# Run the app with no audio file restrictions
+from app import create_ui
+create_ui(-1)

app-network.py ADDED Viewed

	@@ -0,0 +1,3 @@

+# Run the app with no audio file restrictions, and make it available on the network
+from app import create_ui
+create_ui(-1, server_name="0.0.0.0")

app-shared.py ADDED Viewed

	@@ -0,0 +1,3 @@

+# Run the app with no audio file restrictions
+from app import create_ui
+create_ui(-1, share=True)

app.py ADDED Viewed

	@@ -0,0 +1,339 @@

+import math
+from typing import Iterator
+import argparse
+from io import StringIO
+import os
+import pathlib
+import tempfile
+import torch
+from src.modelCache import ModelCache
+from src.vadParallel import ParallelContext, ParallelTranscription
+# External programs
+import ffmpeg
+# UI
+import gradio as gr
+from src.download import ExceededMaximumDuration, download_url
+from src.utils import slugify, write_srt, write_vtt
+from src.vad import AbstractTranscription, NonSpeechStrategy, PeriodicTranscriptionConfig, TranscriptionConfig, VadPeriodicTranscription, VadSileroTranscription
+from src.whisperContainer import WhisperContainer
+# Limitations (set to -1 to disable)
+DEFAULT_INPUT_AUDIO_MAX_DURATION = 600 # seconds
+# Whether or not to automatically delete all uploaded files, to save disk space
+DELETE_UPLOADED_FILES = True
+# Gradio seems to truncate files without keeping the extension, so we need to truncate the file prefix ourself
+MAX_FILE_PREFIX_LENGTH = 17
+# Limit auto_parallel to a certain number of CPUs (specify vad_cpu_cores to get a higher number)
+MAX_AUTO_CPU_CORES = 8
+LANGUAGES = [
+ "English", "Chinese", "German", "Spanish", "Russian", "Korean",
+ "French", "Japanese", "Portuguese", "Turkish", "Polish", "Catalan",
+ "Dutch", "Arabic", "Swedish", "Italian", "Indonesian", "Hindi",
+ "Finnish", "Vietnamese", "Hebrew", "Ukrainian", "Greek", "Malay",
+ "Czech", "Romanian", "Danish", "Hungarian", "Tamil", "Norwegian",
+ "Thai", "Urdu", "Croatian", "Bulgarian", "Lithuanian", "Latin",
+ "Maori", "Malayalam", "Welsh", "Slovak", "Telugu", "Persian",
+ "Latvian", "Bengali", "Serbian", "Azerbaijani", "Slovenian",
+ "Kannada", "Estonian", "Macedonian", "Breton", "Basque", "Icelandic",
+ "Armenian", "Nepali", "Mongolian", "Bosnian", "Kazakh", "Albanian",
+ "Swahili", "Galician", "Marathi", "Punjabi", "Sinhala", "Khmer",
+ "Shona", "Yoruba", "Somali", "Afrikaans", "Occitan", "Georgian",
+ "Belarusian", "Tajik", "Sindhi", "Gujarati", "Amharic", "Yiddish",
+ "Lao", "Uzbek", "Faroese", "Haitian Creole", "Pashto", "Turkmen",
+ "Nynorsk", "Maltese", "Sanskrit", "Luxembourgish", "Myanmar", "Tibetan",
+ "Tagalog", "Malagasy", "Assamese", "Tatar", "Hawaiian", "Lingala",
+ "Hausa", "Bashkir", "Javanese", "Sundanese"
+]
+class WhisperTranscriber:
+    def __init__(self, input_audio_max_duration: float = DEFAULT_INPUT_AUDIO_MAX_DURATION, vad_process_timeout: float = None, vad_cpu_cores: int = 1, delete_uploaded_files: bool = DELETE_UPLOADED_FILES):
+        self.model_cache = ModelCache()
+        self.parallel_device_list = None
+        self.gpu_parallel_context = None
+        self.cpu_parallel_context = None
+        self.vad_process_timeout = vad_process_timeout
+        self.vad_cpu_cores = vad_cpu_cores
+        self.vad_model = None
+        self.inputAudioMaxDuration = input_audio_max_duration
+        self.deleteUploadedFiles = delete_uploaded_files
+    def set_parallel_devices(self, vad_parallel_devices: str):
+        self.parallel_device_list = [ device.strip() for device in vad_parallel_devices.split(",") ] if vad_parallel_devices else None
+    def set_auto_parallel(self, auto_parallel: bool):
+        if auto_parallel:
+            if torch.cuda.is_available():
+                self.parallel_device_list = [ str(gpu_id) for gpu_id in range(torch.cuda.device_count())]
+            self.vad_cpu_cores = min(os.cpu_count(), MAX_AUTO_CPU_CORES)
+            print("[Auto parallel] Using GPU devices " + str(self.parallel_device_list) + " and " + str(self.vad_cpu_cores) + " CPU cores for VAD/transcription.")
+    def transcribe_webui(self, modelName, languageName, urlData, uploadFile, microphoneData, task, vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow):
+        try:
+            source, sourceName = self.__get_source(urlData, uploadFile, microphoneData)
+            try:
+                selectedLanguage = languageName.lower() if len(languageName) > 0 else None
+                selectedModel = modelName if modelName is not None else "base"
+                model = WhisperContainer(model_name=selectedModel, cache=self.model_cache)
+                # Execute whisper
+                result = self.transcribe_file(model, source, selectedLanguage, task, vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow)
+                # Write result
+                downloadDirectory = tempfile.mkdtemp()
+                filePrefix = slugify(sourceName, allow_unicode=True)
+                download, text, vtt = self.write_result(result, filePrefix, downloadDirectory)
+                return download, text, vtt
+            finally:
+                # Cleanup source
+                if self.deleteUploadedFiles:
+                    print("Deleting source file " + source)
+                    os.remove(source)
+        except ExceededMaximumDuration as e:
+            return [], ("[ERROR]: Maximum remote video length is " + str(e.maxDuration) + "s, file was " + str(e.videoDuration) + "s"), "[ERROR]"
+    def transcribe_file(self, model: WhisperContainer, audio_path: str, language: str, task: str = None, vad: str = None,
+                        vadMergeWindow: float = 5, vadMaxMergeSize: float = 150, vadPadding: float = 1, vadPromptWindow: float = 1, **decodeOptions: dict):
+        initial_prompt = decodeOptions.pop('initial_prompt', None)
+        if ('task' in decodeOptions):
+            task = decodeOptions.pop('task')
+        # Callable for processing an audio file
+        whisperCallable = model.create_callback(language, task, initial_prompt, **decodeOptions)
+        # The results
+        if (vad == 'silero-vad'):
+            # Silero VAD where non-speech gaps are transcribed
+            process_gaps = self._create_silero_config(NonSpeechStrategy.CREATE_SEGMENT, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow)
+            result = self.process_vad(audio_path, whisperCallable, self.vad_model, process_gaps)
+        elif (vad == 'silero-vad-skip-gaps'):
+            # Silero VAD where non-speech gaps are simply ignored
+            skip_gaps = self._create_silero_config(NonSpeechStrategy.SKIP, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow)
+            result = self.process_vad(audio_path, whisperCallable, self.vad_model, skip_gaps)
+        elif (vad == 'silero-vad-expand-into-gaps'):
+            # Use Silero VAD where speech-segments are expanded into non-speech gaps
+            expand_gaps = self._create_silero_config(NonSpeechStrategy.EXPAND_SEGMENT, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow)
+            result = self.process_vad(audio_path, whisperCallable, self.vad_model, expand_gaps)
+        elif (vad == 'periodic-vad'):
+            # Very simple VAD - mark every 5 minutes as speech. This makes it less likely that Whisper enters an infinite loop, but
+            # it may create a break in the middle of a sentence, causing some artifacts.
+            periodic_vad = VadPeriodicTranscription()
+            period_config = PeriodicTranscriptionConfig(periodic_duration=vadMaxMergeSize, max_prompt_window=vadPromptWindow)
+            result = self.process_vad(audio_path, whisperCallable, periodic_vad, period_config)
+        else:
+            if (self._has_parallel_devices()):
+                # Use a simple period transcription instead, as we need to use the parallel context
+                periodic_vad = VadPeriodicTranscription()
+                period_config = PeriodicTranscriptionConfig(periodic_duration=math.inf, max_prompt_window=1)
+                result = self.process_vad(audio_path, whisperCallable, periodic_vad, period_config)
+            else:
+                # Default VAD
+                result = whisperCallable(audio_path, 0, None, None)
+        return result
+    def process_vad(self, audio_path, whisperCallable, vadModel: AbstractTranscription, vadConfig: TranscriptionConfig):
+        if (not self._has_parallel_devices()):
+            # No parallel devices, so just run the VAD and Whisper in sequence
+            return vadModel.transcribe(audio_path, whisperCallable, vadConfig)
+        gpu_devices = self.parallel_device_list
+        if (gpu_devices is None or len(gpu_devices) == 0):
+            # No GPU devices specified, pass the current environment variable to the first GPU process. This may be NULL.
+            gpu_devices = [os.environ.get("CUDA_VISIBLE_DEVICES", None)]
+        # Create parallel context if needed
+        if (self.gpu_parallel_context is None):
+            # Create a context wih processes and automatically clear the pool after 1 hour of inactivity
+            self.gpu_parallel_context = ParallelContext(num_processes=len(gpu_devices), auto_cleanup_timeout_seconds=self.vad_process_timeout)
+        # We also need a CPU context for the VAD
+        if (self.cpu_parallel_context is None):
+            self.cpu_parallel_context = ParallelContext(num_processes=self.vad_cpu_cores, auto_cleanup_timeout_seconds=self.vad_process_timeout)
+        parallel_vad = ParallelTranscription()
+        return parallel_vad.transcribe_parallel(transcription=vadModel, audio=audio_path, whisperCallable=whisperCallable,
+                                                config=vadConfig, cpu_device_count=self.vad_cpu_cores, gpu_devices=gpu_devices,
+                                                cpu_parallel_context=self.cpu_parallel_context, gpu_parallel_context=self.gpu_parallel_context)
+    def _has_parallel_devices(self):
+        return (self.parallel_device_list is not None and len(self.parallel_device_list) > 0) or self.vad_cpu_cores > 1
+    def _concat_prompt(self, prompt1, prompt2):
+        if (prompt1 is None):
+            return prompt2
+        elif (prompt2 is None):
+            return prompt1
+        else:
+            return prompt1 + " " + prompt2
+    def _create_silero_config(self, non_speech_strategy: NonSpeechStrategy, vadMergeWindow: float = 5, vadMaxMergeSize: float = 150, vadPadding: float = 1, vadPromptWindow: float = 1):
+        # Use Silero VAD
+        if (self.vad_model is None):
+            self.vad_model = VadSileroTranscription()
+        config = TranscriptionConfig(non_speech_strategy = non_speech_strategy,
+                max_silent_period=vadMergeWindow, max_merge_size=vadMaxMergeSize,
+                segment_padding_left=vadPadding, segment_padding_right=vadPadding,
+                max_prompt_window=vadPromptWindow)
+        return config
+    def write_result(self, result: dict, source_name: str, output_dir: str):
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+        text = result["text"]
+        language = result["language"]
+        languageMaxLineWidth = self.__get_max_line_width(language)
+        print("Max line width " + str(languageMaxLineWidth))
+        vtt = self.__get_subs(result["segments"], "vtt", languageMaxLineWidth)
+        srt = self.__get_subs(result["segments"], "srt", languageMaxLineWidth)
+        output_files = []
+        output_files.append(self.__create_file(srt, output_dir, source_name + "-subs.srt"));
+        output_files.append(self.__create_file(vtt, output_dir, source_name + "-subs.vtt"));
+        output_files.append(self.__create_file(text, output_dir, source_name + "-transcript.txt"));
+        return output_files, text, vtt
+    def clear_cache(self):
+        self.model_cache.clear()
+        self.vad_model = None
+    def __get_source(self, urlData, uploadFile, microphoneData):
+        if urlData:
+            # Download from YouTube
+            source = download_url(urlData, self.inputAudioMaxDuration)[0]
+        else:
+            # File input
+            source = uploadFile if uploadFile is not None else microphoneData
+            if self.inputAudioMaxDuration > 0:
+                # Calculate audio length
+                audioDuration = ffmpeg.probe(source)["format"]["duration"]
+                if float(audioDuration) > self.inputAudioMaxDuration:
+                    raise ExceededMaximumDuration(videoDuration=audioDuration, maxDuration=self.inputAudioMaxDuration, message="Video is too long")
+        file_path = pathlib.Path(source)
+        sourceName = file_path.stem[:MAX_FILE_PREFIX_LENGTH] + file_path.suffix
+        return source, sourceName
+    def __get_max_line_width(self, language: str) -> int:
+        if (language and language.lower() in ["japanese", "ja", "chinese", "zh"]):
+            # Chinese characters and kana are wider, so limit line length to 40 characters
+            return 40
+        else:
+            # TODO: Add more languages
+            # 80 latin characters should fit on a 1080p/720p screen
+            return 80
+    def __get_subs(self, segments: Iterator[dict], format: str, maxLineWidth: int) -> str:
+        segmentStream = StringIO()
+        if format == 'vtt':
+            write_vtt(segments, file=segmentStream, maxLineWidth=maxLineWidth)
+        elif format == 'srt':
+            write_srt(segments, file=segmentStream, maxLineWidth=maxLineWidth)
+        else:
+            raise Exception("Unknown format " + format)
+        segmentStream.seek(0)
+        return segmentStream.read()
+    def __create_file(self, text: str, directory: str, fileName: str) -> str:
+        # Write the text to a file
+        with open(os.path.join(directory, fileName), 'w+', encoding="utf-8") as file:
+            file.write(text)
+        return file.name
+    def close(self):
+        self.clear_cache()
+        if (self.gpu_parallel_context is not None):
+            self.gpu_parallel_context.close()
+        if (self.cpu_parallel_context is not None):
+            self.cpu_parallel_context.close()
+def create_ui(input_audio_max_duration, share=False, server_name: str = None, server_port: int = 7860,
+              default_model_name: str = "medium", default_vad: str = None, vad_parallel_devices: str = None, vad_process_timeout: float = None, vad_cpu_cores: int = 1, auto_parallel: bool = False):
+    ui = WhisperTranscriber(input_audio_max_duration, vad_process_timeout, vad_cpu_cores)
+    # Specify a list of devices to use for parallel processing
+    ui.set_parallel_devices(vad_parallel_devices)
+    ui.set_auto_parallel(auto_parallel)
+    ui_description = "Whisper is a general-purpose speech recognition model. It is trained on a large dataset of diverse "
+    ui_description += " audio and is also a multi-task model that can perform multilingual speech recognition "
+    ui_description += " as well as speech translation and language identification. "
+    ui_description += "\n\n\n\nFor longer audio files (>10 minutes) not in English, it is recommended that you select Silero VAD (Voice Activity Detector) in the VAD option."
+    if input_audio_max_duration > 0:
+        ui_description += "\n\n" + "Max audio file length: " + str(input_audio_max_duration) + " s"
+    ui_article = "Read the [documentation here](https://huggingface.co/spaces/aadnk/whisper-webui/blob/main/docs/options.md)"
+    demo = gr.Interface(fn=ui.transcribe_webui, description=ui_description, article=ui_article, inputs=[
+        gr.Dropdown(choices=["tiny", "base", "small", "medium", "large"], value=default_model_name, label="Model"),
+        gr.Dropdown(choices=sorted(LANGUAGES), label="Language"),
+        gr.Text(label="URL (YouTube, etc.)"),
+        gr.Audio(source="upload", type="filepath", label="Upload Audio"),
+        gr.Audio(source="microphone", type="filepath", label="Microphone Input"),
+        gr.Dropdown(choices=["transcribe", "translate"], label="Task"),
+        gr.Dropdown(choices=["none", "silero-vad", "silero-vad-skip-gaps", "silero-vad-expand-into-gaps", "periodic-vad"], value=default_vad, label="VAD"),
+        gr.Number(label="VAD - Merge Window (s)", precision=0, value=5),
+        gr.Number(label="VAD - Max Merge Size (s)", precision=0, value=30),
+        gr.Number(label="VAD - Padding (s)", precision=None, value=1),
+        gr.Number(label="VAD - Prompt Window (s)", precision=None, value=3)
+    ], outputs=[
+        gr.File(label="Download"),
+        gr.Text(label="Transcription"),
+        gr.Text(label="Segments")
+    ])
+    demo.launch(share=share, server_name=server_name, server_port=server_port)
+    # Clean up
+    ui.close()
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument("--input_audio_max_duration", type=int, default=DEFAULT_INPUT_AUDIO_MAX_DURATION, help="Maximum audio file length in seconds, or -1 for no limit.")
+    parser.add_argument("--share", type=bool, default=False, help="True to share the app on HuggingFace.")
+    parser.add_argument("--server_name", type=str, default=None, help="The host or IP to bind to. If None, bind to localhost.")
+    parser.add_argument("--server_port", type=int, default=7860, help="The port to bind to.")
+    parser.add_argument("--default_model_name", type=str, default="medium", help="The default model name.")
+    parser.add_argument("--default_vad", type=str, default="silero-vad", help="The default VAD.")
+    parser.add_argument("--vad_parallel_devices", type=str, default="", help="A commma delimited list of CUDA devices to use for parallel processing. If None, disable parallel processing.")
+    parser.add_argument("--vad_cpu_cores", type=int, default=1, help="The number of CPU cores to use for VAD pre-processing.")
+    parser.add_argument("--vad_process_timeout", type=float, default="1800", help="The number of seconds before inactivate processes are terminated. Use 0 to close processes immediately, or None for no timeout.")
+    parser.add_argument("--auto_parallel", type=bool, default=False, help="True to use all available GPUs and CPU cores for processing. Use vad_cpu_cores/vad_parallel_devices to specify the number of CPU cores/GPUs to use.")
+    args = parser.parse_args().__dict__
+    create_ui(**args)

cli.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import argparse
+import os
+import pathlib
+from urllib.parse import urlparse
+import warnings
+import numpy as np
+import whisper
+import torch
+from app import LANGUAGES, WhisperTranscriber
+from src.download import download_url
+from src.utils import optional_float, optional_int, str2bool
+from src.whisperContainer import WhisperContainer
+def cli():
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument("audio", nargs="+", type=str, help="audio file(s) to transcribe")
+    parser.add_argument("--model", default="small", choices=["tiny", "base", "small", "medium", "large"], help="name of the Whisper model to use")
+    parser.add_argument("--model_dir", type=str, default=None, help="the path to save model files; uses ~/.cache/whisper by default")
+    parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu", help="device to use for PyTorch inference")
+    parser.add_argument("--output_dir", "-o", type=str, default=".", help="directory to save the outputs")
+    parser.add_argument("--verbose", type=str2bool, default=True, help="whether to print out the progress and debug messages")
+    parser.add_argument("--task", type=str, default="transcribe", choices=["transcribe", "translate"], help="whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')")
+    parser.add_argument("--language", type=str, default=None, choices=sorted(LANGUAGES), help="language spoken in the audio, specify None to perform language detection")
+    parser.add_argument("--vad", type=str, default="none", choices=["none", "silero-vad", "silero-vad-skip-gaps", "silero-vad-expand-into-gaps", "periodic-vad"], help="The voice activity detection algorithm to use")
+    parser.add_argument("--vad_merge_window", type=optional_float, default=5, help="The window size (in seconds) to merge voice segments")
+    parser.add_argument("--vad_max_merge_size", type=optional_float, default=30, help="The maximum size (in seconds) of a voice segment")
+    parser.add_argument("--vad_padding", type=optional_float, default=1, help="The padding (in seconds) to add to each voice segment")
+    parser.add_argument("--vad_prompt_window", type=optional_float, default=3, help="The window size of the prompt to pass to Whisper")
+    parser.add_argument("--vad_cpu_cores", type=int, default=1, help="The number of CPU cores to use for VAD pre-processing.")
+    parser.add_argument("--vad_parallel_devices", type=str, default="", help="A commma delimited list of CUDA devices to use for parallel processing. If None, disable parallel processing.")
+    parser.add_argument("--auto_parallel", type=bool, default=False, help="True to use all available GPUs and CPU cores for processing. Use vad_cpu_cores/vad_parallel_devices to specify the number of CPU cores/GPUs to use.")
+    parser.add_argument("--temperature", type=float, default=0, help="temperature to use for sampling")
+    parser.add_argument("--best_of", type=optional_int, default=5, help="number of candidates when sampling with non-zero temperature")
+    parser.add_argument("--beam_size", type=optional_int, default=5, help="number of beams in beam search, only applicable when temperature is zero")
+    parser.add_argument("--patience", type=float, default=None, help="optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search")
+    parser.add_argument("--length_penalty", type=float, default=None, help="optional token length penalty coefficient (alpha) as in https://arxiv.org/abs/1609.08144, uses simple lengt normalization by default")
+    parser.add_argument("--suppress_tokens", type=str, default="-1", help="comma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuations")
+    parser.add_argument("--initial_prompt", type=str, default=None, help="optional text to provide as a prompt for the first window.")
+    parser.add_argument("--condition_on_previous_text", type=str2bool, default=True, help="if True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop")
+    parser.add_argument("--fp16", type=str2bool, default=True, help="whether to perform inference in fp16; True by default")
+    parser.add_argument("--temperature_increment_on_fallback", type=optional_float, default=0.2, help="temperature to increase when falling back when the decoding fails to meet either of the thresholds below")
+    parser.add_argument("--compression_ratio_threshold", type=optional_float, default=2.4, help="if the gzip compression ratio is higher than this value, treat the decoding as failed")
+    parser.add_argument("--logprob_threshold", type=optional_float, default=-1.0, help="if the average log probability is lower than this value, treat the decoding as failed")
+    parser.add_argument("--no_speech_threshold", type=optional_float, default=0.6, help="if the probability of the <|nospeech|> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence")
+    args = parser.parse_args().__dict__
+    model_name: str = args.pop("model")
+    model_dir: str = args.pop("model_dir")
+    output_dir: str = args.pop("output_dir")
+    device: str = args.pop("device")
+    os.makedirs(output_dir, exist_ok=True)
+    if model_name.endswith(".en") and args["language"] not in {"en", "English"}:
+        warnings.warn(f"{model_name} is an English-only model but receipted '{args['language']}'; using English instead.")
+        args["language"] = "en"
+    temperature = args.pop("temperature")
+    temperature_increment_on_fallback = args.pop("temperature_increment_on_fallback")
+    if temperature_increment_on_fallback is not None:
+        temperature = tuple(np.arange(temperature, 1.0 + 1e-6, temperature_increment_on_fallback))
+    else:
+        temperature = [temperature]
+    vad = args.pop("vad")
+    vad_merge_window = args.pop("vad_merge_window")
+    vad_max_merge_size = args.pop("vad_max_merge_size")
+    vad_padding = args.pop("vad_padding")
+    vad_prompt_window = args.pop("vad_prompt_window")
+    vad_cpu_cores = args.pop("vad_cpu_cores")
+    auto_parallel = args.pop("auto_parallel")
+    model = WhisperContainer(model_name, device=device, download_root=model_dir)
+    transcriber = WhisperTranscriber(delete_uploaded_files=False, vad_cpu_cores=vad_cpu_cores)
+    transcriber.set_parallel_devices(args.pop("vad_parallel_devices"))
+    transcriber.set_auto_parallel(auto_parallel)
+    if (transcriber._has_parallel_devices()):
+        print("Using parallel devices:", transcriber.parallel_device_list)
+    for audio_path in args.pop("audio"):
+        sources = []
+        # Detect URL and download the audio
+        if (uri_validator(audio_path)):
+            # Download from YouTube/URL directly
+            for source_path in  download_url(audio_path, maxDuration=-1, destinationDirectory=output_dir, playlistItems=None):
+                source_name = os.path.basename(source_path)
+                sources.append({ "path": source_path, "name": source_name })
+        else:
+            sources.append({ "path": audio_path, "name": os.path.basename(audio_path) })
+        for source in sources:
+            source_path = source["path"]
+            source_name = source["name"]
+            result = transcriber.transcribe_file(model, source_path, temperature=temperature,
+                                                vad=vad, vadMergeWindow=vad_merge_window, vadMaxMergeSize=vad_max_merge_size,
+                                                vadPadding=vad_padding, vadPromptWindow=vad_prompt_window, **args)
+            transcriber.write_result(result, source_name, output_dir)
+    transcriber.close()
+def uri_validator(x):
+    try:
+        result = urlparse(x)
+        return all([result.scheme, result.netloc])
+    except:
+        return False
+if __name__ == '__main__':
+    cli()

dockerfile ADDED Viewed

	@@ -0,0 +1,20 @@

+FROM huggingface/transformers-pytorch-gpu
+EXPOSE 7860
+ADD . /opt/whisper-webui/
+# Latest version of transformers-pytorch-gpu seems to lack tk.
+# Further, pip install fails, so we must upgrade pip first.
+RUN apt-get -y install python3-tk
+RUN  python3 -m pip install --upgrade pip &&\
+     python3 -m pip install -r /opt/whisper-webui/requirements.txt
+# Note: Models will be downloaded on demand to the directory /root/.cache/whisper.
+# You can also bind this directory in the container to somewhere on the host.
+# To be able to see logs in real time
+ENV PYTHONUNBUFFERED=1
+WORKDIR /opt/whisper-webui/
+ENTRYPOINT ["python3"]
+CMD ["app.py", "--input_audio_max_duration", "-1", "--server_name", "0.0.0.0", "--auto_parallel", "True"]

docs/options.md ADDED Viewed

	@@ -0,0 +1,78 @@

+# Options
+To transcribe or translate an audio file, you can either copy an URL from a website (all [websites](https://github.com/yt-dlp/yt-dlp/blob/master/supportedsites.md)
+supported by YT-DLP will work, including YouTube). Otherwise, upload an audio file (choose "All Files (*.*)"
+in the file selector to select any file type, including video files) or use the microphone.
+For longer audio files (>10 minutes), it is recommended that you select Silero VAD (Voice Activity Detector) in the VAD option.
+## Model
+Select the model that Whisper will use to transcribe the audio:
+| Size   | Parameters | English-only model | Multilingual model | Required VRAM | Relative speed |
+|--------|------------|--------------------|--------------------|---------------|----------------|
+| tiny   | 39 M       | tiny.en            | tiny               | ~1 GB         | ~32x           |
+| base   | 74 M       | base.en            | base               | ~1 GB         | ~16x           |
+| small  | 244 M      | small.en           | small              | ~2 GB         | ~6x            |
+| medium | 769 M      | medium.en          | medium             | ~5 GB         | ~2x            |
+| large  | 1550 M     | N/A                | large              | ~10 GB        | 1x             |
+## Language
+Select the language, or leave it empty for Whisper to automatically detect it.
+Note that if the selected language and the language in the audio differs, Whisper may start to translate the audio to the selected
+language. For instance, if the audio is in English but you select Japaneese, the model may translate the audio to Japanese.
+## Inputs
+The options "URL (YouTube, etc.)", "Upload Audio" or "Micriphone Input" allows you to send an audio input to the model.
+Note that the UI will only process the first valid input - i.e. if you enter both an URL and upload an audio, it will only process
+the URL.
+## Task
+Select the task - either "transcribe" to transcribe the audio to text, or "translate" to translate it to English.
+## Vad
+Using a VAD will improve the timing accuracy of each transcribed line, as well as prevent Whisper getting into an infinite
+loop detecting the same sentence over and over again. The downside is that this may be at a cost to text accuracy, especially
+with regards to unique words or names that appear in the audio. You can compensate for this by increasing the prompt window.
+Note that English is very well handled by Whisper, and it's less susceptible to issues surrounding bad timings and infinite loops.
+So you may only need to use a VAD for other languages, such as Japanese, or when the audio is very long.
+* none
+  * Run whisper on the entire audio input
+* silero-vad
+   * Use Silero VAD to detect sections that contain speech, and run Whisper on independently on each section. Whisper is also run
+     on the gaps between each speech section, by either expanding the section up to the max merge size, or running Whisper independently
+     on the non-speech section.
+* silero-vad-expand-into-gaps
+   * Use Silero VAD to detect sections that contain speech, and run Whisper on independently on each section. Each spech section will be expanded
+     such that they cover any adjacent non-speech sections. For instance, if an audio file of one minute contains the speech sections
+     00:00 - 00:10 (A) and 00:30 - 00:40 (B), the first section (A) will be expanded to 00:00 - 00:30, and (B) will be expanded to 00:30 - 00:60.
+* silero-vad-skip-gaps
+   * As above, but sections that doesn't contain speech according to Silero will be skipped. This will be slightly faster, but
+     may cause dialogue to be skipped.
+* periodic-vad
+   * Create sections of speech every 'VAD - Max Merge Size' seconds. This is very fast and simple, but will potentially break
+     a sentence or word in two.
+## VAD - Merge Window
+If set, any adjacent speech sections that are at most this number of seconds apart will be automatically merged.
+## VAD - Max Merge Size (s)
+Disables merging of adjacent speech sections if they are this number of seconds long.
+## VAD - Padding (s)
+The number of seconds (floating point) to add to the beginning and end of each speech section. Setting this to a number
+larger than zero ensures that Whisper is more likely to correctly transcribe a sentence in the beginning of
+a speech section. However, this also increases the probability of Whisper assigning the wrong timestamp
+to each transcribed line. The default value is 1 second.
+## VAD - Prompt Window (s)
+The text of a detected line will be included as a prompt to the next speech section, if the speech section starts at most this
+number of seconds after the line has finished. For instance, if a line ends at 10:00, and the next speech section starts at
+10:04, the line's text will be included if the prompt window is 4 seconds or more (10:04 - 10:00 = 4 seconds).
+Note that detected lines in gaps between speech sections will not be included in the prompt
+(if silero-vad or silero-vad-expand-into-gaps) is used.

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+git+https://github.com/openai/whisper.git
+transformers
+ffmpeg-python==0.2.0
+gradio
+yt-dlp
+torchaudio

src/__init__.py ADDED Viewed

File without changes

src/download.py ADDED Viewed

	@@ -0,0 +1,72 @@

+from tempfile import mkdtemp
+from typing import List
+from yt_dlp import YoutubeDL
+import yt_dlp
+from yt_dlp.postprocessor import PostProcessor
+class FilenameCollectorPP(PostProcessor):
+    def __init__(self):
+        super(FilenameCollectorPP, self).__init__(None)
+        self.filenames = []
+    def run(self, information):
+        self.filenames.append(information["filepath"])
+        return [], information
+def download_url(url: str, maxDuration: int = None, destinationDirectory: str = None, playlistItems: str = "1") -> List[str]:
+    try:
+        return _perform_download(url, maxDuration=maxDuration, outputTemplate=None, destinationDirectory=destinationDirectory, playlistItems=playlistItems)
+    except yt_dlp.utils.DownloadError as e:
+        # In case of an OS error, try again with a different output template
+        if e.msg and e.msg.find("[Errno 36] File name too long") >= 0:
+            return _perform_download(url, maxDuration=maxDuration, outputTemplate="%(title).10s %(id)s.%(ext)s")
+        pass
+def _perform_download(url: str, maxDuration: int = None, outputTemplate: str = None, destinationDirectory: str = None, playlistItems: str = "1"):
+    # Create a temporary directory to store the downloaded files
+    if destinationDirectory is None:
+        destinationDirectory = mkdtemp()
+    ydl_opts = {
+        "format": "bestaudio/best",
+        'paths': {
+            'home': destinationDirectory
+        }
+    }
+    if (playlistItems):
+        ydl_opts['playlist_items'] = playlistItems
+    # Add output template if specified
+    if outputTemplate:
+        ydl_opts['outtmpl'] = outputTemplate
+    filename_collector = FilenameCollectorPP()
+    with YoutubeDL(ydl_opts) as ydl:
+        if maxDuration and maxDuration > 0:
+            info = ydl.extract_info(url, download=False)
+            duration = info['duration']
+            if duration >= maxDuration:
+                raise ExceededMaximumDuration(videoDuration=duration, maxDuration=maxDuration, message="Video is too long")
+        ydl.add_post_processor(filename_collector)
+        ydl.download([url])
+    if len(filename_collector.filenames) <= 0:
+        raise Exception("Cannot download " + url)
+    result = []
+    for filename in filename_collector.filenames:
+        result.append(filename)
+        print("Downloaded " + filename)
+    return result
+class ExceededMaximumDuration(Exception):
+    def __init__(self, videoDuration, maxDuration, message):
+        self.videoDuration = videoDuration
+        self.maxDuration = maxDuration
+        super().__init__(message)

src/modelCache.py ADDED Viewed

	@@ -0,0 +1,17 @@

+class ModelCache:
+    def __init__(self):
+        self._cache = dict()
+    def get(self, model_key: str, model_factory):
+        result = self._cache.get(model_key)
+        if result is None:
+            result = model_factory()
+            self._cache[model_key] = result
+        return result
+    def clear(self):
+        self._cache.clear()
+# A global cache of models. This is mainly used by the daemon processes to avoid loading the same model multiple times.
+GLOBAL_MODEL_CACHE = ModelCache()

src/segments.py ADDED Viewed

	@@ -0,0 +1,55 @@

+from typing import Any, Dict, List
+import copy
+def merge_timestamps(timestamps: List[Dict[str, Any]], merge_window: float = 5, max_merge_size: float = 30, padding_left: float = 1, padding_right: float = 1):
+    result = []
+    if len(timestamps) == 0:
+        return result
+    if max_merge_size is None:
+        return timestamps
+    if padding_left is None:
+        padding_left = 0
+    if padding_right is None:
+        padding_right = 0
+    processed_time = 0
+    current_segment = None
+    for i in range(len(timestamps)):
+        next_segment = timestamps[i]
+        delta = next_segment['start'] - processed_time
+        # Note that segments can still be longer than the max merge size, they just won't be merged in that case
+        if current_segment is None or (merge_window is not None and delta > merge_window) \
+                 or next_segment['end'] - current_segment['start'] > max_merge_size:
+            # Finish the current segment
+            if current_segment is not None:
+                # Add right padding
+                finish_padding = min(padding_right, delta / 2) if delta < padding_left + padding_right else padding_right
+                current_segment['end'] += finish_padding
+                delta -= finish_padding
+                result.append(current_segment)
+            # Start a new segment
+            current_segment = copy.deepcopy(next_segment)
+            # Pad the segment
+            current_segment['start'] = current_segment['start'] - min(padding_left, delta)
+            processed_time = current_segment['end']
+        else:
+            # Merge the segment
+            current_segment['end'] = next_segment['end']
+            processed_time = current_segment['end']
+    # Add the last segment
+    if current_segment is not None:
+        current_segment['end'] += padding_right
+        result.append(current_segment)
+    return result

src/utils.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import textwrap
+import unicodedata
+import re
+import zlib
+from typing import Iterator, TextIO
+def exact_div(x, y):
+    assert x % y == 0
+    return x // y
+def str2bool(string):
+    str2val = {"True": True, "False": False}
+    if string in str2val:
+        return str2val[string]
+    else:
+        raise ValueError(f"Expected one of {set(str2val.keys())}, got {string}")
+def optional_int(string):
+    return None if string == "None" else int(string)
+def optional_float(string):
+    return None if string == "None" else float(string)
+def compression_ratio(text) -> float:
+    return len(text) / len(zlib.compress(text.encode("utf-8")))
+def format_timestamp(seconds: float, always_include_hours: bool = False, fractionalSeperator: str = '.'):
+    assert seconds >= 0, "non-negative timestamp expected"
+    milliseconds = round(seconds * 1000.0)
+    hours = milliseconds // 3_600_000
+    milliseconds -= hours * 3_600_000
+    minutes = milliseconds // 60_000
+    milliseconds -= minutes * 60_000
+    seconds = milliseconds // 1_000
+    milliseconds -= seconds * 1_000
+    hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
+    return f"{hours_marker}{minutes:02d}:{seconds:02d}{fractionalSeperator}{milliseconds:03d}"
+def write_txt(transcript: Iterator[dict], file: TextIO):
+    for segment in transcript:
+        print(segment['text'].strip(), file=file, flush=True)
+def write_vtt(transcript: Iterator[dict], file: TextIO, maxLineWidth=None):
+    print("WEBVTT\n", file=file)
+    for segment in transcript:
+        text = process_text(segment['text'], maxLineWidth).replace('-->', '->')
+        print(
+            f"{format_timestamp(segment['start'])} --> {format_timestamp(segment['end'])}\n"
+            f"{text}\n",
+            file=file,
+            flush=True,
+        )
+def write_srt(transcript: Iterator[dict], file: TextIO, maxLineWidth=None):
+    """
+    Write a transcript to a file in SRT format.
+    Example usage:
+        from pathlib import Path
+        from whisper.utils import write_srt
+        result = transcribe(model, audio_path, temperature=temperature, **args)
+        # save SRT
+        audio_basename = Path(audio_path).stem
+        with open(Path(output_dir) / (audio_basename + ".srt"), "w", encoding="utf-8") as srt:
+            write_srt(result["segments"], file=srt)
+    """
+    for i, segment in enumerate(transcript, start=1):
+        text = process_text(segment['text'].strip(), maxLineWidth).replace('-->', '->')
+        # write srt lines
+        print(
+            f"{i}\n"
+            f"{format_timestamp(segment['start'], always_include_hours=True, fractionalSeperator=',')} --> "
+            f"{format_timestamp(segment['end'], always_include_hours=True, fractionalSeperator=',')}\n"
+            f"{text}\n",
+            file=file,
+            flush=True,
+        )
+def process_text(text: str, maxLineWidth=None):
+    if (maxLineWidth is None or maxLineWidth < 0):
+        return text
+    lines = textwrap.wrap(text, width=maxLineWidth, tabsize=4)
+    return '\n'.join(lines)
+def slugify(value, allow_unicode=False):
+    """
+    Taken from https://github.com/django/django/blob/master/django/utils/text.py
+    Convert to ASCII if 'allow_unicode' is False. Convert spaces or repeated
+    dashes to single dashes. Remove characters that aren't alphanumerics,
+    underscores, or hyphens. Convert to lowercase. Also strip leading and
+    trailing whitespace, dashes, and underscores.
+    """
+    value = str(value)
+    if allow_unicode:
+        value = unicodedata.normalize('NFKC', value)
+    else:
+        value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
+    value = re.sub(r'[^\w\s-]', '', value.lower())
+    return re.sub(r'[-\s]+', '-', value).strip('-_')

src/vad.py ADDED Viewed

	@@ -0,0 +1,527 @@

+from abc import ABC, abstractmethod
+from collections import Counter, deque
+import time
+from typing import Any, Deque, Iterator, List, Dict
+from pprint import pprint
+from src.modelCache import GLOBAL_MODEL_CACHE, ModelCache
+from src.segments import merge_timestamps
+from src.whisperContainer import WhisperCallback
+# Workaround for https://github.com/tensorflow/tensorflow/issues/48797
+try:
+    import tensorflow as tf
+except ModuleNotFoundError:
+    # Error handling
+    pass
+import torch
+import ffmpeg
+import numpy as np
+from src.utils import format_timestamp
+from enum import Enum
+class NonSpeechStrategy(Enum):
+    """
+    Ignore non-speech frames segments.
+    """
+    SKIP = 1
+    """
+    Just treat non-speech segments as speech.
+    """
+    CREATE_SEGMENT = 2
+    """
+    Expand speech segments into subsequent non-speech segments.
+    """
+    EXPAND_SEGMENT = 3
+# Defaults for Silero
+SPEECH_TRESHOLD = 0.3
+# Minimum size of segments to process
+MIN_SEGMENT_DURATION = 1
+# The maximum time for texts from old segments to be used in the next segment
+MAX_PROMPT_WINDOW = 0 # seconds (0 = disabled)
+PROMPT_NO_SPEECH_PROB = 0.1 # Do not pass the text from segments with a no speech probability higher than this
+VAD_MAX_PROCESSING_CHUNK = 60 * 60 # 60 minutes of audio
+class TranscriptionConfig(ABC):
+    def __init__(self, non_speech_strategy: NonSpeechStrategy = NonSpeechStrategy.SKIP,
+                       segment_padding_left: float = None, segment_padding_right = None, max_silent_period: float = None,
+                       max_merge_size: float = None, max_prompt_window: float = None, initial_segment_index = -1):
+        self.non_speech_strategy = non_speech_strategy
+        self.segment_padding_left = segment_padding_left
+        self.segment_padding_right = segment_padding_right
+        self.max_silent_period = max_silent_period
+        self.max_merge_size = max_merge_size
+        self.max_prompt_window = max_prompt_window
+        self.initial_segment_index = initial_segment_index
+class PeriodicTranscriptionConfig(TranscriptionConfig):
+    def __init__(self, periodic_duration: float, non_speech_strategy: NonSpeechStrategy = NonSpeechStrategy.SKIP,
+                       segment_padding_left: float = None, segment_padding_right = None, max_silent_period: float = None,
+                       max_merge_size: float = None, max_prompt_window: float = None, initial_segment_index = -1):
+        super().__init__(non_speech_strategy, segment_padding_left, segment_padding_right, max_silent_period, max_merge_size, max_prompt_window, initial_segment_index)
+        self.periodic_duration = periodic_duration
+class AbstractTranscription(ABC):
+    def __init__(self, sampling_rate: int = 16000):
+        self.sampling_rate = sampling_rate
+    def get_audio_segment(self, str, start_time: str = None, duration: str = None):
+        return load_audio(str, self.sampling_rate, start_time, duration)
+    @abstractmethod
+    def get_transcribe_timestamps(self, audio: str, config: TranscriptionConfig, start_time: float, end_time: float):
+        """
+        Get the start and end timestamps of the sections that should be transcribed by this VAD method.
+        Parameters
+        ----------
+        audio: str
+            The audio file.
+        config: TranscriptionConfig
+            The transcription configuration.
+        Returns
+        -------
+        A list of start and end timestamps, in fractional seconds.
+        """
+        return
+    def get_merged_timestamps(self, timestamps: List[Dict[str, Any]], config: TranscriptionConfig, total_duration: float):
+        """
+        Get the start and end timestamps of the sections that should be transcribed by this VAD method,
+        after merging the given segments using the specified configuration.
+        Parameters
+        ----------
+        audio: str
+            The audio file.
+        config: TranscriptionConfig
+            The transcription configuration.
+        Returns
+        -------
+        A list of start and end timestamps, in fractional seconds.
+        """
+        merged = merge_timestamps(timestamps, config.max_silent_period, config.max_merge_size,
+                                  config.segment_padding_left, config.segment_padding_right)
+        if config.non_speech_strategy != NonSpeechStrategy.SKIP:
+            # Expand segments to include the gaps between them
+            if (config.non_speech_strategy == NonSpeechStrategy.CREATE_SEGMENT):
+                # When we have a prompt window, we create speech segments betwen each segment if we exceed the merge size
+                merged = self.fill_gaps(merged, total_duration=total_duration, max_expand_size=config.max_merge_size)
+            elif config.non_speech_strategy == NonSpeechStrategy.EXPAND_SEGMENT:
+                # With no prompt window, it is better to just expand the segments (this effectively passes the prompt to the next segment)
+                merged = self.expand_gaps(merged, total_duration=total_duration)
+            else:
+                raise Exception("Unknown non-speech strategy: " + str(config.non_speech_strategy))
+            print("Transcribing non-speech:")
+            pprint(merged)
+        return merged
+    def transcribe(self, audio: str, whisperCallable: WhisperCallback, config: TranscriptionConfig):
+        """
+        Transcribe the given audo file.
+        Parameters
+        ----------
+        audio: str
+            The audio file.
+        whisperCallable: WhisperCallback
+            A callback object to call to transcribe each segment.
+        Returns
+        -------
+        A list of start and end timestamps, in fractional seconds.
+        """
+        max_audio_duration = get_audio_duration(audio)
+        timestamp_segments = self.get_transcribe_timestamps(audio, config, 0, max_audio_duration)
+        # Get speech timestamps from full audio file
+        merged = self.get_merged_timestamps(timestamp_segments, config, max_audio_duration)
+        # A deque of transcribed segments that is passed to the next segment as a prompt
+        prompt_window = deque()
+        print("Processing timestamps:")
+        pprint(merged)
+        result = {
+            'text': "",
+            'segments': [],
+            'language': ""
+        }
+        languageCounter = Counter()
+        detected_language = None
+        segment_index = config.initial_segment_index
+        # For each time segment, run whisper
+        for segment in merged:
+            segment_index += 1
+            segment_start = segment['start']
+            segment_end = segment['end']
+            segment_expand_amount = segment.get('expand_amount', 0)
+            segment_gap = segment.get('gap', False)
+            segment_duration = segment_end - segment_start
+            if segment_duration < MIN_SEGMENT_DURATION:
+                continue;
+            # Audio to run on Whisper
+            segment_audio = self.get_audio_segment(audio, start_time = str(segment_start), duration = str(segment_duration))
+            # Previous segments to use as a prompt
+            segment_prompt = ' '.join([segment['text'] for segment in prompt_window]) if len(prompt_window) > 0 else None
+            # Detected language
+            detected_language = languageCounter.most_common(1)[0][0] if len(languageCounter) > 0 else None
+            print("Running whisper from ", format_timestamp(segment_start), " to ", format_timestamp(segment_end), ", duration: ",
+                  segment_duration, "expanded: ", segment_expand_amount, "prompt: ", segment_prompt, "language: ", detected_language)
+            segment_result = whisperCallable.invoke(segment_audio, segment_index, segment_prompt, detected_language)
+            adjusted_segments = self.adjust_timestamp(segment_result["segments"], adjust_seconds=segment_start, max_source_time=segment_duration)
+            # Propagate expand amount to the segments
+            if (segment_expand_amount > 0):
+                segment_without_expansion = segment_duration - segment_expand_amount
+                for adjusted_segment in adjusted_segments:
+                    adjusted_segment_end = adjusted_segment['end']
+                    # Add expand amount if the segment got expanded
+                    if (adjusted_segment_end > segment_without_expansion):
+                        adjusted_segment["expand_amount"] = adjusted_segment_end - segment_without_expansion
+            # Append to output
+            result['text'] += segment_result['text']
+            result['segments'].extend(adjusted_segments)
+            # Increment detected language
+            if not segment_gap:
+                languageCounter[segment_result['language']] += 1
+            # Update prompt window
+            self.__update_prompt_window(prompt_window, adjusted_segments, segment_end, segment_gap, config)
+        if detected_language is not None:
+            result['language'] = detected_language
+        return result
+    def __update_prompt_window(self, prompt_window: Deque, adjusted_segments: List, segment_end: float, segment_gap: bool, config: TranscriptionConfig):
+        if (config.max_prompt_window is not None and config.max_prompt_window > 0):
+            # Add segments to the current prompt window (unless it is a speech gap)
+            if not segment_gap:
+                for segment in adjusted_segments:
+                    if segment.get('no_speech_prob', 0) <= PROMPT_NO_SPEECH_PROB:
+                        prompt_window.append(segment)
+            while (len(prompt_window) > 0):
+                first_end_time = prompt_window[0].get('end', 0)
+                # Time expanded in the segments should be discounted from the prompt window
+                first_expand_time = prompt_window[0].get('expand_amount', 0)
+                if (first_end_time - first_expand_time < segment_end - config.max_prompt_window):
+                    prompt_window.popleft()
+                else:
+                    break
+    def include_gaps(self, segments: Iterator[dict], min_gap_length: float, total_duration: float):
+        result = []
+        last_end_time = 0
+        for segment in segments:
+            segment_start = float(segment['start'])
+            segment_end = float(segment['end'])
+            if (last_end_time != segment_start):
+                delta = segment_start - last_end_time
+                if (min_gap_length is None or delta >= min_gap_length):
+                    result.append( { 'start': last_end_time, 'end': segment_start, 'gap': True } )
+            last_end_time = segment_end
+            result.append(segment)
+        # Also include total duration if specified
+        if (total_duration is not None and last_end_time < total_duration):
+            delta = total_duration - segment_start
+            if (min_gap_length is None or delta >= min_gap_length):
+                result.append( { 'start': last_end_time, 'end': total_duration, 'gap': True } )
+        return result
+    # Expand the end time of each segment to the start of the next segment
+    def expand_gaps(self, segments: List[Dict[str, Any]], total_duration: float):
+        result = []
+        if len(segments) == 0:
+            return result
+        # Add gap at the beginning if needed
+        if (segments[0]['start'] > 0):
+            result.append({ 'start': 0, 'end': segments[0]['start'], 'gap': True } )
+        for i in range(len(segments) - 1):
+            current_segment = segments[i]
+            next_segment = segments[i + 1]
+            delta = next_segment['start'] - current_segment['end']
+            # Expand if the gap actually exists
+            if (delta >= 0):
+                current_segment = current_segment.copy()
+                current_segment['expand_amount'] = delta
+                current_segment['end'] = next_segment['start']
+            result.append(current_segment)
+        # Add last segment
+        last_segment = segments[-1]
+        result.append(last_segment)
+        # Also include total duration if specified
+        if (total_duration is not None):
+            last_segment = result[-1]
+            if (last_segment['end'] < total_duration):
+                last_segment = last_segment.copy()
+                last_segment['end'] = total_duration
+                result[-1] = last_segment
+        return result
+    def fill_gaps(self, segments: List[Dict[str, Any]], total_duration: float, max_expand_size: float = None):
+        result = []
+        if len(segments) == 0:
+            return result
+        # Add gap at the beginning if needed
+        if (segments[0]['start'] > 0):
+            result.append({ 'start': 0, 'end': segments[0]['start'], 'gap': True } )
+        for i in range(len(segments) - 1):
+            expanded = False
+            current_segment = segments[i]
+            next_segment = segments[i + 1]
+            delta = next_segment['start'] - current_segment['end']
+            if (max_expand_size is not None and delta <= max_expand_size):
+                # Just expand the current segment
+                current_segment = current_segment.copy()
+                current_segment['expand_amount'] = delta
+                current_segment['end'] = next_segment['start']
+                expanded = True
+            result.append(current_segment)
+            # Add a gap to the next segment if needed
+            if (delta >= 0 and not expanded):
+                result.append({ 'start': current_segment['end'], 'end': next_segment['start'], 'gap': True } )
+        # Add last segment
+        last_segment = segments[-1]
+        result.append(last_segment)
+        # Also include total duration if specified
+        if (total_duration is not None):
+            last_segment = result[-1]
+            delta = total_duration - last_segment['end']
+            if (delta > 0):
+                if (max_expand_size is not None and delta <= max_expand_size):
+                    # Expand the last segment
+                    last_segment = last_segment.copy()
+                    last_segment['expand_amount'] = delta
+                    last_segment['end'] = total_duration
+                    result[-1] = last_segment
+                else:
+                    result.append({ 'start': last_segment['end'], 'end': total_duration, 'gap': True } )
+        return result
+    def adjust_timestamp(self, segments: Iterator[dict], adjust_seconds: float, max_source_time: float = None):
+        result = []
+        for segment in segments:
+            segment_start = float(segment['start'])
+            segment_end = float(segment['end'])
+            # Filter segments?
+            if (max_source_time is not None):
+                if (segment_start > max_source_time):
+                    continue
+                segment_end = min(max_source_time, segment_end)
+                new_segment = segment.copy()
+            # Add to start and end
+            new_segment['start'] = segment_start + adjust_seconds
+            new_segment['end'] = segment_end + adjust_seconds
+            result.append(new_segment)
+        return result
+    def multiply_timestamps(self, timestamps: List[Dict[str, Any]], factor: float):
+        result = []
+        for entry in timestamps:
+            start = entry['start']
+            end = entry['end']
+            result.append({
+                'start': start * factor,
+                'end': end * factor
+            })
+        return result
+class VadSileroTranscription(AbstractTranscription):
+    def __init__(self, sampling_rate: int = 16000, cache: ModelCache = None):
+        super().__init__(sampling_rate=sampling_rate)
+        self.model = None
+        self.cache = cache
+        self._initialize_model()
+    def _initialize_model(self):
+        if (self.cache is not None):
+            model_key = "VadSileroTranscription"
+            self.model, self.get_speech_timestamps = self.cache.get(model_key, self._create_model)
+            print("Loaded Silerio model from cache.")
+        else:
+            self.model, self.get_speech_timestamps = self._create_model()
+            print("Created Silerio model")
+    def _create_model(self):
+        model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', model='silero_vad')
+        # Silero does not benefit from multi-threading
+        torch.set_num_threads(1) # JIT
+        (get_speech_timestamps, _, _, _, _) = utils
+        return model, get_speech_timestamps
+    def get_transcribe_timestamps(self, audio: str, config: TranscriptionConfig, start_time: float, end_time: float):
+        result = []
+        print("Getting timestamps from audio file: {}, start: {}, duration: {}".format(audio, start_time, end_time))
+        perf_start_time = time.perf_counter()
+        # Divide procesisng of audio into chunks
+        chunk_start = start_time
+        while (chunk_start < end_time):
+            chunk_duration = min(end_time - chunk_start, VAD_MAX_PROCESSING_CHUNK)
+            print("Processing VAD in chunk from {} to {}".format(format_timestamp(chunk_start), format_timestamp(chunk_start + chunk_duration)))
+            wav = self.get_audio_segment(audio, str(chunk_start), str(chunk_duration))
+            sample_timestamps = self.get_speech_timestamps(wav, self.model, sampling_rate=self.sampling_rate, threshold=SPEECH_TRESHOLD)
+            seconds_timestamps = self.multiply_timestamps(sample_timestamps, factor=1 / self.sampling_rate)
+            adjusted = self.adjust_timestamp(seconds_timestamps, adjust_seconds=chunk_start, max_source_time=chunk_start + chunk_duration)
+            #pprint(adjusted)
+            result.extend(adjusted)
+            chunk_start += chunk_duration
+        perf_end_time = time.perf_counter()
+        print("VAD processing took {} seconds".format(perf_end_time - perf_start_time))
+        return result
+    def __getstate__(self):
+        # We only need the sampling rate
+        return { 'sampling_rate': self.sampling_rate }
+    def __setstate__(self, state):
+        self.sampling_rate = state['sampling_rate']
+        self.model = None
+        # Use the global cache
+        self.cache = GLOBAL_MODEL_CACHE
+        self._initialize_model()
+# A very simple VAD that just marks every N seconds as speech
+class VadPeriodicTranscription(AbstractTranscription):
+    def __init__(self, sampling_rate: int = 16000):
+        super().__init__(sampling_rate=sampling_rate)
+    def get_transcribe_timestamps(self, audio: str, config: PeriodicTranscriptionConfig, start_time: float, end_time: float):
+        result = []
+        # Generate a timestamp every N seconds
+        start_timestamp = start_time
+        while (start_timestamp < end_time):
+            end_timestamp = min(start_timestamp + config.periodic_duration, end_time)
+            segment_duration = end_timestamp - start_timestamp
+            # Minimum duration is 1 second
+            if (segment_duration >= 1):
+                result.append( {  'start': start_timestamp, 'end': end_timestamp } )
+            start_timestamp = end_timestamp
+        return result
+def get_audio_duration(file: str):
+    return float(ffmpeg.probe(file)["format"]["duration"])
+def load_audio(file: str, sample_rate: int = 16000,
+               start_time: str = None, duration: str = None):
+    """
+    Open an audio file and read as mono waveform, resampling as necessary
+    Parameters
+    ----------
+    file: str
+        The audio file to open
+    sr: int
+        The sample rate to resample the audio if necessary
+    start_time: str
+        The start time, using the standard FFMPEG time duration syntax, or None to disable.
+    duration: str
+        The duration, using the standard FFMPEG time duration syntax, or None to disable.
+    Returns
+    -------
+    A NumPy array containing the audio waveform, in float32 dtype.
+    """
+    try:
+        inputArgs = {'threads': 0}
+        if (start_time is not None):
+            inputArgs['ss'] = start_time
+        if (duration is not None):
+            inputArgs['t'] = duration
+        # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
+        # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
+        out, _ = (
+            ffmpeg.input(file, **inputArgs)
+            .output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sample_rate)
+            .run(cmd="ffmpeg", capture_stdout=True, capture_stderr=True)
+        )
+    except ffmpeg.Error as e:
+        raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}")
+    return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0

src/vadParallel.py ADDED Viewed

	@@ -0,0 +1,251 @@

+import multiprocessing
+import threading
+import time
+from src.vad import AbstractTranscription, TranscriptionConfig, get_audio_duration
+from src.whisperContainer import WhisperCallback
+from multiprocessing import Pool
+from typing import Any, Dict, List
+import os
+class ParallelContext:
+    def __init__(self, num_processes: int = None, auto_cleanup_timeout_seconds: float = None):
+        self.num_processes = num_processes
+        self.auto_cleanup_timeout_seconds = auto_cleanup_timeout_seconds
+        self.lock = threading.Lock()
+        self.ref_count = 0
+        self.pool = None
+        self.cleanup_timer = None
+    def get_pool(self):
+        # Initialize pool lazily
+        if (self.pool is None):
+            context = multiprocessing.get_context('spawn')
+            self.pool = context.Pool(self.num_processes)
+        self.ref_count = self.ref_count + 1
+        if (self.auto_cleanup_timeout_seconds is not None):
+            self._stop_auto_cleanup()
+        return self.pool
+    def return_pool(self, pool):
+        if (self.pool == pool and self.ref_count > 0):
+            self.ref_count = self.ref_count - 1
+            if (self.ref_count == 0):
+                if (self.auto_cleanup_timeout_seconds is not None):
+                    self._start_auto_cleanup()
+    def _start_auto_cleanup(self):
+        if (self.cleanup_timer is not None):
+            self.cleanup_timer.cancel()
+        self.cleanup_timer = threading.Timer(self.auto_cleanup_timeout_seconds, self._execute_cleanup)
+        self.cleanup_timer.start()
+        print("Started auto cleanup of pool in " + str(self.auto_cleanup_timeout_seconds) + " seconds")
+    def _stop_auto_cleanup(self):
+        if (self.cleanup_timer is not None):
+            self.cleanup_timer.cancel()
+            self.cleanup_timer = None
+            print("Stopped auto cleanup of pool")
+    def _execute_cleanup(self):
+        print("Executing cleanup of pool")
+        if (self.ref_count == 0):
+            self.close()
+    def close(self):
+        self._stop_auto_cleanup()
+        if (self.pool is not None):
+            print("Closing pool of " + str(self.num_processes) + " processes")
+            self.pool.close()
+            self.pool.join()
+        self.pool = None
+class ParallelTranscriptionConfig(TranscriptionConfig):
+    def __init__(self, device_id: str, override_timestamps, initial_segment_index, copy: TranscriptionConfig = None):
+        super().__init__(copy.non_speech_strategy, copy.segment_padding_left, copy.segment_padding_right, copy.max_silent_period, copy.max_merge_size, copy.max_prompt_window, initial_segment_index)
+        self.device_id = device_id
+        self.override_timestamps = override_timestamps
+class ParallelTranscription(AbstractTranscription):
+    # Silero VAD typically takes about 3 seconds per minute, so there's no need to split the chunks
+    # into smaller segments than 2 minute (min 6 seconds per CPU core)
+    MIN_CPU_CHUNK_SIZE_SECONDS = 2 * 60
+    def __init__(self, sampling_rate: int = 16000):
+        super().__init__(sampling_rate=sampling_rate)
+    def transcribe_parallel(self, transcription: AbstractTranscription, audio: str, whisperCallable: WhisperCallback, config: TranscriptionConfig,
+                            cpu_device_count: int, gpu_devices: List[str], cpu_parallel_context: ParallelContext = None, gpu_parallel_context: ParallelContext = None):
+        total_duration = get_audio_duration(audio)
+        # First, get the timestamps for the original audio
+        if (cpu_device_count > 1):
+            merged = self._get_merged_timestamps_parallel(transcription, audio, config, total_duration, cpu_device_count, cpu_parallel_context)
+        else:
+            merged = transcription.get_merged_timestamps(audio, config, total_duration)
+        # Split into a list for each device
+        # TODO: Split by time instead of by number of chunks
+        merged_split = list(self._split(merged, len(gpu_devices)))
+        # Parameters that will be passed to the transcribe function
+        parameters = []
+        segment_index = config.initial_segment_index
+        for i in range(len(merged_split)):
+            device_segment_list = list(merged_split[i])
+            device_id = gpu_devices[i]
+            if (len(device_segment_list) <= 0):
+                continue
+            print("Device " + str(device_id) + " (index " + str(i) + ") has " + str(len(device_segment_list)) + " segments")
+            # Create a new config with the given device ID
+            device_config = ParallelTranscriptionConfig(device_id, device_segment_list, segment_index, config)
+            segment_index += len(device_segment_list)
+            parameters.append([audio, whisperCallable, device_config]);
+        merged = {
+            'text': '',
+            'segments': [],
+            'language': None
+        }
+        created_context = False
+        perf_start_gpu = time.perf_counter()
+        # Spawn a separate process for each device
+        try:
+            if (gpu_parallel_context is None):
+                gpu_parallel_context = ParallelContext(len(gpu_devices))
+                created_context = True
+            # Get a pool of processes
+            pool = gpu_parallel_context.get_pool()
+            # Run the transcription in parallel
+            results = pool.starmap(self.transcribe, parameters)
+            for result in results:
+                # Merge the results
+                if (result['text'] is not None):
+                    merged['text'] += result['text']
+                if (result['segments'] is not None):
+                    merged['segments'].extend(result['segments'])
+                if (result['language'] is not None):
+                    merged['language'] = result['language']
+        finally:
+            # Return the pool to the context
+            if (gpu_parallel_context is not None):
+                gpu_parallel_context.return_pool(pool)
+            # Always close the context if we created it
+            if (created_context):
+                gpu_parallel_context.close()
+        perf_end_gpu = time.perf_counter()
+        print("Parallel transcription took " + str(perf_end_gpu - perf_start_gpu) + " seconds")
+        return merged
+    def _get_merged_timestamps_parallel(self, transcription: AbstractTranscription, audio: str, config: TranscriptionConfig, total_duration: float,
+                                       cpu_device_count: int, cpu_parallel_context: ParallelContext = None):
+        parameters = []
+        chunk_size = max(total_duration / cpu_device_count, self.MIN_CPU_CHUNK_SIZE_SECONDS)
+        chunk_start = 0
+        cpu_device_id = 0
+        perf_start_time = time.perf_counter()
+        # Create chunks that will be processed on the CPU
+        while (chunk_start < total_duration):
+            chunk_end = min(chunk_start + chunk_size, total_duration)
+            if (chunk_end - chunk_start < 1):
+                # No need to process chunks that are less than 1 second
+                break
+            print("Parallel VAD: Executing chunk from " + str(chunk_start) + " to " +
+                    str(chunk_end) + " on CPU device " + str(cpu_device_id))
+            parameters.append([audio, config, chunk_start, chunk_end]);
+            cpu_device_id += 1
+            chunk_start = chunk_end
+        created_context = False
+        # Spawn a separate process for each device
+        try:
+            if (cpu_parallel_context is None):
+                cpu_parallel_context = ParallelContext(cpu_device_count)
+                created_context = True
+            # Get a pool of processes
+            pool = cpu_parallel_context.get_pool()
+            # Run the transcription in parallel. Note that transcription must be picklable.
+            results = pool.starmap(transcription.get_transcribe_timestamps, parameters)
+            timestamps = []
+            # Flatten the results
+            for result in results:
+                timestamps.extend(result)
+            merged = transcription.get_merged_timestamps(timestamps, config, total_duration)
+            perf_end_time = time.perf_counter()
+            print("Parallel VAD processing took {} seconds".format(perf_end_time - perf_start_time))
+            return merged
+        finally:
+            # Return the pool to the context
+            if (cpu_parallel_context is not None):
+                cpu_parallel_context.return_pool(pool)
+            # Always close the context if we created it
+            if (created_context):
+                cpu_parallel_context.close()
+    def get_transcribe_timestamps(self, audio: str, config: ParallelTranscriptionConfig, start_time: float, duration: float):
+        return []
+    def get_merged_timestamps(self,  timestamps: List[Dict[str, Any]], config: ParallelTranscriptionConfig, total_duration: float):
+        # Override timestamps that will be processed
+        if (config.override_timestamps is not None):
+            print("Using override timestamps of size " + str(len(config.override_timestamps)))
+            return config.override_timestamps
+        return super().get_merged_timestamps(timestamps, config, total_duration)
+    def transcribe(self, audio: str, whisperCallable: WhisperCallback, config: ParallelTranscriptionConfig):
+        # Override device ID the first time
+        if (os.environ.get("INITIALIZED", None) is None):
+            os.environ["INITIALIZED"] = "1"
+            # Note that this may be None if the user didn't specify a device. In that case, Whisper will
+            # just use the default GPU device.
+            if (config.device_id is not None):
+                print("Using device " + config.device_id)
+                os.environ["CUDA_VISIBLE_DEVICES"] = config.device_id
+        return super().transcribe(audio, whisperCallable, config)
+    def _split(self, a, n):
+        """Split a list into n approximately equal parts."""
+        k, m = divmod(len(a), n)
+        return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))

src/whisperContainer.py ADDED Viewed

	@@ -0,0 +1,106 @@

+# External programs
+import whisper
+from src.modelCache import GLOBAL_MODEL_CACHE, ModelCache
+class WhisperContainer:
+    def __init__(self, model_name: str, device: str = None, download_root: str = None, cache: ModelCache = None):
+        self.model_name = model_name
+        self.device = device
+        self.download_root = download_root
+        self.cache = cache
+        # Will be created on demand
+        self.model = None
+    def get_model(self):
+        if self.model is None:
+            if (self.cache is None):
+                self.model = self._create_model()
+            else:
+                model_key = "WhisperContainer." + self.model_name + ":" + (self.device if self.device else '')
+                self.model = self.cache.get(model_key, self._create_model)
+        return self.model
+    def _create_model(self):
+        print("Loading whisper model " + self.model_name)
+        return whisper.load_model(self.model_name, device=self.device, download_root=self.download_root)
+    def create_callback(self, language: str = None, task: str = None, initial_prompt: str = None, **decodeOptions: dict):
+        """
+        Create a WhisperCallback object that can be used to transcript audio files.
+        Parameters
+        ----------
+        language: str
+            The target language of the transcription. If not specified, the language will be inferred from the audio content.
+        task: str
+            The task - either translate or transcribe.
+        initial_prompt: str
+            The initial prompt to use for the transcription.
+        decodeOptions: dict
+            Additional options to pass to the decoder. Must be pickleable.
+        Returns
+        -------
+        A WhisperCallback object.
+        """
+        return WhisperCallback(self, language=language, task=task, initial_prompt=initial_prompt, **decodeOptions)
+    # This is required for multiprocessing
+    def __getstate__(self):
+        return { "model_name": self.model_name, "device": self.device, "download_root": self.download_root }
+    def __setstate__(self, state):
+        self.model_name = state["model_name"]
+        self.device = state["device"]
+        self.download_root = state["download_root"]
+        self.model = None
+        # Depickled objects must use the global cache
+        self.cache = GLOBAL_MODEL_CACHE
+class WhisperCallback:
+    def __init__(self, model_container: WhisperContainer, language: str = None, task: str = None, initial_prompt: str = None, **decodeOptions: dict):
+        self.model_container = model_container
+        self.language = language
+        self.task = task
+        self.initial_prompt = initial_prompt
+        self.decodeOptions = decodeOptions
+    def invoke(self, audio, segment_index: int, prompt: str, detected_language: str):
+        """
+        Peform the transcription of the given audio file or data.
+        Parameters
+        ----------
+        audio: Union[str, np.ndarray, torch.Tensor]
+            The audio file to transcribe, or the audio data as a numpy array or torch tensor.
+        segment_index: int
+            The target language of the transcription. If not specified, the language will be inferred from the audio content.
+        task: str
+            The task - either translate or transcribe.
+        prompt: str
+            The prompt to use for the transcription.
+        detected_language: str
+            The detected language of the audio file.
+        Returns
+        -------
+        The result of the Whisper call.
+        """
+        model = self.model_container.get_model()
+        return model.transcribe(audio, \
+                 language=self.language if self.language else detected_language, task=self.task, \
+                 initial_prompt=self._concat_prompt(self.initial_prompt, prompt) if segment_index == 0 else prompt, \
+                 **self.decodeOptions)
+    def _concat_prompt(self, prompt1, prompt2):
+        if (prompt1 is None):
+            return prompt2
+        elif (prompt2 is None):
+            return prompt1
+        else:
+            return prompt1 + " " + prompt2

tests/segments_test.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import sys
+import unittest
+sys.path.append('../whisper-webui')
+from src.segments import merge_timestamps
+class TestSegments(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(TestSegments, self).__init__(*args, **kwargs)
+    def test_merge_segments(self):
+        segments = [
+            {'start': 10.0, 'end': 20.0},
+            {'start': 22.0, 'end': 27.0},
+            {'start': 31.0, 'end': 35.0},
+            {'start': 45.0, 'end': 60.0},
+            {'start': 61.0, 'end': 65.0},
+            {'start': 68.0, 'end': 98.0},
+            {'start': 100.0, 'end': 102.0},
+            {'start': 110.0, 'end': 112.0}
+        ]
+        result = merge_timestamps(segments, merge_window=5, max_merge_size=30, padding_left=1, padding_right=1)
+        self.assertListEqual(result, [
+            {'start': 9.0, 'end': 36.0},
+            {'start': 44.0, 'end': 66.0},
+            {'start': 67.0, 'end': 99.0},
+            {'start': 99.0, 'end': 103.0},
+            {'start': 109.0, 'end': 113.0}
+        ])
+    def test_overlap_next(self):
+        segments = [
+            {'start': 5.0, 'end': 39.182},
+            {'start': 39.986, 'end': 40.814}
+        ]
+        result = merge_timestamps(segments, merge_window=5, max_merge_size=30, padding_left=1, padding_right=1)
+        self.assertListEqual(result, [
+            {'start': 4.0, 'end': 39.584},
+            {'start': 39.584, 'end': 41.814}
+        ])
+if __name__ == '__main__':
+    unittest.main()

tests/vad_test.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import pprint
+import unittest
+import numpy as np
+import sys
+sys.path.append('../whisper-webui')
+from src.vad import AbstractTranscription, TranscriptionConfig, VadSileroTranscription
+class TestVad(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(TestVad, self).__init__(*args, **kwargs)
+        self.transcribe_calls = []
+    def test_transcript(self):
+        mock = MockVadTranscription()
+        self.transcribe_calls.clear()
+        result = mock.transcribe("mock", lambda segment : self.transcribe_segments(segment))
+        self.assertListEqual(self.transcribe_calls, [
+            [30, 30],
+            [100, 100]
+        ])
+        self.assertListEqual(result['segments'],
+            [{'end': 50.0, 'start': 40.0, 'text': 'Hello world '},
+            {'end': 120.0, 'start': 110.0, 'text': 'Hello world '}]
+        )
+    def transcribe_segments(self, segment):
+        self.transcribe_calls.append(segment.tolist())
+        # Dummy text
+        return {
+            'text': "Hello world ",
+            'segments': [
+                {
+                    "start": 10.0,
+                    "end": 20.0,
+                    "text": "Hello world "
+                }
+            ],
+            'language': ""
+        }
+class MockVadTranscription(AbstractTranscription):
+    def __init__(self):
+        super().__init__()
+    def get_audio_segment(self, str, start_time: str = None, duration: str = None):
+        start_time_seconds = float(start_time.removesuffix("s"))
+        duration_seconds = float(duration.removesuffix("s"))
+        # For mocking, this just returns a simple numppy array
+        return np.array([start_time_seconds, duration_seconds], dtype=np.float64)
+    def get_transcribe_timestamps(self, audio: str, config: TranscriptionConfig, start_time: float, duration: float):
+        result = []
+        result.append( {  'start': 30, 'end': 60 } )
+        result.append( {  'start': 100, 'end': 200 } )
+        return result
+if __name__ == '__main__':
+    unittest.main()