Spaces:

aadnk
/

whisper-webui

Running

App Files Files Community

aadnk commited on Mar 28, 2023

Commit

295de00

•

1 Parent(s): c90f138

Adding support for faster_whisper

Browse files

This is a re-implementation of Whisper in CTranslate2 that can be 4x faster
and use much less memory than OpenAI's Whisper.

Files changed (16) hide show

app.py +12 -5
cli.py +9 -4
config.json5 +3 -0
requirements-fastWhisper.txt +8 -0
src/config.py +2 -78
src/conversion/hf_converter.py +2 -2
src/hooks/progressListener.py +8 -0
src/hooks/subTaskProgressListener.py +37 -0
src/hooks/whisperProgressHook.py +1 -41
src/utils.py +21 -1
src/vad.py +5 -3
src/vadParallel.py +5 -4
src/whisper/abstractWhisperContainer.py +99 -0
src/whisper/fasterWhisperContainer.py +165 -0
src/{whisperContainer.py → whisper/whisperContainer.py} +83 -58
src/whisper/whisperFactory.py +16 -0

app.py CHANGED Viewed

@@ -11,8 +11,11 @@ import zipfile
 import numpy as np
 import torch
 from src.config import ApplicationConfig
-from src.hooks.whisperProgressHook import ProgressListener, SubTaskProgressListener, create_progress_listener_handle
 from src.modelCache import ModelCache
 from src.source import get_audio_source_collection
 from src.vadParallel import ParallelContext, ParallelTranscription
@@ -26,7 +29,8 @@ import gradio as gr
 from src.download import ExceededMaximumDuration, download_url
 from src.utils import slugify, write_srt, write_vtt
 from src.vad import AbstractTranscription, NonSpeechStrategy, PeriodicTranscriptionConfig, TranscriptionConfig, VadPeriodicTranscription, VadSileroTranscription
-from src.whisperContainer import WhisperContainer
 # Configure more application defaults in config.json5
@@ -121,7 +125,8 @@ class WhisperTranscriber:
                 selectedLanguage = languageName.lower() if len(languageName) > 0 else None
                 selectedModel = modelName if modelName is not None else "base"
-                model = WhisperContainer(model_name=selectedModel, cache=self.model_cache, models=self.app_config.models)
                 # Result
                 download = []
@@ -223,7 +228,7 @@ class WhisperTranscriber:
         except ExceededMaximumDuration as e:
             return [], ("[ERROR]: Maximum remote video length is " + str(e.maxDuration) + "s, file was " + str(e.videoDuration) + "s"), "[ERROR]"
-    def transcribe_file(self, model: WhisperContainer, audio_path: str, language: str, task: str = None, vad: str = None,
                         vadMergeWindow: float = 5, vadMaxMergeSize: float = 150, vadPadding: float = 1, vadPromptWindow: float = 1,
                         progressListener: ProgressListener = None, **decodeOptions: dict):
@@ -507,7 +512,9 @@ if __name__ == '__main__':
     parser.add_argument("--auto_parallel", type=bool, default=app_config.auto_parallel, \
                         help="True to use all available GPUs and CPU cores for processing. Use vad_cpu_cores/vad_parallel_devices to specify the number of CPU cores/GPUs to use.") # False
     parser.add_argument("--output_dir", "-o", type=str, default=app_config.output_dir, \
-                        help="directory to save the outputs") # None
     args = parser.parse_args().__dict__

 import numpy as np
 import torch
 from src.config import ApplicationConfig
+from src.hooks.progressListener import ProgressListener
+from src.hooks.subTaskProgressListener import SubTaskProgressListener
+from src.hooks.whisperProgressHook import create_progress_listener_handle
 from src.modelCache import ModelCache
 from src.source import get_audio_source_collection
 from src.vadParallel import ParallelContext, ParallelTranscription
 from src.download import ExceededMaximumDuration, download_url
 from src.utils import slugify, write_srt, write_vtt
 from src.vad import AbstractTranscription, NonSpeechStrategy, PeriodicTranscriptionConfig, TranscriptionConfig, VadPeriodicTranscription, VadSileroTranscription
+from src.whisper.abstractWhisperContainer import AbstractWhisperContainer
+from src.whisper.whisperFactory import create_whisper_container
 # Configure more application defaults in config.json5
                 selectedLanguage = languageName.lower() if len(languageName) > 0 else None
                 selectedModel = modelName if modelName is not None else "base"
+                model = create_whisper_container(whisper_implementation=app_config.whisper_implementation,
+                                                 model_name=selectedModel, cache=self.model_cache, models=self.app_config.models)
                 # Result
                 download = []
         except ExceededMaximumDuration as e:
             return [], ("[ERROR]: Maximum remote video length is " + str(e.maxDuration) + "s, file was " + str(e.videoDuration) + "s"), "[ERROR]"
+    def transcribe_file(self, model: AbstractWhisperContainer, audio_path: str, language: str, task: str = None, vad: str = None,
                         vadMergeWindow: float = 5, vadMaxMergeSize: float = 150, vadPadding: float = 1, vadPromptWindow: float = 1,
                         progressListener: ProgressListener = None, **decodeOptions: dict):
     parser.add_argument("--auto_parallel", type=bool, default=app_config.auto_parallel, \
                         help="True to use all available GPUs and CPU cores for processing. Use vad_cpu_cores/vad_parallel_devices to specify the number of CPU cores/GPUs to use.") # False
     parser.add_argument("--output_dir", "-o", type=str, default=app_config.output_dir, \
+                        help="directory to save the outputs"), \
+    parser.add_argument("--whisper_implementation", type=str, default=app_config.whisper_implementation, choices=["whisper", "faster-whisper"],\
+                        help="the Whisper implementation to use"), \
     args = parser.parse_args().__dict__

cli.py CHANGED Viewed

@@ -11,7 +11,7 @@ from src.config import ApplicationConfig
 from src.download import download_url
 from src.utils import optional_float, optional_int, str2bool
-from src.whisperContainer import WhisperContainer
 def cli():
     app_config = ApplicationConfig.create_default()
@@ -32,8 +32,10 @@ def cli():
     parser.add_argument("--output_dir", "-o", type=str, default=output_dir, \
                         help="directory to save the outputs")
     parser.add_argument("--verbose", type=str2bool, default=app_config.verbose, \
-                        help="whether to print out the progress and debug messages")
     parser.add_argument("--task", type=str, default=app_config.task, choices=["transcribe", "translate"], \
                         help="whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')")
     parser.add_argument("--language", type=str, default=app_config.language, choices=sorted(LANGUAGES), \
@@ -92,6 +94,8 @@ def cli():
     device: str = args.pop("device")
     os.makedirs(output_dir, exist_ok=True)
     if model_name.endswith(".en") and args["language"] not in {"en", "English"}:
         warnings.warn(f"{model_name} is an English-only model but receipted '{args['language']}'; using English instead.")
         args["language"] = "en"
@@ -115,7 +119,8 @@ def cli():
     transcriber.set_parallel_devices(args.pop("vad_parallel_devices"))
     transcriber.set_auto_parallel(auto_parallel)
-    model = WhisperContainer(model_name, device=device, download_root=model_dir, models=app_config.models)
     if (transcriber._has_parallel_devices()):
         print("Using parallel devices:", transcriber.parallel_device_list)

 from src.download import download_url
 from src.utils import optional_float, optional_int, str2bool
+from src.whisper.whisperFactory import create_whisper_container
 def cli():
     app_config = ApplicationConfig.create_default()
     parser.add_argument("--output_dir", "-o", type=str, default=output_dir, \
                         help="directory to save the outputs")
     parser.add_argument("--verbose", type=str2bool, default=app_config.verbose, \
+                        help="whether to print out the progress and debug messages"), \
+    parser.add_argument("--whisper_implementation", type=str, default=app_config.whisper_implementation, choices=["whisper", "faster-whisper"],\
+                        help="the Whisper implementation to use"), \
     parser.add_argument("--task", type=str, default=app_config.task, choices=["transcribe", "translate"], \
                         help="whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')")
     parser.add_argument("--language", type=str, default=app_config.language, choices=sorted(LANGUAGES), \
     device: str = args.pop("device")
     os.makedirs(output_dir, exist_ok=True)
+    whisper_implementation = args.pop("whisper_implementation")
     if model_name.endswith(".en") and args["language"] not in {"en", "English"}:
         warnings.warn(f"{model_name} is an English-only model but receipted '{args['language']}'; using English instead.")
         args["language"] = "en"
     transcriber.set_parallel_devices(args.pop("vad_parallel_devices"))
     transcriber.set_auto_parallel(auto_parallel)
+    model = create_whisper_container(whisper_implementation=whisper_implementation,
+                                     device=device, download_root=model_dir, models=app_config.models)
     if (transcriber._has_parallel_devices()):
         print("Using parallel devices:", transcriber.parallel_device_list)

config.json5 CHANGED Viewed

@@ -62,6 +62,9 @@
     // * General options *
     // The default model name.
     "default_model_name": "medium",
     // The default VAD.

     // * General options *
+    // The default implementation to use for Whisper. Can be "whisper" or "faster-whisper".
+    "whisper_implementation": "whisper",
     // The default model name.
     "default_model_name": "medium",
     // The default VAD.

requirements-fastWhisper.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+ctranslate2
+faster-whisper
+ffmpeg-python==0.2.0
+gradio==3.23.0
+yt-dlp
+json5
+torch
+torchaudio

src/config.py CHANGED Viewed

@@ -8,8 +8,6 @@ import torch
 from tqdm import tqdm
-from src.conversion.hf_converter import convert_hf_whisper
 class ModelConfig:
     def __init__(self, name: str, url: str, path: str = None, type: str = "whisper"):
         """
@@ -25,86 +23,11 @@ class ModelConfig:
         self.path = path
         self.type = type
-    def download_url(self, root_dir: str):
-        import whisper
-        # See if path is already set
-        if self.path is not None:
-            return self.path
-        if root_dir is None:
-            root_dir = os.path.join(os.path.expanduser("~"), ".cache", "whisper")
-        model_type = self.type.lower() if self.type is not None else "whisper"
-        if model_type in ["huggingface", "hf"]:
-            self.path = self.url
-            destination_target = os.path.join(root_dir, self.name + ".pt")
-            # Convert from HuggingFace format to Whisper format
-            if os.path.exists(destination_target):
-                print(f"File {destination_target} already exists, skipping conversion")
-            else:
-                print("Saving HuggingFace model in Whisper format to " + destination_target)
-                convert_hf_whisper(self.url, destination_target)
-            self.path = destination_target
-        elif model_type in ["whisper", "w"]:
-            self.path = self.url
-            # See if URL is just a file
-            if self.url in whisper._MODELS:
-                # No need to download anything - Whisper will handle it
-                self.path = self.url
-            elif self.url.startswith("file://"):
-                # Get file path
-                self.path = urlparse(self.url).path
-            # See if it is an URL
-            elif self.url.startswith("http://") or self.url.startswith("https://"):
-                # Extension (or file name)
-                extension = os.path.splitext(self.url)[-1]
-                download_target = os.path.join(root_dir, self.name + extension)
-                if os.path.exists(download_target) and not os.path.isfile(download_target):
-                    raise RuntimeError(f"{download_target} exists and is not a regular file")
-                if not os.path.isfile(download_target):
-                    self._download_file(self.url, download_target)
-                else:
-                    print(f"File {download_target} already exists, skipping download")
-                self.path = download_target
-            # Must be a local file
-            else:
-                self.path = self.url
-        else:
-            raise ValueError(f"Unknown model type {model_type}")
-        return self.path
-    def _download_file(self, url: str, destination: str):
-        with urllib.request.urlopen(url) as source, open(destination, "wb") as output:
-            with tqdm(
-                total=int(source.info().get("Content-Length")),
-                ncols=80,
-                unit="iB",
-                unit_scale=True,
-                unit_divisor=1024,
-            ) as loop:
-                while True:
-                    buffer = source.read(8192)
-                    if not buffer:
-                        break
-                    output.write(buffer)
-                    loop.update(len(buffer))
 class ApplicationConfig:
     def __init__(self, models: List[ModelConfig] = [], input_audio_max_duration: int = 600,
                  share: bool = False, server_name: str = None, server_port: int = 7860,
                  queue_concurrency_count: int = 1, delete_uploaded_files: bool = True,
                  default_model_name: str = "medium", default_vad: str = "silero-vad",
                  vad_parallel_devices: str = "", vad_cpu_cores: int = 1, vad_process_timeout: int = 1800,
                  auto_parallel: bool = False, output_dir: str = None,
@@ -132,6 +55,7 @@ class ApplicationConfig:
         self.queue_concurrency_count = queue_concurrency_count
         self.delete_uploaded_files = delete_uploaded_files
         self.default_model_name = default_model_name
         self.default_vad = default_vad
         self.vad_parallel_devices = vad_parallel_devices

 from tqdm import tqdm
 class ModelConfig:
     def __init__(self, name: str, url: str, path: str = None, type: str = "whisper"):
         """
         self.path = path
         self.type = type
 class ApplicationConfig:
     def __init__(self, models: List[ModelConfig] = [], input_audio_max_duration: int = 600,
                  share: bool = False, server_name: str = None, server_port: int = 7860,
                  queue_concurrency_count: int = 1, delete_uploaded_files: bool = True,
+                 whisper_implementation: str = "whisper",
                  default_model_name: str = "medium", default_vad: str = "silero-vad",
                  vad_parallel_devices: str = "", vad_cpu_cores: int = 1, vad_process_timeout: int = 1800,
                  auto_parallel: bool = False, output_dir: str = None,
         self.queue_concurrency_count = queue_concurrency_count
         self.delete_uploaded_files = delete_uploaded_files
+        self.whisper_implementation = whisper_implementation
         self.default_model_name = default_model_name
         self.default_vad = default_vad
         self.vad_parallel_devices = vad_parallel_devices

src/conversion/hf_converter.py CHANGED Viewed

@@ -2,7 +2,6 @@
 from copy import deepcopy
 import torch
-from transformers import WhisperForConditionalGeneration
 WHISPER_MAPPING = {
     "layers": "blocks",
@@ -43,7 +42,8 @@ def rename_keys(s_dict):
     return s_dict
-def convert_hf_whisper(hf_model_name_or_path: str, whisper_state_path: str):
     transformer_model = WhisperForConditionalGeneration.from_pretrained(hf_model_name_or_path)
     config = transformer_model.config

 from copy import deepcopy
 import torch
 WHISPER_MAPPING = {
     "layers": "blocks",
     return s_dict
+def convert_hf_whisper(hf_model_name_or_path: str, whisper_state_path: str)
+    from transformers import WhisperForConditionalGeneration
     transformer_model = WhisperForConditionalGeneration.from_pretrained(hf_model_name_or_path)
     config = transformer_model.config

src/hooks/progressListener.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from typing import Union
+class ProgressListener:
+    def on_progress(self, current: Union[int, float], total: Union[int, float]):
+        self.total = total
+    def on_finished(self):
+        pass

src/hooks/subTaskProgressListener.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from src.hooks.progressListener import ProgressListener
+from typing import Union
+class SubTaskProgressListener(ProgressListener):
+    """
+    A sub task listener that reports the progress of a sub task to a base task listener
+    Parameters
+    ----------
+    base_task_listener : ProgressListener
+        The base progress listener to accumulate overall progress in.
+    base_task_total : float
+        The maximum total progress that will be reported to the base progress listener.
+    sub_task_start : float
+        The starting progress of a sub task, in respect to the base progress listener.
+    sub_task_total : float
+        The total amount of progress a sub task will report to the base progress listener.
+    """
+    def __init__(
+        self,
+        base_task_listener: ProgressListener,
+        base_task_total: float,
+        sub_task_start: float,
+        sub_task_total: float,
+    ):
+        self.base_task_listener = base_task_listener
+        self.base_task_total = base_task_total
+        self.sub_task_start = sub_task_start
+        self.sub_task_total = sub_task_total
+    def on_progress(self, current: Union[int, float], total: Union[int, float]):
+        sub_task_progress_frac = current / total
+        sub_task_progress = self.sub_task_start + self.sub_task_total * sub_task_progress_frac
+        self.base_task_listener.on_progress(sub_task_progress, self.base_task_total)
+    def on_finished(self):
+        self.base_task_listener.on_progress(self.sub_task_start + self.sub_task_total, self.base_task_total)

src/hooks/whisperProgressHook.py CHANGED Viewed

@@ -3,12 +3,7 @@ import threading
 from typing import List, Union
 import tqdm
-class ProgressListener:
-    def on_progress(self, current: Union[int, float], total: Union[int, float]):
-        self.total = total
-    def on_finished(self):
-        pass
 class ProgressListenerHandle:
     def __init__(self, listener: ProgressListener):
@@ -23,41 +18,6 @@ class ProgressListenerHandle:
         if exc_type is None:
             self.listener.on_finished()
-class SubTaskProgressListener(ProgressListener):
-    """
-    A sub task listener that reports the progress of a sub task to a base task listener
-    Parameters
-    ----------
-    base_task_listener : ProgressListener
-        The base progress listener to accumulate overall progress in.
-    base_task_total : float
-        The maximum total progress that will be reported to the base progress listener.
-    sub_task_start : float
-        The starting progress of a sub task, in respect to the base progress listener.
-    sub_task_total : float
-        The total amount of progress a sub task will report to the base progress listener.
-    """
-    def __init__(
-        self,
-        base_task_listener: ProgressListener,
-        base_task_total: float,
-        sub_task_start: float,
-        sub_task_total: float,
-    ):
-        self.base_task_listener = base_task_listener
-        self.base_task_total = base_task_total
-        self.sub_task_start = sub_task_start
-        self.sub_task_total = sub_task_total
-    def on_progress(self, current: Union[int, float], total: Union[int, float]):
-        sub_task_progress_frac = current / total
-        sub_task_progress = self.sub_task_start + self.sub_task_total * sub_task_progress_frac
-        self.base_task_listener.on_progress(sub_task_progress, self.base_task_total)
-    def on_finished(self):
-        self.base_task_listener.on_progress(self.sub_task_start + self.sub_task_total, self.base_task_total)
 class _CustomProgressBar(tqdm.tqdm):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)

 from typing import List, Union
 import tqdm
+from src.hooks.progressListener import ProgressListener
 class ProgressListenerHandle:
     def __init__(self, listener: ProgressListener):
         if exc_type is None:
             self.listener.on_finished()
 class _CustomProgressBar(tqdm.tqdm):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)

src/utils.py CHANGED Viewed

@@ -4,6 +4,9 @@ import re
 import zlib
 from typing import Iterator, TextIO
 def exact_div(x, y):
@@ -112,4 +115,21 @@ def slugify(value, allow_unicode=False):
     else:
         value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
     value = re.sub(r'[^\w\s-]', '', value.lower())
-    return re.sub(r'[-\s]+', '-', value).strip('-_')

 import zlib
 from typing import Iterator, TextIO
+import tqdm
+import urllib3
 def exact_div(x, y):
     else:
         value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
     value = re.sub(r'[^\w\s-]', '', value.lower())
+    return re.sub(r'[-\s]+', '-', value).strip('-_')
+def download_file(url: str, destination: str):
+        with urllib3.request.urlopen(url) as source, open(destination, "wb") as output:
+            with tqdm(
+                total=int(source.info().get("Content-Length")),
+                ncols=80,
+                unit="iB",
+                unit_scale=True,
+                unit_divisor=1024,
+            ) as loop:
+                while True:
+                    buffer = source.read(8192)
+                    if not buffer:
+                        break
+                    output.write(buffer)
+                    loop.update(len(buffer))

src/vad.py CHANGED Viewed

@@ -5,11 +5,13 @@ import time
 from typing import Any, Deque, Iterator, List, Dict
 from pprint import pprint
-from src.hooks.whisperProgressHook import ProgressListener, SubTaskProgressListener, create_progress_listener_handle
 from src.modelCache import GLOBAL_MODEL_CACHE, ModelCache
 from src.segments import merge_timestamps
-from src.whisperContainer import WhisperCallback
 # Workaround for https://github.com/tensorflow/tensorflow/issues/48797
 try:
@@ -136,7 +138,7 @@ class AbstractTranscription(ABC):
             pprint(merged)
         return merged
-    def transcribe(self, audio: str, whisperCallable: WhisperCallback, config: TranscriptionConfig,
                    progressListener: ProgressListener = None):
         """
         Transcribe the given audo file.

 from typing import Any, Deque, Iterator, List, Dict
 from pprint import pprint
+from src.hooks.progressListener import ProgressListener
+from src.hooks.subTaskProgressListener import SubTaskProgressListener
+from src.hooks.whisperProgressHook import create_progress_listener_handle
 from src.modelCache import GLOBAL_MODEL_CACHE, ModelCache
 from src.segments import merge_timestamps
+from src.whisper.abstractWhisperContainer import AbstractWhisperCallback
 # Workaround for https://github.com/tensorflow/tensorflow/issues/48797
 try:
             pprint(merged)
         return merged
+    def transcribe(self, audio: str, whisperCallable: AbstractWhisperCallback, config: TranscriptionConfig,
                    progressListener: ProgressListener = None):
         """
         Transcribe the given audo file.

src/vadParallel.py CHANGED Viewed

@@ -2,15 +2,16 @@ import multiprocessing
 from queue import Empty
 import threading
 import time
-from src.hooks.whisperProgressHook import ProgressListener
 from src.vad import AbstractTranscription, TranscriptionConfig, get_audio_duration
-from src.whisperContainer import WhisperCallback
 from multiprocessing import Pool, Queue
 from typing import Any, Dict, List, Union
 import os
 class _ProgressListenerToQueue(ProgressListener):
     def __init__(self, progress_queue: Queue):
         self.progress_queue = progress_queue
@@ -104,7 +105,7 @@ class ParallelTranscription(AbstractTranscription):
     def __init__(self, sampling_rate: int = 16000):
         super().__init__(sampling_rate=sampling_rate)
-    def transcribe_parallel(self, transcription: AbstractTranscription, audio: str, whisperCallable: WhisperCallback, config: TranscriptionConfig,
                             cpu_device_count: int, gpu_devices: List[str], cpu_parallel_context: ParallelContext = None, gpu_parallel_context: ParallelContext = None,
                             progress_listener: ProgressListener = None):
         total_duration = get_audio_duration(audio)
@@ -276,7 +277,7 @@ class ParallelTranscription(AbstractTranscription):
             return config.override_timestamps
         return super().get_merged_timestamps(timestamps, config, total_duration)
-    def transcribe(self, audio: str, whisperCallable: WhisperCallback, config: ParallelTranscriptionConfig,
                    progressListener: ProgressListener = None):
         # Override device ID the first time
         if (os.environ.get("INITIALIZED", None) is None):

 from queue import Empty
 import threading
 import time
+from src.hooks.progressListener import ProgressListener
 from src.vad import AbstractTranscription, TranscriptionConfig, get_audio_duration
 from multiprocessing import Pool, Queue
 from typing import Any, Dict, List, Union
 import os
+from src.whisper.abstractWhisperContainer import AbstractWhisperCallback
 class _ProgressListenerToQueue(ProgressListener):
     def __init__(self, progress_queue: Queue):
         self.progress_queue = progress_queue
     def __init__(self, sampling_rate: int = 16000):
         super().__init__(sampling_rate=sampling_rate)
+    def transcribe_parallel(self, transcription: AbstractTranscription, audio: str, whisperCallable: AbstractWhisperCallback, config: TranscriptionConfig,
                             cpu_device_count: int, gpu_devices: List[str], cpu_parallel_context: ParallelContext = None, gpu_parallel_context: ParallelContext = None,
                             progress_listener: ProgressListener = None):
         total_duration = get_audio_duration(audio)
             return config.override_timestamps
         return super().get_merged_timestamps(timestamps, config, total_duration)
+    def transcribe(self, audio: str, whisperCallable: AbstractWhisperCallback, config: ParallelTranscriptionConfig,
                    progressListener: ProgressListener = None):
         # Override device ID the first time
         if (os.environ.get("INITIALIZED", None) is None):

src/whisper/abstractWhisperContainer.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import abc
+from typing import List
+from src.config import ModelConfig
+from src.hooks.progressListener import ProgressListener
+from src.modelCache import GLOBAL_MODEL_CACHE, ModelCache
+class AbstractWhisperCallback:
+    @abc.abstractmethod
+    def invoke(self, audio, segment_index: int, prompt: str, detected_language: str, progress_listener: ProgressListener = None):
+        """
+        Peform the transcription of the given audio file or data.
+        Parameters
+        ----------
+        audio: Union[str, np.ndarray, torch.Tensor]
+            The audio file to transcribe, or the audio data as a numpy array or torch tensor.
+        segment_index: int
+            The target language of the transcription. If not specified, the language will be inferred from the audio content.
+        task: str
+            The task - either translate or transcribe.
+        progress_listener: ProgressListener
+            A callback to receive progress updates.
+        """
+        raise NotImplementedError()
+    def _concat_prompt(self, prompt1, prompt2):
+        if (prompt1 is None):
+            return prompt2
+        elif (prompt2 is None):
+            return prompt1
+        else:
+            return prompt1 + " " + prompt2
+class AbstractWhisperContainer:
+    def __init__(self, model_name: str, device: str = None, download_root: str = None,
+                       cache: ModelCache = None, models: List[ModelConfig] = []):
+        self.model_name = model_name
+        self.device = device
+        self.download_root = download_root
+        self.cache = cache
+        # Will be created on demand
+        self.model = None
+        # List of known models
+        self.models = models
+    def get_model(self):
+        if self.model is None:
+            if (self.cache is None):
+                self.model = self._create_model()
+            else:
+                model_key = "WhisperContainer." + self.model_name + ":" + (self.device if self.device else '')
+                self.model = self.cache.get(model_key, self._create_model)
+        return self.model
+    @abc.abstractmethod
+    def _create_model(self):
+        raise NotImplementedError()
+    def ensure_downloaded(self):
+        pass
+    @abc.abstractmethod
+    def create_callback(self, language: str = None, task: str = None, initial_prompt: str = None, **decodeOptions: dict) -> AbstractWhisperCallback:
+        """
+        Create a WhisperCallback object that can be used to transcript audio files.
+        Parameters
+        ----------
+        language: str
+            The target language of the transcription. If not specified, the language will be inferred from the audio content.
+        task: str
+            The task - either translate or transcribe.
+        initial_prompt: str
+            The initial prompt to use for the transcription.
+        decodeOptions: dict
+            Additional options to pass to the decoder. Must be pickleable.
+        Returns
+        -------
+        A WhisperCallback object.
+        """
+        raise NotImplementedError()
+    # This is required for multiprocessing
+    def __getstate__(self):
+        return { "model_name": self.model_name, "device": self.device, "download_root": self.download_root, "models": self.models }
+    def __setstate__(self, state):
+        self.model_name = state["model_name"]
+        self.device = state["device"]
+        self.download_root = state["download_root"]
+        self.models = state["models"]
+        self.model = None
+        # Depickled objects must use the global cache
+        self.cache = GLOBAL_MODEL_CACHE

src/whisper/fasterWhisperContainer.py ADDED Viewed

	@@ -0,0 +1,165 @@

+import os
+from typing import List
+from faster_whisper import WhisperModel, download_model
+from src.config import ModelConfig
+from src.hooks.progressListener import ProgressListener
+from src.modelCache import ModelCache
+from src.whisper.abstractWhisperContainer import AbstractWhisperCallback, AbstractWhisperContainer
+class FasterWhisperContainer(AbstractWhisperContainer):
+    def __init__(self, model_name: str, device: str = None, download_root: str = None,
+                       cache: ModelCache = None,
+                       models: List[ModelConfig] = []):
+        super().__init__(model_name, device, download_root, cache, models)
+    def ensure_downloaded(self):
+        """
+        Ensure that the model is downloaded. This is useful if you want to ensure that the model is downloaded before
+        passing the container to a subprocess.
+        """
+        model_config = self._get_model_config()
+        if os.path.isdir(model_config.url):
+            model_config.path = model_config.url
+        else:
+            model_config.path = download_model(model_config.url, output_dir=self.download_root)
+    def _get_model_config(self) -> ModelConfig:
+        """
+        Get the model configuration for the model.
+        """
+        for model in self.models:
+            if model.name == self.model_name:
+                return model
+        return None
+    def _create_model(self):
+        print("Loading faster whisper model " + self.model_name)
+        model_config = self._get_model_config()
+        if model_config.type == "whisper" and model_config.url not in ["tiny", "base", "small", "medium", "large", "large-v2"]:
+            raise Exception("FasterWhisperContainer does not yet support Whisper models. Use ct2-transformers-converter to convert the model to a faster-whisper model.")
+        device = self.device
+        if (device is None):
+            device = "auto"
+        model = WhisperModel(model_config.url, device=device, compute_type="float16")
+        return model
+    def create_callback(self, language: str = None, task: str = None, initial_prompt: str = None, **decodeOptions: dict):
+        """
+        Create a WhisperCallback object that can be used to transcript audio files.
+        Parameters
+        ----------
+        language: str
+            The target language of the transcription. If not specified, the language will be inferred from the audio content.
+        task: str
+            The task - either translate or transcribe.
+        initial_prompt: str
+            The initial prompt to use for the transcription.
+        decodeOptions: dict
+            Additional options to pass to the decoder. Must be pickleable.
+        Returns
+        -------
+        A WhisperCallback object.
+        """
+        return FasterWhisperCallback(self, language=language, task=task, initial_prompt=initial_prompt, **decodeOptions)
+class FasterWhisperCallback(AbstractWhisperCallback):
+    def __init__(self, model_container: FasterWhisperContainer, language: str = None, task: str = None, initial_prompt: str = None, **decodeOptions: dict):
+        self.model_container = model_container
+        self.language = language
+        self.task = task
+        self.initial_prompt = initial_prompt
+        self.decodeOptions = decodeOptions
+    def invoke(self, audio, segment_index: int, prompt: str, detected_language: str, progress_listener: ProgressListener = None):
+        """
+        Peform the transcription of the given audio file or data.
+        Parameters
+        ----------
+        audio: Union[str, np.ndarray, torch.Tensor]
+            The audio file to transcribe, or the audio data as a numpy array or torch tensor.
+        segment_index: int
+            The target language of the transcription. If not specified, the language will be inferred from the audio content.
+        task: str
+            The task - either translate or transcribe.
+        progress_listener: ProgressListener
+            A callback to receive progress updates.
+        """
+        model: WhisperModel = self.model_container.get_model()
+        language_code = self._lookup_language_code(self.language) if self.language else None
+        segments_generator, info = model.transcribe(audio, \
+            language=language_code if language_code else detected_language, task=self.task, \
+            initial_prompt=self._concat_prompt(self.initial_prompt, prompt) if segment_index == 0 else prompt, \
+            **self.decodeOptions
+        )
+        segments = []
+        for segment in segments_generator:
+            segments.append(segment)
+            if progress_listener is not None:
+                progress_listener.on_progress(segment.end, info.duration)
+        text = " ".join([segment.text for segment in segments])
+        # Convert the segments to a format that is easier to serialize
+        whisper_segments = [{
+            "text": segment.text,
+            "start": segment.start,
+            "end": segment.end,
+            # Extra fields added by faster-whisper
+            "words": [{
+                "start": word.start,
+                "end": word.end,
+                "word": word.word,
+                "probability": word.probability
+            } for word in (segment.words if segment.words is not None else []) ]
+        } for segment in segments]
+        result = {
+            "segments": whisper_segments,
+            "text": text,
+            "language": info.language if info else None,
+            # Extra fields added by faster-whisper
+            "language_probability": info.language_probability if info else None,
+            "duration": info.duration if info else None
+        }
+        if progress_listener is not None:
+            progress_listener.on_finished()
+        return result
+    def _lookup_language_code(self, language: str):
+        lookup = {
+            "english": "en", "chinese": "zh-cn", "german": "de", "spanish": "es", "russian": "ru", "korean": "ko",
+            "french": "fr", "japanese": "ja", "portuguese": "pt", "turkish": "tr", "polish": "pl", "catalan": "ca",
+            "dutch": "nl", "arabic": "ar", "swedish": "sv", "italian": "it", "indonesian": "id", "hindi": "hi",
+            "finnish": "fi", "vietnamese": "vi", "hebrew": "he", "ukrainian": "uk", "greek": "el", "malay": "ms",
+            "czech": "cs", "romanian": "ro", "danish": "da", "hungarian": "hu", "tamil": "ta", "norwegian": "no",
+            "thai": "th", "urdu": "ur", "croatian": "hr", "bulgarian": "bg", "lithuanian": "lt", "latin": "la",
+            "maori": "mi", "malayalam": "ml", "welsh": "cy", "slovak": "sk", "telugu": "te", "persian": "fa",
+            "latvian": "lv", "bengali": "bn", "serbian": "sr", "azerbaijani": "az", "slovenian": "sl",
+            "kannada": "kn", "estonian": "et", "macedonian": "mk", "breton": "br", "basque": "eu", "icelandic": "is",
+            "armenian": "hy", "nepali": "ne", "mongolian": "mn", "bosnian": "bs", "kazakh": "kk", "albanian": "sq",
+            "swahili": "sw", "galician": "gl", "marathi": "mr", "punjabi": "pa", "sinhala": "si", "khmer": "km",
+            "shona": "sn", "yoruba": "yo", "somali": "so", "afrikaans": "af", "occitan": "oc", "georgian": "ka",
+            "belarusian": "be", "tajik": "tg", "sindhi": "sd", "gujarati": "gu", "amharic": "am", "yiddish": "yi",
+            "lao": "lo", "uzbek": "uz", "faroese": "fo", "haitian creole": "ht", "pashto": "ps", "turkmen": "tk",
+            "nynorsk": "nn", "maltese": "mt", "sanskrit": "sa", "luxembourgish": "lb", "myanmar": "my", "tibetan": "bo",
+            "tagalog": "tl", "malagasy": "mg", "assamese": "as", "tatar": "tt", "hawaiian": "haw", "lingala": "ln",
+            "hausa": "ha", "bashkir": "ba", "javanese": "jv", "sundanese": "su"
+        }
+        return lookup.get(language.lower() if language is not None else None, language)

src/{whisperContainer.py → whisper/whisperContainer.py} RENAMED Viewed

@@ -1,40 +1,27 @@
 # External programs
 import os
 import sys
 from typing import List
 import whisper
 from whisper import Whisper
 from src.config import ModelConfig
-from src.hooks.whisperProgressHook import ProgressListener, create_progress_listener_handle
 from src.modelCache import GLOBAL_MODEL_CACHE, ModelCache
-class WhisperContainer:
-    def __init__(self, model_name: str, device: str = None, download_root: str = None,
-                 cache: ModelCache = None, models: List[ModelConfig] = []):
-        self.model_name = model_name
-        self.device = device
-        self.download_root = download_root
-        self.cache = cache
-        # Will be created on demand
-        self.model = None
-        # List of known models
-        self.models = models
-    def get_model(self):
-        if self.model is None:
-            if (self.cache is None):
-                self.model = self._create_model()
-            else:
-                model_key = "WhisperContainer." + self.model_name + ":" + (self.device if self.device else '')
-                self.model = self.cache.get(model_key, self._create_model)
-        return self.model
     def ensure_downloaded(self):
         """
         Ensure that the model is downloaded. This is useful if you want to ensure that the model is downloaded before
@@ -43,7 +30,7 @@ class WhisperContainer:
         # Warning: Using private API here
         try:
             root_dir = self.download_root
-            model_config = self.get_model_config()
             if root_dir is None:
                 root_dir = os.path.join(os.path.expanduser("~"), ".cache", "whisper")
@@ -60,7 +47,7 @@ class WhisperContainer:
             print("Error pre-downloading model: " + str(e))
             return False
-    def get_model_config(self) -> ModelConfig:
         """
         Get the model configuration for the model.
         """
@@ -71,10 +58,10 @@ class WhisperContainer:
     def _create_model(self):
         print("Loading whisper model " + self.model_name)
-        model_config = self.get_model_config()
         # Note that the model will not be downloaded in the case of an official Whisper model
-        model_path = model_config.download_url(self.download_root)
         return whisper.load_model(model_path, device=self.device, download_root=self.download_root)
@@ -99,21 +86,73 @@ class WhisperContainer:
         """
         return WhisperCallback(self, language=language, task=task, initial_prompt=initial_prompt, **decodeOptions)
-    # This is required for multiprocessing
-    def __getstate__(self):
-        return { "model_name": self.model_name, "device": self.device, "download_root": self.download_root, "models": self.models }
-    def __setstate__(self, state):
-        self.model_name = state["model_name"]
-        self.device = state["device"]
-        self.download_root = state["download_root"]
-        self.models = state["models"]
-        self.model = None
-        # Depickled objects must use the global cache
-        self.cache = GLOBAL_MODEL_CACHE
-class WhisperCallback:
     def __init__(self, model_container: WhisperContainer, language: str = None, task: str = None, initial_prompt: str = None, **decodeOptions: dict):
         self.model_container = model_container
         self.language = language
@@ -133,14 +172,8 @@ class WhisperCallback:
             The target language of the transcription. If not specified, the language will be inferred from the audio content.
         task: str
             The task - either translate or transcribe.
-        prompt: str
-            The prompt to use for the transcription.
-        detected_language: str
-            The detected language of the audio file.
-        Returns
-        -------
-        The result of the Whisper call.
         """
         model = self.model_container.get_model()
@@ -155,12 +188,4 @@ class WhisperCallback:
             language=self.language if self.language else detected_language, task=self.task, \
             initial_prompt=self._concat_prompt(self.initial_prompt, prompt) if segment_index == 0 else prompt, \
             **self.decodeOptions
-        )
-    def _concat_prompt(self, prompt1, prompt2):
-        if (prompt1 is None):
-            return prompt2
-        elif (prompt2 is None):
-            return prompt1
-        else:
-            return prompt1 + " " + prompt2

 # External programs
+import abc
 import os
 import sys
 from typing import List
+from urllib.parse import urlparse
+import urllib3
+from src.hooks.progressListener import ProgressListener
 import whisper
 from whisper import Whisper
 from src.config import ModelConfig
+from src.hooks.whisperProgressHook import create_progress_listener_handle
 from src.modelCache import GLOBAL_MODEL_CACHE, ModelCache
+from src.utils import download_file
+from src.whisper.abstractWhisperContainer import AbstractWhisperCallback, AbstractWhisperContainer
+class WhisperContainer(AbstractWhisperContainer):
+    def __init__(self, model_name: str, device: str = None, download_root: str = None,
+                       cache: ModelCache = None, models: List[ModelConfig] = []):
+        super().__init__(model_name, device, download_root, cache, models)
     def ensure_downloaded(self):
         """
         Ensure that the model is downloaded. This is useful if you want to ensure that the model is downloaded before
         # Warning: Using private API here
         try:
             root_dir = self.download_root
+            model_config = self._get_model_config()
             if root_dir is None:
                 root_dir = os.path.join(os.path.expanduser("~"), ".cache", "whisper")
             print("Error pre-downloading model: " + str(e))
             return False
+    def _get_model_config(self) -> ModelConfig:
         """
         Get the model configuration for the model.
         """
     def _create_model(self):
         print("Loading whisper model " + self.model_name)
+        model_config = self._get_model_config()
         # Note that the model will not be downloaded in the case of an official Whisper model
+        model_path = self._get_model_path(model_config, self.download_root)
         return whisper.load_model(model_path, device=self.device, download_root=self.download_root)
         """
         return WhisperCallback(self, language=language, task=task, initial_prompt=initial_prompt, **decodeOptions)
+    def _get_model_path(self, model_config: ModelConfig, root_dir: str = None):
+        from src.conversion.hf_converter import convert_hf_whisper
+        """
+        Download the model.
+        Parameters
+        ----------
+        model_config: ModelConfig
+            The model configuration.
+        """
+        # See if path is already set
+        if model_config.path is not None:
+            return model_config.path
+        if root_dir is None:
+            root_dir = os.path.join(os.path.expanduser("~"), ".cache", "whisper")
+        model_type = model_config.type.lower() if model_config.type is not None else "whisper"
+        if model_type in ["huggingface", "hf"]:
+            model_config.path = model_config.url
+            destination_target = os.path.join(root_dir, model_config.name + ".pt")
+            # Convert from HuggingFace format to Whisper format
+            if os.path.exists(destination_target):
+                print(f"File {destination_target} already exists, skipping conversion")
+            else:
+                print("Saving HuggingFace model in Whisper format to " + destination_target)
+                convert_hf_whisper(model_config.url, destination_target)
+            model_config.path = destination_target
+        elif model_type in ["whisper", "w"]:
+            model_config.path = model_config.url
+            # See if URL is just a file
+            if model_config.url in whisper._MODELS:
+                # No need to download anything - Whisper will handle it
+                model_config.path = model_config.url
+            elif model_config.url.startswith("file://"):
+                # Get file path
+                model_config.path = urlparse(model_config.url).path
+            # See if it is an URL
+            elif model_config.url.startswith("http://") or model_config.url.startswith("https://"):
+                # Extension (or file name)
+                extension = os.path.splitext(model_config.url)[-1]
+                download_target = os.path.join(root_dir, model_config.name + extension)
+                if os.path.exists(download_target) and not os.path.isfile(download_target):
+                    raise RuntimeError(f"{download_target} exists and is not a regular file")
+                if not os.path.isfile(download_target):
+                    download_file(model_config.url, download_target)
+                else:
+                    print(f"File {download_target} already exists, skipping download")
+                model_config.path = download_target
+            # Must be a local file
+            else:
+                model_config.path = model_config.url
+        else:
+            raise ValueError(f"Unknown model type {model_type}")
+        return model_config.path
+class WhisperCallback(AbstractWhisperCallback):
     def __init__(self, model_container: WhisperContainer, language: str = None, task: str = None, initial_prompt: str = None, **decodeOptions: dict):
         self.model_container = model_container
         self.language = language
             The target language of the transcription. If not specified, the language will be inferred from the audio content.
         task: str
             The task - either translate or transcribe.
+        progress_listener: ProgressListener
+            A callback to receive progress updates.
         """
         model = self.model_container.get_model()
             language=self.language if self.language else detected_language, task=self.task, \
             initial_prompt=self._concat_prompt(self.initial_prompt, prompt) if segment_index == 0 else prompt, \
             **self.decodeOptions
+        )

src/whisper/whisperFactory.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from typing import List
+from src import modelCache
+from src.config import ModelConfig
+from src.whisper.abstractWhisperContainer import AbstractWhisperContainer
+def create_whisper_container(whisper_implementation: str,
+                             model_name: str, device: str = None, download_root: str = None,
+                             cache: modelCache = None, models: List[ModelConfig] = []) -> AbstractWhisperContainer:
+    if (whisper_implementation == "whisper"):
+        from src.whisper.whisperContainer import WhisperContainer
+        return WhisperContainer(model_name, device, download_root, cache, models)
+    elif (whisper_implementation == "faster-whisper" or whisper_implementation == "faster_whisper"):
+        from src.whisper.fasterWhisperContainer import FasterWhisperContainer
+        return FasterWhisperContainer(model_name, device, download_root, cache, models)
+    else:
+        raise ValueError("Unknown Whisper implementation: " + whisper_implementation)