Spaces:

aadnk
/

whisper-webui

Running

App Files Files Community

aadnk commited on Nov 22, 2022

Commit

95261ed

•

1 Parent(s): 8f3aedf

Add support for parallel execution on multiple GPUs

Browse files

Files changed (5) hide show

app.py +37 -14
cli.py +2 -0
src/vad.py +42 -24
src/vadParallel.py +81 -0
src/whisperContainer.py +91 -0

app.py CHANGED Viewed

@@ -1,9 +1,13 @@
 from typing import Iterator
 from io import StringIO
 import os
 import pathlib
 import tempfile
 # External programs
 import whisper
@@ -14,7 +18,7 @@ import gradio as gr
 from src.download import ExceededMaximumDuration, download_url
 from src.utils import slugify, write_srt, write_vtt
-from src.vad import NonSpeechStrategy, PeriodicTranscriptionConfig, TranscriptionConfig, VadPeriodicTranscription, VadSileroTranscription
 # Limitations (set to -1 to disable)
 DEFAULT_INPUT_AUDIO_MAX_DURATION = 600 # seconds
@@ -48,6 +52,7 @@ LANGUAGES = [
 class WhisperTranscriber:
     def __init__(self, inputAudioMaxDuration: float = DEFAULT_INPUT_AUDIO_MAX_DURATION, deleteUploadedFiles: bool = DELETE_UPLOADED_FILES):
         self.model_cache = dict()
         self.vad_model = None
         self.inputAudioMaxDuration = inputAudioMaxDuration
@@ -64,7 +69,7 @@ class WhisperTranscriber:
                 model = self.model_cache.get(selectedModel, None)
                 if not model:
-                    model = whisper.load_model(selectedModel)
                     self.model_cache[selectedModel] = model
                 # Execute whisper
@@ -87,7 +92,7 @@ class WhisperTranscriber:
         except ExceededMaximumDuration as e:
             return [], ("[ERROR]: Maximum remote video length is " + str(e.maxDuration) + "s, file was " + str(e.videoDuration) + "s"), "[ERROR]"
-    def transcribe_file(self, model: whisper.Whisper, audio_path: str, language: str, task: str = None, vad: str = None,
                         vadMergeWindow: float = 5, vadMaxMergeSize: float = 150, vadPadding: float = 1, vadPromptWindow: float = 1, **decodeOptions: dict):
         initial_prompt = decodeOptions.pop('initial_prompt', None)
@@ -96,35 +101,42 @@ class WhisperTranscriber:
             task = decodeOptions.pop('task')
         # Callable for processing an audio file
-        whisperCallable = lambda audio, segment_index, prompt, detected_language : model.transcribe(audio, \
-                 language=language if language else detected_language, task=task, \
-                 initial_prompt=self._concat_prompt(initial_prompt, prompt) if segment_index == 0 else prompt, \
-                 **decodeOptions)
         # The results
         if (vad == 'silero-vad'):
             # Silero VAD where non-speech gaps are transcribed
             process_gaps = self._create_silero_config(NonSpeechStrategy.CREATE_SEGMENT, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow)
-            result = self.vad_model.transcribe(audio_path, whisperCallable, process_gaps)
         elif (vad == 'silero-vad-skip-gaps'):
             # Silero VAD where non-speech gaps are simply ignored
             skip_gaps = self._create_silero_config(NonSpeechStrategy.SKIP, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow)
-            result = self.vad_model.transcribe(audio_path, whisperCallable, skip_gaps)
         elif (vad == 'silero-vad-expand-into-gaps'):
             # Use Silero VAD where speech-segments are expanded into non-speech gaps
             expand_gaps = self._create_silero_config(NonSpeechStrategy.EXPAND_SEGMENT, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow)
-            result = self.vad_model.transcribe(audio_path, whisperCallable, expand_gaps)
         elif (vad == 'periodic-vad'):
             # Very simple VAD - mark every 5 minutes as speech. This makes it less likely that Whisper enters an infinite loop, but
             # it may create a break in the middle of a sentence, causing some artifacts.
             periodic_vad = VadPeriodicTranscription()
-            result = periodic_vad.transcribe(audio_path, whisperCallable, PeriodicTranscriptionConfig(periodic_duration=vadMaxMergeSize, max_prompt_window=vadPromptWindow))
         else:
             # Default VAD
             result = whisperCallable(audio_path, 0, None, None)
         return result
     def _concat_prompt(self, prompt1, prompt2):
         if (prompt1 is None):
             return prompt2
@@ -218,9 +230,12 @@ class WhisperTranscriber:
         return file.name
-def create_ui(inputAudioMaxDuration, share=False, server_name: str = None):
     ui = WhisperTranscriber(inputAudioMaxDuration)
     ui_description = "Whisper is a general-purpose speech recognition model. It is trained on a large dataset of diverse "
     ui_description += " audio and is also a multi-task model that can perform multilingual speech recognition "
     ui_description += " as well as speech translation and language identification. "
@@ -250,7 +265,15 @@ def create_ui(inputAudioMaxDuration, share=False, server_name: str = None):
         gr.Text(label="Segments")
     ])
-    demo.launch(share=share, server_name=server_name)
 if __name__ == '__main__':
-    create_ui(DEFAULT_INPUT_AUDIO_MAX_DURATION)

 from typing import Iterator
+import argparse
 from io import StringIO
 import os
 import pathlib
 import tempfile
+from src.vadParallel import ParallelTranscription
+from src.whisperContainer import WhisperContainer
 # External programs
 import whisper
 from src.download import ExceededMaximumDuration, download_url
 from src.utils import slugify, write_srt, write_vtt
+from src.vad import AbstractTranscription, NonSpeechStrategy, PeriodicTranscriptionConfig, TranscriptionConfig, VadPeriodicTranscription, VadSileroTranscription
 # Limitations (set to -1 to disable)
 DEFAULT_INPUT_AUDIO_MAX_DURATION = 600 # seconds
 class WhisperTranscriber:
     def __init__(self, inputAudioMaxDuration: float = DEFAULT_INPUT_AUDIO_MAX_DURATION, deleteUploadedFiles: bool = DELETE_UPLOADED_FILES):
         self.model_cache = dict()
+        self.parallel_device_list = None
         self.vad_model = None
         self.inputAudioMaxDuration = inputAudioMaxDuration
                 model = self.model_cache.get(selectedModel, None)
                 if not model:
+                    model = WhisperContainer(selectedModel)
                     self.model_cache[selectedModel] = model
                 # Execute whisper
         except ExceededMaximumDuration as e:
             return [], ("[ERROR]: Maximum remote video length is " + str(e.maxDuration) + "s, file was " + str(e.videoDuration) + "s"), "[ERROR]"
+    def transcribe_file(self, model: WhisperContainer, audio_path: str, language: str, task: str = None, vad: str = None,
                         vadMergeWindow: float = 5, vadMaxMergeSize: float = 150, vadPadding: float = 1, vadPromptWindow: float = 1, **decodeOptions: dict):
         initial_prompt = decodeOptions.pop('initial_prompt', None)
             task = decodeOptions.pop('task')
         # Callable for processing an audio file
+        whisperCallable = model.create_callback(language, task, initial_prompt, **decodeOptions)
         # The results
         if (vad == 'silero-vad'):
             # Silero VAD where non-speech gaps are transcribed
             process_gaps = self._create_silero_config(NonSpeechStrategy.CREATE_SEGMENT, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow)
+            result = self.process_vad(audio_path, whisperCallable, self.vad_model, process_gaps)
         elif (vad == 'silero-vad-skip-gaps'):
             # Silero VAD where non-speech gaps are simply ignored
             skip_gaps = self._create_silero_config(NonSpeechStrategy.SKIP, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow)
+            result = self.process_vad(audio_path, whisperCallable, self.vad_model, skip_gaps)
         elif (vad == 'silero-vad-expand-into-gaps'):
             # Use Silero VAD where speech-segments are expanded into non-speech gaps
             expand_gaps = self._create_silero_config(NonSpeechStrategy.EXPAND_SEGMENT, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow)
+            result = self.process_vad(audio_path, whisperCallable, self.vad_model, expand_gaps)
         elif (vad == 'periodic-vad'):
             # Very simple VAD - mark every 5 minutes as speech. This makes it less likely that Whisper enters an infinite loop, but
             # it may create a break in the middle of a sentence, causing some artifacts.
             periodic_vad = VadPeriodicTranscription()
+            period_config = PeriodicTranscriptionConfig(periodic_duration=vadMaxMergeSize, max_prompt_window=vadPromptWindow)
+            result = self.process_vad(audio_path, whisperCallable, periodic_vad, period_config)
         else:
             # Default VAD
             result = whisperCallable(audio_path, 0, None, None)
         return result
+    def process_vad(self, audio_path, whisperCallable, vadModel: AbstractTranscription, vadConfig: TranscriptionConfig):
+        if (self.parallel_device_list is None or len(self.parallel_device_list) == 0):
+            # No parallel devices, so just run the VAD and Whisper in sequence
+            return vadModel.transcribe(audio_path, whisperCallable, vadConfig)
+        parallell_vad = ParallelTranscription()
+        return parallell_vad.transcribe_parallel(transcription=vadModel, audio=audio_path, whisperCallable=whisperCallable, config=vadConfig, devices=self.parallel_device_list)
     def _concat_prompt(self, prompt1, prompt2):
         if (prompt1 is None):
             return prompt2
         return file.name
+def create_ui(inputAudioMaxDuration, share=False, server_name: str = None, server_port: int = 7860, vad_parallel_devices: str = None):
     ui = WhisperTranscriber(inputAudioMaxDuration)
+    # Specify a list of devices to use for parallel processing
+    ui.parallel_device_list = [ device.strip() for device in vad_parallel_devices.split(",") ] if vad_parallel_devices else None
     ui_description = "Whisper is a general-purpose speech recognition model. It is trained on a large dataset of diverse "
     ui_description += " audio and is also a multi-task model that can perform multilingual speech recognition "
     ui_description += " as well as speech translation and language identification. "
         gr.Text(label="Segments")
     ])
+    demo.launch(share=share, server_name=server_name, server_port=server_port)
 if __name__ == '__main__':
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument("--inputAudioMaxDuration", type=int, default=600, help="Maximum audio file length in seconds, or -1 for no limit.")
+    parser.add_argument("--share", type=bool, default=False, help="True to share the app on HuggingFace.")
+    parser.add_argument("--server_name", type=str, default=None, help="The host or IP to bind to. If None, bind to localhost.")
+    parser.add_argument("--server_port", type=int, default=7860, help="The port to bind to.")
+    parser.add_argument("--vad_parallel_devices", type=str, default="0,1", help="A commma delimited list of CUDA devices to use for parallel processing. If None, disable parallel processing.")
+    args = parser.parse_args().__dict__
+    create_ui(**args)

cli.py CHANGED Viewed

@@ -31,6 +31,7 @@ def cli():
     parser.add_argument("--vad_max_merge_size", type=optional_float, default=30, help="The maximum size (in seconds) of a voice segment")
     parser.add_argument("--vad_padding", type=optional_float, default=1, help="The padding (in seconds) to add to each voice segment")
     parser.add_argument("--vad_prompt_window", type=optional_float, default=3, help="The window size of the prompt to pass to Whisper")
     parser.add_argument("--temperature", type=float, default=0, help="temperature to use for sampling")
     parser.add_argument("--best_of", type=optional_int, default=5, help="number of candidates when sampling with non-zero temperature")
@@ -74,6 +75,7 @@ def cli():
     model = whisper.load_model(model_name, device=device, download_root=model_dir)
     transcriber = WhisperTranscriber(deleteUploadedFiles=False)
     for audio_path in args.pop("audio"):
         sources = []

     parser.add_argument("--vad_max_merge_size", type=optional_float, default=30, help="The maximum size (in seconds) of a voice segment")
     parser.add_argument("--vad_padding", type=optional_float, default=1, help="The padding (in seconds) to add to each voice segment")
     parser.add_argument("--vad_prompt_window", type=optional_float, default=3, help="The window size of the prompt to pass to Whisper")
+    parser.add_argument("--vad_parallel_devices", type=str, default="0", help="A commma delimited list of CUDA devices to use for paralell processing. If None, disable parallel processing.")
     parser.add_argument("--temperature", type=float, default=0, help="temperature to use for sampling")
     parser.add_argument("--best_of", type=optional_int, default=5, help="number of candidates when sampling with non-zero temperature")
     model = whisper.load_model(model_name, device=device, download_root=model_dir)
     transcriber = WhisperTranscriber(deleteUploadedFiles=False)
+    transcriber.parallel_device_list = args.pop("vad_parallel_devices")
     for audio_path in args.pop("audio"):
         sources = []

src/vad.py CHANGED Viewed

@@ -6,6 +6,7 @@ from typing import Any, Deque, Iterator, List, Dict
 from pprint import pprint
 from src.segments import merge_timestamps
 # Workaround for https://github.com/tensorflow/tensorflow/issues/48797
 try:
@@ -51,19 +52,20 @@ VAD_MAX_PROCESSING_CHUNK = 60 * 60 # 60 minutes of audio
 class TranscriptionConfig(ABC):
     def __init__(self, non_speech_strategy: NonSpeechStrategy = NonSpeechStrategy.SKIP,
                        segment_padding_left: float = None, segment_padding_right = None, max_silent_period: float = None,
-                       max_merge_size: float = None, max_prompt_window: float = None):
         self.non_speech_strategy = non_speech_strategy
         self.segment_padding_left = segment_padding_left
         self.segment_padding_right = segment_padding_right
         self.max_silent_period = max_silent_period
         self.max_merge_size = max_merge_size
         self.max_prompt_window = max_prompt_window
 class PeriodicTranscriptionConfig(TranscriptionConfig):
     def __init__(self, periodic_duration: float, non_speech_strategy: NonSpeechStrategy = NonSpeechStrategy.SKIP,
                        segment_padding_left: float = None, segment_padding_right = None, max_silent_period: float = None,
-                       max_merge_size: float = None, max_prompt_window: float = None):
-        super().__init__(non_speech_strategy, segment_padding_left, segment_padding_right, max_silent_period, max_merge_size, max_prompt_window)
         self.periodic_duration = periodic_duration
 class AbstractTranscription(ABC):
@@ -91,37 +93,26 @@ class AbstractTranscription(ABC):
         """
         return
-    def transcribe(self, audio: str, whisperCallable, config: TranscriptionConfig):
         """
-        Transcribe the given audo file.
         Parameters
         ----------
         audio: str
-            The audio file.
-        whisperCallable: Callable[[Union[str, np.ndarray, torch.Tensor], int, str, str], dict[str, Union[dict, Any]]]
-            The callback that is used to invoke Whisper on an audio file/buffer. The first parameter is the audio file/buffer,
-            the second parameter is an optional text prompt, and the last is the current detected language. The return value is the result of the Whisper call.
         Returns
         -------
         A list of start and end timestamps, in fractional seconds.
         """
-        # get speech timestamps from full audio file
         seconds_timestamps = self.get_transcribe_timestamps(audio, config)
-        #for seconds_timestamp in seconds_timestamps:
-        #    print("VAD timestamp ", format_timestamp(seconds_timestamp['start']), " to ", format_timestamp(seconds_timestamp['end']))
-        merged = merge_timestamps(seconds_timestamps, config.max_silent_period, config.max_merge_size, config.segment_padding_left, config.segment_padding_right)
-        # A deque of transcribed segments that is passed to the next segment as a prompt
-        prompt_window = deque()
-        print("Timestamps:")
-        pprint(merged)
         if config.non_speech_strategy != NonSpeechStrategy.SKIP:
             max_audio_duration = get_audio_duration(audio)
@@ -138,6 +129,32 @@ class AbstractTranscription(ABC):
             print("Transcribing non-speech:")
             pprint(merged)
         result = {
             'text': "",
@@ -147,7 +164,7 @@ class AbstractTranscription(ABC):
         languageCounter = Counter()
         detected_language = None
-        segment_index = -1
         # For each time segment, run whisper
         for segment in merged:
@@ -172,7 +189,7 @@ class AbstractTranscription(ABC):
             print("Running whisper from ", format_timestamp(segment_start), " to ", format_timestamp(segment_end), ", duration: ",
                   segment_duration, "expanded: ", segment_expand_amount, "prompt: ", segment_prompt, "language: ", detected_language)
-            segment_result = whisperCallable(segment_audio, segment_index, segment_prompt, detected_language)
             adjusted_segments = self.adjust_timestamp(segment_result["segments"], adjust_seconds=segment_start, max_source_time=segment_duration)
@@ -373,6 +390,7 @@ class AbstractTranscription(ABC):
             })
         return result
 class VadSileroTranscription(AbstractTranscription):
     def __init__(self, sampling_rate: int = 16000):
         super().__init__(sampling_rate=sampling_rate)

 from pprint import pprint
 from src.segments import merge_timestamps
+from src.whisperContainer import WhisperCallback
 # Workaround for https://github.com/tensorflow/tensorflow/issues/48797
 try:
 class TranscriptionConfig(ABC):
     def __init__(self, non_speech_strategy: NonSpeechStrategy = NonSpeechStrategy.SKIP,
                        segment_padding_left: float = None, segment_padding_right = None, max_silent_period: float = None,
+                       max_merge_size: float = None, max_prompt_window: float = None, initial_segment_index = -1):
         self.non_speech_strategy = non_speech_strategy
         self.segment_padding_left = segment_padding_left
         self.segment_padding_right = segment_padding_right
         self.max_silent_period = max_silent_period
         self.max_merge_size = max_merge_size
         self.max_prompt_window = max_prompt_window
+        self.initial_segment_index = initial_segment_index
 class PeriodicTranscriptionConfig(TranscriptionConfig):
     def __init__(self, periodic_duration: float, non_speech_strategy: NonSpeechStrategy = NonSpeechStrategy.SKIP,
                        segment_padding_left: float = None, segment_padding_right = None, max_silent_period: float = None,
+                       max_merge_size: float = None, max_prompt_window: float = None, initial_segment_index = -1):
+        super().__init__(non_speech_strategy, segment_padding_left, segment_padding_right, max_silent_period, max_merge_size, max_prompt_window, initial_segment_index)
         self.periodic_duration = periodic_duration
 class AbstractTranscription(ABC):
         """
         return
+    def get_merged_timestamps(self, audio: str, config: TranscriptionConfig):
         """
+        Get the start and end timestamps of the sections that should be transcribed by this VAD method,
+        after merging the segments using the specified configuration.
         Parameters
         ----------
         audio: str
+            The audio file.
+        config: TranscriptionConfig
+            The transcription configuration.
         Returns
         -------
         A list of start and end timestamps, in fractional seconds.
         """
         seconds_timestamps = self.get_transcribe_timestamps(audio, config)
+        merged = merge_timestamps(seconds_timestamps, config.max_silent_period, config.max_merge_size,
+                                  config.segment_padding_left, config.segment_padding_right)
         if config.non_speech_strategy != NonSpeechStrategy.SKIP:
             max_audio_duration = get_audio_duration(audio)
             print("Transcribing non-speech:")
             pprint(merged)
+        return merged
+    def transcribe(self, audio: str, whisperCallable: WhisperCallback, config: TranscriptionConfig):
+        """
+        Transcribe the given audo file.
+        Parameters
+        ----------
+        audio: str
+            The audio file.
+        whisperCallable: WhisperCallback
+            A callback object to call to transcribe each segment.
+        Returns
+        -------
+        A list of start and end timestamps, in fractional seconds.
+        """
+        # Get speech timestamps from full audio file
+        merged = self.get_merged_timestamps(audio, config)
+        # A deque of transcribed segments that is passed to the next segment as a prompt
+        prompt_window = deque()
+        print("Processing timestamps:")
+        pprint(merged)
         result = {
             'text': "",
         languageCounter = Counter()
         detected_language = None
+        segment_index = config.initial_segment_index
         # For each time segment, run whisper
         for segment in merged:
             print("Running whisper from ", format_timestamp(segment_start), " to ", format_timestamp(segment_end), ", duration: ",
                   segment_duration, "expanded: ", segment_expand_amount, "prompt: ", segment_prompt, "language: ", detected_language)
+            segment_result = whisperCallable.invoke(segment_audio, segment_index, segment_prompt, detected_language)
             adjusted_segments = self.adjust_timestamp(segment_result["segments"], adjust_seconds=segment_start, max_source_time=segment_duration)
             })
         return result
 class VadSileroTranscription(AbstractTranscription):
     def __init__(self, sampling_rate: int = 16000):
         super().__init__(sampling_rate=sampling_rate)

src/vadParallel.py ADDED Viewed

	@@ -0,0 +1,81 @@

+from src.vad import AbstractTranscription, TranscriptionConfig
+from src.whisperContainer import WhisperCallback
+from multiprocessing import Pool
+from typing import List
+import os
+class ParallelTranscriptionConfig(TranscriptionConfig):
+    def __init__(self, device_id: str, override_timestamps, initial_segment_index, copy: TranscriptionConfig = None):
+        super().__init__(copy.non_speech_strategy, copy.segment_padding_left, copy.segment_padding_right, copy.max_silent_period, copy.max_merge_size, copy.max_prompt_window, initial_segment_index)
+        self.device_id = device_id
+        self.override_timestamps = override_timestamps
+class ParallelTranscription(AbstractTranscription):
+    def __init__(self, sampling_rate: int = 16000):
+        super().__init__(sampling_rate=sampling_rate)
+    def transcribe_parallel(self, transcription: AbstractTranscription, audio: str, whisperCallable: WhisperCallback, config: TranscriptionConfig, devices: List[str]):
+        # First, get the timestamps for the original audio
+        merged = transcription.get_merged_timestamps(audio, config)
+        # Split into a list for each device
+        merged_split = self._chunks(merged, len(merged) // len(devices))
+        # Parameters that will be passed to the transcribe function
+        parameters = []
+        segment_index = config.initial_segment_index
+        for i in range(len(devices)):
+            device_segment_list = merged_split[i]
+            # Create a new config with the given device ID
+            device_config = ParallelTranscriptionConfig(devices[i], device_segment_list, segment_index, config)
+            segment_index += len(device_segment_list)
+            parameters.append([audio, whisperCallable, device_config]);
+        merged = {
+            'text': '',
+            'segments': [],
+            'language': None
+        }
+        with Pool(len(devices)) as p:
+            # Run the transcription in parallel
+            results = p.starmap(self.transcribe, parameters)
+            for result in results:
+                # Merge the results
+                if (result['text'] is not None):
+                    merged['text'] += result['text']
+                if (result['segments'] is not None):
+                    merged['segments'].extend(result['segments'])
+                if (result['language'] is not None):
+                    merged['language'] = result['language']
+        return merged
+    def get_transcribe_timestamps(self, audio: str, config: ParallelTranscriptionConfig):
+        return []
+    def get_merged_timestamps(self, audio: str, config: ParallelTranscriptionConfig):
+        # Override timestamps that will be processed
+        if (config.override_timestamps is not None):
+            print("Using override timestamps of size " + str(len(config.override_timestamps)))
+            return config.override_timestamps
+        return super().get_merged_timestamps(audio, config)
+    def transcribe(self, audio: str, whisperCallable: WhisperCallback, config: ParallelTranscriptionConfig):
+        # Override device ID
+        if (config.device_id is not None):
+            print("Using device " + config.device_id)
+            os.environ["CUDA_VISIBLE_DEVICES"] = config.device_id
+        return super().transcribe(audio, whisperCallable, config)
+    def _chunks(self, lst, n):
+        """Yield successive n-sized chunks from lst."""
+        return [lst[i:i + n] for i in range(0, len(lst), n)]

src/whisperContainer.py ADDED Viewed

	@@ -0,0 +1,91 @@

+# External programs
+import whisper
+class WhisperContainer:
+    def __init__(self, model_name: str, device: str = None):
+        self.model_name = model_name
+        self.device = device
+        # Will be created on demand
+        self.model = None
+    def get_model(self):
+        if self.model is None:
+            print("Loading model " + self.model_name)
+            self.model = whisper.load_model(self.model_name, device=self.device)
+        return self.model
+    def create_callback(self, language: str = None, task: str = None, initial_prompt: str = None, **decodeOptions: dict):
+        """
+        Create a WhisperCallback object that can be used to transcript audio files.
+        Parameters
+        ----------
+        language: str
+            The target language of the transcription. If not specified, the language will be inferred from the audio content.
+        task: str
+            The task - either translate or transcribe.
+        initial_prompt: str
+            The initial prompt to use for the transcription.
+        decodeOptions: dict
+            Additional options to pass to the decoder. Must be pickleable.
+        Returns
+        -------
+        A WhisperCallback object.
+        """
+        return WhisperCallback(self, language=language, task=task, initial_prompt=initial_prompt, **decodeOptions)
+    # This is required for multiprocessing
+    def __getstate__(self):
+        return { "model_name": self.model_name, "device": self.device }
+    def __setstate__(self, state):
+        self.model_name = state["model_name"]
+        self.device = state["device"]
+        self.model = None
+class WhisperCallback:
+    def __init__(self, model_container: WhisperContainer, language: str = None, task: str = None, initial_prompt: str = None, **decodeOptions: dict):
+        self.model_container = model_container
+        self.language = language
+        self.task = task
+        self.initial_prompt = initial_prompt
+        self.decodeOptions = decodeOptions
+    def invoke(self, audio, segment_index: int, prompt: str, detected_language: str):
+        """
+        Peform the transcription of the given audio file or data.
+        Parameters
+        ----------
+        audio: Union[str, np.ndarray, torch.Tensor]
+            The audio file to transcribe, or the audio data as a numpy array or torch tensor.
+        segment_index: int
+            The target language of the transcription. If not specified, the language will be inferred from the audio content.
+        task: str
+            The task - either translate or transcribe.
+        prompt: str
+            The prompt to use for the transcription.
+        detected_language: str
+            The detected language of the audio file.
+        Returns
+        -------
+        The result of the Whisper call.
+        """
+        model = self.model_container.get_model()
+        return model.transcribe(audio, \
+                 language=self.language if self.language else detected_language, task=self.task, \
+                 initial_prompt=self._concat_prompt(self.initial_prompt, prompt) if segment_index == 0 else prompt, \
+                 **self.decodeOptions)
+    def _concat_prompt(self, prompt1, prompt2):
+        if (prompt1 is None):
+            return prompt2
+        elif (prompt2 is None):
+            return prompt1
+        else:
+            return prompt1 + " " + prompt2