Rolando commited on Jan 13, 2024

Commit

1 Parent(s): e9ccfaf

Set it up

Files changed (23) hide show

.gitignore +129 -0
LICENSE +21 -0
README.md +1872 -0
examples/non-whisper.ipynb +425 -0
setup.py +40 -0
silence_suppresion0.png +0 -0
silence_suppresion1.png +0 -0
stable_whisper/__init__.py +8 -0
stable_whisper/__main__.py +3 -0
stable_whisper/_version.py +1 -0
stable_whisper/alignment.py +1265 -0
stable_whisper/audio.py +288 -0
stable_whisper/decode.py +109 -0
stable_whisper/non_whisper.py +348 -0
stable_whisper/quantization.py +40 -0
stable_whisper/result.py +2281 -0
stable_whisper/stabilization.py +424 -0
stable_whisper/text_output.py +620 -0
stable_whisper/timing.py +275 -0
stable_whisper/utils.py +78 -0
stable_whisper/video_output.py +111 -0
stable_whisper/whisper_compatibility.py +73 -0
stable_whisper/whisper_word_level.py +1651 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,129 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2022 jian
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,1872 @@

+# Stabilizing Timestamps for Whisper
+This library modifies [Whisper](https://github.com/openai/whisper) to produce more reliable timestamps and extends its functionality.
+https://github.com/jianfch/stable-ts/assets/28970749/7adf0540-3620-4b2b-b2d4-e316906d6dfa
+* [Setup](#setup)
+* [Usage](#usage)
+  * [Transcribe](#transcribe)
+  * [Output](#output)
+  * [Alignment](#alignment)
+    * [Adjustments](#adjustments)
+  * [Refinement](#refinement)
+  * [Regrouping Words](#regrouping-words)
+  * [Editing](#editing)
+  * [Locating Words](#locating-words)
+  * [Silence Suppression](#silence-suppression)
+  * [Tips](#tips)
+  * [Visualizing Suppression](#visualizing-suppression)
+  * [Encode Comparison](#encode-comparison)
+  * [Use with any ASR](#any-asr)
+* [Quick 1.X → 2.X Guide](#quick-1x--2x-guide)
+## Setup
+```
+pip install -U stable-ts
+```
+To install the latest commit:
+```
+pip install -U git+https://github.com/jianfch/stable-ts.git
+```
+## Usage
+### Transcribe
+```python
+import stable_whisper
+model = stable_whisper.load_model('base')
+result = model.transcribe('audio.mp3')
+result.to_srt_vtt('audio.srt')
+```
+<details>
+<summary>CLI</summary>
+```commandline
+stable-ts audio.mp3 -o audio.srt
+```
+</details>
+Docstrings:
+<details>
+<summary>load_model()</summary>
+    Load an instance if :class:`whisper.model.Whisper`.
+    Parameters
+    ----------
+    name : {'tiny', 'tiny.en', 'base', 'base.en', 'small', 'small.en', 'medium', 'medium.en', 'large-v1',
+        'large-v2', 'large-v3', or 'large'}
+        One of the official model names listed by :func:`whisper.available_models`, or
+        path to a model checkpoint containing the model dimensions and the model state_dict.
+    device : str or torch.device, optional
+        PyTorch device to put the model into.
+    download_root : str, optional
+        Path to download the model files; by default, it uses "~/.cache/whisper".
+    in_memory : bool, default False
+        Whether to preload the model weights into host memory.
+    cpu_preload : bool, default True
+        Load model into CPU memory first then move model to specified device
+        to reduce GPU memory usage when loading model
+    dq : bool, default False
+        Whether to apply Dynamic Quantization to model to reduced memory usage and increase inference speed
+        but at the cost of a slight decrease in accuracy. Only for CPU.
+    Returns
+    -------
+    model : "Whisper"
+        The Whisper ASR model instance.
+    Notes
+    -----
+    The overhead from ``dq = True`` might make inference slower for models smaller than 'large'.
+</details>
+<details>
+<summary>transcribe()</summary>
+    Transcribe audio using Whisper.
+    This is a modified version of :func:`whisper.transcribe.transcribe` with slightly different decoding logic while
+    allowing additional preprocessing and postprocessing. The preprocessing performed on the audio includes: isolating
+    voice / removing noise with Demucs and low/high-pass filter. The postprocessing performed on the transcription
+    result includes: adjusting timestamps with VAD and custom regrouping segments based punctuation and speech gaps.
+    Parameters
+    ----------
+    model : whisper.model.Whisper
+        An instance of Whisper ASR model.
+    audio : str or numpy.ndarray or torch.Tensor or bytes
+        Path/URL to the audio file, the audio waveform, or bytes of audio file.
+        If audio is :class:`numpy.ndarray` or :class:`torch.Tensor`, the audio must be already at sampled to 16kHz.
+    verbose : bool or None, default False
+        Whether to display the text being decoded to the console.
+        Displays all the details if ``True``. Displays progressbar if ``False``. Display nothing if ``None``.
+    temperature : float or iterable of float, default (0.0, 0.2, 0.4, 0.6, 0.8, 1.0)
+        Temperature for sampling. It can be a tuple of temperatures, which will be successfully used
+        upon failures according to either ``compression_ratio_threshold`` or ``logprob_threshold``.
+    compression_ratio_threshold : float, default 2.4
+        If the gzip compression ratio is above this value, treat as failed.
+    logprob_threshold : float, default -1
+        If the average log probability over sampled tokens is below this value, treat as failed
+    no_speech_threshold : float, default 0.6
+        If the no_speech probability is higher than this value AND the average log probability
+        over sampled tokens is below ``logprob_threshold``, consider the segment as silent
+    condition_on_previous_text : bool, default True
+        If ``True``, the previous output of the model is provided as a prompt for the next window;
+        disabling may make the text inconsistent across windows, but the model becomes less prone to
+        getting stuck in a failure loop, such as repetition looping or timestamps going out of sync.
+    initial_prompt : str, optional
+        Text to provide as a prompt for the first window. This can be used to provide, or
+        "prompt-engineer" a context for transcription, e.g. custom vocabularies or proper nouns
+        to make it more likely to predict those word correctly.
+    word_timestamps : bool, default True
+        Extract word-level timestamps using the cross-attention pattern and dynamic time warping,
+        and include the timestamps for each word in each segment.
+        Disabling this will prevent segments from splitting/merging properly.
+    regroup : bool or str, default True, meaning the default regroup algorithm
+        String for customizing the regrouping algorithm. False disables regrouping.
+        Ignored if ``word_timestamps = False``.
+    ts_num : int, default 0, meaning disable this option
+        Number of extra timestamp inferences to perform then use average of these extra timestamps.
+        An experimental option that might hurt performance.
+    ts_noise : float, default 0.1
+        Percentage of noise to add to audio_features to perform inferences for ``ts_num``.
+    suppress_silence : bool, default True
+        Whether to enable timestamps adjustments based on the detected silence.
+    suppress_word_ts : bool, default True
+        Whether to adjust word timestamps based on the detected silence. Only enabled if ``suppress_silence = True``.
+    use_word_position : bool, default True
+        Whether to use position of the word in its segment to determine whether to keep end or start timestamps if
+        adjustments are required. If it is the first word, keep end. Else if it is the last word, keep the start.
+    q_levels : int, default 20
+        Quantization levels for generating timestamp suppression mask; ignored if ``vad = true``.
+        Acts as a threshold to marking sound as silent.
+        Fewer levels will increase the threshold of volume at which to mark a sound as silent.
+    k_size : int, default 5
+        Kernel size for avg-pooling waveform to generate timestamp suppression mask; ignored if ``vad = true``.
+        Recommend 5 or 3; higher sizes will reduce detection of silence.
+    time_scale : float, optional
+        Factor for scaling audio duration for inference.
+        Greater than 1.0 'slows down' the audio, and less than 1.0 'speeds up' the audio. None is same as 1.0.
+        A factor of 1.5 will stretch 10s audio to 15s for inference. This increases the effective resolution
+        of the model but can increase word error rate.
+    demucs : bool or torch.nn.Module, default False
+        Whether to preprocess ``audio`` with Demucs to isolate vocals / remove noise. Set ``demucs`` to an instance of
+        a Demucs model to avoid reloading the model for each run.
+        Demucs must be installed to use. Official repo. https://github.com/facebookresearch/demucs.
+    demucs_output : str, optional
+        Path to save the vocals isolated by Demucs as WAV file. Ignored if ``demucs = False``.
+        Demucs must be installed to use. Official repo. https://github.com/facebookresearch/demucs.
+    demucs_options : dict, optional
+        Options to use for :func:`stable_whisper.audio.demucs_audio`.
+    vad : bool, default False
+        Whether to use Silero VAD to generate timestamp suppression mask.
+        Silero VAD requires PyTorch 1.12.0+. Official repo, https://github.com/snakers4/silero-vad.
+    vad_threshold : float, default 0.35
+        Threshold for detecting speech with Silero VAD. Low threshold reduces false positives for silence detection.
+    vad_onnx : bool, default False
+        Whether to use ONNX for Silero VAD.
+    min_word_dur : float, default 0.1
+        Shortest duration each word is allowed to reach for silence suppression.
+    nonspeech_error : float, default 0.3
+        Relative error of non-speech sections that appear in between a word for silence suppression.
+    only_voice_freq : bool, default False
+        Whether to only use sound between 200 - 5000 Hz, where majority of human speech are.
+    prepend_punctuations : str, default '"\'“¿([{-)'
+        Punctuations to prepend to next word.
+    append_punctuations : str, default '.。,，!！?？:：”)]}、)'
+        Punctuations to append to previous word.
+    mel_first : bool, default False
+        Process entire audio track into log-Mel spectrogram first instead in chunks.
+        Used if odd behavior seen in stable-ts but not in whisper, but use significantly more memory for long audio.
+    split_callback : Callable, optional
+        Custom callback for grouping tokens up with their corresponding words.
+        The callback must take two arguments, list of tokens and tokenizer.
+        The callback returns a tuple with a list of words and a corresponding nested list of tokens.
+    suppress_ts_tokens : bool, default False
+        Whether to suppress timestamp tokens during inference for timestamps are detected at silent.
+        Reduces hallucinations in some cases, but also prone to ignore disfluencies and repetitions.
+        This option is ignored if ``suppress_silence = False``.
+    gap_padding : str, default ' ...'
+        Padding prepend to each segments for word timing alignment.
+        Used to reduce the probability of model predicting timestamps earlier than the first utterance.
+    only_ffmpeg : bool, default False
+        Whether to use only FFmpeg (instead of not yt-dlp) for URls
+    max_instant_words : float, default 0.5
+        If percentage of instantaneous words in a segment exceed this amount, the segment is removed.
+    avg_prob_threshold: float or None, default None
+        Transcribe the gap after the previous word and if the average word proababiliy of a segment falls below this
+        value, discard the segment. If ``None``, skip transcribing the gap to reduce chance of timestamps starting
+        before the next utterance.
+    progress_callback : Callable, optional
+        A function that will be called when transcription progress is updated.
+        The callback need two parameters.
+        The first parameter is a float for seconds of the audio that has been transcribed.
+        The second parameter is a float for total duration of audio in seconds.
+    ignore_compatibility : bool, default False
+        Whether to ignore warnings for compatibility issues with the detected Whisper version.
+    decode_options
+        Keyword arguments to construct class:`whisper.decode.DecodingOptions` instances.
+    Returns
+    -------
+    stable_whisper.result.WhisperResult
+        All timestamps, words, probabilities, and other data from the transcription of ``audio``.
+    See Also
+    --------
+    stable_whisper.non_whisper.transcribe_any : Return :class:`stable_whisper.result.WhisperResult` containing all the
+        data from transcribing audio with unmodified :func:`whisper.transcribe.transcribe` with preprocessing and
+        postprocessing.
+    stable_whisper.whisper_word_level.load_faster_whisper.faster_transcribe : Return
+        :class:`stable_whisper.result.WhisperResult` containing all the data from transcribing audio with
+        :meth:`faster_whisper.WhisperModel.transcribe` with preprocessing and postprocessing.
+    Examples
+    --------
+    >>> import stable_whisper
+    >>> model = stable_whisper.load_model('base')
+    >>> result = model.transcribe('audio.mp3', vad=True)
+    >>> result.to_srt_vtt('audio.srt')
+    Saved: audio.srt
+</details>
+<details>
+<summary>transcribe_minimal()</summary>
+    Transcribe audio using Whisper.
+    This is uses the original whisper transcribe function, :func:`whisper.transcribe.transcribe`, while still allowing
+    additional preprocessing and postprocessing. The preprocessing performed on the audio includes: isolating voice /
+    removing noise with Demucs and low/high-pass filter. The postprocessing performed on the transcription
+    result includes: adjusting timestamps with VAD and custom regrouping segments based punctuation and speech gaps.
+    Parameters
+    ----------
+    model : whisper.model.Whisper
+        An instance of Whisper ASR model.
+    audio : str or numpy.ndarray or torch.Tensor or bytes
+        Path/URL to the audio file, the audio waveform, or bytes of audio file.
+        If audio is ``numpy.ndarray`` or ``torch.Tensor``, the audio must be already at sampled to 16kHz.
+    verbose : bool or None, default False
+        Whether to display the text being decoded to the console.
+        Displays all the details if ``True``. Displays progressbar if ``False``. Display nothing if ``None``.
+    word_timestamps : bool, default True
+        Extract word-level timestamps using the cross-attention pattern and dynamic time warping,
+        and include the timestamps for each word in each segment.
+        Disabling this will prevent segments from splitting/merging properly.
+    regroup : bool or str, default True, meaning the default regroup algorithm
+        String for customizing the regrouping algorithm. False disables regrouping.
+        Ignored if ``word_timestamps = False``.
+    suppress_silence : bool, default True
+        Whether to enable timestamps adjustments based on the detected silence.
+    suppress_word_ts : bool, default True
+        Whether to adjust word timestamps based on the detected silence. Only enabled if ``suppress_silence = True``.
+    use_word_position : bool, default True
+        Whether to use position of the word in its segment to determine whether to keep end or start timestamps if
+        adjustments are required. If it is the first word, keep end. Else if it is the last word, keep the start.
+    q_levels : int, default 20
+        Quantization levels for generating timestamp suppression mask; ignored if ``vad = true``.
+        Acts as a threshold to marking sound as silent.
+        Fewer levels will increase the threshold of volume at which to mark a sound as silent.
+    k_size : int, default 5
+        Kernel size for avg-pooling waveform to generate timestamp suppression mask; ignored if ``vad = true``.
+        Recommend 5 or 3; higher sizes will reduce detection of silence.
+    demucs : bool or torch.nn.Module, default False
+        Whether to preprocess ``audio`` with Demucs to isolate vocals / remove noise. Set ``demucs`` to an instance of
+        a Demucs model to avoid reloading the model for each run.
+        Demucs must be installed to use. Official repo, https://github.com/facebookresearch/demucs.
+    demucs_output : str, optional
+        Path to save the vocals isolated by Demucs as WAV file. Ignored if ``demucs = False``.
+        Demucs must be installed to use. Official repo, https://github.com/facebookresearch/demucs.
+    demucs_options : dict, optional
+        Options to use for :func:`stable_whisper.audio.demucs_audio`.
+    vad : bool, default False
+        Whether to use Silero VAD to generate timestamp suppression mask.
+        Silero VAD requires PyTorch 1.12.0+. Official repo, https://github.com/snakers4/silero-vad.
+    vad_threshold : float, default 0.35
+        Threshold for detecting speech with Silero VAD. Low threshold reduces false positives for silence detection.
+    vad_onnx : bool, default False
+        Whether to use ONNX for Silero VAD.
+    min_word_dur : float, default 0.1
+        Shortest duration each word is allowed to reach for silence suppression.
+    nonspeech_error : float, default 0.3
+        Relative error of non-speech sections that appear in between a word for silence suppression.
+    only_voice_freq : bool, default False
+        Whether to only use sound between 200 - 5000 Hz, where majority of human speech are.
+    only_ffmpeg : bool, default False
+        Whether to use only FFmpeg (instead of not yt-dlp) for URls
+    options
+        Additional options used for :func:`whisper.transcribe.transcribe` and
+        :func:`stable_whisper.non_whisper.transcribe_any`.
+    Returns
+    -------
+    stable_whisper.result.WhisperResult
+        All timestamps, words, probabilities, and other data from the transcription of ``audio``.
+    Examples
+    --------
+    >>> import stable_whisper
+    >>> model = stable_whisper.load_model('base')
+    >>> result = model.transcribe_minimal('audio.mp3', vad=True)
+    >>> result.to_srt_vtt('audio.srt')
+    Saved: audio.srt
+</details>
+<br>
+<details>
+<summary>faster-whisper</summary>
+Use with [faster-whisper](https://github.com/guillaumekln/faster-whisper):
+```python
+model = stable_whisper.load_faster_whisper('base')
+result = model.transcribe_stable('audio.mp3')
+```
+```commandline
+stable-ts audio.mp3 -o audio.srt -fw
+```
+Docstring:
+<details>
+<summary>load_faster_whisper()</summary>
+    Load an instance of :class:`faster_whisper.WhisperModel`.
+    Parameters
+    ----------
+    model_size_or_path : {'tiny', 'tiny.en', 'base', 'base.en', 'small', 'small.en', 'medium', 'medium.en', 'large-v1',
+        'large-v2', 'large-v3', or 'large'}
+        Size of the model.
+    model_init_options
+        Additional options to use for initialization of :class:`faster_whisper.WhisperModel`.
+    Returns
+    -------
+    faster_whisper.WhisperModel
+        A modified instance with :func:`stable_whisper.whisper_word_level.load_faster_whisper.faster_transcribe`
+        assigned to :meth:`faster_whisper.WhisperModel.transcribe_stable`.
+</details>
+<details>
+<summary>transcribe_stable()</summary>
+        Transcribe audio using faster-whisper (https://github.com/guillaumekln/faster-whisper).
+        This is uses the transcribe method from faster-whisper, :meth:`faster_whisper.WhisperModel.transcribe`, while
+        still allowing additional preprocessing and postprocessing. The preprocessing performed on the audio includes:
+        isolating voice / removing noise with Demucs and low/high-pass filter. The postprocessing performed on the
+        transcription result includes: adjusting timestamps with VAD and custom regrouping segments based punctuation
+        and speech gaps.
+        Parameters
+        ----------
+        model : faster_whisper.WhisperModel
+            The faster-whisper ASR model instance.
+        audio : str or numpy.ndarray or torch.Tensor or bytes
+            Path/URL to the audio file, the audio waveform, or bytes of audio file.
+            If audio is :class:`numpy.ndarray` or :class:`torch.Tensor`, the audio must be already at sampled to 16kHz.
+        verbose : bool or None, default False
+            Whether to display the text being decoded to the console.
+            Displays all the details if ``True``. Displays progressbar if ``False``. Display nothing if ``None``.
+        word_timestamps : bool, default True
+            Extract word-level timestamps using the cross-attention pattern and dynamic time warping,
+            and include the timestamps for each word in each segment.
+            Disabling this will prevent segments from splitting/merging properly.
+        regroup : bool or str, default True, meaning the default regroup algorithm
+            String for customizing the regrouping algorithm. False disables regrouping.
+            Ignored if ``word_timestamps = False``.
+        suppress_silence : bool, default True
+            Whether to enable timestamps adjustments based on the detected silence.
+        suppress_word_ts : bool, default True
+            Whether to adjust word timestamps based on the detected silence. Only enabled if ``suppress_silence = True``.
+        use_word_position : bool, default True
+            Whether to use position of the word in its segment to determine whether to keep end or start timestamps if
+            adjustments are required. If it is the first word, keep end. Else if it is the last word, keep the start.
+        q_levels : int, default 20
+            Quantization levels for generating timestamp suppression mask; ignored if ``vad = true``.
+            Acts as a threshold to marking sound as silent.
+            Fewer levels will increase the threshold of volume at which to mark a sound as silent.
+        k_size : int, default 5
+            Kernel size for avg-pooling waveform to generate timestamp suppression mask; ignored if ``vad = true``.
+            Recommend 5 or 3; higher sizes will reduce detection of silence.
+        demucs : bool or torch.nn.Module, default False
+            Whether to preprocess ``audio`` with Demucs to isolate vocals / remove noise. Set ``demucs`` to an instance
+            of a Demucs model to avoid reloading the model for each run.
+            Demucs must be installed to use. Official repo, https://github.com/facebookresearch/demucs.
+        demucs_output : str, optional
+            Path to save the vocals isolated by Demucs as WAV file. Ignored if ``demucs = False``.
+            Demucs must be installed to use. Official repo, https://github.com/facebookresearch/demucs.
+        demucs_options : dict, optional
+            Options to use for :func:`stable_whisper.audio.demucs_audio`.
+        vad : bool, default False
+            Whether to use Silero VAD to generate timestamp suppression mask.
+            Silero VAD requires PyTorch 1.12.0+. Official repo, https://github.com/snakers4/silero-vad.
+        vad_threshold : float, default 0.35
+            Threshold for detecting speech with Silero VAD. Low threshold reduces false positives for silence detection.
+        vad_onnx : bool, default False
+            Whether to use ONNX for Silero VAD.
+        min_word_dur : float, default 0.1
+            Shortest duration each word is allowed to reach for silence suppression.
+        nonspeech_error : float, default 0.3
+            Relative error of non-speech sections that appear in between a word for silence suppression.
+        only_voice_freq : bool, default False
+            Whether to only use sound between 200 - 5000 Hz, where majority of human speech are.
+        only_ffmpeg : bool, default False
+            Whether to use only FFmpeg (instead of not yt-dlp) for URls
+        check_sorted : bool, default True
+            Whether to raise an error when timestamps returned by faster-whipser are not in ascending order.
+        progress_callback : Callable, optional
+            A function that will be called when transcription progress is updated.
+            The callback need two parameters.
+            The first parameter is a float for seconds of the audio that has been transcribed.
+            The second parameter is a float for total duration of audio in seconds.
+        options
+            Additional options used for :meth:`faster_whisper.WhisperModel.transcribe` and
+            :func:`stable_whisper.non_whisper.transcribe_any`.
+        Returns
+        -------
+        stable_whisper.result.WhisperResult
+            All timestamps, words, probabilities, and other data from the transcription of ``audio``.
+        Examples
+        --------
+        >>> import stable_whisper
+        >>> model = stable_whisper.load_faster_whisper('base')
+        >>> result = model.transcribe_stable('audio.mp3', vad=True)
+        >>> result.to_srt_vtt('audio.srt')
+        Saved: audio.srt
+</details>
+</details>
+### Output
+Stable-ts supports various text output formats.
+```python
+result.to_srt_vtt('audio.srt') #SRT
+result.to_srt_vtt('audio.vtt') #VTT
+result.to_ass('audio.ass') #ASS
+result.to_tsv('audio.tsv') #TSV
+```
+Docstrings:
+<details>
+<summary>result_to_srt_vtt()</summary>
+    Generate SRT/VTT from ``result`` to display segment-level and/or word-level timestamp.
+    Parameters
+    ----------
+    result : dict or list or stable_whisper.result.WhisperResult
+        Result of transcription.
+    filepath : str, default None, meaning content will be returned as a ``str``
+        Path to save file.
+    segment_level : bool, default True
+        Whether to use segment-level timestamps in output.
+    word_level : bool, default True
+        Whether to use word-level timestamps in output.
+    min_dur : float, default 0.2
+        Minimum duration allowed for any word/segment before the word/segments are merged with adjacent word/segments.
+    tag: tuple of (str, str), default None, meaning ('<font color="#00ff00">', '</font>') if SRT else ('<u>', '</u>')
+        Tag used to change the properties a word at its timestamp.
+    vtt : bool, default None, meaning determined by extension of ``filepath`` or ``False`` if no valid extension.
+        Whether to output VTT.
+    strip : bool, default True
+        Whether to remove spaces before and after text on each segment for output.
+    reverse_text: bool or tuple, default False
+        Whether to reverse the order of words for each segment or provide the ``prepend_punctuations`` and
+        ``append_punctuations`` as tuple pair instead of ``True`` which is for the default punctuations.
+    Returns
+    -------
+    str
+        String of the content if ``filepath`` is ``None``.
+    Notes
+    -----
+    ``reverse_text`` will not fix RTL text not displaying tags properly which is an issue with some video player. VLC
+    seems to not suffer from this issue.
+    Examples
+    --------
+    >>> import stable_whisper
+    >>> model = stable_whisper.load_model('base')
+    >>> result = model.transcribe('audio.mp3')
+    >>> result.to_srt_vtt('audio.srt')
+    Saved: audio.srt
+</details>
+<details>
+<summary>result_to_ass()</summary>
+    Generate Advanced SubStation Alpha (ASS) file from ``result`` to display segment-level and/or word-level timestamp.
+    Parameters
+    ----------
+    result : dict or list or stable_whisper.result.WhisperResult
+        Result of transcription.
+    filepath : str, default None, meaning content will be returned as a ``str``
+        Path to save file.
+    segment_level : bool, default True
+        Whether to use segment-level timestamps in output.
+    word_level : bool, default True
+        Whether to use word-level timestamps in output.
+    min_dur : float, default 0.2
+        Minimum duration allowed for any word/segment before the word/segments are merged with adjacent word/segments.
+    tag: tuple of (str, str) or int, default None, meaning use default highlighting
+        Tag used to change the properties a word at its timestamp. -1 for individual word highlight tag.
+    font : str, default `Arial`
+        Word font.
+    font_size : int, default 48
+        Word font size.
+    strip : bool, default True
+        Whether to remove spaces before and after text on each segment for output.
+    highlight_color : str, default '00ff00'
+        Hexadecimal of the color use for default highlights as '<bb><gg><rr>'.
+    karaoke : bool, default False
+        Whether to use progressive filling highlights (for karaoke effect).
+    reverse_text: bool or tuple, default False
+        Whether to reverse the order of words for each segment or provide the ``prepend_punctuations`` and
+        ``append_punctuations`` as tuple pair instead of ``True`` which is for the default punctuations.
+    kwargs:
+        Format styles:
+        'Name', 'Fontname', 'Fontsize', 'PrimaryColour', 'SecondaryColour', 'OutlineColour', 'BackColour', 'Bold',
+        'Italic', 'Underline', 'StrikeOut', 'ScaleX', 'ScaleY', 'Spacing', 'Angle', 'BorderStyle', 'Outline',
+        'Shadow', 'Alignment', 'MarginL', 'MarginR', 'MarginV', 'Encoding'
+    Returns
+    -------
+    str
+        String of the content if ``filepath`` is ``None``.
+    Notes
+    -----
+    ``reverse_text`` will not fix RTL text not displaying tags properly which is an issue with some video player. VLC
+    seems to not suffer from this issue.
+    Examples
+    --------
+    >>> import stable_whisper
+    >>> model = stable_whisper.load_model('base')
+    >>> result = model.transcribe('audio.mp3')
+    >>> result.to_ass('audio.ass')
+    Saved: audio.ass
+</details>
+<details>
+<summary>result_to_tsv()</summary>
+    Generate TSV from ``result`` to display segment-level and/or word-level timestamp.
+    Parameters
+    ----------
+    result : dict or list or stable_whisper.result.WhisperResult
+        Result of transcription.
+    filepath : str, default None, meaning content will be returned as a ``str``
+        Path to save file.
+    segment_level : bool, default True
+        Whether to use segment-level timestamps in output.
+    word_level : bool, default True
+        Whether to use word-level timestamps in output.
+    min_dur : float, default 0.2
+        Minimum duration allowed for any word/segment before the word/segments are merged with adjacent word/segments.
+    strip : bool, default True
+        Whether to remove spaces before and after text on each segment for output.
+    reverse_text: bool or tuple, default False
+        Whether to reverse the order of words for each segment or provide the ``prepend_punctuations`` and
+        ``append_punctuations`` as tuple pair instead of ``True`` which is for the default punctuations.
+    Returns
+    -------
+    str
+        String of the content if ``filepath`` is ``None``.
+    Notes
+    -----
+    ``reverse_text`` will not fix RTL text not displaying tags properly which is an issue with some video player. VLC
+    seems to not suffer from this issue.
+    Examples
+    --------
+    >>> import stable_whisper
+    >>> model = stable_whisper.load_model('base')
+    >>> result = model.transcribe('audio.mp3')
+    >>> result.to_tsv('audio.tsv')
+    Saved: audio.tsv
+</details>
+<details>
+<summary>result_to_txt()</summary>
+    Generate plain-text without timestamps from ``result``.
+    Parameters
+    ----------
+    result : dict or list or stable_whisper.result.WhisperResult
+        Result of transcription.
+    filepath : str, default None, meaning content will be returned as a ``str``
+        Path to save file.
+    min_dur : float, default 0.2
+        Minimum duration allowed for any word/segment before the word/segments are merged with adjacent word/segments.
+    strip : bool, default True
+        Whether to remove spaces before and after text on each segment for output.
+    reverse_text: bool or tuple, default False
+        Whether to reverse the order of words for each segment or provide the ``prepend_punctuations`` and
+        ``append_punctuations`` as tuple pair instead of ``True`` which is for the default punctuations.
+    Returns
+    -------
+    str
+        String of the content if ``filepath`` is ``None``.
+    Notes
+    -----
+    ``reverse_text`` will not fix RTL text not displaying tags properly which is an issue with some video player. VLC
+    seems to not suffer from this issue.
+    Examples
+    --------
+    >>> import stable_whisper
+    >>> model = stable_whisper.load_model('base')
+    >>> result = model.transcribe('audio.mp3')
+    >>> result.to_txt('audio.txt')
+    Saved: audio.txt
+</details>
+<details>
+<summary>save_as_json()</summary>
+    Save ``result`` as JSON file to ``path``.
+    Parameters
+    ----------
+    result : dict or list or stable_whisper.result.WhisperResult
+        Result of transcription.
+    path : str
+        Path to save file.
+    ensure_ascii : bool, default False
+        Whether to escape non-ASCII characters.
+    Examples
+    --------
+    >>> import stable_whisper
+    >>> model = stable_whisper.load_model('base')
+    >>> result = model.transcribe('audio.mp3')
+    >>> result.save_as_json('audio.json')
+    Saved: audio.json
+</details>
+<br /><br />
+There are word-level and segment-level timestamps. All output formats support them.
+They also support will both levels simultaneously except TSV.
+By default, `segment_level` and `word_level` are both `True` for all the formats that support both simultaneously.<br /><br />
+Examples in VTT.
+Default: `segment_level=True` + `word_level=True`
+<details>
+<summary>CLI</summary>
+`--segment_level true` + `--word_level true`
+</details>
+```
+00:00:07.760 --> 00:00:09.900
+But<00:00:07.860> when<00:00:08.040> you<00:00:08.280> arrived<00:00:08.580> at<00:00:08.800> that<00:00:09.000> distant<00:00:09.400> world,
+```
+`segment_level=True`  + `word_level=False`
+```
+00:00:07.760 --> 00:00:09.900
+But when you arrived at that distant world,
+```
+`segment_level=False` + `word_level=True`
+```
+00:00:07.760 --> 00:00:07.860
+But
+00:00:07.860 --> 00:00:08.040
+when
+00:00:08.040 --> 00:00:08.280
+you
+00:00:08.280 --> 00:00:08.580
+arrived
+...
+```
+#### JSON
+The result can also be saved as a JSON file to preserve all the data for future reprocessing.
+This is useful for testing different sets of postprocessing arguments without the need to redo inference.
+```python
+result.save_as_json('audio.json')
+```
+<details>
+<summary>CLI</summary>
+```commandline
+stable-ts audio.mp3 -o audio.json
+```
+</details>
+Processing JSON file of the results into SRT.
+```python
+result = stable_whisper.WhisperResult('audio.json')
+result.to_srt_vtt('audio.srt')
+```
+<details>
+<summary>CLI</summary>
+```commandline
+stable-ts audio.json -o audio.srt
+```
+</details>
+### Alignment
+Audio can be aligned/synced with plain text on word-level.
+```python
+text = 'Machines thinking, breeding. You were to bear us a new, promised land.'
+result = model.align('audio.mp3', text, language='en')
+```
+When the text is correct but the timestamps need more work,
+`align()` is a faster alternative for testing various settings/models.
+```python
+new_result = model.align('audio.mp3', result, language='en')
+```
+<details>
+<summary>CLI</summary>
+```commandline
+stable-ts audio.mp3 --align text.txt --language en
+```
+`--align` can also a JSON file of a result
+</details>
+Docstring:
+<details>
+<summary>align()</summary>
+    Align plain text or tokens with audio at word-level.
+    Since this is significantly faster than transcribing, it is a more efficient method for testing various settings
+    without re-transcribing. This is also useful for timing a more correct transcript than one that Whisper can produce.
+    Parameters
+    ----------
+    model : "Whisper"
+        The Whisper ASR model modified instance
+    audio : str or numpy.ndarray or torch.Tensor or bytes
+        Path/URL to the audio file, the audio waveform, or bytes of audio file.
+        If audio is :class:`numpy.ndarray` or :class:`torch.Tensor`, the audio must be already at sampled to 16kHz.
+    text : str or list of int or stable_whisper.result.WhisperResult
+        String of plain-text, list of tokens, or instance of :class:`stable_whisper.result.WhisperResult`.
+    language : str, default None, uses ``language`` in ``text`` if it is a :class:`stable_whisper.result.WhisperResult`
+        Language of ``text``. Required if ``text`` does not contain ``language``.
+    remove_instant_words : bool, default False
+        Whether to truncate any words with zero duration.
+    token_step : int, default 100
+        Max number of tokens to align each pass. Use higher values to reduce chance of misalignment.
+    original_split : bool, default False
+        Whether to preserve the original segment groupings. Segments are spit by line break if ``text`` is plain-text.
+    max_word_dur : float or None, default 3.0
+        Global maximum word duration in seconds. Re-align words that exceed the global maximum word duration.
+    word_dur_factor : float or None, default 2.0
+        Factor to compute the Local maximum word duration, which is ``word_dur_factor`` * local medium word duration.
+        Words that need re-alignment, are re-algined with duration <= local/global maximum word duration.
+    nonspeech_skip : float or None, default 3.0
+        Skip non-speech sections that are equal or longer than this duration in seconds. Disable skipping if ``None``.
+    fast_mode : bool, default False
+        Whether to speed up alignment by re-alignment with local/global maximum word duration.
+        ``True`` tends produce better timestamps when ``text`` is accurate and there are no large speechless gaps.
+    tokenizer : "Tokenizer", default None, meaning a new tokenizer is created according ``language`` and ``model``
+        A tokenizer to used tokenizer text and detokenize tokens.
+    verbose : bool or None, default False
+        Whether to display the text being decoded to the console.
+        Displays all the details if ``True``. Displays progressbar if ``False``. Display nothing if ``None``.
+    regroup : bool or str, default True, meaning the default regroup algorithm
+        String for customizing the regrouping algorithm. False disables regrouping.
+        Ignored if ``word_timestamps = False``.
+    suppress_silence : bool, default True
+        Whether to enable timestamps adjustments based on the detected silence.
+    suppress_word_ts : bool, default True
+        Whether to adjust word timestamps based on the detected silence. Only enabled if ``suppress_silence = True``.
+    use_word_position : bool, default True
+        Whether to use position of the word in its segment to determine whether to keep end or start timestamps if
+        adjustments are required. If it is the first word, keep end. Else if it is the last word, keep the start.
+    q_levels : int, default 20
+        Quantization levels for generating timestamp suppression mask; ignored if ``vad = true``.
+        Acts as a threshold to marking sound as silent.
+        Fewer levels will increase the threshold of volume at which to mark a sound as silent.
+    k_size : int, default 5
+        Kernel size for avg-pooling waveform to generate timestamp suppression mask; ignored if ``vad = true``.
+        Recommend 5 or 3; higher sizes will reduce detection of silence.
+    demucs : bool or torch.nn.Module, default False
+        Whether to preprocess ``audio`` with Demucs to isolate vocals / remove noise. Set ``demucs`` to an instance of
+        a Demucs model to avoid reloading the model for each run.
+        Demucs must be installed to use. Official repo, https://github.com/facebookresearch/demucs.
+    demucs_output : str, optional
+        Path to save the vocals isolated by Demucs as WAV file. Ignored if ``demucs = False``.
+        Demucs must be installed to use. Official repo, https://github.com/facebookresearch/demucs.
+    demucs_options : dict, optional
+        Options to use for :func:`stable_whisper.audio.demucs_audio`.
+    vad : bool, default False
+        Whether to use Silero VAD to generate timestamp suppression mask.
+        Silero VAD requires PyTorch 1.12.0+. Official repo, https://github.com/snakers4/silero-vad.
+    vad_threshold : float, default 0.35
+        Threshold for detecting speech with Silero VAD. Low threshold reduces false positives for silence detection.
+    vad_onnx : bool, default False
+        Whether to use ONNX for Silero VAD.
+    min_word_dur : float, default 0.1
+        Shortest duration each word is allowed to reach for silence suppression.
+    nonspeech_error : float, default 0.3
+        Relative error of non-speech sections that appear in between a word for silence suppression.
+    only_voice_freq : bool, default False
+        Whether to only use sound between 200 - 5000 Hz, where majority of human speech are.
+    prepend_punctuations : str, default '"'“¿([{-)'
+        Punctuations to prepend to next word.
+    append_punctuations : str, default '.。,，!！?？:：”)]}、)'
+        Punctuations to append to previous word.
+    progress_callback : Callable, optional
+        A function that will be called when transcription progress is updated.
+        The callback need two parameters.
+        The first parameter is a float for seconds of the audio that has been transcribed.
+        The second parameter is a float for total duration of audio in seconds.
+    ignore_compatibility : bool, default False
+        Whether to ignore warnings for compatibility issues with the detected Whisper version.
+    Returns
+    -------
+    stable_whisper.result.WhisperResult or None
+        All timestamps, words, probabilities, and other data from the alignment of ``audio``. Return None if alignment
+        fails and ``remove_instant_words = True``.
+    Notes
+    -----
+    If ``token_step`` is less than 1, ``token_step`` will be set to its maximum value, 442. This value is computed with
+    ``whisper.model.Whisper.dims.n_text_ctx`` - 6.
+    IF ``original_split = True`` and a line break is found in middle of a word in ``text``, the split will occur after
+    that word.
+    ``regroup`` is ignored if ``original_split = True``.
+    Examples
+    --------
+    >>> import stable_whisper
+    >>> model = stable_whisper.load_model('base')
+    >>> result = model.align('helloworld.mp3', 'Hello, World!', 'English')
+    >>> result.to_srt_vtt('helloword.srt')
+    Saved 'helloworld.srt'
+</details>
+#### Adjustments
+Timestamps are adjusted after the model predicts them.
+When `suppress_silence=True` (default), `transcribe()`/`transcribe_minimal()`/`align()` adjust based on silence/non-speech.
+The timestamps can be further adjusted base on another result with `adjust_by_result()`,
+which acts as a logical AND operation for the timestamps of both results, further reducing duration of each word.
+Note: both results are required to have word timestamps and matching words.
+```python
+# the adjustments are in-place for `result`
+result.adjust_by_result(new_result)
+```
+Docstring:
+<details>
+<summary>adjust_by_result()</summary>
+        Minimize the duration of words using timestamps of another result.
+        Parameters
+        ----------
+        other_result : "WhisperResult"
+            Timing data of the same words in a WhisperResult instance.
+        min_word_dur : float, default 0.1
+            Prevent changes to timestamps if the resultant word duration is less than ``min_word_dur``.
+        verbose : bool, default False
+            Whether to print out the timestamp changes.
+</details>
+### Refinement
+Timestamps can be further improved with `refine()`.
+This method iteratively mutes portions of the audio based on current timestamps
+then compute the probabilities of the tokens.
+Then by monitoring the fluctuation of the probabilities, it tries to find the most precise timestamps.
+"Most precise" in this case means the latest start and earliest end for the word
+such that it still meets the specified conditions.
+```python
+model.refine('audio.mp3', result)
+```
+<details>
+<summary>CLI</summary>
+```commandline
+stable-ts audio.mp3 --refine -o audio.srt
+```
+Input can also be JSON file of a result.
+```commandline
+stable-ts result.json --refine -o audio.srt --refine_option "audio=audio.mp3"
+```
+</details>
+Docstring:
+<details>
+<summary>refine()</summary>
+    Improve existing timestamps.
+    This function iteratively muting portions of the audio and monitoring token probabilities to find the most precise
+    timestamps. This "most precise" in this case means the latest start and earliest end of a word that maintains an
+    acceptable probability determined by the specified arguments.
+    This is useful readjusting timestamps when they start too early or end too late.
+    Parameters
+    ----------
+    model : "Whisper"
+        The Whisper ASR model modified instance
+    audio : str or numpy.ndarray or torch.Tensor or bytes
+        Path/URL to the audio file, the audio waveform, or bytes of audio file.
+        If audio is :class:`numpy.ndarray` or :class:`torch.Tensor`, the audio must be already at sampled to 16kHz.
+    result : stable_whisper.result.WhisperResult
+        All timestamps, words, probabilities, and other data from the transcription of ``audio``.
+    steps : str, default 'se'
+        Instructions for refinement. A 's' means refine start-timestamps. An 'e' means refine end-timestamps.
+    rel_prob_decrease : float, default 0.3
+        Maximum percent decrease in probability relative to original probability which is the probability from muting
+        according initial timestamps.
+    abs_prob_decrease : float, default 0.05
+        Maximum decrease in probability from original probability.
+    rel_rel_prob_decrease : float, optional
+        Maximum percent decrease in probability relative to previous probability which is the probability from previous
+        iteration of muting.
+    prob_threshold : float, default 0.5
+        Stop refining the timestamp if the probability of its token goes below this value.
+    rel_dur_change : float, default 0.5
+        Maximum percent change in duration of a word relative to its original duration.
+    abs_dur_change : float, optional
+        Maximum seconds a word is allowed deviate from its original duration.
+    word_level : bool, default True
+        Whether to refine timestamps on word-level. If ``False``, only refine start/end timestamps of each segment.
+    precision : float, default 0.1
+        Precision of refined timestamps in seconds. The lowest precision is 0.02 second.
+    single_batch : bool, default False
+        Whether to process in only batch size of one to reduce memory usage.
+    inplace : bool, default True, meaning return a deepcopy of ``result``
+        Whether to alter timestamps in-place.
+    demucs : bool or torch.nn.Module, default False
+        Whether to preprocess ``audio`` with Demucs to isolate vocals / remove noise. Set ``demucs`` to an instance of
+        a Demucs model to avoid reloading the model for each run.
+        Demucs must be installed to use. Official repo, https://github.com/facebookresearch/demucs.
+    demucs_options : dict, optional
+        Options to use for :func:`stable_whisper.audio.demucs_audio`.
+    only_voice_freq : bool, default False
+        Whether to only use sound between 200 - 5000 Hz, where majority of human speech are.
+    verbose : bool or None, default False
+        Whether to display the text being decoded to the console.
+        Displays all the details if ``True``. Displays progressbar if ``False``. Display nothing if ``None``.
+    Returns
+    -------
+    stable_whisper.result.WhisperResult
+        All timestamps, words, probabilities, and other data from the refinement of ``text`` with ``audio``.
+    Notes
+    -----
+    The lower the ``precision``, the longer the processing time.
+    Examples
+    --------
+    >>> import stable_whisper
+    >>> model = stable_whisper.load_model('base')
+    >>> result = model.transcribe('audio.mp3')
+    >>> model.refine('audio.mp3', result)
+    >>> result.to_srt_vtt('audio.srt')
+    Saved 'audio.srt'
+</details>
+### Regrouping Words
+Stable-ts has a preset for regrouping words into different segments with more natural boundaries.
+This preset is enabled by `regroup=True` (default).
+But there are other built-in [regrouping methods](#regrouping-methods) that allow you to customize the regrouping algorithm.
+This preset is just a predefined combination of those methods.
+https://github.com/jianfch/stable-ts/assets/28970749/7b6164a3-50e2-4368-8b75-853cb14045ec
+```python
+# The following results are all functionally equivalent:
+result0 = model.transcribe('audio.mp3', regroup=True) # regroup is True by default
+result1 = model.transcribe('audio.mp3', regroup=False)
+(
+    result1
+    .clamp_max()
+    .split_by_punctuation([('.', ' '), '。', '?', '？', (',', ' '), '，'])
+    .split_by_gap(.5)
+    .merge_by_gap(.3, max_words=3)
+    .split_by_punctuation([('.', ' '), '。', '?', '？'])
+)
+result2 = model.transcribe('audio.mp3', regroup='cm_sp=.* /。/?/？/,* /，_sg=.5_mg=.3+3_sp=.* /。/?/？')
+# To undo all regrouping operations:
+result0.reset()
+```
+Any regrouping algorithm can be expressed as a string. Please feel free share your strings [here](https://github.com/jianfch/stable-ts/discussions/162)
+#### Regrouping Methods
+<details>
+<summary>regroup()</summary>
+        Regroup (in-place) words into segments.
+        Parameters
+        ----------
+        regroup_algo: str or bool, default 'da'
+             String representation of a custom regrouping algorithm or ``True`` use to the default algorithm 'da'.
+        verbose : bool, default False
+            Whether to show all the methods and arguments parsed from ``regroup_algo``.
+        only_show : bool, default False
+            Whether to show the all methods and arguments parsed from ``regroup_algo`` without running the methods
+        Returns
+        -------
+        stable_whisper.result.WhisperResult
+            The current instance after the changes.
+        Notes
+        -----
+        Syntax for string representation of custom regrouping algorithm.
+            Method keys:
+                sg: split_by_gap
+                sp: split_by_punctuation
+                sl: split_by_length
+                sd: split_by_duration
+                mg: merge_by_gap
+                mp: merge_by_punctuation
+                ms: merge_all_segment
+                cm: clamp_max
+                l: lock
+                us: unlock_all_segments
+                da: default algorithm (cm_sp=.* /。/?/？/,* /，_sg=.5_mg=.3+3_sp=.* /。/?/？)
+                rw: remove_word
+                rs: remove_segment
+                rp: remove_repetition
+                rws: remove_words_by_str
+                fg: fill_in_gaps
+            Metacharacters:
+                = separates a method key and its arguments (not used if no argument)
+                _ separates method keys (after arguments if there are any)
+                + separates arguments for a method key
+                / separates an argument into list of strings
+                * separates an item in list of strings into a nested list of strings
+            Notes:
+            -arguments are parsed positionally
+            -if no argument is provided, the default ones will be used
+            -use 1 or 0 to represent True or False
+            Example 1:
+                merge_by_gap(.2, 10, lock=True)
+                mg=.2+10+++1
+                Note: [lock] is the 5th argument hence the 2 missing arguments inbetween the three + before 1
+            Example 2:
+                split_by_punctuation([('.', ' '), '。', '?', '？'], True)
+                sp=.* /。/?/？+1
+            Example 3:
+                merge_all_segments().split_by_gap(.5).merge_by_gap(.15, 3)
+                ms_sg=.5_mg=.15+3
+</details>
+<details>
+<summary>split_by_gap()</summary>
+        Split (in-place) any segment where the gap between two of its words is greater than ``max_gap``.
+        Parameters
+        ----------
+        max_gap : float, default 0.1
+            Maximum second(s) allowed between two words if the same segment.
+        lock : bool, default False
+            Whether to prevent future splits/merges from altering changes made by this method.
+        newline: bool, default False
+            Whether to insert line break at the split points instead of splitting into separate segments.
+        Returns
+        -------
+        stable_whisper.result.WhisperResult
+            The current instance after the changes.
+</details>
+<details>
+<summary>split_by_punctuation()</summary>
+        Split (in-place) segments at words that start/end with ``punctuation``.
+        Parameters
+        ----------
+        punctuation : list of str of list of tuple of (str, str) or str
+            Punctuation(s) to split segments by.
+        lock : bool, default False
+            Whether to prevent future splits/merges from altering changes made by this method.
+        newline : bool, default False
+            Whether to insert line break at the split points instead of splitting into separate segments.
+        min_words : int, optional
+            Split segments with words >= ``min_words``.
+        min_chars : int, optional
+            Split segments with characters >= ``min_chars``.
+        min_dur : int, optional
+            split segments with duration (in seconds) >= ``min_dur``.
+        Returns
+        -------
+        stable_whisper.result.WhisperResult
+            The current instance after the changes.
+</details>
+<details>
+<summary>split_by_length()</summary>
+        Split (in-place) any segment that exceeds ``max_chars`` or ``max_words`` into smaller segments.
+        Parameters
+        ----------
+        max_chars : int, optional
+            Maximum number of characters allowed in each segment.
+        max_words : int, optional
+            Maximum number of words allowed in each segment.
+        even_split : bool, default True
+            Whether to evenly split a segment in length if it exceeds ``max_chars`` or ``max_words``.
+        force_len : bool, default False
+            Whether to force a constant length for each segment except the last segment.
+            This will ignore all previous non-locked segment boundaries.
+        lock : bool, default False
+            Whether to prevent future splits/merges from altering changes made by this method.
+        include_lock: bool, default False
+            Whether to include previous lock before splitting based on max_words, if ``even_split = False``.
+            Splitting will be done after the first non-locked word > ``max_chars`` / ``max_words``.
+        newline: bool, default False
+            Whether to insert line break at the split points instead of splitting into separate segments.
+        Returns
+        -------
+        stable_whisper.result.WhisperResult
+            The current instance after the changes.
+        Notes
+        -----
+        If ``even_split = True``, segments can still exceed ``max_chars`` and locked words will be ignored to avoid
+        uneven splitting.
+</details>
+<details>
+<summary>split_by_duration()</summary>
+        Split (in-place) any segment that exceeds ``max_dur`` into smaller segments.
+        Parameters
+        ----------
+        max_dur : float
+            Maximum duration (in seconds) per segment.
+        even_split : bool, default True
+            Whether to evenly split a segment in length if it exceeds ``max_dur``.
+        force_len : bool, default False
+            Whether to force a constant length for each segment except the last segment.
+            This will ignore all previous non-locked segment boundaries.
+        lock : bool, default False
+            Whether to prevent future splits/merges from altering changes made by this method.
+        include_lock: bool, default False
+            Whether to include previous lock before splitting based on max_words, if ``even_split = False``.
+            Splitting will be done after the first non-locked word > ``max_dur``.
+        newline: bool, default False
+            Whether to insert line break at the split points instead of splitting into separate segments.
+        Returns
+        -------
+        stable_whisper.result.WhisperResult
+            The current instance after the changes.
+        Notes
+        -----
+        If ``even_split = True``, segments can still exceed ``max_dur`` and locked words will be ignored to avoid
+        uneven splitting.
+</details>
+<details>
+<summary>merge_by_gap()</summary>
+        Merge (in-place) any pair of adjacent segments if the gap between them <= ``min_gap``.
+        Parameters
+        ----------
+        min_gap : float, default 0.1
+            Minimum second(s) allow between two segment.
+        max_words : int, optional
+            Maximum number of words allowed in each segment.
+        max_chars : int, optional
+            Maximum number of characters allowed in each segment.
+        is_sum_max : bool, default False
+            Whether ``max_words`` and ``max_chars`` is applied to the merged segment instead of the individual segments
+            to be merged.
+        lock : bool, default False
+            Whether to prevent future splits/merges from altering changes made by this method.
+        Returns
+        -------
+        stable_whisper.result.WhisperResult
+            The current instance after the changes.
+</details>
+<details>
+<summary>merge_by_punctuation()</summary>
+        Merge (in-place) any two segments that has specific punctuations inbetween.
+        Parameters
+        ----------
+        punctuation : list of str of list of tuple of (str, str) or str
+            Punctuation(s) to merge segments by.
+        max_words : int, optional
+            Maximum number of words allowed in each segment.
+        max_chars : int, optional
+            Maximum number of characters allowed in each segment.
+        is_sum_max : bool, default False
+            Whether ``max_words`` and ``max_chars`` is applied to the merged segment instead of the individual segments
+            to be merged.
+        lock : bool, default False
+            Whether to prevent future splits/merges from altering changes made by this method.
+        Returns
+        -------
+        stable_whisper.result.WhisperResult
+            The current instance after the changes.
+</details>
+<details>
+<summary>merge_all_segments()</summary>
+        Merge all segments into one segment.
+        Returns
+        -------
+        stable_whisper.result.WhisperResult
+            The current instance after the changes.
+</details>
+<details>
+<summary>clamp_max()</summary>
+        Clamp all word durations above certain value.
+        This is most effective when applied before and after other regroup operations.
+        Parameters
+        ----------
+        medium_factor : float, default 2.5
+            Clamp durations above (``medium_factor`` * medium duration) per segment.
+            If ``medium_factor = None/0`` or segment has less than 3 words, it will be ignored and use only ``max_dur``.
+        max_dur : float, optional
+            Clamp durations above ``max_dur``.
+        clip_start : bool or None, default None
+            Whether to clamp the start of a word. If ``None``, clamp the start of first word and end of last word per
+            segment.
+        verbose : bool, default False
+            Whether to print out the timestamp changes.
+        Returns
+        -------
+        stable_whisper.result.WhisperResult
+            The current instance after the changes.
+</details>
+<details>
+<summary>lock()</summary>
+        Lock words/segments with matching prefix/suffix to prevent splitting/merging.
+        Parameters
+        ----------
+        startswith: str or list of str
+            Prefixes to lock.
+        endswith: str or list of str
+            Suffixes to lock.
+        right : bool, default True
+            Whether prevent splits/merges with the next word/segment.
+        left : bool, default False
+            Whether prevent splits/merges with the previous word/segment.
+        case_sensitive : bool, default False
+            Whether to match the case of the prefixes/suffixes with the words/segments.
+        strip : bool, default True
+            Whether to ignore spaces before and after both words/segments and prefixes/suffixes.
+        Returns
+        -------
+        stable_whisper.result.WhisperResult
+            The current instance after the changes.
+</details>
+### Editing
+The editing methods in stable-ts can be chained with [Regrouping Methods](#regrouping-methods) and used in `regroup()`.
+Remove specific instances words or segments:
+```python
+# Remove first word of the first segment:
+first_word = result[0][0]
+result.remove_word(first_word)
+# This following is also does the same:
+del result[0][0]
+# Remove the last segment:
+last_segment = result[-1]
+result.remove_segment(last_segment)
+# This following is also does the same:
+del result[-1]
+```
+Docstrings:
+<details>
+<summary>remove_word()</summary>
+        Remove a word.
+        Parameters
+        ----------
+        word : WordTiming or tuple of (int, int)
+            Instance of :class:`stable_whisper.result.WordTiming` or tuple of (segment index, word index).
+        reassign_ids : bool, default True
+            Whether to reassign segment and word ids (indices) after removing ``word``.
+        verbose : bool, default True
+            Whether to print detail of the removed word.
+        Returns
+        -------
+        stable_whisper.result.WhisperResult
+            The current instance after the changes.
+</details>
+<details>
+<summary>remove_segment()</summary>
+        Remove a segment.
+        Parameters
+        ----------
+        segment : Segment or int
+            Instance :class:`stable_whisper.result.Segment` or segment index.
+        reassign_ids : bool, default True
+            Whether to reassign segment IDs (indices) after removing ``segment``.
+        verbose : bool, default True
+            Whether to print detail of the removed word.
+        Returns
+        -------
+        stable_whisper.result.WhisperResult
+            The current instance after the changes.
+</details>
+Removing repetitions:
+```python
+# Example 1: "This is is is a test." -> "This is a test."
+# The following removes the last two " is":
+result.remove_repetition(1)
+# Example 2: "This is is is a test this is a test." -> "This is a test."
+# The following removes the second " is" and third " is", then remove the last "this is a test"
+# The first parameter `max_words` is `4` because "this is a test" consists 4 words
+result.remove_repetition(4)
+```
+Docstring:
+<details>
+<summary>remove_repetition()</summary>
+        Remove words that repeat consecutively.
+        Parameters
+        ----------
+        max_words : int
+            Maximum number of words to look for consecutively.
+        case_sensitive : bool, default False
+            Whether the case of words need to match to be considered as repetition.
+        strip : bool, default True
+            Whether to ignore spaces before and after each word.
+        ignore_punctuations : bool, default '"',.?!'
+            Ending punctuations to ignore.
+        extend_duration: bool, default True
+            Whether to extend the duration of the previous word to cover the duration of the repetition.
+        verbose: bool, default True
+            Whether to print detail of the removed repetitions.
+        Returns
+        -------
+        stable_whisper.result.WhisperResult
+            The current instance after the changes.
+</details>
+Removing specific word(s) by string content:
+```python
+# Remove all " ok" from " ok ok this is a test."
+result.remove_words_by_str('ok')
+# Remove all " ok" and " Um..." from " ok this is a test. Um..."
+result.remove_words_by_str(['ok', 'um'])
+```
+Docstring:
+<details>
+<summary>remove_words_by_str()</summary>
+        Remove words that match ``words``.
+        Parameters
+        ----------
+        words : str or list of str or None
+            A word or list of words to remove.``None`` for all words to be passed into ``filters``.
+        case_sensitive : bool, default False
+            Whether the case of words need to match to be considered as repetition.
+        strip : bool, default True
+            Whether to ignore spaces before and after each word.
+        ignore_punctuations : bool, default '"',.?!'
+            Ending punctuations to ignore.
+        min_prob : float, optional
+            Acts as the first filter the for the words that match ``words``. Words with probability < ``min_prob`` will
+            be removed if ``filters`` is ``None``, else pass the words into ``filters``. Words without probability will
+            be treated as having probability < ``min_prob``.
+        filters : Callable, optional
+            A function that takes an instance of :class:`stable_whisper.result.WordTiming` as its only argument.
+            This function is custom filter for the words that match ``words`` and were not caught by ``min_prob``.
+        verbose:
+            Whether to print detail of the removed words.
+        Returns
+        -------
+        stable_whisper.result.WhisperResult
+            The current instance after the changes.
+</details>
+Filling in segment gaps:
+```python
+# result0:             [" How are you?"] [" I'm good."]                     [" Good!"]
+# result1: [" Hello!"] [" How are you?"]                [" How about you?"] [" Good!"]
+result0.fill_in_gaps(result1)
+# After filling in the gaps in `result0` with contents in `result1`:
+# result0: [" Hello!"] [" How are you?"] [" I'm good."] [" How about you?"] [" Good!"]
+```
+Docstring:
+<details>
+<summary>fill_in_gaps()</summary>
+        Fill in segment gaps larger than ``min_gap`` with content from ``other_result`` at the times of gaps.
+        Parameters
+        ----------
+        other_result : WhisperResult or str
+            Another transcription result as an instance of :class:`stable_whisper.result.WhisperResult` or path to the
+            JSON of the result.
+        min_gap : float, default 0.1
+            The minimum seconds of a gap between segments that must be exceeded to be filled in.
+        case_sensitive : bool, default False
+            Whether to consider the case of the first and last word of the gap to determine overlapping words to remove
+            before filling in.
+        strip : bool, default True
+            Whether to ignore spaces before and after the first and last word of the gap to determine overlapping words
+            to remove before filling in.
+        ignore_punctuations : bool, default '"',.?!'
+            Ending punctuations to ignore in the first and last word of the gap to determine overlapping words to
+            remove before filling in.
+        verbose:
+            Whether to print detail of the filled content.
+        Returns
+        -------
+        stable_whisper.result.WhisperResult
+            The current instance after the changes.
+</details>
+### Locating Words
+There are two ways to locate words.
+The first way is by approximating time at which the words are spoken
+then transcribing a few seconds around the approximated time.
+This also the faster way for locating words.
+```python
+matches = model.locate('audio.mp3', 'are', language='en', count=0)
+for match in matches:
+    print(match.to_display_str())
+# verbose=True does the same thing as this for-loop.
+```
+Docstring:
+<details>
+<summary>locate()</summary>
+    Locate when specific words are spoken in ``audio`` without fully transcribing.
+    This is usefully for quickly finding at what time the specify words or phrases are spoken in an audio. Since it
+    does not need to transcribe the audio to approximate the time, it is significantly faster transcribing then
+    locating the word in the transcript.
+    It can also transcribe few seconds around the approximated time to find out what was said around those words or
+    confirm if the word was even spoken near that time.
+    Parameters
+    ----------
+    model : whisper.model.Whisper
+        An instance of Whisper ASR model.
+    audio : str or numpy.ndarray or torch.Tensor or bytes
+        Path/URL to the audio file, the audio waveform, or bytes of audio file.
+        If audio is :class:`numpy.ndarray` or :class:`torch.Tensor`, the audio must be already at sampled to 16kHz.
+    text: str or list of int
+        Words/phrase or list of tokens to search for in ``audio``.
+    language : str
+        Language of the ``text``.
+    count : int, default 1, meaning stop search after 1 match
+        Number of matches to find. Use 0 to look for all.
+    duration_window : float or tuple of (float, float), default 3.0, same as (3.0, 3.0)
+        Seconds before and after the end timestamp approximations to transcribe after mode 1.
+        If tuple pair of values, then the 1st value will be seconds before the end and 2nd value will be seconds after.
+    mode : int, default 0
+        Mode of search.
+        2, Approximates the end timestamp of ``text`` in the audio. This mode does not confirm whether ``text`` is
+            spoken at the timestamp
+        1, Completes mode 2 then transcribes audio within ``duration_window`` to confirm whether `text` is a match at
+            the approximated timestamp by checking if ``text`` at that ``duration_window`` is within
+            ``probability_threshold`` or matching the string content if ``text`` with the transcribed text at the
+            ``duration_window``.
+        0, Completes mode 1 then add word timestamps to the transcriptions of each match.
+        Modes from fastest to slowest: 2, 1, 0
+    start : float, optional, meaning it starts from 0s
+        Seconds into the audio to start searching for ``text``.
+    end : float, optional
+        Seconds into the audio to stop searching for ``text``.
+    probability_threshold : float, default 0.5
+        Minimum probability of each token in ``text`` for it to be considered a match.
+    eots : int, default 1
+        Number of EOTs to reach before stopping transcription at mode 1. When transcription reach a EOT, it usually
+        means the end of the segment or audio. Once ``text`` is found in the ``duration_window``, the transcription
+        will stop immediately upon reaching a EOT.
+    max_token_per_seg : int, default 20
+        Maximum number of tokens to transcribe in the ``duration_window`` before stopping.
+    exact_token : bool, default False
+        Whether to find a match base on the exact tokens that make up ``text``.
+    case_sensitive : bool, default False
+        Whether to consider the case of ``text`` when matching in string content.
+    verbose : bool or None, default False
+        Whether to display the text being decoded to the console.
+        Displays all the details if ``True``. Displays progressbar if ``False``. Display nothing if ``None``.
+    initial_prompt : str, optional
+        Text to provide as a prompt for the first window. This can be used to provide, or
+        "prompt-engineer" a context for transcription, e.g. custom vocabularies or proper nouns
+        to make it more likely to predict those word correctly.
+    suppress_tokens : str or list of int, default '-1', meaning suppress special characters except common punctuations
+        List of tokens to suppress.
+    demucs : bool or torch.nn.Module, default False
+        Whether to preprocess ``audio`` with Demucs to isolate vocals / remove noise. Set ``demucs`` to an instance of
+        a Demucs model to avoid reloading the model for each run.
+        Demucs must be installed to use. Official repo, https://github.com/facebookresearch/demucs.
+    demucs_options : dict, optional
+        Options to use for :func:`stable_whisper.audio.demucs_audio`.
+    only_voice_freq : bool, default False
+        Whether to only use sound between 200 - 5000 Hz, where majority of human speech are.
+    Returns
+    -------
+    stable_whisper.result.Segment or list of dict or list of float
+        Mode 0, list of instances of :class:`stable_whisper.result.Segment`.
+        Mode 1, list of dictionaries with end timestamp approximation of matches and transcribed neighboring words.
+        Mode 2, list of timestamps in seconds for each end timestamp approximation.
+    Notes
+    -----
+    For ``text``, the case and spacing matters as 'on', ' on', ' On' are different tokens, therefore chose the one that
+    best suits the context (e.g. ' On' to look for it at the beginning of a sentence).
+    Use a sufficiently large first value of ``duration_window`` i.e. the value > time it is expected to speak ``text``.
+    If ``exact_token = False`` and the string content matches, then ``probability_threshold`` is not used.
+    Examples
+    --------
+    >>> import stable_whisper
+    >>> model = stable_whisper.load_model('base')
+    >>> matches = model.locate('audio.mp3', 'are', 'English', verbose=True)
+    Some words can sound the same but have different spellings to increase of the chance of finding such words use
+    ``initial_prompt``.
+    >>> matches = model.locate('audio.mp3', ' Nickie', 'English', verbose=True, initial_prompt='Nickie')
+</details>
+<details>
+<summary>CLI</summary>
+```
+stable-ts audio.mp3 --locate "are" --language en -to "count=0"
+```
+</details>
+The second way allows you to locate words with regular expression,
+but it requires the audio to be fully transcribed first.
+```python
+result = model.transcribe('audio.mp3')
+# Find every sentence that contains "and"
+matches = result.find(r'[^.]+and[^.]+\.')
+# print the all matches if there are any
+for match in matches:
+  print(f'match: {match.text_match}\n'
+        f'text: {match.text}\n'
+        f'start: {match.start}\n'
+        f'end: {match.end}\n')
+# Find the word before and after "and" in the matches
+matches = matches.find(r'\s\S+\sand\s\S+')
+for match in matches:
+  print(f'match: {match.text_match}\n'
+        f'text: {match.text}\n'
+        f'start: {match.start}\n'
+        f'end: {match.end}\n')
+```
+Docstring:
+<details>
+<summary>find()</summary>
+        Find segments/words and timestamps with regular expression.
+        Parameters
+        ----------
+        pattern : str
+            RegEx pattern to search for.
+        word_level : bool, default True
+            Whether to search at word-level.
+        flags : optional
+            RegEx flags.
+        Returns
+        -------
+        stable_whisper.result.WhisperResultMatches
+            An instance of :class:`stable_whisper.result.WhisperResultMatches` with word/segment that match ``pattern``.
+</details>
+### Silence Suppression
+While the timestamps predicted by Whisper are generally accurate,
+it sometimes predicts the start of a word way before the word is spoken
+or the end of a word long after the word has been spoken.
+This is where "silence suppression" helps. It is enabled by default (`suppress_silence=True`).
+The idea is to adjust the timestamps based on the timestamps of non-speech portions of the audio.
+![silence_suppresion0](./silence_suppresion0.png)
+*Note: In 1.X, "silence suppression" refers to the process of suppressing timestamp tokens of the silent portions during inference,
+but changed to post-inference timestamp adjustments in 2.X, which allows stable-ts to be used with other ASR models.
+The timestamp token suppression feature is disabled by default, but can still be enabled with `suppress_ts_tokens=True`.*
+By default, stable-ts determines the non-speech timestamps based on
+how loud a section of the audio is relative to the neighboring sections.
+This method is most effective for cases, where the speech is significantly louder than the background noise.
+The other method is to use [Silero VAD](https://github.com/snakers4/silero-vad) (enabled with `vad=True`).
+To visualize the differences between non-VAD and VAD, see [Visualizing Suppression](#visualizing-suppression).
+Besides the parameters for non-speech detection sensitivity (see [Visualizing Suppression](#visualizing-suppression)),
+the following parameters are used to combat inaccurate non-speech detection.<br>
+`min_word_dur` is the shortest duration each word is allowed from adjustments.<br>
+`nonspeech_error` is the relative error of the non-speech that appears in between a word.<br>
+`use_word_position` is whether to use word position in segment to determine whether to keep end or start timestamps
+*Note: `nonspeech_error` was not available before 2.14.0; `use_word_position` was not available before 2.14.2;
+`min_word_dur` prevented any adjustments that resulted in word duration shorter than `min_word_dur`.*
+For the following example, `min_word_dur=0.5` (default: 0.1) and `nonspeech_error=0.3` (default: 0.3).
+![silence_suppresion1](./silence_suppresion1.png)
+`nonspeech_error=0.3` allows each non-speech section to be treated 1.3 times their actual duration.
+Either from the start of the corresponding word to the end of the non-speech
+or from the start of the non-speech to the end of the corresponding word.
+In the case that both conditions are met, the shorter one is used.
+Or if both are equal, then the start of the non-speech to the end of the word is used.<br>
+The second non-speech from 1.375s to 1.75s is ignored for 'world.' because it failed both conditions.<br>
+The first word, 'Hello', satisfies only the former condition from 0s to 0.625, thus the new start for 'Hello'
+would be 0.625s. However, `min_word_dur=0.5` requires the resultant duration to be at least 0.5s.
+As a result, the start of 'Hello' is changed to 0.375s instead of 0.625s.
+Furthermore, the default setting, `use_word_position=True`, also ensures the start is adjusted for the first word
+and the end is adjusted for the last word of the segment as long as one of the conditions is true.
+### Tips
+- do not disable word timestamps with `word_timestamps=False` for reliable segment timestamps
+- use `vad=True` for more accurate non-speech detection
+- use `demucs=True` to isolate vocals with [Demucs](https://github.com/facebookresearch/demucs); it is also effective at isolating vocals even if there is no music
+- use `demucs=True` and `vad=True` for music
+- set same seed for each transcription (e.g. `random.seed(0)`) for `demucs=True` to produce deterministic outputs
+- to enable dynamic quantization for inference on CPU use `--dq true` for CLI or `dq=True` for `stable_whisper.load_model`
+- use `encode_video_comparison()` to encode multiple transcripts into one video for synced comparison; see [Encode Comparison](#encode-comparison)
+- use `visualize_suppression()` to visualize the differences between non-VAD and VAD options; see [Visualizing Suppression](#visualizing-suppression)
+- [refinement](#refinement) can an effective (but slow) alternative for polishing timestamps if silence suppression isn't effective
+### Visualizing Suppression
+You can visualize which parts of the audio will likely be suppressed (i.e. marked as silent).
+Requires: [Pillow](https://github.com/python-pillow/Pillow) or [opencv-python](https://github.com/opencv/opencv-python).
+#### Without VAD
+```python
+import stable_whisper
+# regions on the waveform colored red are where it will likely be suppressed and marked as silent
+# [q_levels]=20 and [k_size]=5 (default)
+stable_whisper.visualize_suppression('audio.mp3', 'image.png', q_levels=20, k_size = 5)
+```
+![novad](https://user-images.githubusercontent.com/28970749/225825408-aca63dbf-9571-40be-b399-1259d98f93be.png)
+#### With [Silero VAD](https://github.com/snakers4/silero-vad)
+```python
+# [vad_threshold]=0.35 (default)
+stable_whisper.visualize_suppression('audio.mp3', 'image.png', vad=True, vad_threshold=0.35)
+```
+![vad](https://user-images.githubusercontent.com/28970749/225825446-980924a5-7485-41e1-b0d9-c9b069d605f2.png)
+Docstring:
+<details>
+<summary>visualize_suppression()</summary>
+    Visualize regions on the waveform of ``audio`` detected as silent.
+    Regions on the waveform colored red are detected as silent.
+    Parameters
+    ----------
+    audio : str or numpy.ndarray or torch.Tensor or bytes
+        Path/URL to the audio file, the audio waveform, or bytes of audio file.
+        If audio is ``numpy.ndarray`` or ``torch.Tensor``, the audio must be already at sampled to 16kHz.
+    output : str, default None, meaning image will be shown directly via Pillow or opencv-python
+        Path to save visualization.
+    q_levels : int, default 20
+        Quantization levels for generating timestamp suppression mask; ignored if ``vad = true``.
+        Acts as a threshold to marking sound as silent.
+        Fewer levels will increase the threshold of volume at which to mark a sound as silent.
+    k_size : int, default 5
+        Kernel size for avg-pooling waveform to generate timestamp suppression mask; ignored if ``vad = true``.
+        Recommend 5 or 3; higher sizes will reduce detection of silence.
+    vad : bool, default False
+        Whether to use Silero VAD to generate timestamp suppression mask.
+        Silero VAD requires PyTorch 1.12.0+. Official repo, https://github.com/snakers4/silero-vad.
+    vad_threshold : float, default 0.35
+        Threshold for detecting speech with Silero VAD. Low threshold reduces false positives for silence detection.
+    max_width : int, default 1500
+        Maximum width of visualization to avoid overly large image from long audio.
+        Each unit of pixel is equivalent  to 1 token.  Use -1 to visualize the entire audio track.
+    height : int, default 200
+        Height of visualization.
+</details>
+### Encode Comparison
+You can encode videos similar to the ones in the doc for comparing transcriptions of the same audio.
+```python
+stable_whisper.encode_video_comparison(
+    'audio.mp3',
+    ['audio_sub1.srt', 'audio_sub2.srt'],
+    output_videopath='audio.mp4',
+    labels=['Example 1', 'Example 2']
+)
+```
+Docstring:
+<details>
+<summary>encode_video_comparison()</summary>
+    Encode multiple subtitle files into one video with the subtitles vertically stacked.
+    Parameters
+    ----------
+    audiofile : str
+        Path of audio file.
+    subtitle_files : list of str
+        List of paths for subtitle file.
+    output_videopath : str, optional
+        Output video path.
+    labels : list of str, default, None, meaning use ``subtitle_files`` as labels
+        List of labels for ``subtitle_files``.
+    height : int, default 90
+        Height for each subtitle section.
+    width : int, default 720
+        Width for each subtitle section.
+    color : str, default 'black'
+        Background color of the video.
+    fontsize: int, default 70
+        Font size for subtitles.
+    border_color : str, default 'white'
+        Border color for separating the sections of subtitle.
+    label_color : str, default 'white'
+        Color of labels.
+    label_size : int, default 14
+        Font size of labels.
+    fps : int, default 25
+        Frame-rate of the video.
+    video_codec : str, optional
+        Video codec opf the video.
+    audio_codec : str, optional
+        Audio codec opf the video.
+    overwrite : bool, default False
+        Whether to overwrite existing video files with the same path as the output video.
+    only_cmd : bool, default False
+        Whether to skip encoding and only return the full command generate from the specified options.
+    verbose : bool, default True
+        Whether to display ffmpeg processing info.
+    Returns
+    -------
+    str or None
+        Encoding command as a string if ``only_cmd = True``.
+</details>
+#### Multiple Files with CLI
+Transcribe multiple audio files then process the results directly into SRT files.
+```commandline
+stable-ts audio1.mp3 audio2.mp3 audio3.mp3 -o audio1.srt audio2.srt audio3.srt
+```
+### Any ASR
+You can use most of the features of Stable-ts improve the results of any ASR model/APIs.
+[Just follow this notebook](https://github.com/jianfch/stable-ts/blob/main/examples/non-whisper.ipynb).
+## Quick 1.X → 2.X Guide
+### What's new in 2.0.0?
+- updated to use Whisper's more reliable word-level timestamps method.
+- the more reliable word timestamps allow regrouping all words into segments with more natural boundaries.
+- can now suppress silence with [Silero VAD](https://github.com/snakers4/silero-vad) (requires PyTorch 1.12.0+)
+- non-VAD silence suppression is also more robust
+### Usage changes
+- `results_to_sentence_srt(result, 'audio.srt')` → `result.to_srt_vtt('audio.srt', word_level=False)`
+- `results_to_word_srt(result, 'audio.srt')` → `result.to_srt_vtt('output.srt', segment_level=False)`
+- `results_to_sentence_word_ass(result, 'audio.srt')` → `result.to_ass('output.ass')`
+- there's no need to stabilize segments after inference because they're already stabilized during inference
+- `transcribe()` returns a `WhisperResult` object which can be converted to `dict` with `.to_dict()`. e.g `result.to_dict()`
+## License
+This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details
+## Acknowledgments
+Includes slight modification of the original work: [Whisper](https://github.com/openai/whisper)

examples/non-whisper.ipynb ADDED Viewed

	@@ -0,0 +1,425 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "13dc05a3-de12-4d7a-a926-e99d6d97826e",
+   "metadata": {},
+   "source": [
+    "## Using Stable-ts with any ASR"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5cfee322-ebca-4c23-87a4-a109a2f85203",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import stable_whisper\n",
+    "assert int(stable_whisper.__version__.replace('.', '')) >= 270, f\"Requires Stable-ts 2.7.0+. Current version is {stable_whisper.__version__}.\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e6c2dab2-f4df-46f9-b2e8-94dd88522c7d",
+   "metadata": {},
+   "source": [
+    "<br />\n",
+    "\n",
+    "Stable-ts can be used for other ASR models or web APIs by wrapping them as a function then passing it as the first argument to `non_whisper.transcribe_any()`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "7d32fa9f-a54c-4996-97c3-3b360230d029",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def inference(audio, **kwargs) -> dict:\n",
+    "    # run model/API \n",
+    "    # return data as a dictionary\n",
+    "    data = {}\n",
+    "    return data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "856ef1fd-f489-42af-a90c-97323fd05a6b",
+   "metadata": {},
+   "source": [
+    "The data returned by the function must be one of the following:\n",
+    "- an instance of `WhisperResult` containing the data\n",
+    "- a dictionary in an appropriate mapping\n",
+    "- a path of JSON file containing data in an appropriate mapping"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bbdebdad-af1d-4077-8e99-20e767a0fd91",
+   "metadata": {},
+   "source": [
+    "Here are the 3 types of mappings:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "06bc4ce7-5117-4674-8eb9-c343c13c18bc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#1:\n",
+    "essential_mapping = [\n",
+    "    [   # 1st Segment\n",
+    "        {'word': ' And', 'start': 0.0, 'end': 1.28}, \n",
+    "        {'word': ' when', 'start': 1.28, 'end': 1.52}, \n",
+    "        {'word': ' no', 'start': 1.52, 'end': 2.26}, \n",
+    "        {'word': ' ocean,', 'start': 2.26, 'end': 2.68},\n",
+    "        {'word': ' mountain,', 'start': 3.28, 'end': 3.58}\n",
+    "    ], \n",
+    "    [   # 2nd Segment\n",
+    "        {'word': ' or', 'start': 4.0, 'end': 4.08}, \n",
+    "        {'word': ' sky', 'start': 4.08, 'end': 4.56}, \n",
+    "        {'word': ' could', 'start': 4.56, 'end': 4.84}, \n",
+    "        {'word': ' contain', 'start': 4.84, 'end': 5.26}, \n",
+    "        {'word': ' us,', 'start': 5.26, 'end': 6.27},\n",
+    "        {'word': ' our', 'start': 6.27, 'end': 6.58}, \n",
+    "        {'word': ' gaze', 'start': 6.58, 'end': 6.98}, \n",
+    "        {'word': ' hungered', 'start': 6.98, 'end': 7.88}, \n",
+    "        {'word': ' starward.', 'start': 7.88, 'end': 8.64}\n",
+    "    ]\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b53bd812-2838-4f47-ab5f-5e729801aaee",
+   "metadata": {},
+   "source": [
+    "<br />\n",
+    "\n",
+    "If word timings are not available they can be omitted, but operations that can be performed on this data will be limited."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "8c6bf720-5bfd-4e79-90e7-7049a2ca1d3a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#2:\n",
+    "no_word_mapping = [\n",
+    "    {\n",
+    "        'start': 0.0, \n",
+    "        'end': 3.58, \n",
+    "        'text': ' And when no ocean, mountain,',\n",
+    "    }, \n",
+    "    {\n",
+    "        'start': 4.0, \n",
+    "        'end': 8.64, \n",
+    "        'text': ' or sky could contain us, our gaze hungered starward.', \n",
+    "    }\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "108e960f-8bd1-4d2a-92bf-cc8cb56f4615",
+   "metadata": {},
+   "source": [
+    "<br />\n",
+    "\n",
+    "Below is the full mapping for normal Stable-ts results. `None` takes the place of any omitted values except for `start`, `end`, and `text`/`word` which are required."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "2969aad2-c8bf-4043-8015-669a3102e158",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#3:\n",
+    "full_mapping = {\n",
+    "    'language': 'en',\n",
+    "    'text': ' And when no ocean, mountain, or sky could contain us, our gaze hungered starward.', \n",
+    "    'segments': [\n",
+    "        {\n",
+    "            'seek': 0.0, \n",
+    "            'start': 0.0, \n",
+    "            'end': 3.58, \n",
+    "            'text': ' And when no ocean, mountain,', \n",
+    "            'tokens': [400, 562, 572, 7810, 11, 6937, 11], \n",
+    "            'temperature': 0.0, \n",
+    "            'avg_logprob': -0.48702024376910663, \n",
+    "            'compression_ratio': 1.0657894736842106, \n",
+    "            'no_speech_prob': 0.3386174440383911, \n",
+    "            'id': 0, \n",
+    "            'words': [\n",
+    "                {'word': ' And', 'start': 0.04, 'end': 1.28, 'probability': 0.6481522917747498, 'tokens': [400]}, \n",
+    "                {'word': ' when', 'start': 1.28, 'end': 1.52, 'probability': 0.9869539141654968, 'tokens': [562]}, \n",
+    "                {'word': ' no', 'start': 1.52, 'end': 2.26, 'probability': 0.57384192943573, 'tokens': [572]}, \n",
+    "                {'word': ' ocean,', 'start': 2.26, 'end': 2.68, 'probability': 0.9484889507293701, 'tokens': [7810, 11]},\n",
+    "                {'word': ' mountain,', 'start': 3.28, 'end': 3.58, 'probability': 0.9581122398376465, 'tokens': [6937, 11]}\n",
+    "            ]\n",
+    "        }, \n",
+    "        {\n",
+    "            'seek': 0.0, \n",
+    "            'start': 4.0, \n",
+    "            'end': 8.64, \n",
+    "            'text': ' or sky could contain us, our gaze hungered starward.', \n",
+    "            'tokens': [420, 5443, 727, 5304, 505, 11, 527, 24294, 5753, 4073, 3543, 1007, 13], \n",
+    "            'temperature': 0.0, \n",
+    "            'avg_logprob': -0.48702024376910663, \n",
+    "            'compression_ratio': 1.0657894736842106, \n",
+    "            'no_speech_prob': 0.3386174440383911, \n",
+    "            'id': 1, \n",
+    "            'words': [\n",
+    "                {'word': ' or', 'start': 4.0, 'end': 4.08, 'probability': 0.9937937259674072, 'tokens': [420]}, \n",
+    "                {'word': ' sky', 'start': 4.08, 'end': 4.56, 'probability': 0.9950089454650879, 'tokens': [5443]}, \n",
+    "                {'word': ' could', 'start': 4.56, 'end': 4.84, 'probability': 0.9915681481361389, 'tokens': [727]}, \n",
+    "                {'word': ' contain', 'start': 4.84, 'end': 5.26, 'probability': 0.898974597454071, 'tokens': [5304]}, \n",
+    "                {'word': ' us,', 'start': 5.26, 'end': 6.27, 'probability': 0.999351441860199, 'tokens': [505, 11]},\n",
+    "                {'word': ' our', 'start': 6.27, 'end': 6.58, 'probability': 0.9634224772453308, 'tokens': [527]}, \n",
+    "                {'word': ' gaze', 'start': 6.58, 'end': 6.98, 'probability': 0.8934874534606934, 'tokens': [24294]}, \n",
+    "                {'word': ' hungered', 'start': 6.98, 'end': 7.88, 'probability': 0.7424876093864441, 'tokens': [5753, 4073]}, \n",
+    "                {'word': ' starward.', 'start': 7.88, 'end': 8.64, 'probability': 0.464096799492836, 'tokens': [3543, 1007, 13]}\n",
+    "            ]\n",
+    "        }\n",
+    "    ]\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "49d136e4-0f7d-4dcf-84f9-efb6f0eda491",
+   "metadata": {},
+   "source": [
+    "<br />\n",
+    "\n",
+    "The function must also have `audio` as a parameter."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "33f03286-69f9-4ae1-aec0-250fd92a8cb6",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def inference(audio, **kwargs) -> dict:\n",
+    "    # run model/API on the audio\n",
+    "    # return data in a proper format\n",
+    "    return essential_mapping"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "d6710eb5-5386-42cf-b6e7-02a84b5fad40",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "result = stable_whisper.transcribe_any(inference, './demo.wav', vad=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "6d7f9de6-5c9b-4c73-808d-640b13efb051",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0\n",
+      "00:00:01,122 --> 00:00:02,680\n",
+      "And when no ocean,\n",
+      "\n",
+      "1\n",
+      "00:00:03,280 --> 00:00:03,580\n",
+      "mountain,\n",
+      "\n",
+      "2\n",
+      "00:00:04,000 --> 00:00:06,046\n",
+      "or sky could contain us,\n",
+      "\n",
+      "3\n",
+      "00:00:06,402 --> 00:00:08,640\n",
+      "our gaze hungered starward.\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(result.to_srt_vtt(word_level=False))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "be5a45e8-1b25-4a70-9af6-94bc5379fc7d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "    Transcribe an audio file using any ASR system.\n",
+      "\n",
+      "    Parameters\n",
+      "    ----------\n",
+      "    inference_func: Callable\n",
+      "        Function that runs ASR when provided the [audio] and return data in the appropriate format.\n",
+      "        For format examples: https://github.com/jianfch/stable-ts/blob/main/examples/non-whisper.ipynb\n",
+      "\n",
+      "    audio: Union[str, np.ndarray, torch.Tensor, bytes]\n",
+      "        The path/URL to the audio file, the audio waveform, or bytes of audio file.\n",
+      "\n",
+      "    audio_type: str\n",
+      "        The type that [audio] needs to be for [inference_func]. (Default: Same type as [audio])\n",
+      "\n",
+      "        Types:\n",
+      "            None (default)\n",
+      "                same type as [audio]\n",
+      "\n",
+      "            'str'\n",
+      "                a path to the file\n",
+      "                -if [audio] is a file and not audio preprocessing is done,\n",
+      "                    [audio] will be directly passed into [inference_func]\n",
+      "                -if audio preprocessing is performed (from [demucs] and/or [only_voice_freq]),\n",
+      "                    the processed audio will be encoded into [temp_file] and then passed into [inference_func]\n",
+      "\n",
+      "            'byte'\n",
+      "                bytes (used for APIs or to avoid writing any data to hard drive)\n",
+      "                -if [audio] is file, the bytes of file is used\n",
+      "                -if [audio] PyTorch tensor or NumPy array, the bytes of the [audio] encoded into WAV format is used\n",
+      "\n",
+      "            'torch'\n",
+      "                a PyTorch tensor containing the audio waveform, in float32 dtype, on CPU\n",
+      "\n",
+      "            'numpy'\n",
+      "                a NumPy array containing the audio waveform, in float32 dtype\n",
+      "\n",
+      "    input_sr: int\n",
+      "        The sample rate of [audio]. (Default: Auto-detected if [audio] is str/bytes)\n",
+      "\n",
+      "    model_sr: int\n",
+      "        The sample rate to resample the audio into for [inference_func]. (Default: Same as [input_sr])\n",
+      "        Resampling is only performed when [model_sr] do not match the sample rate of the final audio due to:\n",
+      "         -[input_sr] not matching\n",
+      "         -sample rate changed due to audio preprocessing from [demucs]=True\n",
+      "\n",
+      "    inference_kwargs: dict\n",
+      "        Dictionary of arguments provided to [inference_func]. (Default: None)\n",
+      "\n",
+      "    temp_file: str\n",
+      "        Temporary path for the preprocessed audio when [audio_type]='str'. (Default: './_temp_stable-ts_audio_.wav')\n",
+      "\n",
+      "    verbose: bool\n",
+      "        Whether to display the text being decoded to the console. If True, displays all the details,\n",
+      "        If False, displays progressbar. If None, does not display anything (Default: False)\n",
+      "\n",
+      "    regroup: Union[bool, str]\n",
+      "        Whether to regroup all words into segments with more natural boundaries. (Default: True)\n",
+      "        Specify string for customizing the regrouping algorithm.\n",
+      "        Ignored if [word_timestamps]=False.\n",
+      "\n",
+      "    suppress_silence: bool\n",
+      "        Whether to suppress timestamp where audio is silent at segment-level\n",
+      "        and word-level if [suppress_word_ts]=True. (Default: True)\n",
+      "\n",
+      "    suppress_word_ts: bool\n",
+      "        Whether to suppress timestamps, if [suppress_silence]=True, where audio is silent at word-level. (Default: True)\n",
+      "\n",
+      "    q_levels: int\n",
+      "        Quantization levels for generating timestamp suppression mask; ignored if [vad]=true. (Default: 20)\n",
+      "        Acts as a threshold to marking sound as silent.\n",
+      "        Fewer levels will increase the threshold of volume at which to mark a sound as silent.\n",
+      "\n",
+      "    k_size: int\n",
+      "        Kernel size for avg-pooling waveform to generate timestamp suppression mask; ignored if [vad]=true. (Default: 5)\n",
+      "        Recommend 5 or 3; higher sizes will reduce detection of silence.\n",
+      "\n",
+      "    demucs: bool\n",
+      "        Whether to preprocess the audio track with Demucs to isolate vocals/remove noise. (Default: False)\n",
+      "        Demucs must be installed to use. Official repo: https://github.com/facebookresearch/demucs\n",
+      "\n",
+      "    demucs_device: str\n",
+      "        Device to use for demucs: 'cuda' or 'cpu'. (Default. 'cuda' if torch.cuda.is_available() else 'cpu')\n",
+      "\n",
+      "    demucs_output: str\n",
+      "        Path to save the vocals isolated by Demucs as WAV file. Ignored if [demucs]=False.\n",
+      "        Demucs must be installed to use. Official repo: https://github.com/facebookresearch/demucs\n",
+      "\n",
+      "    vad: bool\n",
+      "        Whether to use Silero VAD to generate timestamp suppression mask. (Default: False)\n",
+      "        Silero VAD requires PyTorch 1.12.0+. Official repo: https://github.com/snakers4/silero-vad\n",
+      "\n",
+      "    vad_threshold: float\n",
+      "        Threshold for detecting speech with Silero VAD. (Default: 0.35)\n",
+      "        Low threshold reduces false positives for silence detection.\n",
+      "\n",
+      "    vad_onnx: bool\n",
+      "        Whether to use ONNX for Silero VAD. (Default: False)\n",
+      "\n",
+      "    min_word_dur: float\n",
+      "        Only allow suppressing timestamps that result in word durations greater than this value. (default: 0.1)\n",
+      "\n",
+      "    only_voice_freq: bool\n",
+      "        Whether to only use sound between 200 - 5000 Hz, where majority of human speech are. (Default: False)\n",
+      "\n",
+      "    only_ffmpeg: bool\n",
+      "        Whether to use only FFmpeg (and not yt-dlp) for URls. (Default: False)\n",
+      "\n",
+      "    Returns\n",
+      "    -------\n",
+      "    An instance of WhisperResult.\n",
+      "    \n"
+     ]
+    }
+   ],
+   "source": [
+    "print(stable_whisper.transcribe_any.__doc__)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a99ee627-6ab4-411d-ba27-d372d3647593",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.15"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

setup.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import os
+from setuptools import setup
+def version() -> str:
+    with open(os.path.join(os.path.dirname(__file__), 'stable_whisper/_version.py')) as f:
+        return f.read().split('=')[-1].strip().strip('"').strip("'")
+def read_me() -> str:
+    with open('README.md', 'r', encoding='utf-8') as f:
+        return f.read()
+setup(
+    name="stable-ts",
+    version=version(),
+    description="Modifies OpenAI's Whisper to produce more reliable timestamps.",
+    long_description=read_me(),
+    long_description_content_type='text/markdown',
+    python_requires=">=3.8",
+    author="Jian",
+    url="https://github.com/jianfch/stable-ts",
+    license="MIT",
+    packages=['stable_whisper'],
+    install_requires=[
+        "numpy",
+        "torch",
+        "torchaudio",
+        "tqdm",
+        "more-itertools",
+        "transformers>=4.19.0",
+        "ffmpeg-python==0.2.0",
+        "openai-whisper==20231117"
+    ],
+    entry_points={
+        "console_scripts": ["stable-ts=stable_whisper.whisper_word_level:cli"],
+    },
+    include_package_data=False
+)

silence_suppresion0.png ADDED Viewed

silence_suppresion1.png ADDED Viewed

stable_whisper/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from .whisper_word_level import *
+from .result import *
+from .text_output import *
+from .video_output import *
+from .stabilization import visualize_suppression
+from .non_whisper import transcribe_any
+from ._version import __version__
+from .whisper_compatibility import _required_whisper_ver, _COMPATIBLE_WHISPER_VERSIONS

stable_whisper/__main__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .whisper_word_level import cli
2	+
3	+ cli()

stable_whisper/_version.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ __version__ = "2.14.3"

stable_whisper/alignment.py ADDED Viewed

	@@ -0,0 +1,1265 @@

+import copy
+import re
+import warnings
+import torch
+import numpy as np
+from tqdm import tqdm
+from typing import TYPE_CHECKING, Union, List, Callable, Optional, Tuple
+import whisper
+from whisper.audio import (
+    SAMPLE_RATE, N_FRAMES, N_SAMPLES, N_FFT, pad_or_trim, log_mel_spectrogram, FRAMES_PER_SECOND, CHUNK_LENGTH
+)
+from .result import WhisperResult, Segment
+from .timing import add_word_timestamps_stable, split_word_tokens
+from .audio import prep_audio
+from .utils import safe_print, format_timestamp
+from .whisper_compatibility import warn_compatibility_issues, get_tokenizer
+from .stabilization import get_vad_silence_func, wav2mask, mask2timing
+if TYPE_CHECKING:
+    from whisper.model import Whisper
+__all__ = ['align', 'refine', 'locate']
+def align(
+        model: "Whisper",
+        audio: Union[str, np.ndarray, torch.Tensor, bytes],
+        text: Union[str, List[int], WhisperResult],
+        language: str = None,
+        *,
+        verbose: Optional[bool] = False,
+        regroup: bool = True,
+        suppress_silence: bool = True,
+        suppress_word_ts: bool = True,
+        use_word_position: bool = True,
+        min_word_dur: bool = 0.1,
+        nonspeech_error: float = 0.3,
+        q_levels: int = 20,
+        k_size: int = 5,
+        vad: bool = False,
+        vad_threshold: float = 0.35,
+        vad_onnx: bool = False,
+        demucs: Union[bool, torch.nn.Module] = False,
+        demucs_output: str = None,
+        demucs_options: dict = None,
+        only_voice_freq: bool = False,
+        prepend_punctuations: str = "\"'“¿([{-",
+        append_punctuations: str = "\"'.。,，!！?？:：”)]}、",
+        progress_callback: Callable = None,
+        ignore_compatibility: bool = False,
+        remove_instant_words: bool = False,
+        token_step: int = 100,
+        original_split: bool = False,
+        word_dur_factor: Optional[float] = 2.0,
+        max_word_dur: Optional[float] = 3.0,
+        nonspeech_skip: Optional[float] = 3.0,
+        fast_mode: bool = False,
+        tokenizer: "Tokenizer" = None
+) -> Union[WhisperResult, None]:
+    """
+    Align plain text or tokens with audio at word-level.
+    Since this is significantly faster than transcribing, it is a more efficient method for testing various settings
+    without re-transcribing. This is also useful for timing a more correct transcript than one that Whisper can produce.
+    Parameters
+    ----------
+    model : "Whisper"
+        The Whisper ASR model modified instance
+    audio : str or numpy.ndarray or torch.Tensor or bytes
+        Path/URL to the audio file, the audio waveform, or bytes of audio file.
+        If audio is :class:`numpy.ndarray` or :class:`torch.Tensor`, the audio must be already at sampled to 16kHz.
+    text : str or list of int or stable_whisper.result.WhisperResult
+        String of plain-text, list of tokens, or instance of :class:`stable_whisper.result.WhisperResult`.
+    language : str, default None, uses ``language`` in ``text`` if it is a :class:`stable_whisper.result.WhisperResult`
+        Language of ``text``. Required if ``text`` does not contain ``language``.
+    remove_instant_words : bool, default False
+        Whether to truncate any words with zero duration.
+    token_step : int, default 100
+        Max number of tokens to align each pass. Use higher values to reduce chance of misalignment.
+    original_split : bool, default False
+        Whether to preserve the original segment groupings. Segments are spit by line break if ``text`` is plain-text.
+    max_word_dur : float or None, default 3.0
+        Global maximum word duration in seconds. Re-align words that exceed the global maximum word duration.
+    word_dur_factor : float or None, default 2.0
+        Factor to compute the Local maximum word duration, which is ``word_dur_factor`` * local medium word duration.
+        Words that need re-alignment, are re-algined with duration <= local/global maximum word duration.
+    nonspeech_skip : float or None, default 3.0
+        Skip non-speech sections that are equal or longer than this duration in seconds. Disable skipping if ``None``.
+    fast_mode : bool, default False
+        Whether to speed up alignment by re-alignment with local/global maximum word duration.
+        ``True`` tends produce better timestamps when ``text`` is accurate and there are no large speechless gaps.
+    tokenizer : "Tokenizer", default None, meaning a new tokenizer is created according ``language`` and ``model``
+        A tokenizer to used tokenizer text and detokenize tokens.
+    verbose : bool or None, default False
+        Whether to display the text being decoded to the console.
+        Displays all the details if ``True``. Displays progressbar if ``False``. Display nothing if ``None``.
+    regroup : bool or str, default True, meaning the default regroup algorithm
+        String for customizing the regrouping algorithm. False disables regrouping.
+        Ignored if ``word_timestamps = False``.
+    suppress_silence : bool, default True
+        Whether to enable timestamps adjustments based on the detected silence.
+    suppress_word_ts : bool, default True
+        Whether to adjust word timestamps based on the detected silence. Only enabled if ``suppress_silence = True``.
+    use_word_position : bool, default True
+        Whether to use position of the word in its segment to determine whether to keep end or start timestamps if
+        adjustments are required. If it is the first word, keep end. Else if it is the last word, keep the start.
+    q_levels : int, default 20
+        Quantization levels for generating timestamp suppression mask; ignored if ``vad = true``.
+        Acts as a threshold to marking sound as silent.
+        Fewer levels will increase the threshold of volume at which to mark a sound as silent.
+    k_size : int, default 5
+        Kernel size for avg-pooling waveform to generate timestamp suppression mask; ignored if ``vad = true``.
+        Recommend 5 or 3; higher sizes will reduce detection of silence.
+    demucs : bool or torch.nn.Module, default False
+        Whether to preprocess ``audio`` with Demucs to isolate vocals / remove noise. Set ``demucs`` to an instance of
+        a Demucs model to avoid reloading the model for each run.
+        Demucs must be installed to use. Official repo, https://github.com/facebookresearch/demucs.
+    demucs_output : str, optional
+        Path to save the vocals isolated by Demucs as WAV file. Ignored if ``demucs = False``.
+        Demucs must be installed to use. Official repo, https://github.com/facebookresearch/demucs.
+    demucs_options : dict, optional
+        Options to use for :func:`stable_whisper.audio.demucs_audio`.
+    vad : bool, default False
+        Whether to use Silero VAD to generate timestamp suppression mask.
+        Silero VAD requires PyTorch 1.12.0+. Official repo, https://github.com/snakers4/silero-vad.
+    vad_threshold : float, default 0.35
+        Threshold for detecting speech with Silero VAD. Low threshold reduces false positives for silence detection.
+    vad_onnx : bool, default False
+        Whether to use ONNX for Silero VAD.
+    min_word_dur : float, default 0.1
+        Shortest duration each word is allowed to reach for silence suppression.
+    nonspeech_error : float, default 0.3
+        Relative error of non-speech sections that appear in between a word for silence suppression.
+    only_voice_freq : bool, default False
+        Whether to only use sound between 200 - 5000 Hz, where majority of human speech are.
+    prepend_punctuations : str, default '"'“¿([{-)'
+        Punctuations to prepend to next word.
+    append_punctuations : str, default '.。,，!！?？:：”)]}、)'
+        Punctuations to append to previous word.
+    progress_callback : Callable, optional
+        A function that will be called when transcription progress is updated.
+        The callback need two parameters.
+        The first parameter is a float for seconds of the audio that has been transcribed.
+        The second parameter is a float for total duration of audio in seconds.
+    ignore_compatibility : bool, default False
+        Whether to ignore warnings for compatibility issues with the detected Whisper version.
+    Returns
+    -------
+    stable_whisper.result.WhisperResult or None
+        All timestamps, words, probabilities, and other data from the alignment of ``audio``. Return None if alignment
+        fails and ``remove_instant_words = True``.
+    Notes
+    -----
+    If ``token_step`` is less than 1, ``token_step`` will be set to its maximum value, 442. This value is computed with
+    ``whisper.model.Whisper.dims.n_text_ctx`` - 6.
+    IF ``original_split = True`` and a line break is found in middle of a word in ``text``, the split will occur after
+    that word.
+    ``regroup`` is ignored if ``original_split = True``.
+    Examples
+    --------
+    >>> import stable_whisper
+    >>> model = stable_whisper.load_model('base')
+    >>> result = model.align('helloworld.mp3', 'Hello, World!', 'English')
+    >>> result.to_srt_vtt('helloword.srt')
+    Saved 'helloworld.srt'
+    """
+    is_faster_model = model.__module__.startswith('faster_whisper.')
+    if demucs_options is None:
+        demucs_options = {}
+    if demucs_output:
+        if 'save_path' not in demucs_options:
+            demucs_options['save_path'] = demucs_output
+        warnings.warn('``demucs_output`` is deprecated. Use ``demucs_options`` with ``save_path`` instead. '
+                      'E.g. demucs_options=dict(save_path="demucs_output.mp3")',
+                      DeprecationWarning, stacklevel=2)
+    max_token_step = (model.max_length if is_faster_model else model.dims.n_text_ctx) - 6
+    if token_step < 1:
+        token_step = max_token_step
+    elif token_step > max_token_step:
+        raise ValueError(f'The max value for [token_step] is {max_token_step} but got {token_step}.')
+    warn_compatibility_issues(whisper, ignore_compatibility)
+    split_indices_by_char = []
+    if isinstance(text, WhisperResult):
+        if language is None:
+            language = text.language
+        if original_split and len(text.segments) > 1 and text.has_words:
+            split_indices_by_char = np.cumsum([sum(len(w.word) for w in seg.words) for seg in text.segments])
+        text = text.all_tokens() if text.has_words and all(w.tokens for w in text.all_words()) else text.text
+    elif isinstance(text, str):
+        if original_split and '\n' in text:
+            text_split = [line if line.startswith(' ') else ' '+line for line in text.splitlines()]
+            split_indices_by_char = np.cumsum([len(seg) for seg in text_split])
+            text = ''.join(re.sub(r'\s', ' ', seg) for seg in text_split)
+        else:
+            text = re.sub(r'\s', ' ', text)
+            if not text.startswith(' '):
+                text = ' ' + text
+    if language is None:
+        raise TypeError('expected argument for language')
+    if tokenizer is None:
+        tokenizer = get_tokenizer(model, is_faster_model=is_faster_model, language=language, task='transcribe')
+    tokens = tokenizer.encode(text) if isinstance(text, str) else text
+    tokens = [t for t in tokens if t < tokenizer.eot]
+    _, (words, word_tokens), _ = split_word_tokens([dict(tokens=tokens)], tokenizer)
+    audio = prep_audio(
+        audio,
+        demucs=demucs,
+        demucs_options=demucs_options,
+        only_voice_freq=only_voice_freq,
+        verbose=verbose
+    )
+    sample_padding = int(N_FFT // 2) + 1
+    seek_sample = 0
+    total_samples = audio.shape[-1]
+    total_duration = round(total_samples / SAMPLE_RATE, 2)
+    total_words = len(words)
+    if is_faster_model:
+        def timestamp_words():
+            temp_segment = dict(
+                seek=0,
+                start=0.0,
+                end=round(segment_samples / model.feature_extractor.sampling_rate, 3),
+                tokens=[t for wt in curr_word_tokens for t in wt],
+            )
+            features = model.feature_extractor(audio_segment.numpy())
+            encoder_output = model.encode(features[:, : model.feature_extractor.nb_max_frames])
+            model.add_word_timestamps(
+                segments=[temp_segment],
+                tokenizer=tokenizer,
+                encoder_output=encoder_output,
+                num_frames=round(segment_samples / model.feature_extractor.hop_length),
+                prepend_punctuations=prepend_punctuations,
+                append_punctuations=append_punctuations,
+                last_speech_timestamp=temp_segment['start'],
+            )
+            cumsum_lens = np.cumsum([len(w) for w in curr_words]).tolist()
+            final_cumsum_lens = np.cumsum([len(w['word']) for w in temp_segment['words']]).tolist()
+            assert not (set(final_cumsum_lens) - set(cumsum_lens)), 'word mismatch'
+            prev_l_idx = 0
+            for w_idx, cs_len in enumerate(final_cumsum_lens):
+                temp_segment['words'][w_idx]['start'] = round(temp_segment['words'][w_idx]['start'] + time_offset, 3)
+                temp_segment['words'][w_idx]['end'] = round(temp_segment['words'][w_idx]['end'] + time_offset, 3)
+                l_idx = cumsum_lens.index(cs_len)+1
+                temp_segment['words'][w_idx]['tokens'] = [t for wt in curr_word_tokens[prev_l_idx:l_idx] for t in wt]
+                prev_l_idx = l_idx
+            return temp_segment
+    else:
+        def timestamp_words():
+            temp_segment = dict(
+                seek=time_offset,
+                tokens=(curr_words, curr_word_tokens)
+            )
+            mel_segment = log_mel_spectrogram(audio_segment, model.dims.n_mels, padding=sample_padding)
+            mel_segment = pad_or_trim(mel_segment, N_FRAMES).to(device=model.device)
+            add_word_timestamps_stable(
+                segments=[temp_segment],
+                model=model,
+                tokenizer=tokenizer,
+                mel=mel_segment,
+                num_samples=segment_samples,
+                split_callback=(lambda x, _: x),
+                prepend_punctuations=prepend_punctuations,
+                append_punctuations=append_punctuations,
+                gap_padding=None
+            )
+            return temp_segment
+    def get_curr_words():
+        nonlocal words, word_tokens
+        curr_tk_count = 0
+        w, wt = [], []
+        for _ in range(len(words)):
+            tk_count = len(word_tokens[0])
+            if curr_tk_count + tk_count > token_step and w:
+                break
+            w.append(words.pop(0))
+            wt.append(word_tokens.pop(0))
+            curr_tk_count += tk_count
+        return w, wt
+    result = []
+    nonspeech_timings = [[], []]
+    nonspeech_vad_timings = None
+    if (suppress_silence or nonspeech_skip is not None) and vad:
+        nonspeech_vad_timings = (
+            get_vad_silence_func(onnx=vad_onnx, verbose=verbose)(audio, speech_threshold=vad_threshold)
+        )
+        if nonspeech_vad_timings is not None:
+            nonspeech_timings = nonspeech_vad_timings[0].copy(), nonspeech_vad_timings[1].copy()
+    with tqdm(total=total_duration, unit='sec', disable=verbose is not False, desc='Align') as tqdm_pbar:
+        def update_pbar(finish: bool = False):
+            tqdm_pbar.update((total_duration if finish else min(round(last_ts, 2), total_duration)) - tqdm_pbar.n)
+            if progress_callback is not None:
+                progress_callback(seek=tqdm_pbar.n, total=tqdm_pbar.total)
+        def redo_words(_idx: int = None):
+            nonlocal seg_words, seg_tokens, seg_words, words, word_tokens, curr_words, temp_word
+            if curr_words and temp_word is not None:
+                assert curr_words[0]['word'] == temp_word['word']
+                if curr_words[0]['probability'] >= temp_word['probability']:
+                    temp_word = curr_words[0]
+            if _idx is None:  # redo all
+                words = seg_words + words
+                word_tokens = seg_tokens + word_tokens
+                curr_words = []
+            elif _idx != len(seg_words):  # redo from _idx
+                words = seg_words[_idx:] + words
+                word_tokens = seg_tokens[_idx:] + word_tokens
+                curr_words = curr_words[:_idx]
+                if curr_words:
+                    if temp_word is not None:
+                        curr_words[0] = temp_word
+                        temp_word = None
+                    words = seg_words[_idx-1:_idx] + words
+                    word_tokens = seg_tokens[_idx-1:_idx] + word_tokens
+                    temp_word = curr_words.pop(-1)
+            else:
+                if temp_word is not None:
+                    curr_words[0] = temp_word
+                    temp_word = None
+        n_samples = model.feature_extractor.n_samples if is_faster_model else N_SAMPLES
+        temp_word = None
+        while words and seek_sample < total_samples:
+            time_offset = seek_sample / SAMPLE_RATE
+            seek_sample_end = seek_sample + n_samples
+            audio_segment = audio[seek_sample:seek_sample_end]
+            segment_samples = audio_segment.shape[-1]
+            if nonspeech_skip is not None:
+                segment_nonspeech_timings = None
+                if not vad:
+                    ts_token_mask = wav2mask(audio_segment, q_levels=q_levels, k_size=k_size)
+                    segment_nonspeech_timings = mask2timing(ts_token_mask, time_offset=time_offset)
+                    if segment_nonspeech_timings is not None:
+                        nonspeech_timings[0].extend(segment_nonspeech_timings[0])
+                        nonspeech_timings[1].extend(segment_nonspeech_timings[1])
+                elif nonspeech_vad_timings:
+                    timing_indices = np.logical_and(
+                        nonspeech_vad_timings[1] > time_offset,
+                        nonspeech_vad_timings[0] < time_offset + 30.0
+                    )
+                    if timing_indices.any():
+                        segment_nonspeech_timings = (
+                            nonspeech_vad_timings[0][timing_indices], nonspeech_vad_timings[1][timing_indices]
+                        )
+                    else:
+                        segment_nonspeech_timings = None
+                    if mn := timing_indices.argmax():
+                        nonspeech_vad_timings = (nonspeech_vad_timings[0][mn:], nonspeech_vad_timings[1][mn:])
+                if segment_nonspeech_timings is not None:
+                    # segment has no detectable speech
+                    if (
+                            (segment_nonspeech_timings[0][0] <= time_offset + min_word_dur) and
+                            (segment_nonspeech_timings[1][0] >= time_offset + segment_samples - min_word_dur)
+                    ):
+                        seek_sample += segment_samples
+                        continue
+                    timing_indices = (segment_nonspeech_timings[1] - segment_nonspeech_timings[0]) >= nonspeech_skip
+                    if any(timing_indices):
+                        nonspeech_starts = segment_nonspeech_timings[0][timing_indices]
+                        nonspeech_ends = segment_nonspeech_timings[1][timing_indices]
+                        if round(time_offset, 3) >= nonspeech_starts[0]:
+                            seek_sample = round(nonspeech_ends[0] * SAMPLE_RATE)
+                            if seek_sample + (min_word_dur * SAMPLE_RATE) >= total_samples:
+                                seek_sample = total_samples
+                                continue
+                            time_offset = seek_sample / SAMPLE_RATE
+                            if len(nonspeech_starts) > 1:
+                                seek_sample_end = (
+                                        seek_sample + round((nonspeech_starts[1] - nonspeech_ends[0]) * SAMPLE_RATE)
+                                )
+                            audio_segment = audio[seek_sample:seek_sample_end]
+                            segment_samples = audio_segment.shape[-1]
+            curr_words, curr_word_tokens = get_curr_words()
+            segment = timestamp_words()
+            curr_words = segment['words']
+            seg_words = [w['word'] for w in curr_words]
+            seg_tokens = [w['tokens'] for w in curr_words]
+            durations = np.array([w['end'] - w['start'] for w in curr_words]).round(3)
+            nonzero_mask = durations > 0
+            nonzero_indices = np.flatnonzero(nonzero_mask)
+            if len(nonzero_indices):
+                redo_index = nonzero_indices[-1] + 1
+                if (
+                        words and
+                        redo_index > 1 and
+                        curr_words[nonzero_indices[-1]]['end'] >= np.floor(time_offset + segment_samples / SAMPLE_RATE)
+                ):
+                    nonzero_mask[nonzero_indices[-1]] = False
+                    nonzero_indices = nonzero_indices[:-1]
+                    redo_index = nonzero_indices[-1] + 1
+                med_dur = np.median(durations[:redo_index])
+                if fast_mode:
+                    new_start = None
+                    global_max_dur = None
+                else:
+                    local_max_dur = round(med_dur * word_dur_factor, 3) if word_dur_factor else None
+                    if max_word_dur:
+                        local_max_dur = min(local_max_dur, max_word_dur) if local_max_dur else max_word_dur
+                        global_max_dur = max_word_dur
+                    else:
+                        global_max_dur = local_max_dur or None
+                    if global_max_dur and med_dur > global_max_dur:
+                        med_dur = global_max_dur
+                    if (
+                            local_max_dur and durations[nonzero_indices[0]] > global_max_dur
+                    ):
+                        new_start = round(max(
+                            curr_words[nonzero_indices[0]]['end'] - (med_dur * nonzero_indices[0] + local_max_dur),
+                            curr_words[nonzero_indices[0]]['start']
+                        ), 3)
+                        if new_start <= time_offset:
+                            new_start = None
+                    else:
+                        new_start = None
+                if new_start is None:
+                    if global_max_dur:
+                        index_offset = nonzero_indices[0] + 1
+                        redo_indices = \
+                            np.flatnonzero(durations[index_offset:redo_index] > global_max_dur) + index_offset
+                        if len(redo_indices):
+                            redo_index = redo_indices[0]
+                    last_ts = curr_words[redo_index - 1]['end']
+                    redo_words(redo_index)
+                else:
+                    last_ts = new_start
+                    redo_words()
+                seek_sample = round(last_ts * SAMPLE_RATE)
+            else:
+                seek_sample += audio_segment.shape[-1]
+                last_ts = round(seek_sample / SAMPLE_RATE, 2)
+                redo_words()
+            update_pbar()
+            result.extend(curr_words)
+            if verbose:
+                line = '\n'.join(
+                    f"[{format_timestamp(word['start'])}] -> "
+                    f"[{format_timestamp(word['end'])}] \"{word['word']}\""
+                    for word in curr_words
+                )
+                safe_print(line)
+        update_pbar(True)
+    if temp_word is not None:
+        result.append(temp_word)
+    if not result:
+        warnings.warn('Failed to align text.', stacklevel=2)
+    elif words:
+        warnings.warn(f'Failed to align the last {len(words)}/{total_words} words after '
+                      f'{format_timestamp(result[-1]["end"])}.', stacklevel=2)
+    if words and not remove_instant_words:
+        result.extend(
+            [
+                dict(word=w, start=total_duration, end=total_duration, probability=0.0, tokens=wt)
+                for w, wt in zip(words, word_tokens)
+            ]
+        )
+    if not result:
+        return
+    if len(split_indices_by_char):
+        word_lens = np.cumsum([[len(w['word']) for w in result]])
+        split_indices = [(word_lens >= i).nonzero()[0][0]+1 for i in split_indices_by_char]
+        result = WhisperResult([result[i:j] for i, j in zip([0]+split_indices[:-1], split_indices)])
+    else:
+        result = WhisperResult([result])
+    if suppress_silence:
+        result.suppress_silence(
+            *nonspeech_timings,
+            min_word_dur=min_word_dur,
+            word_level=suppress_word_ts,
+            nonspeech_error=nonspeech_error,
+            use_word_position=use_word_position
+        )
+        result.update_nonspeech_sections(*nonspeech_timings)
+    if not original_split:
+        result.regroup(regroup)
+    if fail_segs := len([None for s in result.segments if s.end-s.start <= 0]):
+        warnings.warn(f'{fail_segs}/{len(result.segments)} segments failed to align.', stacklevel=2)
+    return result
+def refine(
+        model: "Whisper",
+        audio: Union[str, np.ndarray, torch.Tensor, bytes],
+        result: WhisperResult,
+        *,
+        steps: str = None,
+        rel_prob_decrease: float = .03,
+        abs_prob_decrease: float = .05,
+        rel_rel_prob_decrease: Optional[float] = None,
+        prob_threshold: float = .5,
+        rel_dur_change: Optional[float] = .5,
+        abs_dur_change: Optional[float] = None,
+        word_level: bool = True,
+        precision: float = None,
+        single_batch: bool = False,
+        inplace: bool = True,
+        demucs: Union[bool, torch.nn.Module] = False,
+        demucs_options: dict = None,
+        only_voice_freq: bool = False,
+        verbose: Optional[bool] = False
+) -> WhisperResult:
+    """
+    Improve existing timestamps.
+    This function iteratively muting portions of the audio and monitoring token probabilities to find the most precise
+    timestamps. This "most precise" in this case means the latest start and earliest end of a word that maintains an
+    acceptable probability determined by the specified arguments.
+    This is useful readjusting timestamps when they start too early or end too late.
+    Parameters
+    ----------
+    model : "Whisper"
+        The Whisper ASR model modified instance
+    audio : str or numpy.ndarray or torch.Tensor or bytes
+        Path/URL to the audio file, the audio waveform, or bytes of audio file.
+        If audio is :class:`numpy.ndarray` or :class:`torch.Tensor`, the audio must be already at sampled to 16kHz.
+    result : stable_whisper.result.WhisperResult
+        All timestamps, words, probabilities, and other data from the transcription of ``audio``.
+    steps : str, default 'se'
+        Instructions for refinement. A 's' means refine start-timestamps. An 'e' means refine end-timestamps.
+    rel_prob_decrease : float, default 0.3
+        Maximum percent decrease in probability relative to original probability which is the probability from muting
+        according initial timestamps.
+    abs_prob_decrease : float, default 0.05
+        Maximum decrease in probability from original probability.
+    rel_rel_prob_decrease : float, optional
+        Maximum percent decrease in probability relative to previous probability which is the probability from previous
+        iteration of muting.
+    prob_threshold : float, default 0.5
+        Stop refining the timestamp if the probability of its token goes below this value.
+    rel_dur_change : float, default 0.5
+        Maximum percent change in duration of a word relative to its original duration.
+    abs_dur_change : float, optional
+        Maximum seconds a word is allowed deviate from its original duration.
+    word_level : bool, default True
+        Whether to refine timestamps on word-level. If ``False``, only refine start/end timestamps of each segment.
+    precision : float, default 0.1
+        Precision of refined timestamps in seconds. The lowest precision is 0.02 second.
+    single_batch : bool, default False
+        Whether to process in only batch size of one to reduce memory usage.
+    inplace : bool, default True, meaning return a deepcopy of ``result``
+        Whether to alter timestamps in-place.
+    demucs : bool or torch.nn.Module, default False
+        Whether to preprocess ``audio`` with Demucs to isolate vocals / remove noise. Set ``demucs`` to an instance of
+        a Demucs model to avoid reloading the model for each run.
+        Demucs must be installed to use. Official repo, https://github.com/facebookresearch/demucs.
+    demucs_options : dict, optional
+        Options to use for :func:`stable_whisper.audio.demucs_audio`.
+    only_voice_freq : bool, default False
+        Whether to only use sound between 200 - 5000 Hz, where majority of human speech are.
+    verbose : bool or None, default False
+        Whether to display the text being decoded to the console.
+        Displays all the details if ``True``. Displays progressbar if ``False``. Display nothing if ``None``.
+    Returns
+    -------
+    stable_whisper.result.WhisperResult
+        All timestamps, words, probabilities, and other data from the refinement of ``text`` with ``audio``.
+    Notes
+    -----
+    The lower the ``precision``, the longer the processing time.
+    Examples
+    --------
+    >>> import stable_whisper
+    >>> model = stable_whisper.load_model('base')
+    >>> result = model.transcribe('audio.mp3')
+    >>> model.refine('audio.mp3', result)
+    >>> result.to_srt_vtt('audio.srt')
+    Saved 'audio.srt'
+    """
+    if not steps:
+        steps = 'se'
+    if precision is None:
+        precision = 0.1
+    if invalid_steps := steps.replace('s', '').replace('e', ''):
+        raise ValueError(f'Invalid step(s): {", ".join(invalid_steps)}')
+    if not result.has_words:
+        raise NotImplementedError(f'Result must have word timestamps.')
+    if not inplace:
+        result = copy.deepcopy(result)
+    audio = prep_audio(
+        audio,
+        demucs=demucs,
+        demucs_options=demucs_options,
+        only_voice_freq=only_voice_freq,
+        verbose=verbose
+    )
+    max_inference_tokens = model.dims.n_text_ctx - 6
+    sample_padding = int(N_FFT // 2) + 1
+    frame_precision = max(round(precision * FRAMES_PER_SECOND), 2)
+    total_duration = round(audio.shape[-1] / SAMPLE_RATE, 3)
+    tokenizer = get_tokenizer(model, language=result.language, task='transcribe')
+    def ts_to_frames(timestamps: Union[np.ndarray, list]) -> np.ndarray:
+        if isinstance(timestamps, list):
+            timestamps = np.array(timestamps)
+        return (timestamps * FRAMES_PER_SECOND).round().astype(int)
+    def curr_segments():
+        all_words = result.all_words()
+        seg_edge_mask = np.array([
+            1 if _i == 0 else (2 if _i == len(seg.words)-1 else 0)
+            for seg in result.segments
+            for _i, w in enumerate(seg.words)
+        ])
+        start_times = [
+            max(
+                0 if abs_dur_change is None else (w.start - abs_dur_change),
+                0 if rel_dur_change is None else (w.start - w.duration * rel_dur_change),
+                0 if i == 0 else max(all_words[i - 1].end, w.end - 14.5, 0)
+            )
+            for i, w in enumerate(all_words)
+        ]
+        end_times = [
+            min(
+                total_duration if abs_dur_change is None else (w.end + abs_dur_change),
+                total_duration if rel_dur_change is None else (w.end + w.duration * rel_dur_change),
+                total_duration if i == len(all_words) else min(all_words[i].start, w.start + 14.5, total_duration)
+            )
+            for i, w in enumerate(all_words, 1)
+        ]
+        start = start_times[0]
+        prev_i = 0
+        curr_words, curr_starts, curr_ends = [], [], []
+        for i, w in enumerate(all_words, 1):
+            if (
+                    (end_times[0] - start > 30) or
+                    (len(curr_words) + 1 > max_inference_tokens)
+            ):
+                if curr_words:
+                    yield curr_words, curr_starts, curr_ends, seg_edge_mask[prev_i:prev_i+len(curr_words)]
+                    curr_words, curr_starts, curr_ends = [], [], []
+                start = start_times[0]
+                prev_i = i - 1
+            curr_words.append(w)
+            curr_starts.append(start_times.pop(0))
+            curr_ends.append(end_times.pop(0))
+            if i == len(all_words):
+                yield curr_words, curr_starts, curr_ends, seg_edge_mask[prev_i:prev_i+len(curr_words)]
+    def _refine(_step: str):
+        for words, min_starts, max_ends, edge_mask in curr_segments():
+            time_offset = min_starts[0]
+            start_sample = round(time_offset * SAMPLE_RATE)
+            end_sample = round(max_ends[-1] * SAMPLE_RATE)
+            audio_segment = audio[start_sample:end_sample + 1].unsqueeze(0)
+            max_starts = ts_to_frames(np.array([w.end for w in words]) - time_offset)
+            min_ends = ts_to_frames(np.array([w.start for w in words]) - time_offset)
+            min_starts = ts_to_frames(np.array(min_starts) - time_offset)
+            max_ends = ts_to_frames(np.array(max_ends) - time_offset)
+            mid_starts = min_starts + ((max_starts - min_starts) / 2).round().astype(int)
+            mid_ends = min_ends + ((max_ends - min_ends) / 2).round().astype(int)
+            text_tokens = [t for w in words for t in w.tokens if t < tokenizer.eot]
+            word_tokens = [[t for t in w.tokens if t < tokenizer.eot] for w in words]
+            orig_mel_segment = log_mel_spectrogram(audio_segment, model.dims.n_mels, padding=sample_padding)
+            orig_mel_segment = pad_or_trim(orig_mel_segment, N_FRAMES).to(device=model.device)
+            def get_prob():
+                tokens = torch.tensor(
+                    [
+                        *tokenizer.sot_sequence,
+                        tokenizer.no_timestamps,
+                        *text_tokens,
+                        tokenizer.eot,
+                    ]
+                ).to(model.device)
+                with torch.no_grad():
+                    curr_mel_segment = mel_segment if prob_indices else orig_mel_segment
+                    if single_batch:
+                        logits = torch.cat(
+                            [model(_mel.unsqueeze(0), tokens.unsqueeze(0)) for _mel in curr_mel_segment]
+                        )
+                    else:
+                        logits = model(curr_mel_segment, tokens.unsqueeze(0))
+                sampled_logits = logits[:, len(tokenizer.sot_sequence):, : tokenizer.eot]
+                token_probs = sampled_logits.softmax(dim=-1)
+                text_token_probs = token_probs[:, np.arange(len(text_tokens)), text_tokens]
+                token_positions = token_probs[:, np.arange(len(text_tokens))]
+                if logits.shape[0] != 1 and prob_indices is not None:
+                    indices1 = np.arange(len(prob_indices))
+                    text_token_probs = text_token_probs[prob_indices, indices1]
+                    token_positions = token_positions[prob_indices, indices1]
+                else:
+                    text_token_probs.squeeze_(0)
+                text_token_probs = text_token_probs.tolist()
+                token_positions = \
+                    (
+                            token_positions.sort().indices == tokens[len(tokenizer.sot_sequence) + 1:-1][:, None]
+                    ).nonzero()[:, -1].tolist()
+                word_boundaries = np.pad(np.cumsum([len(t) for t in word_tokens]), (1, 0))
+                word_probabilities = np.array([
+                    text_token_probs[j-1] if is_end_ts else text_token_probs[i]
+                    for i, j in zip(word_boundaries[:-1], word_boundaries[1:])
+                ])
+                token_positions = [
+                    token_positions[j-1] if is_end_ts else token_positions[i]
+                    for i, j in zip(word_boundaries[:-1], word_boundaries[1:])
+                ]
+                return word_probabilities, token_positions
+            def update_ts():
+                if not is_finish[idx] or changes[idx, -1] == -1:
+                    return
+                new_ts = round(time_offset + (changes[idx, -1] / FRAMES_PER_SECOND), 3)
+                if changes[idx, 0] and not changes[idx, 1]:
+                    if is_end_ts:
+                        if new_ts <= words[idx].end:
+                            return
+                    elif new_ts >= words[idx].start:
+                        return
+                if not verbose:
+                    return
+                curr_word = words[idx]
+                word_info = (f'[Word="{curr_word.word}"] '
+                             f'[Segment ID: {curr_word.segment_id}] '
+                             f'[Word ID: {curr_word.id}]')
+                if is_end_ts:
+                    print(f'End: {words[idx].end} -> {new_ts}  {word_info}')
+                    words[idx].end = new_ts
+                else:
+                    print(f'Start: {words[idx].start} -> {new_ts}  {word_info}')
+                    words[idx].start = new_ts
+            mel_segment = orig_mel_segment.clone().repeat_interleave(2, 0)
+            is_end_ts = _step == 'e'
+            prob_indices = []
+            is_finish = np.less([w.probability for w in words], prob_threshold)
+            is_finish = np.logical_or(is_finish, [w.duration == 0 for w in words])
+            if not word_level:
+                is_finish[edge_mask != (2 if is_end_ts else 1)] = True
+            for idx, _i in enumerate(max_starts if is_end_ts else min_ends):
+                row = idx % 2
+                prob_indices.extend([row] * len(words[idx].tokens))
+                if is_finish[idx]:
+                    continue
+                if is_end_ts:
+                    _p = mel_segment.shape[-1] if idx == len(words)-1 else mid_ends[idx+1]
+                    mel_segment[row, :, _i:_p] = 0
+                else:
+                    _p = 0 if idx == 0 else mid_starts[idx-1]
+                    mel_segment[row, :, _p:_i] = 0
+            orig_probs, orig_tk_poss = get_prob()
+            changes = np.zeros((orig_probs.shape[-1], 3), dtype=int)
+            changes[:, -1] = -1
+            frame_indices = (mid_ends, max_starts) if is_end_ts else (min_ends, mid_starts)
+            for idx, (_s, _e) in enumerate(zip(*frame_indices)):
+                row = idx % 2
+                if is_finish[idx]:
+                    continue
+                mel_segment[row, :, _s:_e] = 0
+            new_probs = prev_probs = orig_probs
+            while not np.all(is_finish):
+                probs, tk_poss = get_prob()
+                abs_diffs = orig_probs - probs
+                rel_diffs = abs_diffs / orig_probs
+                rel_change_diffs = (prev_probs - probs) / prev_probs
+                prev_probs = probs
+                for idx, (abs_diff, rel_diff, rel_change_diff, prob) \
+                        in enumerate(zip(abs_diffs, rel_diffs, rel_change_diffs, probs)):
+                    if is_finish[idx]:
+                        continue
+                    if is_end_ts:
+                        curr_min, curr_max, curr_mid = min_ends[idx], max_ends[idx], mid_ends[idx]
+                    else:
+                        curr_min, curr_max, curr_mid = min_starts[idx], max_starts[idx], mid_starts[idx]
+                    row = prob_indices[idx]
+                    best_tks_changed = orig_tk_poss[idx] > tk_poss[idx]
+                    failed_requirements = (
+                            abs_diff > abs_prob_decrease or
+                            rel_diff > rel_prob_decrease or
+                            (rel_rel_prob_decrease is not None and rel_change_diff > rel_rel_prob_decrease) or
+                            prob < prob_threshold or
+                            best_tks_changed
+                    )
+                    if failed_requirements:
+                        changes[idx][0] = 1
+                        if is_end_ts:
+                            curr_min = curr_mid
+                        else:
+                            curr_max = curr_mid
+                    else:
+                        changes[idx][1] = 1
+                        if is_end_ts:
+                            curr_max = curr_mid
+                        else:
+                            curr_min = curr_mid
+                    if (new_mid_change := round((curr_max - curr_min) / 2)) < frame_precision:
+                        is_finish[idx] = True
+                        update_ts()
+                        continue
+                    new_mid = curr_min + new_mid_change
+                    if failed_requirements:
+                        if is_end_ts:
+                            mel_segment[row, :, curr_min:new_mid] = orig_mel_segment[0, :, curr_min:new_mid]
+                        else:
+                            mel_segment[row, :, new_mid:curr_max] = orig_mel_segment[0, :, new_mid:curr_max]
+                    else:
+                        if is_end_ts:
+                            mel_segment[row, :, new_mid:curr_max] = 0
+                        else:
+                            mel_segment[row, :, curr_min:new_mid] = 0
+                    if is_end_ts:
+                        min_ends[idx], max_ends[idx], mid_ends[idx] = curr_min, curr_max, new_mid
+                    else:
+                        min_starts[idx], max_starts[idx], mid_starts[idx] = curr_min, curr_max, new_mid
+                    if not best_tks_changed:
+                        changes[idx][-1] = new_mid
+                    new_probs[idx] = prob
+            update_pbar(words[-1].end)
+    with tqdm(total=round(total_duration, 2), unit='sec', disable=verbose is not False, desc='Refine') as tqdm_pbar:
+        def update_pbar(last_ts: float):
+            nonlocal prev_ts
+            tqdm_pbar.update(round(((last_ts - prev_ts) / len(steps)), 2))
+            prev_ts = last_ts
+        for step_count, step in enumerate(steps, 1):
+            prev_ts = 0
+            _refine(step)
+            update_pbar(round(tqdm_pbar.total / len(step), 2))
+        tqdm_pbar.update(tqdm_pbar.total - tqdm_pbar.n)
+    result.update_all_segs_with_words()
+    return result
+def locate(
+        model: "Whisper",
+        audio: Union[str, np.ndarray, torch.Tensor, bytes],
+        text: Union[str, List[int]],
+        language: str,
+        count: int = 1,
+        duration_window: Union[float, Tuple[float, float]] = 3.0,
+        *,
+        mode: int = 0,
+        start: float = None,
+        end: float = None,
+        probability_threshold: float = 0.5,
+        eots: int = 1,
+        max_token_per_seg: int = 20,
+        exact_token: bool = False,
+        case_sensitive: bool = False,
+        verbose: bool = False,
+        initial_prompt: str = None,
+        suppress_tokens: Union[str, List[int]] = '-1',
+        demucs: Union[bool, torch.nn.Module] = False,
+        demucs_options: dict = None,
+        only_voice_freq: bool = False,
+) -> Union[List[Segment], List[dict]]:
+    """
+    Locate when specific words are spoken in ``audio`` without fully transcribing.
+    This is usefully for quickly finding at what time the specify words or phrases are spoken in an audio. Since it
+    does not need to transcribe the audio to approximate the time, it is significantly faster transcribing then
+    locating the word in the transcript.
+    It can also transcribe few seconds around the approximated time to find out what was said around those words or
+    confirm if the word was even spoken near that time.
+    Parameters
+    ----------
+    model : whisper.model.Whisper
+        An instance of Whisper ASR model.
+    audio : str or numpy.ndarray or torch.Tensor or bytes
+        Path/URL to the audio file, the audio waveform, or bytes of audio file.
+        If audio is :class:`numpy.ndarray` or :class:`torch.Tensor`, the audio must be already at sampled to 16kHz.
+    text: str or list of int
+        Words/phrase or list of tokens to search for in ``audio``.
+    language : str
+        Language of the ``text``.
+    count : int, default 1, meaning stop search after 1 match
+        Number of matches to find. Use 0 to look for all.
+    duration_window : float or tuple of (float, float), default 3.0, same as (3.0, 3.0)
+        Seconds before and after the end timestamp approximations to transcribe after mode 1.
+        If tuple pair of values, then the 1st value will be seconds before the end and 2nd value will be seconds after.
+    mode : int, default 0
+        Mode of search.
+        2, Approximates the end timestamp of ``text`` in the audio. This mode does not confirm whether ``text`` is
+            spoken at the timestamp
+        1, Completes mode 2 then transcribes audio within ``duration_window`` to confirm whether `text` is a match at
+            the approximated timestamp by checking if ``text`` at that ``duration_window`` is within
+            ``probability_threshold`` or matching the string content if ``text`` with the transcribed text at the
+            ``duration_window``.
+        0, Completes mode 1 then add word timestamps to the transcriptions of each match.
+        Modes from fastest to slowest: 2, 1, 0
+    start : float, optional, meaning it starts from 0s
+        Seconds into the audio to start searching for ``text``.
+    end : float, optional
+        Seconds into the audio to stop searching for ``text``.
+    probability_threshold : float, default 0.5
+        Minimum probability of each token in ``text`` for it to be considered a match.
+    eots : int, default 1
+        Number of EOTs to reach before stopping transcription at mode 1. When transcription reach a EOT, it usually
+        means the end of the segment or audio. Once ``text`` is found in the ``duration_window``, the transcription
+        will stop immediately upon reaching a EOT.
+    max_token_per_seg : int, default 20
+        Maximum number of tokens to transcribe in the ``duration_window`` before stopping.
+    exact_token : bool, default False
+        Whether to find a match base on the exact tokens that make up ``text``.
+    case_sensitive : bool, default False
+        Whether to consider the case of ``text`` when matching in string content.
+    verbose : bool or None, default False
+        Whether to display the text being decoded to the console.
+        Displays all the details if ``True``. Displays progressbar if ``False``. Display nothing if ``None``.
+    initial_prompt : str, optional
+        Text to provide as a prompt for the first window. This can be used to provide, or
+        "prompt-engineer" a context for transcription, e.g. custom vocabularies or proper nouns
+        to make it more likely to predict those word correctly.
+    suppress_tokens : str or list of int, default '-1', meaning suppress special characters except common punctuations
+        List of tokens to suppress.
+    demucs : bool or torch.nn.Module, default False
+        Whether to preprocess ``audio`` with Demucs to isolate vocals / remove noise. Set ``demucs`` to an instance of
+        a Demucs model to avoid reloading the model for each run.
+        Demucs must be installed to use. Official repo, https://github.com/facebookresearch/demucs.
+    demucs_options : dict, optional
+        Options to use for :func:`stable_whisper.audio.demucs_audio`.
+    only_voice_freq : bool, default False
+        Whether to only use sound between 200 - 5000 Hz, where majority of human speech are.
+    Returns
+    -------
+    stable_whisper.result.Segment or list of dict or list of float
+        Mode 0, list of instances of :class:`stable_whisper.result.Segment`.
+        Mode 1, list of dictionaries with end timestamp approximation of matches and transcribed neighboring words.
+        Mode 2, list of timestamps in seconds for each end timestamp approximation.
+    Notes
+    -----
+    For ``text``, the case and spacing matters as 'on', ' on', ' On' are different tokens, therefore chose the one that
+    best suits the context (e.g. ' On' to look for it at the beginning of a sentence).
+    Use a sufficiently large first value of ``duration_window`` i.e. the value > time it is expected to speak ``text``.
+    If ``exact_token = False`` and the string content matches, then ``probability_threshold`` is not used.
+    Examples
+    --------
+    >>> import stable_whisper
+    >>> model = stable_whisper.load_model('base')
+    >>> matches = model.locate('audio.mp3', 'are', 'English', verbose=True)
+    Some words can sound the same but have different spellings to increase of the chance of finding such words use
+    ``initial_prompt``.
+    >>> matches = model.locate('audio.mp3', ' Nickie', 'English', verbose=True, initial_prompt='Nickie')
+    """
+    from whisper.timing import median_filter
+    from whisper.decoding import DecodingTask, DecodingOptions, SuppressTokens
+    from .timing import split_word_tokens
+    sample_padding = int(N_FFT // 2) + 1
+    sec_per_emb = model.dims.n_audio_ctx / CHUNK_LENGTH
+    CHUNK_SAMPLES = round(CHUNK_LENGTH * SAMPLE_RATE)
+    if isinstance(duration_window, (float, int)):
+        duration_window = [duration_window] * 2
+    window_sum = sum(duration_window)
+    assert CHUNK_SAMPLES > window_sum, \
+        f'Sum of [duration_window] must be less than {CHUNK_SAMPLES}, got {window_sum}'
+    adjusted_chunk_size = CHUNK_SAMPLES - round(duration_window[0]*SAMPLE_RATE)
+    if initial_prompt:
+        initial_prompt = ' ' + initial_prompt.strip()
+    task = DecodingTask(model, DecodingOptions(
+        language=language, prompt=initial_prompt, suppress_tokens=suppress_tokens, without_timestamps=True,
+    ))
+    tokenizer = task.tokenizer
+    initial_tokens = list(task.initial_tokens)
+    text_tokens, text = (tokenizer.encode(text), text) if isinstance(text, str) else (text, tokenizer.decode(text))
+    if not exact_token and not case_sensitive:
+        text = text.lower()
+    tk_suppress_masks = [
+        [i for i in fil.suppress_tokens if i < tokenizer.eot]
+        for fil in task.logit_filters if isinstance(fil, SuppressTokens)
+    ]
+    audio = prep_audio(
+        audio,
+        demucs=demucs,
+        demucs_options=demucs_options,
+        only_voice_freq=only_voice_freq,
+        verbose=verbose
+    )
+    prev_target_end = None
+    found = 0
+    if end:
+        audio = audio[:round(end * SAMPLE_RATE)]
+    seek_sample = round(start * SAMPLE_RATE) if start else 0
+    total_samples = audio.shape[-1]
+    def _locate():
+        nonlocal seek_sample, found
+        seek = round(seek_sample / SAMPLE_RATE, 3)
+        audio_segment = audio[seek_sample: seek_sample + CHUNK_SAMPLES]
+        mel_segment = log_mel_spectrogram(audio_segment, model.dims.n_mels, padding=sample_padding)
+        mel_segment = pad_or_trim(mel_segment, N_FRAMES).to(device=model.device)
+        QKs = [None] * model.dims.n_text_layer
+        hooks = [
+            block.cross_attn.register_forward_hook(
+                lambda _, ins, outs, index=i: QKs.__setitem__(index, outs[-1])
+            )
+            for i, block in enumerate(model.decoder.blocks)
+        ]
+        tokens = torch.tensor([initial_tokens + text_tokens]).to(model.device)
+        with torch.no_grad():
+            audio_features = model.encoder(mel_segment.unsqueeze(0))
+            model.decoder(tokens, audio_features)
+        for hook in hooks:
+            hook.remove()
+        weights = torch.cat([QKs[_l][:, _h] for _l, _h in model.alignment_heads.indices().T], dim=0)
+        weights = weights.softmax(dim=-1)
+        std, mean = torch.std_mean(weights, dim=-2, keepdim=True, unbiased=False)
+        weights = (weights - mean) / std
+        weights = median_filter(weights, 7)
+        matrix = weights.mean(axis=0)
+        target_end = round((matrix[-1].argmax()/sec_per_emb).item(), 3)
+        found_msg = f'"{text}" ending at ~{format_timestamp(target_end+seek)}' if verbose else ''
+        if mode == 2:
+            if found_msg:
+                safe_print('Unconfirmed:' + found_msg)
+            nonlocal prev_target_end
+            found += 1
+            if (
+                    (seek_sample + CHUNK_SAMPLES >= total_samples) or
+                    (count and found >= count) or
+                    (prev_target_end == target_end)
+            ):
+                seek_sample = total_samples
+            else:
+                seek_sample += round(target_end * SAMPLE_RATE)
+            prev_target_end = target_end
+            return dict(tokens=[], target_end=target_end+seek)
+        curr_start = round(max(target_end - duration_window[0], 0.), 3)
+        curr_end = round(target_end + duration_window[1], 3)
+        start_frame = round(curr_start * FRAMES_PER_SECOND)
+        end_frame = round(curr_end * FRAMES_PER_SECOND)
+        mel_segment_section = pad_or_trim(mel_segment[..., start_frame:end_frame], N_FRAMES)
+        temp_tokens = torch.tensor([initial_tokens]).to(model.device)
+        predictions = []
+        target_token_idx = 0
+        not_end = True
+        found_target = False
+        curr_eots = 0
+        temp_audio_features = model.encoder(mel_segment_section.unsqueeze(0))
+        tokens_to_decode = []
+        replace_found_tokens = []
+        infer_tokens = [temp_tokens[0]]
+        kv_cache, hooks = model.install_kv_cache_hooks()
+        while not_end:
+            with torch.no_grad():
+                logits = model.decoder(temp_tokens, temp_audio_features, kv_cache=kv_cache)[0, -1, :tokenizer.eot+1]
+            for tks in tk_suppress_masks:
+                logits[tks] = -np.inf
+            sorted_logits_idxs = logits.sort(dim=-1).indices[-2:]
+            best_token = sorted_logits_idxs[-1]
+            best_non_eot_token = sorted_logits_idxs[-2] if best_token == tokenizer.eot else best_token
+            logits = logits[:tokenizer.eot].softmax(dim=-1)
+            if found_target:
+                target_word_prob = is_match = None
+            else:
+                if exact_token:
+                    is_match = False
+                else:
+                    tokens_to_decode.append(best_non_eot_token)
+                    temp_text = tokenizer.decode(tokens_to_decode)
+                    if not case_sensitive:
+                        temp_text = temp_text.lower()
+                    if is_match := temp_text.endswith(text):
+                        tokens_to_decode = []
+                target_word_prob = logits[text_tokens[target_token_idx]].item()
+            if (
+                    target_word_prob is not None and
+                    (
+                            target_word_prob >= probability_threshold or
+                            best_non_eot_token == text_tokens[target_token_idx] or
+                            is_match
+                    )
+            ):
+                if is_match:
+                    best_token = best_non_eot_token
+                    token_prob = logits[best_token].item()
+                    found_target = True
+                else:
+                    best_token[None] = text_tokens[target_token_idx]
+                    if len(replace_found_tokens) or best_non_eot_token != text_tokens[target_token_idx]:
+                        replace_found_tokens.append(best_non_eot_token)
+                    target_token_idx += 1
+                    if target_token_idx == len(text_tokens):
+                        found_target = True
+                    token_prob = target_word_prob
+                if found_target:
+                    found += 1
+                curr_eots = 0
+            else:
+                if not found_target:
+                    if len(replace_found_tokens):
+                        temp_tokens = torch.cat(infer_tokens)[None]
+                        temp_tokens = torch.cat(
+                            [temp_tokens[..., :-len(replace_found_tokens)],
+                             torch.stack(replace_found_tokens)[None]]
+                        )
+                        replace_found_tokens = []
+                        kv_cache.clear()
+                    target_token_idx = 0
+                if best_token == tokenizer.eot:
+                    if curr_eots >= eots or found_target:
+                        not_end = False
+                    else:
+                        curr_eots += 1
+                        best_token = best_non_eot_token
+                else:
+                    curr_eots = 0
+                token_prob = None if best_token == tokenizer.eot else logits[best_token].item()
+            predictions.append(dict(token=best_token.item(), prob=token_prob))
+            if len(predictions) > max_token_per_seg:
+                not_end = False
+            if not_end:
+                infer_tokens.append(best_token[None])
+                temp_tokens = best_token[None, None]
+        kv_cache.clear()
+        for hook in hooks:
+            hook.remove()
+        segment = None
+        if found_target:
+            if found_msg:
+                safe_print('Confirmed: ' + found_msg, tqdm_pbar.write)
+            final_tokens = [p['token'] for p in predictions]
+            if mode == 1:
+                _, (ws, wts), _ = split_word_tokens([dict(tokens=final_tokens)], tokenizer)
+                final_token_probs = [p['prob'] for p in predictions]
+                wps = [float(np.mean([final_token_probs.pop(0) for _ in wt])) for wt in wts]
+                words = [dict(word=w, tokens=wt, probability=wp) for w, wt, wp in zip(ws, wts, wps)]
+                final_end = target_end+seek
+                near_text = "".join(ws)
+                segment = dict(end=final_end, text=text, duration_window_text=near_text, duration_window_word=words)
+                if verbose:
+                    safe_print(f'Duration Window: "{near_text}"\n', tqdm_pbar.write)
+                seek_sample += round(curr_end * SAMPLE_RATE)
+            else:
+                segment = dict(
+                    seek=0,
+                    tokens=final_tokens
+                )
+                add_word_timestamps_stable(
+                    segments=[segment],
+                    model=model,
+                    tokenizer=tokenizer,
+                    mel=mel_segment,
+                    num_samples=round(curr_end*SAMPLE_RATE),
+                    gap_padding=None
+                )
+                segment = Segment(0, 0, '', words=segment['words'])
+                segment.update_seg_with_words()
+                seek_sample += round(segment.words[-1].end * SAMPLE_RATE)
+                segment.offset_time(seek)
+                segment.seek = curr_start
+                if verbose:
+                    safe_print(segment.to_display_str(), tqdm_pbar.write)
+        else:
+            seek_sample += adjusted_chunk_size if audio_segment.shape[-1] == CHUNK_SAMPLES else audio_segment.shape[-1]
+        return segment
+    total_duration = round(total_samples / SAMPLE_RATE, 2)
+    matches = []
+    with tqdm(total=total_duration, unit='sec', disable=verbose is None, desc='Locate') as tqdm_pbar:
+        while seek_sample < total_samples and (not count or found < count):
+            if match := _locate():
+                matches.append(match)
+            tqdm_pbar.update(round(seek_sample/SAMPLE_RATE, 2) - tqdm_pbar.n)
+        tqdm_pbar.update(tqdm_pbar.total - tqdm_pbar.n)
+    if verbose and not matches:
+        safe_print(f'Failed to locate "{text}".')
+    return matches

stable_whisper/audio.py ADDED Viewed

	@@ -0,0 +1,288 @@

+import subprocess
+import warnings
+import ffmpeg
+import torch
+import torchaudio
+import numpy as np
+from typing import Union, Optional
+from whisper.audio import SAMPLE_RATE
+def is_ytdlp_available():
+    return subprocess.run('yt-dlp -h', shell=True, capture_output=True).returncode == 0
+def _load_file(file: Union[str, bytes], verbose: bool = False, only_ffmpeg: bool = False):
+    if isinstance(file, str) and '://' in file:
+        if is_ytdlp_available():
+            verbosity = ' -q' if verbose is None else (' --progress' if verbose else ' --progress -q')
+            p = subprocess.run(
+                f'yt-dlp "{file}" -f ba/w -I 1{verbosity} -o -',
+                shell=True,
+                stdout=subprocess.PIPE
+            )
+            if len(p.stdout) == 0:
+                raise RuntimeError(f'Failed to download media from "{file}" with yt-dlp')
+            return p.stdout
+        else:
+            warnings.warn('URL detected but yt-dlp not available. '
+                          'To handle a greater variety of URLs (i.e. non-direct links), '
+                          'install yt-dlp, \'pip install yt-dlp\' (repo: https://github.com/yt-dlp/yt-dlp).')
+        if not only_ffmpeg:
+            if is_ytdlp_available():
+                verbosity = ' -q' if verbose is None else (' --progress' if verbose else ' --progress -q')
+                p = subprocess.run(
+                    f'yt-dlp "{file}" -f ba/w -I 1{verbosity} -o -',
+                    shell=True,
+                    stdout=subprocess.PIPE
+                )
+                if p.returncode != 0 or len(p.stdout) == 0:
+                    raise RuntimeError(f'Failed to download media from "{file}" with yt-dlp')
+                return p.stdout
+            else:
+                warnings.warn('URL detected but yt-dlp not available. '
+                              'To handle a greater variety of URLs (i.e. non-direct links), '
+                              'install yt-dlp, \'pip install yt-dlp\' (repo: https://github.com/yt-dlp/yt-dlp).')
+    return file
+# modified version of whisper.audio.load_audio
+def load_audio(file: Union[str, bytes], sr: int = SAMPLE_RATE, verbose: bool = True, only_ffmpeg: bool = False):
+    """
+    Open an audio file and read as mono waveform then resamples as necessary.
+    Parameters
+    ----------
+    file : str or bytes
+        The audio file to open, bytes of file, or URL to audio/video.
+    sr : int, default ``whisper.model.SAMPLE_RATE``
+        The sample rate to resample the audio if necessary.
+    verbose : bool, default True
+        Whether to print yt-dlp log.
+    only_ffmpeg : bool, default False
+        Whether to use only FFmpeg (instead of yt-dlp) for URls.
+    Returns
+    -------
+    numpy.ndarray
+        A array containing the audio waveform in float32.
+    """
+    file = _load_file(file, verbose=verbose, only_ffmpeg=only_ffmpeg)
+    if isinstance(file, bytes):
+        inp, file = file, 'pipe:'
+    else:
+        inp = None
+    try:
+        # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
+        # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
+        out, _ = (
+            ffmpeg.input(file, threads=0)
+            .output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr)
+            .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True, input=inp)
+        )
+    except ffmpeg.Error as e:
+        raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
+    return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
+def voice_freq_filter(wf: (torch.Tensor, np.ndarray), sr: int,
+                      upper_freq: int = None,
+                      lower_freq: int = None) -> torch.Tensor:
+    if isinstance(wf, np.ndarray):
+        wf = torch.from_numpy(wf)
+    if upper_freq is None:
+        upper_freq = 5000
+    if lower_freq is None:
+        lower_freq = 200
+    assert upper_freq > lower_freq, f'upper_freq {upper_freq} must but greater than lower_freq {lower_freq}'
+    return torchaudio.functional.highpass_biquad(torchaudio.functional.lowpass_biquad(wf, sr, upper_freq),
+                                                 sr,
+                                                 lower_freq)
+def is_demucs_available():
+    from importlib.util import find_spec
+    if find_spec('demucs') is None:
+        raise ModuleNotFoundError("Please install Demucs; "
+                                  "'pip install -U demucs' or "
+                                  "'pip install -U git+https://github.com/facebookresearch/demucs#egg=demucs'; "
+                                  "Official Demucs repo: https://github.com/facebookresearch/demucs")
+def load_demucs_model():
+    is_demucs_available()
+    from demucs.pretrained import get_model_from_args
+    return get_model_from_args(type('args', (object,), dict(name='htdemucs', repo=None))).cpu().eval()
+def demucs_audio(audio: (torch.Tensor, str),
+                 input_sr: int = None,
+                 output_sr: int = None,
+                 model=None,
+                 device=None,
+                 verbose: bool = True,
+                 track_name: str = None,
+                 save_path: str = None,
+                 **demucs_options) -> torch.Tensor:
+    """
+    Isolates vocals / remove noise from ``audio`` with Demucs.
+    Official repo, https://github.com/facebookresearch/demucs.
+    """
+    if model is None:
+        model = load_demucs_model()
+    else:
+        is_demucs_available()
+    from demucs.apply import apply_model
+    if track_name:
+        track_name = f'"{track_name}"'
+    if isinstance(audio, (str, bytes)):
+        if isinstance(audio, str) and not track_name:
+            track_name = f'"{audio}"'
+        audio = torch.from_numpy(load_audio(audio, model.samplerate))
+    elif input_sr != model.samplerate:
+        if input_sr is None:
+            raise ValueError('No [input_sr] specified for audio tensor.')
+        audio = torchaudio.functional.resample(audio,
+                                               orig_freq=input_sr,
+                                               new_freq=model.samplerate)
+    if not track_name:
+        track_name = 'audio track'
+    audio_dims = audio.dim()
+    if audio_dims == 1:
+        audio = audio[None, None].repeat_interleave(2, -2)
+    else:
+        if audio.shape[-2] == 1:
+            audio = audio.repeat_interleave(2, -2)
+        if audio_dims < 3:
+            audio = audio[None]
+    if 'mix' in demucs_options:
+        audio = demucs_options.pop('mix')
+    if device is None:
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+    vocals_idx = model.sources.index('vocals')
+    if verbose:
+        print(f'Isolating vocals from {track_name}')
+    apply_kwarg = dict(
+        model=model,
+        mix=audio,
+        device=device,
+        split=True,
+        overlap=.25,
+        progress=verbose is not None,
+    )
+    apply_kwarg.update(demucs_options)
+    vocals = apply_model(
+        **apply_kwarg
+    )[0, vocals_idx].mean(0)
+    if device != 'cpu':
+        torch.cuda.empty_cache()
+    if output_sr is not None and model.samplerate != output_sr:
+        vocals = torchaudio.functional.resample(vocals,
+                                                orig_freq=model.samplerate,
+                                                new_freq=output_sr)
+    if save_path is not None:
+        if isinstance(save_path, str) and not save_path.lower().endswith('.wav'):
+            save_path += '.wav'
+        torchaudio.save(save_path, vocals[None], output_sr or model.samplerate)
+        print(f'Saved: {save_path}')
+    return vocals
+def get_samplerate(audiofile: (str, bytes)) -> (int, None):
+    import re
+    if isinstance(audiofile, str):
+        metadata = subprocess.run(f'ffmpeg -i {audiofile}', capture_output=True, shell=True).stderr.decode()
+    else:
+        p = subprocess.Popen(f'ffmpeg -i -',  stderr=subprocess.PIPE, stdin=subprocess.PIPE, shell=True)
+        try:
+            p.stdin.write(audiofile)
+        except BrokenPipeError:
+            pass
+        finally:
+            metadata = p.communicate()[-1]
+            if metadata is not None:
+                metadata = metadata.decode()
+    sr = re.findall(r'\n.+Stream.+Audio.+\D+(\d+) Hz', metadata)
+    if sr:
+        return int(sr[0])
+def prep_audio(
+        audio: Union[str, np.ndarray, torch.Tensor, bytes],
+        demucs: Union[bool, torch.nn.Module] = False,
+        demucs_options: dict = None,
+        only_voice_freq: bool = False,
+        only_ffmpeg: bool = False,
+        verbose: Optional[bool] = False,
+        sr: int = None
+) -> torch.Tensor:
+    """
+    Converts input audio of many types into a mono waveform as a torch.Tensor.
+    Parameters
+    ----------
+    audio : str or numpy.ndarray or torch.Tensor or bytes
+        Path/URL to the audio file, the audio waveform, or bytes of audio file.
+        If audio is :class:`numpy.ndarray` or :class:`torch.Tensor`, the audio must be already at sampled to 16kHz.
+    demucs : bool or torch.nn.Module, default False
+        Whether to preprocess ``audio`` with Demucs to isolate vocals / remove noise. Set ``demucs`` to an instance of
+        a Demucs model to avoid reloading the model for each run.
+        Demucs must be installed to use. Official repo, https://github.com/facebookresearch/demucs.
+    demucs_options : dict, optional
+        Options to use for :func:`stable_whisper.audio.demucs_audio`.
+    only_voice_freq : bool, default False
+        Whether to only use sound between 200 - 5000 Hz, where majority of human speech are.
+    sr : int, default None, meaning ``whisper.audio.SAMPLE_RATE``, 16kHZ
+        The sample rate of ``audio``.
+    verbose : bool, default False
+        Whether to print yt-dlp log.
+    only_ffmpeg: bool, default False
+        Whether to use only FFmpeg (and not yt-dlp) for URls.
+    Returns
+    -------
+    torch.Tensor
+        A mono waveform.
+    """
+    if not sr:
+        sr = SAMPLE_RATE
+    if isinstance(audio, (str, bytes)):
+        if demucs:
+            demucs_kwargs = dict(
+                audio=audio,
+                output_sr=sr,
+                verbose=verbose,
+            )
+            demucs_kwargs.update(demucs_options or {})
+            audio = demucs_audio(**demucs_kwargs)
+        else:
+            audio = torch.from_numpy(load_audio(audio, sr=sr, verbose=verbose, only_ffmpeg=only_ffmpeg))
+    else:
+        if isinstance(audio, np.ndarray):
+            audio = torch.from_numpy(audio)
+        if demucs:
+            demucs_kwargs = dict(
+                audio=audio,
+                input_sr=sr,
+                output_sr=sr,
+                verbose=verbose,
+            )
+            demucs_kwargs.update(demucs_options or {})
+            audio = demucs_audio(**demucs_kwargs)
+    if only_voice_freq:
+        audio = voice_freq_filter(audio, sr)
+    return audio

stable_whisper/decode.py ADDED Viewed

	@@ -0,0 +1,109 @@

+from typing import TYPE_CHECKING, List, Union
+from dataclasses import replace
+import torch
+import numpy as np
+from whisper.decoding import DecodingTask, DecodingOptions, DecodingResult
+if TYPE_CHECKING:
+    from whisper.model import Whisper
+def _suppress_ts(ts_logits: torch.Tensor, ts_token_mask: torch.Tensor = None):
+    if ts_token_mask is not None:
+        ts_logits[:, ts_token_mask] = -np.inf
+# modified version of whisper.decoding.DecodingTask
+class DecodingTaskStable(DecodingTask):
+    def __init__(self, *args, **kwargs):
+        self.ts_token_mask: torch.Tensor = kwargs.pop('ts_token_mask', None)
+        self.audio_features: torch.Tensor = kwargs.pop('audio_features', None)
+        super(DecodingTaskStable, self).__init__(*args, **kwargs)
+    def _get_audio_features(self, mel: torch.Tensor):
+        if self.audio_features is None:
+            audio_features = super()._get_audio_features(mel)
+            self.audio_features = audio_features.detach().clone()
+            return audio_features
+        return self.audio_features.clone()
+    # modified version of whisper.DecodingTask._main_loop
+    def _main_loop(self, audio_features: torch.Tensor, tokens: torch.Tensor):
+        n_batch = tokens.shape[0]
+        sum_logprobs: torch.Tensor = torch.zeros(n_batch, device=audio_features.device)
+        no_speech_probs = [np.nan] * n_batch
+        try:
+            for i in range(self.sample_len):
+                logits = self.inference.logits(tokens, audio_features)
+                if i == 0 and self.tokenizer.no_speech is not None:  # save no_speech_probs
+                    probs_at_sot = logits[:, self.sot_index].float().softmax(dim=-1)
+                    no_speech_probs = probs_at_sot[:, self.tokenizer.no_speech].tolist()
+                # now we need to consider the logits at the last token only
+                logits = logits[:, -1]
+                # apply the logit filters, e.g. for suppressing or applying penalty to
+                for logit_filter in self.logit_filters:
+                    logit_filter.apply(logits, tokens)
+                # suppress timestamp tokens where the audio is silent so that decoder ignores those timestamps
+                _suppress_ts(logits[:, self.tokenizer.timestamp_begin:], self.ts_token_mask)
+                logits.nan_to_num_(-np.inf)
+                # expand the tokens tensor with the selected next tokens
+                tokens, completed = self.decoder.update(tokens, logits, sum_logprobs)
+                if completed or tokens.shape[-1] > self.n_ctx:
+                    break
+        finally:
+            self.inference.cleanup_caching()
+        return tokens, sum_logprobs, no_speech_probs
+# modified version of whisper.decoding.decode
+@torch.no_grad()
+def decode_stable(model: "Whisper",
+                  mel: torch.Tensor,
+                  options: DecodingOptions = DecodingOptions(),
+                  ts_token_mask: torch.Tensor = None,
+                  audio_features: torch.Tensor = None,
+                  **kwargs, ) -> \
+        Union[DecodingResult, List[DecodingResult], tuple]:
+    """
+    Performs decoding of 30-second audio segment(s), provided as Mel spectrogram(s).
+    Parameters
+    ----------
+    model : whisper.model.Whisper
+        An instance of Whisper ASR model.
+    mel : torch.Tensor,
+        A tensor containing the Mel spectrogram(s). ``mel.shape`` must be (80, 3000) or (*, 80, 3000).
+    options : whisper.decode.DecodingOptions, default whisper.decode.DecodingOptions()
+        A dataclass that contains all necessary options for decoding 30-second segments
+    ts_token_mask : torch.Tensor, optional
+        Mask for suppressing to timestamp token(s) for decoding.
+    audio_features : torch.Tensor, optional
+        Reused ``audio_feature`` from encoder for fallback.
+    Returns
+    -------
+    whisper.decode.DecodingResult or list whisper.decode.DecodingResult
+        The result(s) of decoding contained in ``whisper.decode.DecodingResult`` dataclass instance(s).
+    """
+    if single := mel.ndim == 2:
+        mel = mel.unsqueeze(0)
+    if kwargs:
+        options = replace(options, **kwargs)
+    task = DecodingTaskStable(model, options, ts_token_mask=ts_token_mask, audio_features=audio_features)
+    result = task.run(mel)
+    return result[0] if single else result, task.audio_features

stable_whisper/non_whisper.py ADDED Viewed

	@@ -0,0 +1,348 @@

+import os
+import warnings
+import io
+import torch
+import torchaudio
+import numpy as np
+from typing import Union, Callable, Optional
+from .audio import load_audio
+from .result import WhisperResult
+AUDIO_TYPES = ('str', 'byte', 'torch', 'numpy')
+def transcribe_any(
+        inference_func: Callable,
+        audio: Union[str, np.ndarray, torch.Tensor, bytes],
+        audio_type: str = None,
+        input_sr: int = None,
+        model_sr: int = None,
+        inference_kwargs: dict = None,
+        temp_file: str = None,
+        verbose: Optional[bool] = False,
+        regroup: Union[bool, str] = True,
+        suppress_silence: bool = True,
+        suppress_word_ts: bool = True,
+        q_levels: int = 20,
+        k_size: int = 5,
+        demucs: bool = False,
+        demucs_device: str = None,
+        demucs_output: str = None,
+        demucs_options: dict = None,
+        vad: bool = False,
+        vad_threshold: float = 0.35,
+        vad_onnx: bool = False,
+        min_word_dur: float = 0.1,
+        nonspeech_error: float = 0.3,
+        use_word_position: bool = True,
+        only_voice_freq: bool = False,
+        only_ffmpeg: bool = False,
+        force_order: bool = False,
+        check_sorted: bool = True
+) -> WhisperResult:
+    """
+    Transcribe ``audio`` using any ASR system.
+    Parameters
+    ----------
+    inference_func : Callable
+        Function that runs ASR when provided the [audio] and return data in the appropriate format.
+        For format examples see, https://github.com/jianfch/stable-ts/blob/main/examples/non-whisper.ipynb.
+    audio : str or numpy.ndarray or torch.Tensor or bytes
+        Path/URL to the audio file, the audio waveform, or bytes of audio file.
+    audio_type : {'str', 'byte', 'torch', 'numpy', None}, default None, meaning same type as ``audio``
+        The type that ``audio`` needs to be for ``inference_func``.
+        'str' is a path to the file.
+        'byte' is bytes (used for APIs or to avoid writing any data to hard drive).
+        'torch' is an instance of :class:`torch.Tensor` containing the audio waveform, in float32 dtype, on CPU.
+        'numpy' is an instance of :class:`numpy.ndarray` containing the audio waveform, in float32 dtype.
+    input_sr : int, default None, meaning auto-detected if ``audio`` is ``str`` or ``bytes``
+        The sample rate of ``audio``.
+    model_sr : int, default None, meaning same sample rate as ``input_sr``
+        The sample rate to resample the audio into for ``inference_func``.
+    inference_kwargs : dict, optional
+        Dictionary of arguments to pass into ``inference_func``.
+    temp_file : str, default './_temp_stable-ts_audio_.wav'
+        Temporary path for the preprocessed audio when ``audio_type = 'str'``.
+    verbose: bool, False
+        Whether to displays all the details during transcription, If ``False``, displays progressbar. If ``None``, does
+        not display anything.
+    regroup: str or bool, default True
+         String representation of a custom regrouping algorithm or ``True`` use to the default algorithm 'da'. Only
+         applies if ``word_timestamps = False``.
+    suppress_silence : bool, default True
+        Whether to enable timestamps adjustments based on the detected silence.
+    suppress_word_ts : bool, default True
+        Whether to adjust word timestamps based on the detected silence. Only enabled if ``suppress_silence = True``.
+    q_levels : int, default 20
+        Quantization levels for generating timestamp suppression mask; ignored if ``vad = true``.
+        Acts as a threshold to marking sound as silent.
+        Fewer levels will increase the threshold of volume at which to mark a sound as silent.
+    k_size : int, default 5
+        Kernel size for avg-pooling waveform to generate timestamp suppression mask; ignored if ``vad = true``.
+        Recommend 5 or 3; higher sizes will reduce detection of silence.
+    demucs : bool or torch.nn.Module, default False
+        Whether to preprocess ``audio`` with Demucs to isolate vocals / remove noise. Set ``demucs`` to an instance of
+        a Demucs model to avoid reloading the model for each run.
+        Demucs must be installed to use. Official repo, https://github.com/facebookresearch/demucs.
+    demucs_output : str, optional
+        Path to save the vocals isolated by Demucs as WAV file. Ignored if ``demucs = False``.
+        Demucs must be installed to use. Official repo, https://github.com/facebookresearch/demucs.
+    demucs_options : dict, optional
+        Options to use for :func:`stable_whisper.audio.demucs_audio`.
+    demucs_device : str, default None, meaning 'cuda' if cuda is available with ``torch`` else 'cpu'
+        Device to use for demucs.
+    vad : bool, default False
+        Whether to use Silero VAD to generate timestamp suppression mask.
+        Silero VAD requires PyTorch 1.12.0+. Official repo, https://github.com/snakers4/silero-vad.
+    vad_threshold : float, default 0.35
+        Threshold for detecting speech with Silero VAD. Low threshold reduces false positives for silence detection.
+    vad_onnx : bool, default False
+        Whether to use ONNX for Silero VAD.
+    min_word_dur : float, default 0.1
+        Shortest duration each word is allowed to reach for silence suppression.
+    nonspeech_error : float, default 0.3
+        Relative error of non-speech sections that appear in between a word for silence suppression.
+    use_word_position : bool, default True
+        Whether to use position of the word in its segment to determine whether to keep end or start timestamps if
+        adjustments are required. If it is the first word, keep end. Else if it is the last word, keep the start.
+    only_voice_freq : bool, default False
+        Whether to only use sound between 200 - 5000 Hz, where majority of human speech are.
+    only_ffmpeg : bool, default False
+        Whether to use only FFmpeg (instead of not yt-dlp) for URls
+    force_order : bool, default False
+        Whether to use adjacent timestamps to replace timestamps that are out of order. Use this parameter only if
+        the words/segments returned by ``inference_func`` are expected to be in chronological order.
+    check_sorted : bool, default True
+        Whether to raise an error when timestamps returned by ``inference_func`` are not in ascending order.
+    Returns
+    -------
+    stable_whisper.result.WhisperResult
+        All timestamps, words, probabilities, and other data from the transcription of ``audio``.
+    Notes
+    -----
+    For ``audio_type = 'str'``:
+        If ``audio`` is a file and no audio preprocessing is set, ``audio`` will be directly passed into
+            ``inference_func``.
+        If audio preprocessing is ``demucs`` or ``only_voice_freq``, the processed audio will be encoded into
+            ``temp_file`` and then passed into ``inference_func``.
+    For ``audio_type = 'byte'``:
+        If ``audio`` is file, the bytes of file will be passed into ``inference_func``.
+        If ``audio`` is :class:`torch.Tensor` or :class:`numpy.ndarray`, the bytes of the ``audio`` will be encoded
+            into WAV format then passed into ``inference_func``.
+    Resampling is only performed on ``audio`` when ``model_sr`` does not match the sample rate of the ``audio`` before
+        passing into ``inference_func`` due to ``input_sr`` not matching ``model_sr``, or sample rate changes due to
+        audio preprocessing from ``demucs = True``.
+    """
+    if demucs_options is None:
+        demucs_options = {}
+    if demucs_output:
+        if 'save_path' not in demucs_options:
+            demucs_options['save_path'] = demucs_output
+        warnings.warn('``demucs_output`` is deprecated. Use ``demucs_options`` with ``save_path`` instead. '
+                      'E.g. demucs_options=dict(save_path="demucs_output.mp3")',
+                      DeprecationWarning, stacklevel=2)
+    if demucs_device:
+        if 'device' not in demucs_options:
+            demucs_options['device'] = demucs_device
+        warnings.warn('``demucs_device`` is deprecated. Use ``demucs_options`` with ``device`` instead. '
+                      'E.g. demucs_options=dict(device="cpu")',
+                      DeprecationWarning, stacklevel=2)
+    if audio_type is not None and (audio_type := audio_type.lower()) not in AUDIO_TYPES:
+        raise NotImplementedError(f'[audio_type]={audio_type} is not supported. Types: {AUDIO_TYPES}')
+    if audio_type is None:
+        if isinstance(audio, str):
+            audio_type = 'str'
+        elif isinstance(audio, bytes):
+            audio_type = 'byte'
+        elif isinstance(audio, torch.Tensor):
+            audio_type = 'pytorch'
+        elif isinstance(audio, np.ndarray):
+            audio_type = 'numpy'
+        else:
+            raise TypeError(f'{type(audio)} is not supported for [audio].')
+    if (
+            input_sr is None and
+            isinstance(audio, (np.ndarray, torch.Tensor)) and
+            (demucs or only_voice_freq or suppress_silence or model_sr)
+    ):
+        raise ValueError('[input_sr] is required when [audio] is a PyTorch tensor or NumPy array.')
+    if (
+            model_sr is None and
+            isinstance(audio, (str, bytes)) and
+            audio_type in ('torch', 'numpy')
+    ):
+        raise ValueError('[model_sr] is required when [audio_type] is a "pytorch" or "numpy".')
+    if isinstance(audio, str):
+        from .audio import _load_file
+        audio = _load_file(audio, verbose=verbose, only_ffmpeg=only_ffmpeg)
+    if inference_kwargs is None:
+        inference_kwargs = {}
+    temp_file = os.path.abspath(temp_file or './_temp_stable-ts_audio_.wav')
+    temp_audio_file = None
+    curr_sr = input_sr
+    if demucs:
+        if demucs is True:
+            from .audio import load_demucs_model
+            demucs_model = load_demucs_model()
+        else:
+            demucs_model = demucs
+            demucs = True
+    else:
+        demucs_model = None
+    def get_input_sr():
+        nonlocal input_sr
+        if not input_sr and isinstance(audio, (str, bytes)):
+            from .audio import get_samplerate
+            input_sr = get_samplerate(audio)
+        return input_sr
+    if only_voice_freq:
+        from .audio import voice_freq_filter
+        if demucs_model is None:
+            curr_sr = model_sr or get_input_sr()
+        else:
+            curr_sr = demucs_model.samplerate
+            if model_sr is None:
+                model_sr = get_input_sr()
+        audio = load_audio(audio, sr=curr_sr, verbose=verbose, only_ffmpeg=only_ffmpeg)
+        audio = voice_freq_filter(audio, curr_sr)
+    if demucs:
+        from .audio import demucs_audio
+        if demucs_device is None:
+            demucs_device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        demucs_kwargs = dict(
+            audio=audio,
+            input_sr=curr_sr,
+            model=demucs_model,
+            save_path=demucs_output,
+            device=demucs_device,
+            verbose=verbose
+        )
+        demucs_kwargs.update(demucs_options or {})
+        audio = demucs_audio(
+            **demucs_kwargs
+        )
+        curr_sr = demucs_model.samplerate
+        if demucs_output and audio_type == 'str':
+            audio = demucs_output
+    final_audio = audio
+    if model_sr is not None:
+        if curr_sr is None:
+            curr_sr = get_input_sr()
+        if curr_sr != model_sr:
+            if isinstance(final_audio, (str, bytes)):
+                final_audio = load_audio(
+                    final_audio,
+                    sr=model_sr,
+                    verbose=verbose,
+                    only_ffmpeg=only_ffmpeg
+                )
+            else:
+                if isinstance(final_audio, np.ndarray):
+                    final_audio = torch.from_numpy(final_audio)
+                if isinstance(final_audio, torch.Tensor):
+                    final_audio = torchaudio.functional.resample(
+                        final_audio,
+                        orig_freq=curr_sr,
+                        new_freq=model_sr,
+                        resampling_method="kaiser_window"
+                    )
+    if audio_type in ('torch', 'numpy'):
+        if isinstance(final_audio, (str, bytes)):
+            final_audio = load_audio(
+                final_audio,
+                sr=model_sr,
+                verbose=verbose,
+                only_ffmpeg=only_ffmpeg
+            )
+        else:
+            if audio_type == 'torch':
+                if isinstance(final_audio, np.ndarray):
+                    final_audio = torch.from_numpy(final_audio)
+            elif audio_type == 'numpy' and isinstance(final_audio, torch.Tensor):
+                final_audio = final_audio.cpu().numpy()
+    elif audio_type == 'str':
+        if isinstance(final_audio, (torch.Tensor, np.ndarray)):
+            if isinstance(final_audio, np.ndarray):
+                final_audio = torch.from_numpy(final_audio)
+            if final_audio.ndim < 2:
+                final_audio = final_audio[None]
+            torchaudio.save(temp_file, final_audio, model_sr)
+            final_audio = temp_audio_file = temp_file
+        elif isinstance(final_audio, bytes):
+            with open(temp_file, 'wb') as f:
+                f.write(final_audio)
+            final_audio = temp_audio_file = temp_file
+    else:  # audio_type == 'byte'
+        if isinstance(final_audio, (torch.Tensor, np.ndarray)):
+            if isinstance(final_audio, np.ndarray):
+                final_audio = torch.from_numpy(final_audio)
+            if final_audio.ndim < 2:
+                final_audio = final_audio[None]
+            with io.BytesIO() as f:
+                torchaudio.save(f, final_audio, model_sr, format="wav")
+                f.seek(0)
+                final_audio = f.read()
+        elif isinstance(final_audio, str):
+            with open(final_audio, 'rb') as f:
+                final_audio = f.read()
+    inference_kwargs['audio'] = final_audio
+    result = None
+    try:
+        result = inference_func(**inference_kwargs)
+        if not isinstance(result, WhisperResult):
+            result = WhisperResult(result, force_order=force_order, check_sorted=check_sorted)
+        if suppress_silence:
+            result.adjust_by_silence(
+                audio, vad,
+                vad_onnx=vad_onnx, vad_threshold=vad_threshold,
+                q_levels=q_levels, k_size=k_size,
+                sample_rate=curr_sr, min_word_dur=min_word_dur,
+                word_level=suppress_word_ts, verbose=True,
+                nonspeech_error=nonspeech_error,
+                use_word_position=use_word_position
+            )
+        if result.has_words and regroup:
+            result.regroup(regroup)
+    finally:
+        if temp_audio_file is not None:
+            try:
+                os.unlink(temp_audio_file)
+            except Exception as e:
+                warnings.warn(f'Failed to remove temporary audio file {temp_audio_file}. {e}')
+    return result

stable_whisper/quantization.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import torch
+from torch import nn
+from whisper.model import Linear, Conv1d, LayerNorm, Whisper
+def replace_modules(model: nn.Module, only_linear: bool = False):
+    """
+    Replace ``Linear``/``Conv1d``/``LayerNorm`` from :class:`whisper.model` with equivalent module in
+        :class:`torch.nn`.
+    """
+    for m in model.__dict__.get('_modules', []):
+        module = model.__getattr__(m)
+        update = True
+        if isinstance(module, Linear):
+            model.__setattr__(m, nn.Linear(module.in_features, module.out_features,
+                                           bias=module.bias is not None))
+        elif not only_linear and isinstance(module, Conv1d):
+            model.__setattr__(m, nn.Conv1d(module.in_channels, module.out_channels,
+                                           kernel_size=module.kernel_size,
+                                           stride=module.stride,
+                                           padding=module.padding,
+                                           bias=module.bias is not None))
+        elif not only_linear and isinstance(module, LayerNorm):
+            model.__setattr__(m, nn.LayerNorm(module.normalized_shape[0]))
+        else:
+            update = False
+            replace_modules(module)
+        if update:
+            model.__getattr__(m).load_state_dict(module.state_dict())
+def ptdq_linear(model: "Whisper"):
+    """
+    Apply Dynamic Quantization to instance of :class:`whisper.model.Whisper`.
+    """
+    model.cpu()
+    replace_modules(model, only_linear=True)
+    torch.quantization.quantize_dynamic(model, {nn.Linear}, dtype=torch.qint8, inplace=True)
+    setattr(model, 'dq', True)

stable_whisper/result.py ADDED Viewed

	@@ -0,0 +1,2281 @@

+import warnings
+import re
+import torch
+import numpy as np
+from typing import Union, List, Tuple, Optional, Callable
+from dataclasses import dataclass
+from copy import deepcopy
+from itertools import chain
+from .stabilization import suppress_silence, get_vad_silence_func, mask2timing, wav2mask
+from .text_output import *
+from .utils import str_to_valid_type, format_timestamp, UnsortedException
+__all__ = ['WhisperResult', 'Segment']
+def _combine_attr(obj: object, other_obj: object, attr: str):
+    if (val := getattr(obj, attr)) is not None:
+        other_val = getattr(other_obj, attr)
+        if isinstance(val, list):
+            if other_val is None:
+                setattr(obj, attr, None)
+            else:
+                val.extend(other_val)
+        else:
+            new_val = None if other_val is None else ((val + other_val) / 2)
+            setattr(obj, attr, new_val)
+def _increment_attr(obj: object, attr: str, val: Union[int, float]):
+    if (curr_val := getattr(obj, attr, None)) is not None:
+        setattr(obj, attr, curr_val + val)
+@dataclass
+class WordTiming:
+    word: str
+    start: float
+    end: float
+    probability: float = None
+    tokens: List[int] = None
+    left_locked: bool = False
+    right_locked: bool = False
+    segment_id: Optional[int] = None
+    id: Optional[int] = None
+    def __len__(self):
+        return len(self.word)
+    def __add__(self, other: 'WordTiming'):
+        self_copy = deepcopy(self)
+        self_copy.start = min(self_copy.start, other.start)
+        self_copy.end = max(other.end, self_copy.end)
+        self_copy.word += other.word
+        self_copy.left_locked = self_copy.left_locked or other.left_locked
+        self_copy.right_locked = self_copy.right_locked or other.right_locked
+        _combine_attr(self_copy, other, 'probability')
+        _combine_attr(self_copy, other, 'tokens')
+        return self_copy
+    def __deepcopy__(self, memo=None):
+        return self.copy()
+    def copy(self):
+        return WordTiming(
+            word=self.word,
+            start=self.start,
+            end=self.end,
+            probability=self.probability,
+            tokens=None if self.tokens is None else self.tokens.copy(),
+            left_locked=self.left_locked,
+            right_locked=self.right_locked,
+            segment_id=self.segment_id,
+            id=self.id
+        )
+    @property
+    def duration(self):
+        return round(self.end - self.start, 3)
+    def round_all_timestamps(self):
+        self.start = round(self.start, 3)
+        self.end = round(self.end, 3)
+    def offset_time(self, offset_seconds: float):
+        self.start = round(self.start + offset_seconds, 3)
+        self.end = round(self.end + offset_seconds, 3)
+    def to_dict(self):
+        dict_ = deepcopy(self).__dict__
+        dict_.pop('left_locked')
+        dict_.pop('right_locked')
+        return dict_
+    def lock_left(self):
+        self.left_locked = True
+    def lock_right(self):
+        self.right_locked = True
+    def lock_both(self):
+        self.lock_left()
+        self.lock_right()
+    def unlock_both(self):
+        self.left_locked = False
+        self.right_locked = False
+    def suppress_silence(self,
+                         silent_starts: np.ndarray,
+                         silent_ends: np.ndarray,
+                         min_word_dur: float = 0.1,
+                         nonspeech_error: float = 0.3,
+                         keep_end: Optional[bool] = True):
+        suppress_silence(self, silent_starts, silent_ends, min_word_dur, nonspeech_error, keep_end)
+        return self
+    def rescale_time(self, scale_factor: float):
+        self.start = round(self.start * scale_factor, 3)
+        self.end = round(self.end * scale_factor, 3)
+    def clamp_max(self, max_dur: float, clip_start: bool = False, verbose: bool = False):
+        if self.duration > max_dur:
+            if clip_start:
+                new_start = round(self.end - max_dur, 3)
+                if verbose:
+                    print(f'Start: {self.start} -> {new_start}\nEnd: {self.end}\nText:"{self.word}"\n')
+                self.start = new_start
+            else:
+                new_end = round(self.start + max_dur, 3)
+                if verbose:
+                    print(f'Start: {self.start}\nEnd: {self.end} -> {new_end}\nText:"{self.word}"\n')
+                self.end = new_end
+    def set_segment(self, segment: 'Segment'):
+        self._segment = segment
+    def get_segment(self) -> Union['Segment', None]:
+        """
+        Return instance of :class:`stable_whisper.result.Segment` that this instance is a part of.
+        """
+        return getattr(self, '_segment', None)
+def _words_by_lock(words: List[WordTiming], only_text: bool = False, include_single: bool = False):
+    """
+    Return a nested list of words such that each sublist contains words that are locked together.
+    """
+    all_words = []
+    for word in words:
+        if len(all_words) == 0 or not (all_words[-1][-1].right_locked or word.left_locked):
+            all_words.append([word])
+        else:
+            all_words[-1].append(word)
+    if only_text:
+        all_words = list(map(lambda ws: list(map(lambda w: w.word, ws)), all_words))
+    if not include_single:
+        all_words = [ws for ws in all_words if len(ws) > 1]
+    return all_words
+@dataclass
+class Segment:
+    start: float
+    end: float
+    text: str
+    seek: float = None
+    tokens: List[int] = None
+    temperature: float = None
+    avg_logprob: float = None
+    compression_ratio: float = None
+    no_speech_prob: float = None
+    words: Union[List[WordTiming], List[dict]] = None
+    ori_has_words: bool = None
+    id: int = None
+    def __getitem__(self, index: int) -> WordTiming:
+        if self.words is None:
+            raise ValueError('segment contains no words')
+        return self.words[index]
+    def __delitem__(self, index: int):
+        if self.words is None:
+            raise ValueError('segment contains no words')
+        del self.words[index]
+        self.reassign_ids()
+        self.update_seg_with_words()
+    def __deepcopy__(self, memo=None):
+        return self.copy()
+    def copy(self, new_words: Optional[List[WordTiming]] = None):
+        if new_words is None:
+            words = None if self.words is None else [w.copy() for w in self.words]
+        else:
+            words = [w.copy() for w in new_words]
+        new_seg = Segment(
+            start=self.start,
+            end=self.end,
+            text=self.text,
+            seek=self.seek,
+            tokens=self.tokens,
+            temperature=self.temperature,
+            avg_logprob=self.avg_logprob,
+            compression_ratio=self.compression_ratio,
+            no_speech_prob=self.no_speech_prob,
+            words=words,
+            id=self.id
+        )
+        new_seg.update_seg_with_words()
+        return new_seg
+    def to_display_str(self, only_segment: bool = False):
+        line = f'[{format_timestamp(self.start)} --> {format_timestamp(self.end)}] "{self.text}"'
+        if self.has_words and not only_segment:
+            line += '\n' + '\n'.join(
+                f"-[{format_timestamp(w.start)}] -> [{format_timestamp(w.end)}] \"{w.word}\"" for w in self.words
+            ) + '\n'
+        return line
+    @property
+    def has_words(self):
+        return bool(self.words)
+    @property
+    def duration(self):
+        return self.end - self.start
+    def word_count(self):
+        if self.has_words:
+            return len(self.words)
+        return -1
+    def char_count(self):
+        if self.has_words:
+            return sum(len(w) for w in self.words)
+        return len(self.text)
+    def __post_init__(self):
+        if self.has_words:
+            self.words: List[WordTiming] = \
+                [WordTiming(**word) if isinstance(word, dict) else word for word in self.words]
+            for w in self.words:
+                w.set_segment(self)
+        if self.ori_has_words is None:
+            self.ori_has_words = self.has_words
+        self.round_all_timestamps()
+    def __add__(self, other: 'Segment'):
+        self_copy = deepcopy(self)
+        self_copy.start = min(self_copy.start, other.start)
+        self_copy.end = max(other.end, self_copy.end)
+        self_copy.text += other.text
+        _combine_attr(self_copy, other, 'tokens')
+        _combine_attr(self_copy, other, 'temperature')
+        _combine_attr(self_copy, other, 'avg_logprob')
+        _combine_attr(self_copy, other, 'compression_ratio')
+        _combine_attr(self_copy, other, 'no_speech_prob')
+        if self_copy.has_words:
+            if other.has_words:
+                self_copy.words.extend(other.words)
+            else:
+                self_copy.words = None
+        return self_copy
+    def _word_operations(self, operation: str, *args, **kwargs):
+        if self.has_words:
+            for w in self.words:
+                getattr(w, operation)(*args, **kwargs)
+    def round_all_timestamps(self):
+        self.start = round(self.start, 3)
+        self.end = round(self.end, 3)
+        if self.has_words:
+            for word in self.words:
+                word.round_all_timestamps()
+    def offset_time(self, offset_seconds: float):
+        self.start = round(self.start + offset_seconds, 3)
+        self.end = round(self.end + offset_seconds, 3)
+        _increment_attr(self, 'seek', offset_seconds)
+        self._word_operations('offset_time', offset_seconds)
+    def add_words(self, index0: int, index1: int, inplace: bool = False):
+        if self.has_words:
+            new_word = self.words[index0] + self.words[index1]
+            if inplace:
+                i0, i1 = sorted([index0, index1])
+                self.words[i0] = new_word
+                del self.words[i1]
+            return new_word
+    def rescale_time(self, scale_factor: float):
+        self.start = round(self.start * scale_factor, 3)
+        self.end = round(self.end * scale_factor, 3)
+        if self.seek is not None:
+            self.seek = round(self.seek * scale_factor, 3)
+        self._word_operations('rescale_time', scale_factor)
+        self.update_seg_with_words()
+    def apply_min_dur(self, min_dur: float, inplace: bool = False):
+        """
+        Merge any word with adjacent word if its duration is less than ``min_dur``.
+        """
+        segment = self if inplace else deepcopy(self)
+        if not self.has_words:
+            return segment
+        max_i = len(segment.words) - 1
+        if max_i == 0:
+            return segment
+        for i in reversed(range(len(segment.words))):
+            if max_i == 0:
+                break
+            if segment.words[i].duration < min_dur:
+                if i == max_i:
+                    segment.add_words(i-1, i, inplace=True)
+                elif i == 0:
+                    segment.add_words(i, i+1, inplace=True)
+                else:
+                    if segment.words[i+1].duration < segment.words[i-1].duration:
+                        segment.add_words(i-1, i, inplace=True)
+                    else:
+                        segment.add_words(i, i+1, inplace=True)
+                max_i -= 1
+        return segment
+    def _to_reverse_text(
+            self,
+            prepend_punctuations: str = None,
+            append_punctuations: str = None
+    ):
+        """
+        Return a copy with words reversed order per segment.
+        """
+        if prepend_punctuations is None:
+            prepend_punctuations = "\"'“¿([{-"
+        if prepend_punctuations and ' ' not in prepend_punctuations:
+            prepend_punctuations += ' '
+        if append_punctuations is None:
+            append_punctuations = "\"'.。,，!！?？:：”)]}、"
+        self_copy = deepcopy(self)
+        has_prepend = bool(prepend_punctuations)
+        has_append = bool(append_punctuations)
+        if has_prepend or has_append:
+            word_objs = (
+                self_copy.words
+                if self_copy.has_words else
+                [WordTiming(w, 0, 1, 0) for w in self_copy.text.split(' ')]
+            )
+            for word in word_objs:
+                new_append = ''
+                if has_prepend:
+                    for _ in range(len(word)):
+                        char = word.word[0]
+                        if char in prepend_punctuations:
+                            new_append += char
+                            word.word = word.word[1:]
+                        else:
+                            break
+                new_prepend = ''
+                if has_append:
+                    for _ in range(len(word)):
+                        char = word.word[-1]
+                        if char in append_punctuations:
+                            new_prepend += char
+                            word.word = word.word[:-1]
+                        else:
+                            break
+                word.word = f'{new_prepend}{word.word}{new_append[::-1]}'
+            self_copy.text = ''.join(w.word for w in reversed(word_objs))
+        return self_copy
+    def to_dict(self, reverse_text: Union[bool, tuple] = False):
+        if reverse_text:
+            seg_dict = (
+                (self._to_reverse_text(*reverse_text)
+                 if isinstance(reverse_text, tuple) else
+                 self._to_reverse_text()).__dict__
+            )
+        else:
+            seg_dict = deepcopy(self).__dict__
+        seg_dict.pop('ori_has_words')
+        if self.has_words:
+            seg_dict['words'] = [w.to_dict() for w in seg_dict['words']]
+        elif self.ori_has_words:
+            seg_dict['words'] = []
+        else:
+            seg_dict.pop('words')
+        if self.id is None:
+            seg_dict.pop('id')
+        if reverse_text:
+            seg_dict['reversed_text'] = True
+        return seg_dict
+    def words_by_lock(self, only_text: bool = True, include_single: bool = False):
+        return _words_by_lock(self.words, only_text=only_text, include_single=include_single)
+    @property
+    def left_locked(self):
+        if self.has_words:
+            return self.words[0].left_locked
+        return False
+    @property
+    def right_locked(self):
+        if self.has_words:
+            return self.words[-1].right_locked
+        return False
+    def lock_left(self):
+        if self.has_words:
+            self.words[0].lock_left()
+    def lock_right(self):
+        if self.has_words:
+            self.words[-1].lock_right()
+    def lock_both(self):
+        self.lock_left()
+        self.lock_right()
+    def unlock_all_words(self):
+        self._word_operations('unlock_both')
+    def reassign_ids(self):
+        if self.has_words:
+            for i, w in enumerate(self.words):
+                w.segment_id = self.id
+                w.id = i
+    def update_seg_with_words(self):
+        if self.has_words:
+            self.start = self.words[0].start
+            self.end = self.words[-1].end
+            self.text = ''.join(w.word for w in self.words)
+            self.tokens = (
+                None
+                if any(w.tokens is None for w in self.words) else
+                [t for w in self.words for t in w.tokens]
+            )
+            for w in self.words:
+                w.set_segment(self)
+    def suppress_silence(self,
+                         silent_starts: np.ndarray,
+                         silent_ends: np.ndarray,
+                         min_word_dur: float = 0.1,
+                         word_level: bool = True,
+                         nonspeech_error: float = 0.3,
+                         use_word_position: bool = True):
+        if self.has_words:
+            words = self.words if word_level or len(self.words) == 1 else [self.words[0], self.words[-1]]
+            for i, w in enumerate(words, 1):
+                if use_word_position:
+                    keep_end = True if i == 1 else (False if i == len(words) else None)
+                else:
+                    keep_end = None
+                w.suppress_silence(silent_starts, silent_ends, min_word_dur, nonspeech_error, keep_end)
+            self.update_seg_with_words()
+        else:
+            suppress_silence(self,
+                             silent_starts,
+                             silent_ends,
+                             min_word_dur,
+                             nonspeech_error)
+        return self
+    def get_locked_indices(self):
+        locked_indices = [i
+                          for i, (left, right) in enumerate(zip(self.words[1:], self.words[:-1]))
+                          if left.left_locked or right.right_locked]
+        return locked_indices
+    def get_gaps(self, as_ndarray=False):
+        if self.has_words:
+            s_ts = np.array([w.start for w in self.words])
+            e_ts = np.array([w.end for w in self.words])
+            gap = s_ts[1:] - e_ts[:-1]
+            return gap if as_ndarray else gap.tolist()
+        return []
+    def get_gap_indices(self, max_gap: float = 0.1):  # for splitting
+        if not self.has_words or len(self.words) < 2:
+            return []
+        if max_gap is None:
+            max_gap = 0
+        indices = (self.get_gaps(True) > max_gap).nonzero()[0].tolist()
+        return sorted(set(indices) - set(self.get_locked_indices()))
+    def get_punctuation_indices(self, punctuation: Union[List[str], List[Tuple[str, str]], str]):  # for splitting
+        if not self.has_words or len(self.words) < 2:
+            return []
+        if isinstance(punctuation, str):
+            punctuation = [punctuation]
+        indices = []
+        for p in punctuation:
+            if isinstance(p, str):
+                for i, s in enumerate(self.words[:-1]):
+                    if s.word.endswith(p):
+                        indices.append(i)
+                    elif i != 0 and s.word.startswith(p):
+                        indices.append(i-1)
+            else:
+                ending, beginning = p
+                indices.extend([i for i, (w0, w1) in enumerate(zip(self.words[:-1], self.words[1:]))
+                                if w0.word.endswith(ending) and w1.word.startswith(beginning)])
+        return sorted(set(indices) - set(self.get_locked_indices()))
+    def get_length_indices(self, max_chars: int = None, max_words: int = None, even_split: bool = True,
+                           include_lock: bool = False):
+        # for splitting
+        if not self.has_words or (max_chars is None and max_words is None):
+            return []
+        assert max_chars != 0 and max_words != 0, \
+            f'max_chars and max_words must be greater 0, but got {max_chars} and {max_words}'
+        if len(self.words) < 2:
+            return []
+        indices = []
+        if even_split:
+            char_count = -1 if max_chars is None else sum(map(len, self.words))
+            word_count = -1 if max_words is None else len(self.words)
+            exceed_chars = max_chars is not None and char_count > max_chars
+            exceed_words = max_words is not None and word_count > max_words
+            if exceed_chars:
+                splits = np.ceil(char_count / max_chars)
+                chars_per_split = char_count / splits
+                cum_char_count = np.cumsum([len(w.word) for w in self.words[:-1]])
+                indices = [
+                    (np.abs(cum_char_count-(i*chars_per_split))).argmin()
+                    for i in range(1, int(splits))
+                ]
+                if max_words is not None:
+                    exceed_words = any(j-i+1 > max_words for i, j in zip([0]+indices, indices+[len(self.words)]))
+            if exceed_words:
+                splits = np.ceil(word_count / max_words)
+                words_per_split = word_count / splits
+                cum_word_count = np.array(range(1, len(self.words)+1))
+                indices = [
+                    np.abs(cum_word_count-(i*words_per_split)).argmin()
+                    for i in range(1, int(splits))
+                ]
+        else:
+            curr_words = 0
+            curr_chars = 0
+            locked_indices = []
+            if include_lock:
+                locked_indices = self.get_locked_indices()
+            for i, word in enumerate(self.words):
+                curr_words += 1
+                curr_chars += len(word)
+                if i != 0:
+                    if (
+                            max_chars is not None and curr_chars > max_chars
+                            or
+                            max_words is not None and curr_words > max_words
+                    ) and i-1 not in locked_indices:
+                        indices.append(i-1)
+                        curr_words = 1
+                        curr_chars = len(word)
+        return indices
+    def get_duration_indices(self, max_dur: float, even_split: bool = True, include_lock: bool = False):
+        if not self.has_words or (total_duration := np.sum([w.duration for w in self.words])) <= max_dur:
+            return []
+        if even_split:
+            splits = np.ceil(total_duration / max_dur)
+            dur_per_split = total_duration / splits
+            cum_dur = np.cumsum([w.duration for w in self.words[:-1]])
+            indices = [
+                (np.abs(cum_dur - (i * dur_per_split))).argmin()
+                for i in range(1, int(splits))
+            ]
+        else:
+            indices = []
+            curr_total_dur = 0.0
+            locked_indices = self.get_locked_indices() if include_lock else []
+            for i, word in enumerate(self.words):
+                curr_total_dur += word.duration
+                if i != 0:
+                    if curr_total_dur > max_dur and i - 1 not in locked_indices:
+                        indices.append(i - 1)
+                        curr_total_dur = word.duration
+        return indices
+    def split(self, indices: List[int]):
+        if len(indices) == 0:
+            return []
+        if indices[-1] != len(self.words) - 1:
+            indices.append(len(self.words) - 1)
+        seg_copies = []
+        prev_i = 0
+        for i in indices:
+            i += 1
+            c = deepcopy(self)
+            c.words = c.words[prev_i:i]
+            c.update_seg_with_words()
+            seg_copies.append(c)
+            prev_i = i
+        return seg_copies
+    def set_result(self, result: 'WhisperResult'):
+        self._result = result
+    def get_result(self) -> Union['WhisperResult', None]:
+        """
+        Return outer instance of :class:`stable_whisper.result.WhisperResult` that ``self`` is a part of.
+        """
+        return getattr(self, '_result', None)
+class WhisperResult:
+    def __init__(
+            self,
+            result: Union[str, dict, list],
+            force_order: bool = False,
+            check_sorted: Union[bool, str] = True,
+            show_unsorted: bool = True
+    ):
+        result, self.path = self._standardize_result(result)
+        self.ori_dict = result.get('ori_dict') or result
+        self.language = self.ori_dict.get('language')
+        self._regroup_history = result.get('regroup_history', '')
+        self._nonspeech_sections = result.get('nonspeech_sections', [])
+        segments = deepcopy(result.get('segments', self.ori_dict.get('segments')))
+        self.segments: List[Segment] = [Segment(**s) for s in segments] if segments else []
+        self._forced_order = force_order
+        if self._forced_order:
+            self.force_order()
+        self.raise_for_unsorted(check_sorted, show_unsorted)
+        self.remove_no_word_segments(any(seg.has_words for seg in self.segments))
+        self.update_all_segs_with_words()
+    def __getitem__(self, index: int) -> Segment:
+        return self.segments[index]
+    def __delitem__(self, index: int):
+        del self.segments[index]
+        self.reassign_ids(True)
+    @staticmethod
+    def _standardize_result(result: Union[str, dict, list]):
+        path = None
+        if isinstance(result, str):
+            path = result
+            result = load_result(path)
+        if isinstance(result, list):
+            if isinstance(result[0], list):
+                if not isinstance(result[0][0], dict):
+                    raise NotImplementedError(f'Got list of list of {type(result[0])} but expects list of list of dict')
+                result = dict(
+                    segments=[
+                        dict(
+                            start=words[0]['start'],
+                            end=words[-1]['end'],
+                            text=''.join(w['word'] for w in words),
+                            words=words
+                        )
+                        for words in result
+                    ]
+                )
+            elif isinstance(result[0], dict):
+                result = dict(segments=result)
+            else:
+                raise NotImplementedError(f'Got list of {type(result[0])} but expects list of list/dict')
+        return result, path
+    def force_order(self):
+        prev_ts_end = 0
+        timestamps = self.all_words_or_segments()
+        for i, ts in enumerate(timestamps, 1):
+            if ts.start < prev_ts_end:
+                ts.start = prev_ts_end
+            if ts.start > ts.end:
+                if prev_ts_end > ts.end:
+                    warnings.warn('Multiple consecutive timestamps are out of order. Some parts will have no duration.')
+                    ts.start = ts.end
+                    for j in range(i-2, -1, -1):
+                        if timestamps[j].end > ts.end:
+                            timestamps[j].end = ts.end
+                        if timestamps[j].start > ts.end:
+                            timestamps[j].start = ts.end
+                else:
+                    if ts.start != prev_ts_end:
+                        ts.start = prev_ts_end
+                    else:
+                        ts.end = ts.start if i == len(timestamps) else timestamps[i].start
+            prev_ts_end = ts.end
+        if self.has_words:
+            self.update_all_segs_with_words()
+    def raise_for_unsorted(self, check_sorted: Union[bool, str] = True, show_unsorted: bool = True):
+        if check_sorted is False:
+            return
+        all_parts = self.all_words_or_segments()
+        has_words = self.has_words
+        timestamps = np.array(list(chain.from_iterable((p.start, p.end) for p in all_parts)))
+        if len(timestamps) > 1 and (unsorted_mask := timestamps[:-1] > timestamps[1:]).any():
+            if show_unsorted:
+                def get_part_info(idx):
+                    curr_part = all_parts[idx]
+                    seg_id = curr_part.segment_id if has_words else curr_part.id
+                    word_id_str = f'Word ID: {curr_part.id}\n' if has_words else ''
+                    return (
+                        f'Segment ID: {seg_id}\n{word_id_str}'
+                        f'Start: {curr_part.start}\nEnd: {curr_part.end}\n'
+                        f'Text: "{curr_part.word if has_words else curr_part.text}"'
+                    ), curr_part.start, curr_part.end
+                for i, unsorted in enumerate(unsorted_mask, 2):
+                    if unsorted:
+                        word_id = i//2-1
+                        part_info, start, end = get_part_info(word_id)
+                        if i % 2 == 1:
+                            next_info, next_start, _ = get_part_info(word_id+1)
+                            part_info += f'\nConflict: end ({end}) > next start ({next_start})\n{next_info}'
+                        else:
+                            part_info += f'\nConflict: start ({start}) > end ({end})'
+                        print(part_info, end='\n\n')
+            data = self.to_dict()
+            if check_sorted is True:
+                raise UnsortedException(data=data)
+            warnings.warn('Timestamps are not in ascending order. '
+                          'If data is produced by Stable-ts, please submit an issue with the saved data.')
+            save_as_json(data, check_sorted)
+    def update_all_segs_with_words(self):
+        for seg in self.segments:
+            seg.update_seg_with_words()
+            seg.set_result(self)
+    def update_nonspeech_sections(self, silent_starts, silent_ends):
+        self._nonspeech_sections = [dict(start=s, end=e) for s, e in zip(silent_starts, silent_ends)]
+    def add_segments(self, index0: int, index1: int, inplace: bool = False, lock: bool = False):
+        new_seg = self.segments[index0] + self.segments[index1]
+        new_seg.update_seg_with_words()
+        if lock and self.segments[index0].has_words:
+            lock_idx = len(self.segments[index0].words)
+            new_seg.words[lock_idx - 1].lock_right()
+            if lock_idx < len(new_seg.words):
+                new_seg.words[lock_idx].lock_left()
+        if inplace:
+            i0, i1 = sorted([index0, index1])
+            self.segments[i0] = new_seg
+            del self.segments[i1]
+        return new_seg
+    def rescale_time(self, scale_factor: float):
+        for s in self.segments:
+            s.rescale_time(scale_factor)
+    def apply_min_dur(self, min_dur: float, inplace: bool = False):
+        """
+        Merge any word/segment with adjacent word/segment if its duration is less than ``min_dur``.
+        """
+        result = self if inplace else deepcopy(self)
+        max_i = len(result.segments) - 1
+        if max_i == 0:
+            return result
+        for i in reversed(range(len(result.segments))):
+            if max_i == 0:
+                break
+            if result.segments[i].duration < min_dur:
+                if i == max_i:
+                    result.add_segments(i-1, i, inplace=True)
+                elif i == 0:
+                    result.add_segments(i, i+1, inplace=True)
+                else:
+                    if result.segments[i+1].duration < result.segments[i-1].duration:
+                        result.add_segments(i-1, i, inplace=True)
+                    else:
+                        result.add_segments(i, i+1, inplace=True)
+                max_i -= 1
+        result.reassign_ids()
+        for s in result.segments:
+            s.apply_min_dur(min_dur, inplace=True)
+        return result
+    def offset_time(self, offset_seconds: float):
+        for s in self.segments:
+            s.offset_time(offset_seconds)
+    def suppress_silence(
+            self,
+            silent_starts: np.ndarray,
+            silent_ends: np.ndarray,
+            min_word_dur: float = 0.1,
+            word_level: bool = True,
+            nonspeech_error: float = 0.3,
+            use_word_position: bool = True
+    ) -> "WhisperResult":
+        """
+        Move any start/end timestamps in silence parts of audio to the boundaries of the silence.
+        Parameters
+        ----------
+        silent_starts : numpy.ndarray
+            An array starting timestamps of silent sections of audio.
+        silent_ends : numpy.ndarray
+            An array ending timestamps of silent sections of audio.
+        min_word_dur : float, default 0.1
+            Shortest duration each word is allowed to reach for adjustments.
+        word_level : bool, default False
+            Whether to settings to word level timestamps.
+        nonspeech_error : float, default 0.3
+            Relative error of non-speech sections that appear in between a word for adjustments.
+        use_word_position : bool, default True
+            Whether to use position of the word in its segment to determine whether to keep end or start timestamps if
+            adjustments are required. If it is the first word, keep end. Else if it is the last word, keep the start.
+        Returns
+        -------
+        stable_whisper.result.WhisperResult
+            The current instance after the changes.
+        """
+        for s in self.segments:
+            s.suppress_silence(
+                silent_starts,
+                silent_ends,
+                min_word_dur,
+                word_level=word_level,
+                nonspeech_error=nonspeech_error,
+                use_word_position=use_word_position
+            )
+        return self
+    def adjust_by_silence(
+            self,
+            audio: Union[torch.Tensor, np.ndarray, str, bytes],
+            vad: bool = False,
+            *,
+            verbose: (bool, None) = False,
+            sample_rate: int = None,
+            vad_onnx: bool = False,
+            vad_threshold: float = 0.35,
+            q_levels: int = 20,
+            k_size: int = 5,
+            min_word_dur: float = 0.1,
+            word_level: bool = True,
+            nonspeech_error: float = 0.3,
+            use_word_position: bool = True
+    ) -> "WhisperResult":
+        """
+        Adjust timestamps base detected speech gaps.
+        This is method combines :meth:`stable_whisper.result.WhisperResult.suppress_silence` with silence detection.
+        Parameters
+        ----------
+        audio : str or numpy.ndarray or torch.Tensor or bytes
+            Path/URL to the audio file, the audio waveform, or bytes of audio file.
+        vad : bool, default False
+            Whether to use Silero VAD to generate timestamp suppression mask.
+            Silero VAD requires PyTorch 1.12.0+. Official repo, https://github.com/snakers4/silero-vad.
+        verbose : bool or None, default False
+            If ``False``, mute messages about hitting local caches. Note that the message about first download cannot be
+            muted. Only applies if ``vad = True``.
+        sample_rate : int, default None, meaning ``whisper.audio.SAMPLE_RATE``, 16kHZ
+            The sample rate of ``audio``.
+        vad_onnx : bool, default False
+            Whether to use ONNX for Silero VAD.
+        vad_threshold : float, default 0.35
+            Threshold for detecting speech with Silero VAD. Low threshold reduces false positives for silence detection.
+        q_levels : int, default 20
+            Quantization levels for generating timestamp suppression mask; ignored if ``vad = true``.
+            Acts as a threshold to marking sound as silent.
+            Fewer levels will increase the threshold of volume at which to mark a sound as silent.
+        k_size : int, default 5
+            Kernel size for avg-pooling waveform to generate timestamp suppression mask; ignored if ``vad = true``.
+            Recommend 5 or 3; higher sizes will reduce detection of silence.
+        min_word_dur : float, default 0.1
+            Shortest duration each word is allowed to reach from adjustments.
+        word_level : bool, default False
+            Whether to settings to word level timestamps.
+        nonspeech_error : float, default 0.3
+            Relative error of non-speech sections that appear in between a word for adjustments.
+        use_word_position : bool, default True
+            Whether to use position of the word in its segment to determine whether to keep end or start timestamps if
+            adjustments are required. If it is the first word, keep end. Else if it is the last word, keep the start.
+        Returns
+        -------
+        stable_whisper.result.WhisperResult
+            The current instance after the changes.
+        Notes
+        -----
+        This operation is already performed by :func:`stable_whisper.whisper_word_level.transcribe_stable` /
+        :func:`stable_whisper.whisper_word_level.transcribe_minimal`/
+        :func:`stable_whisper.non_whisper.transcribe_any` / :func:`stable_whisper.alignment.align`
+        if ``suppress_silence = True``.
+        """
+        if vad:
+            silent_timings = get_vad_silence_func(
+                onnx=vad_onnx,
+                verbose=verbose
+            )(audio, speech_threshold=vad_threshold, sr=sample_rate)
+        else:
+            silent_timings = mask2timing(
+                wav2mask(audio, q_levels=q_levels, k_size=k_size, sr=sample_rate)
+            )
+        if silent_timings is None:
+            return self
+        self.suppress_silence(
+            *silent_timings,
+            min_word_dur=min_word_dur,
+            word_level=word_level,
+            nonspeech_error=nonspeech_error,
+            use_word_position=use_word_position
+        )
+        self.update_nonspeech_sections(*silent_timings)
+        return self
+    def adjust_by_result(
+            self,
+            other_result: "WhisperResult",
+            min_word_dur: float = 0.1,
+            verbose: bool = False
+    ):
+        """
+        Minimize the duration of words using timestamps of another result.
+        Parameters
+        ----------
+        other_result : "WhisperResult"
+            Timing data of the same words in a WhisperResult instance.
+        min_word_dur : float, default 0.1
+            Prevent changes to timestamps if the resultant word duration is less than ``min_word_dur``.
+        verbose : bool, default False
+            Whether to print out the timestamp changes.
+        """
+        if not (self.has_words and other_result.has_words):
+            raise NotImplementedError('This operation can only be performed on results with word timestamps')
+        assert [w.word for w in self.all_words()] == [w.word for w in other_result.all_words()], \
+            'The words in [other_result] do not match the current words.'
+        for word, other_word in zip(self.all_words(), other_result.all_words()):
+            if word.end > other_word.start:
+                new_start = max(word.start, other_word.start)
+                new_end = min(word.end, other_word.end)
+                if new_end - new_start >= min_word_dur:
+                    line = ''
+                    if word.start != new_start:
+                        if verbose:
+                            line += f'[Start:{word.start:.3f}->{new_start:.3f}] '
+                        word.start = new_start
+                    if word.end != new_end:
+                        if verbose:
+                            line += f'[End:{word.end:.3f}->{new_end:.3f}]  '
+                        word.end = new_end
+                    if line:
+                        print(f'{line}"{word.word}"')
+        self.update_all_segs_with_words()
+    def reassign_ids(self, only_segments: bool = False):
+        for i, s in enumerate(self.segments):
+            s.id = i
+            if not only_segments:
+                s.reassign_ids()
+    def remove_no_word_segments(self, ignore_ori=False):
+        for i in reversed(range(len(self.segments))):
+            if (ignore_ori or self.segments[i].ori_has_words) and not self.segments[i].has_words:
+                del self.segments[i]
+        self.reassign_ids()
+    def get_locked_indices(self):
+        locked_indices = [i
+                          for i, (left, right) in enumerate(zip(self.segments[1:], self.segments[:-1]))
+                          if left.left_locked or right.right_locked]
+        return locked_indices
+    def get_gaps(self, as_ndarray=False):
+        s_ts = np.array([s.start for s in self.segments])
+        e_ts = np.array([s.end for s in self.segments])
+        gap = s_ts[1:] - e_ts[:-1]
+        return gap if as_ndarray else gap.tolist()
+    def get_gap_indices(self, min_gap: float = 0.1):  # for merging
+        if len(self.segments) < 2:
+            return []
+        if min_gap is None:
+            min_gap = 0
+        indices = (self.get_gaps(True) <= min_gap).nonzero()[0].tolist()
+        return sorted(set(indices) - set(self.get_locked_indices()))
+    def get_punctuation_indices(self, punctuation: Union[List[str], List[Tuple[str, str]], str]):  # for merging
+        if len(self.segments) < 2:
+            return []
+        if isinstance(punctuation, str):
+            punctuation = [punctuation]
+        indices = []
+        for p in punctuation:
+            if isinstance(p, str):
+                for i, s in enumerate(self.segments[:-1]):
+                    if s.text.endswith(p):
+                        indices.append(i)
+                    elif i != 0 and s.text.startswith(p):
+                        indices.append(i-1)
+            else:
+                ending, beginning = p
+                indices.extend([i for i, (s0, s1) in enumerate(zip(self.segments[:-1], self.segments[1:]))
+                                if s0.text.endswith(ending) and s1.text.startswith(beginning)])
+        return sorted(set(indices) - set(self.get_locked_indices()))
+    def all_words(self):
+        return list(chain.from_iterable(s.words for s in self.segments))
+    def all_words_or_segments(self):
+        return self.all_words() if self.has_words else self.segments
+    def all_words_by_lock(self, only_text: bool = True, by_segment: bool = False, include_single: bool = False):
+        if by_segment:
+            return [
+                segment.words_by_lock(only_text=only_text, include_single=include_single)
+                for segment in self.segments
+            ]
+        return _words_by_lock(self.all_words(), only_text=only_text, include_single=include_single)
+    def all_tokens(self):
+        return list(chain.from_iterable(s.tokens for s in self.all_words()))
+    def to_dict(self):
+        return dict(text=self.text,
+                    segments=self.segments_to_dicts(),
+                    language=self.language,
+                    ori_dict=self.ori_dict,
+                    regroup_history=self._regroup_history,
+                    nonspeech_sections=self._nonspeech_sections)
+    def segments_to_dicts(self, reverse_text: Union[bool, tuple] = False):
+        return [s.to_dict(reverse_text=reverse_text) for s in self.segments]
+    def _split_segments(self, get_indices, args: list = None, *, lock: bool = False, newline: bool = False):
+        if args is None:
+            args = []
+        no_words = False
+        for i in reversed(range(0, len(self.segments))):
+            no_words = no_words or not self.segments[i].has_words
+            indices = sorted(set(get_indices(self.segments[i], *args)))
+            if not indices:
+                continue
+            if newline:
+                if indices[-1] == len(self.segments[i].words) - 1:
+                    del indices[-1]
+                    if not indices:
+                        continue
+                for word_idx in indices:
+                    if self.segments[i].words[word_idx].word.endswith('\n'):
+                        continue
+                    self.segments[i].words[word_idx].word += '\n'
+                    if lock:
+                        self.segments[i].words[word_idx].lock_right()
+                        if word_idx + 1 < len(self.segments[i].words):
+                            self.segments[i].words[word_idx+1].lock_left()
+                self.segments[i].update_seg_with_words()
+            else:
+                new_segments = self.segments[i].split(indices)
+                if lock:
+                    for s in new_segments:
+                        if s == new_segments[0]:
+                            s.lock_right()
+                        elif s == new_segments[-1]:
+                            s.lock_left()
+                        else:
+                            s.lock_both()
+                del self.segments[i]
+                for s in reversed(new_segments):
+                    self.segments.insert(i, s)
+        if no_words:
+            warnings.warn('Found segment(s) without word timings. These segment(s) cannot be split.')
+        self.remove_no_word_segments()
+    def _merge_segments(self, indices: List[int],
+                        *, max_words: int = None, max_chars: int = None, is_sum_max: bool = False, lock: bool = False):
+        if len(indices) == 0:
+            return
+        for i in reversed(indices):
+            seg = self.segments[i]
+            if (
+                    (
+                            max_words and
+                            seg.has_words and
+                            (
+                                    (seg.word_count() + self.segments[i + 1].word_count() > max_words)
+                                    if is_sum_max else
+                                    (seg.word_count() > max_words and self.segments[i + 1].word_count() > max_words)
+                            )
+                    ) or
+                    (
+                            max_chars and
+                            (
+                                    (seg.char_count() + self.segments[i + 1].char_count() > max_chars)
+                                    if is_sum_max else
+                                    (seg.char_count() > max_chars and self.segments[i + 1].char_count() > max_chars)
+                            )
+                    )
+            ):
+                continue
+            self.add_segments(i, i + 1, inplace=True, lock=lock)
+        self.remove_no_word_segments()
+    def get_content_by_time(
+            self,
+            time: Union[float, Tuple[float, float], dict],
+            within: bool = False,
+            segment_level: bool = False
+    ) -> Union[List[WordTiming], List[Segment]]:
+        """
+        Return content in the ``time`` range.
+        Parameters
+        ----------
+        time : float or tuple of (float, float) or dict
+            Range of time to find content. For tuple of two floats, first value is the start time and second value is
+            the end time. For a single float value, it is treated as both the start and end time.
+        within : bool, default False
+            Whether to only find content fully overlaps with ``time`` range.
+        segment_level : bool, default False
+            Whether to look only on the segment level and return instances of :class:`stable_whisper.result.Segment`
+            instead of :class:`stable_whisper.result.WordTiming`.
+        Returns
+        -------
+        list of stable_whisper.result.WordTiming or list of stable_whisper.result.Segment
+            List of contents in the ``time`` range. The contents are instances of
+            :class:`stable_whisper.result.Segment` if ``segment_level = True`` else
+            :class:`stable_whisper.result.WordTiming`.
+        """
+        if not segment_level and not self.has_words:
+            raise ValueError('Missing word timestamps in result. Use ``segment_level=True`` instead.')
+        contents = self.segments if segment_level else self.all_words()
+        if isinstance(time, (float, int)):
+            time = [time, time]
+        elif isinstance(time, dict):
+            time = [time['start'], time['end']]
+        start, end = time
+        if within:
+            def is_in_range(c):
+                return start <= c.start and end >= c.end
+        else:
+            def is_in_range(c):
+                return start <= c.end and end >= c.start
+        return [c for c in contents if is_in_range(c)]
+    def split_by_gap(
+            self,
+            max_gap: float = 0.1,
+            lock: bool = False,
+            newline: bool = False
+    ) -> "WhisperResult":
+        """
+        Split (in-place) any segment where the gap between two of its words is greater than ``max_gap``.
+        Parameters
+        ----------
+        max_gap : float, default 0.1
+            Maximum second(s) allowed between two words if the same segment.
+        lock : bool, default False
+            Whether to prevent future splits/merges from altering changes made by this method.
+        newline: bool, default False
+            Whether to insert line break at the split points instead of splitting into separate segments.
+        Returns
+        -------
+        stable_whisper.result.WhisperResult
+            The current instance after the changes.
+        """
+        self._split_segments(lambda x: x.get_gap_indices(max_gap), lock=lock, newline=newline)
+        if self._regroup_history:
+            self._regroup_history += '_'
+        self._regroup_history += f'sg={max_gap}+{int(lock)}+{int(newline)}'
+        return self
+    def merge_by_gap(
+            self,
+            min_gap: float = 0.1,
+            max_words: int = None,
+            max_chars: int = None,
+            is_sum_max: bool = False,
+            lock: bool = False
+    ) -> "WhisperResult":
+        """
+        Merge (in-place) any pair of adjacent segments if the gap between them <= ``min_gap``.
+        Parameters
+        ----------
+        min_gap : float, default 0.1
+            Minimum second(s) allow between two segment.
+        max_words : int, optional
+            Maximum number of words allowed in each segment.
+        max_chars : int, optional
+            Maximum number of characters allowed in each segment.
+        is_sum_max : bool, default False
+            Whether ``max_words`` and ``max_chars`` is applied to the merged segment instead of the individual segments
+            to be merged.
+        lock : bool, default False
+            Whether to prevent future splits/merges from altering changes made by this method.
+        Returns
+        -------
+        stable_whisper.result.WhisperResult
+            The current instance after the changes.
+        """
+        indices = self.get_gap_indices(min_gap)
+        self._merge_segments(indices,
+                             max_words=max_words, max_chars=max_chars, is_sum_max=is_sum_max, lock=lock)
+        if self._regroup_history:
+            self._regroup_history += '_'
+        self._regroup_history += f'mg={min_gap}+{max_words or ""}+{max_chars or ""}+{int(is_sum_max)}+{int(lock)}'
+        return self
+    def split_by_punctuation(
+            self,
+            punctuation: Union[List[str], List[Tuple[str, str]], str],
+            lock: bool = False,
+            newline: bool = False,
+            min_words: Optional[int] = None,
+            min_chars: Optional[int] = None,
+            min_dur: Optional[int] = None
+    ) -> "WhisperResult":
+        """
+        Split (in-place) segments at words that start/end with ``punctuation``.
+        Parameters
+        ----------
+        punctuation : list of str of list of tuple of (str, str) or str
+            Punctuation(s) to split segments by.
+        lock : bool, default False
+            Whether to prevent future splits/merges from altering changes made by this method.
+        newline : bool, default False
+            Whether to insert line break at the split points instead of splitting into separate segments.
+        min_words : int, optional
+            Split segments with words >= ``min_words``.
+        min_chars : int, optional
+            Split segments with characters >= ``min_chars``.
+        min_dur : int, optional
+            split segments with duration (in seconds) >= ``min_dur``.
+        Returns
+        -------
+        stable_whisper.result.WhisperResult
+            The current instance after the changes.
+        """
+        def _over_max(x: Segment):
+            return (
+                    (min_words and len(x.words) >= min_words) or
+                    (min_chars and x.char_count() >= min_chars) or
+                    (min_dur and x.duration >= min_dur)
+            )
+        indices = set(s.id for s in self.segments if _over_max(s)) if any((min_words, min_chars, min_dur)) else None
+        def _get_indices(x: Segment):
+            return x.get_punctuation_indices(punctuation) if indices is None or x.id in indices else []
+        self._split_segments(_get_indices, lock=lock, newline=newline)
+        if self._regroup_history:
+            self._regroup_history += '_'
+        punct_str = '/'.join(p if isinstance(p, str) else '*'.join(p) for p in punctuation)
+        self._regroup_history += f'sp={punct_str}+{int(lock)}+{int(newline)}'
+        self._regroup_history += f'+{min_words or ""}+{min_chars or ""}+{min_dur or ""}'.rstrip('+')
+        return self
+    def merge_by_punctuation(
+            self,
+            punctuation: Union[List[str], List[Tuple[str, str]], str],
+            max_words: int = None,
+            max_chars: int = None,
+            is_sum_max: bool = False,
+            lock: bool = False
+    ) -> "WhisperResult":
+        """
+        Merge (in-place) any two segments that has specific punctuations inbetween.
+        Parameters
+        ----------
+        punctuation : list of str of list of tuple of (str, str) or str
+            Punctuation(s) to merge segments by.
+        max_words : int, optional
+            Maximum number of words allowed in each segment.
+        max_chars : int, optional
+            Maximum number of characters allowed in each segment.
+        is_sum_max : bool, default False
+            Whether ``max_words`` and ``max_chars`` is applied to the merged segment instead of the individual segments
+            to be merged.
+        lock : bool, default False
+            Whether to prevent future splits/merges from altering changes made by this method.
+        Returns
+        -------
+        stable_whisper.result.WhisperResult
+            The current instance after the changes.
+        """
+        indices = self.get_punctuation_indices(punctuation)
+        self._merge_segments(indices,
+                             max_words=max_words, max_chars=max_chars, is_sum_max=is_sum_max, lock=lock)
+        if self._regroup_history:
+            self._regroup_history += '_'
+        punct_str = '/'.join(p if isinstance(p, str) else '*'.join(p) for p in punctuation)
+        self._regroup_history += f'mp={punct_str}+{max_words or ""}+{max_chars or ""}+{int(is_sum_max)}+{int(lock)}'
+        return self
+    def merge_all_segments(self) -> "WhisperResult":
+        """
+        Merge all segments into one segment.
+        Returns
+        -------
+        stable_whisper.result.WhisperResult
+            The current instance after the changes.
+        """
+        if not self.segments:
+            return self
+        if self.has_words:
+            self.segments[0].words = self.all_words()
+        else:
+            self.segments[0].text += ''.join(s.text for s in self.segments[1:])
+            if all(s.tokens is not None for s in self.segments):
+                self.segments[0].tokens += list(chain.from_iterable(s.tokens for s in self.segments[1:]))
+            self.segments[0].end = self.segments[-1].end
+        self.segments = [self.segments[0]]
+        self.reassign_ids()
+        self.update_all_segs_with_words()
+        if self._regroup_history:
+            self._regroup_history += '_'
+        self._regroup_history += 'ms'
+        return self
+    def split_by_length(
+            self,
+            max_chars: int = None,
+            max_words: int = None,
+            even_split: bool = True,
+            force_len: bool = False,
+            lock: bool = False,
+            include_lock: bool = False,
+            newline: bool = False
+    ) -> "WhisperResult":
+        """
+        Split (in-place) any segment that exceeds ``max_chars`` or ``max_words`` into smaller segments.
+        Parameters
+        ----------
+        max_chars : int, optional
+            Maximum number of characters allowed in each segment.
+        max_words : int, optional
+            Maximum number of words allowed in each segment.
+        even_split : bool, default True
+            Whether to evenly split a segment in length if it exceeds ``max_chars`` or ``max_words``.
+        force_len : bool, default False
+            Whether to force a constant length for each segment except the last segment.
+            This will ignore all previous non-locked segment boundaries.
+        lock : bool, default False
+            Whether to prevent future splits/merges from altering changes made by this method.
+        include_lock: bool, default False
+            Whether to include previous lock before splitting based on max_words, if ``even_split = False``.
+            Splitting will be done after the first non-locked word > ``max_chars`` / ``max_words``.
+        newline: bool, default False
+            Whether to insert line break at the split points instead of splitting into separate segments.
+        Returns
+        -------
+        stable_whisper.result.WhisperResult
+            The current instance after the changes.
+        Notes
+        -----
+        If ``even_split = True``, segments can still exceed ``max_chars`` and locked words will be ignored to avoid
+        uneven splitting.
+        """
+        if force_len:
+            self.merge_all_segments()
+        self._split_segments(
+            lambda x: x.get_length_indices(
+                max_chars=max_chars,
+                max_words=max_words,
+                even_split=even_split,
+                include_lock=include_lock
+            ),
+            lock=lock,
+            newline=newline
+        )
+        if self._regroup_history:
+            self._regroup_history += '_'
+        self._regroup_history += (f'sl={max_chars or ""}+{max_words or ""}+{int(even_split)}+{int(force_len)}'
+                                  f'+{int(lock)}+{int(include_lock)}+{int(newline)}')
+        return self
+    def split_by_duration(
+            self,
+            max_dur: float,
+            even_split: bool = True,
+            force_len: bool = False,
+            lock: bool = False,
+            include_lock: bool = False,
+            newline: bool = False
+    ) -> "WhisperResult":
+        """
+        Split (in-place) any segment that exceeds ``max_dur`` into smaller segments.
+        Parameters
+        ----------
+        max_dur : float
+            Maximum duration (in seconds) per segment.
+        even_split : bool, default True
+            Whether to evenly split a segment in length if it exceeds ``max_dur``.
+        force_len : bool, default False
+            Whether to force a constant length for each segment except the last segment.
+            This will ignore all previous non-locked segment boundaries.
+        lock : bool, default False
+            Whether to prevent future splits/merges from altering changes made by this method.
+        include_lock: bool, default False
+            Whether to include previous lock before splitting based on max_words, if ``even_split = False``.
+            Splitting will be done after the first non-locked word > ``max_dur``.
+        newline: bool, default False
+            Whether to insert line break at the split points instead of splitting into separate segments.
+        Returns
+        -------
+        stable_whisper.result.WhisperResult
+            The current instance after the changes.
+        Notes
+        -----
+        If ``even_split = True``, segments can still exceed ``max_dur`` and locked words will be ignored to avoid
+        uneven splitting.
+        """
+        if force_len:
+            self.merge_all_segments()
+        self._split_segments(
+            lambda x: x.get_duration_indices(
+                max_dur=max_dur,
+                even_split=even_split,
+                include_lock=include_lock
+            ),
+            lock=lock,
+            newline=newline
+        )
+        if self._regroup_history:
+            self._regroup_history += '_'
+        self._regroup_history += (f'sd={max_dur}+{int(even_split)}+{int(force_len)}'
+                                  f'+{int(lock)}+{int(include_lock)}+{int(newline)}')
+        return self
+    def clamp_max(
+            self,
+            medium_factor: float = 2.5,
+            max_dur: float = None,
+            clip_start: Optional[bool] = None,
+            verbose: bool = False
+    ) -> "WhisperResult":
+        """
+        Clamp all word durations above certain value.
+        This is most effective when applied before and after other regroup operations.
+        Parameters
+        ----------
+        medium_factor : float, default 2.5
+            Clamp durations above (``medium_factor`` * medium duration) per segment.
+            If ``medium_factor = None/0`` or segment has less than 3 words, it will be ignored and use only ``max_dur``.
+        max_dur : float, optional
+            Clamp durations above ``max_dur``.
+        clip_start : bool or None, default None
+            Whether to clamp the start of a word. If ``None``, clamp the start of first word and end of last word per
+            segment.
+        verbose : bool, default False
+            Whether to print out the timestamp changes.
+        Returns
+        -------
+        stable_whisper.result.WhisperResult
+            The current instance after the changes.
+        """
+        if not (medium_factor or max_dur):
+            raise ValueError('At least one of following arguments requires non-zero value: medium_factor; max_dur')
+        if not self.has_words:
+            warnings.warn('Cannot clamp due to missing/no word-timestamps')
+            return self
+        for seg in self.segments:
+            curr_max_dur = None
+            if medium_factor and len(seg.words) > 2:
+                durations = np.array([word.duration for word in seg.words])
+                durations.sort()
+                curr_max_dur = medium_factor * durations[len(durations)//2 + 1]
+            if max_dur and (not curr_max_dur or curr_max_dur > max_dur):
+                curr_max_dur = max_dur
+            if not curr_max_dur:
+                continue
+            if clip_start is None:
+                seg.words[0].clamp_max(curr_max_dur, clip_start=True, verbose=verbose)
+                seg.words[-1].clamp_max(curr_max_dur, clip_start=False, verbose=verbose)
+            else:
+                for i, word in enumerate(seg.words):
+                    word.clamp_max(curr_max_dur, clip_start=clip_start, verbose=verbose)
+            seg.update_seg_with_words()
+        if self._regroup_history:
+            self._regroup_history += '_'
+        self._regroup_history += f'cm={medium_factor}+{max_dur or ""}+{clip_start or ""}+{int(verbose)}'
+        return self
+    def lock(
+            self,
+            startswith: Union[str, List[str]] = None,
+            endswith: Union[str, List[str]] = None,
+            right: bool = True,
+            left: bool = False,
+            case_sensitive: bool = False,
+            strip: bool = True
+    ) -> "WhisperResult":
+        """
+        Lock words/segments with matching prefix/suffix to prevent splitting/merging.
+        Parameters
+        ----------
+        startswith: str or list of str
+            Prefixes to lock.
+        endswith: str or list of str
+            Suffixes to lock.
+        right : bool, default True
+            Whether prevent splits/merges with the next word/segment.
+        left : bool, default False
+            Whether prevent splits/merges with the previous word/segment.
+        case_sensitive : bool, default False
+            Whether to match the case of the prefixes/suffixes with the words/segments.
+        strip : bool, default True
+            Whether to ignore spaces before and after both words/segments and prefixes/suffixes.
+        Returns
+        -------
+        stable_whisper.result.WhisperResult
+            The current instance after the changes.
+        """
+        assert startswith or endswith, 'Must specify [startswith] or/and [endswith].'
+        startswith = [] if startswith is None else ([startswith] if isinstance(startswith, str) else startswith)
+        endswith = [] if endswith is None else ([endswith] if isinstance(endswith, str) else endswith)
+        if not case_sensitive:
+            startswith = [t.lower() for t in startswith]
+            endswith = [t.lower() for t in endswith]
+        if strip:
+            startswith = [t.strip() for t in startswith]
+            endswith = [t.strip() for t in endswith]
+        for part in self.all_words_or_segments():
+            text = part.word if hasattr(part, 'word') else part.text
+            if not case_sensitive:
+                text = text.lower()
+            if strip:
+                text = text.strip()
+            for prefix in startswith:
+                if text.startswith(prefix):
+                    if right:
+                        part.lock_right()
+                    if left:
+                        part.lock_left()
+            for suffix in endswith:
+                if text.endswith(suffix):
+                    if right:
+                        part.lock_right()
+                    if left:
+                        part.lock_left()
+        if self._regroup_history:
+            self._regroup_history += '_'
+        startswith_str = (startswith if isinstance(startswith, str) else '/'.join(startswith)) if startswith else ""
+        endswith_str = (endswith if isinstance(endswith, str) else '/'.join(endswith)) if endswith else ""
+        self._regroup_history += (f'l={startswith_str}+{endswith_str}'
+                                  f'+{int(right)}+{int(left)}+{int(case_sensitive)}+{int(strip)}')
+        return self
+    def remove_word(
+            self,
+            word: Union[WordTiming, Tuple[int, int]],
+            reassign_ids: bool = True,
+            verbose: bool = True
+    ) -> 'WhisperResult':
+        """
+        Remove a word.
+        Parameters
+        ----------
+        word : WordTiming or tuple of (int, int)
+            Instance of :class:`stable_whisper.result.WordTiming` or tuple of (segment index, word index).
+        reassign_ids : bool, default True
+            Whether to reassign segment and word ids (indices) after removing ``word``.
+        verbose : bool, default True
+            Whether to print detail of the removed word.
+        Returns
+        -------
+        stable_whisper.result.WhisperResult
+            The current instance after the changes.
+        """
+        if isinstance(word, WordTiming):
+            if self[word.segment_id][word.id] is not word:
+                self.reassign_ids()
+                if self[word.segment_id][word.id] is not word:
+                    raise ValueError('word not in result')
+            seg_id, word_id = word.segment_id, word.id
+        else:
+            seg_id, word_id = word
+        if verbose:
+            print(f'Removed: {self[seg_id][word_id].to_dict()}')
+        del self.segments[seg_id].words[word_id]
+        if not reassign_ids:
+            return self
+        if self[seg_id].has_words:
+            self[seg_id].reassign_ids()
+        else:
+            self.remove_no_word_segments()
+        return self
+    def remove_segment(
+            self,
+            segment: Union[Segment, int],
+            reassign_ids: bool = True,
+            verbose: bool = True
+    ) -> 'WhisperResult':
+        """
+        Remove a segment.
+        Parameters
+        ----------
+        segment : Segment or int
+            Instance :class:`stable_whisper.result.Segment` or segment index.
+        reassign_ids : bool, default True
+            Whether to reassign segment IDs (indices) after removing ``segment``.
+        verbose : bool, default True
+            Whether to print detail of the removed word.
+        Returns
+        -------
+        stable_whisper.result.WhisperResult
+            The current instance after the changes.
+        """
+        if isinstance(segment, Segment):
+            if self[segment.id] is not segment:
+                self.reassign_ids()
+                if self[segment.id] is not segment:
+                    raise ValueError('segment not in result')
+            segment = segment.id
+        if verbose:
+            print(f'Removed: [id:{self[segment].id}] {self[segment].to_display_str(True)}')
+        del self.segments[segment]
+        if not reassign_ids:
+            return self
+        self.reassign_ids(True)
+        return self
+    def remove_repetition(
+            self,
+            max_words: int = 1,
+            case_sensitive: bool = False,
+            strip: bool = True,
+            ignore_punctuations: str = "\"',.?!",
+            extend_duration: bool = True,
+            verbose: bool = True
+    ) -> 'WhisperResult':
+        """
+        Remove words that repeat consecutively.
+        Parameters
+        ----------
+        max_words : int
+            Maximum number of words to look for consecutively.
+        case_sensitive : bool, default False
+            Whether the case of words need to match to be considered as repetition.
+        strip : bool, default True
+            Whether to ignore spaces before and after each word.
+        ignore_punctuations : bool, default '"',.?!'
+            Ending punctuations to ignore.
+        extend_duration: bool, default True
+            Whether to extend the duration of the previous word to cover the duration of the repetition.
+        verbose: bool, default True
+            Whether to print detail of the removed repetitions.
+        Returns
+        -------
+        stable_whisper.result.WhisperResult
+            The current instance after the changes.
+        """
+        if not self.has_words:
+            return self
+        for count in range(1, max_words + 1):
+            all_words = self.all_words()
+            if len(all_words) < 2:
+                return self
+            all_words_str = [w.word for w in all_words]
+            if strip:
+                all_words_str = [w.strip() for w in all_words_str]
+            if ignore_punctuations:
+                ptn = f'[{ignore_punctuations}]+$'
+                all_words_str = [re.sub(ptn, '', w) for w in all_words_str]
+            if not case_sensitive:
+                all_words_str = [w.lower() for w in all_words_str]
+            next_i = None
+            changes = []
+            for i in reversed(range(count*2, len(all_words_str)+1)):
+                if next_i is not None:
+                    if next_i != i:
+                        continue
+                    else:
+                        next_i = None
+                s = i - count
+                if all_words_str[s - count:s] != all_words_str[s:i]:
+                    continue
+                next_i = s
+                if extend_duration:
+                    all_words[s-1].end = all_words[i-1].end
+                temp_changes = []
+                for j in reversed(range(s, i)):
+                    if verbose:
+                        temp_changes.append(f'- {all_words[j].to_dict()}')
+                    self.remove_word(all_words[j], False, verbose=False)
+                if temp_changes:
+                    changes.append(
+                        f'Remove: [{format_timestamp(all_words[s].start)} -> {format_timestamp(all_words[i-1].end)}] '
+                        + ''.join(_w.word for _w in all_words[s:i]) + '\n'
+                        + '\n'.join(reversed(temp_changes)) + '\n'
+                    )
+                for i0, i1 in zip(range(s - count, s), range(s, i)):
+                    if len(all_words[i0].word) < len(all_words[i1].word):
+                        all_words[i1].start = all_words[i0].start
+                        all_words[i1].end = all_words[i0].end
+                        _sid, _wid = all_words[i0].segment_id, all_words[i0].id
+                        self.segments[_sid].words[_wid] = all_words[i1]
+            if changes:
+                print('\n'.join(reversed(changes)))
+            self.remove_no_word_segments()
+        self.update_all_segs_with_words()
+        return self
+    def remove_words_by_str(
+            self,
+            words: Union[str, List[str], None],
+            case_sensitive: bool = False,
+            strip: bool = True,
+            ignore_punctuations: str = "\"',.?!",
+            min_prob: float = None,
+            filters: Callable = None,
+            verbose: bool = True
+    ) -> 'WhisperResult':
+        """
+        Remove words that match ``words``.
+        Parameters
+        ----------
+        words : str or list of str or None
+            A word or list of words to remove.``None`` for all words to be passed into ``filters``.
+        case_sensitive : bool, default False
+            Whether the case of words need to match to be considered as repetition.
+        strip : bool, default True
+            Whether to ignore spaces before and after each word.
+        ignore_punctuations : bool, default '"',.?!'
+            Ending punctuations to ignore.
+        min_prob : float, optional
+            Acts as the first filter the for the words that match ``words``. Words with probability < ``min_prob`` will
+            be removed if ``filters`` is ``None``, else pass the words into ``filters``. Words without probability will
+            be treated as having probability < ``min_prob``.
+        filters : Callable, optional
+            A function that takes an instance of :class:`stable_whisper.result.WordTiming` as its only argument.
+            This function is custom filter for the words that match ``words`` and were not caught by ``min_prob``.
+        verbose:
+            Whether to print detail of the removed words.
+        Returns
+        -------
+        stable_whisper.result.WhisperResult
+            The current instance after the changes.
+        """
+        if not self.has_words:
+            return self
+        if isinstance(words, str):
+            words = [words]
+        all_words = self.all_words()
+        all_words_str = [w.word for w in all_words]
+        if strip:
+            all_words_str = [w.strip() for w in all_words_str]
+            words = [w.strip() for w in words]
+        if ignore_punctuations:
+            ptn = f'[{ignore_punctuations}]+$'
+            all_words_str = [re.sub(ptn, '', w) for w in all_words_str]
+            words = [re.sub(ptn, '', w) for w in words]
+        if not case_sensitive:
+            all_words_str = [w.lower() for w in all_words_str]
+            words = [w.lower() for w in words]
+        changes = []
+        for i, w in reversed(list(enumerate(all_words_str))):
+            if not (words is None or any(w == _w for _w in words)):
+                continue
+            if (
+                    (min_prob is None or all_words[i].probability is None or min_prob > all_words[i].probability) and
+                    (filters is None or filters(all_words[i]))
+            ):
+                if verbose:
+                    changes.append(f'Removed: {all_words[i].to_dict()}')
+                self.remove_word(all_words[i], False, verbose=False)
+        if changes:
+            print('\n'.join(reversed(changes)))
+        self.remove_no_word_segments()
+        self.update_all_segs_with_words()
+        return self
+    def fill_in_gaps(
+            self,
+            other_result: Union['WhisperResult', str],
+            min_gap: float = 0.1,
+            case_sensitive: bool = False,
+            strip: bool = True,
+            ignore_punctuations: str = "\"',.?!",
+            verbose: bool = True
+    ) -> 'WhisperResult':
+        """
+        Fill in segment gaps larger than ``min_gap`` with content from ``other_result`` at the times of gaps.
+        Parameters
+        ----------
+        other_result : WhisperResult or str
+            Another transcription result as an instance of :class:`stable_whisper.result.WhisperResult` or path to the
+            JSON of the result.
+        min_gap : float, default 0.1
+            The minimum seconds of a gap between segments that must be exceeded to be filled in.
+        case_sensitive : bool, default False
+            Whether to consider the case of the first and last word of the gap to determine overlapping words to remove
+            before filling in.
+        strip : bool, default True
+            Whether to ignore spaces before and after the first and last word of the gap to determine overlapping words
+            to remove before filling in.
+        ignore_punctuations : bool, default '"',.?!'
+            Ending punctuations to ignore in the first and last word of the gap to determine overlapping words to
+            remove before filling in.
+        verbose:
+            Whether to print detail of the filled content.
+        Returns
+        -------
+        stable_whisper.result.WhisperResult
+            The current instance after the changes.
+        """
+        if len(self.segments) < 2:
+            return self
+        if isinstance(other_result, str):
+            other_result = WhisperResult(other_result)
+        if strip:
+            def strip_space(w):
+                return w.strip()
+        else:
+            def strip_space(w):
+                return w
+        if ignore_punctuations:
+            ptn = f'[{ignore_punctuations}]+$'
+            def strip_punctuations(w):
+                return re.sub(ptn, '', strip_space(w))
+        else:
+            strip_punctuations = strip_space
+        if case_sensitive:
+            strip = strip_punctuations
+        else:
+            def strip(w):
+                return strip_punctuations(w).lower()
+        seg_pairs = list(enumerate(zip(self.segments[:-1], self.segments[1:])))
+        seg_pairs.insert(0, (-1, (None, self.segments[0])))
+        seg_pairs.append((seg_pairs[-1][0]+1, (self.segments[-1], None)))
+        changes = []
+        for i, (seg0, seg1) in reversed(seg_pairs):
+            first_word = None if seg0 is None else seg0.words[-1]
+            last_word = None if seg1 is None else seg1.words[0]
+            start = (other_result[0].start if first_word is None else first_word.end)
+            end = other_result[-1].end if last_word is None else last_word.start
+            if end - start <= min_gap:
+                continue
+            gap_words = other_result.get_content_by_time((start, end))
+            if first_word is not None and gap_words and strip(first_word.word) == strip(gap_words[0].word):
+                first_word.end = gap_words[0].end
+                gap_words = gap_words[1:]
+            if last_word is not None and gap_words and strip(last_word.word) == strip(gap_words[-1].word):
+                last_word.start = gap_words[-1].start
+                gap_words = gap_words[:-1]
+            if not gap_words:
+                continue
+            if last_word is not None and last_word.start < gap_words[-1].end:
+                last_word.start = gap_words[-1].end
+            new_segments = [other_result[gap_words[0].segment_id].copy([])]
+            for j, new_word in enumerate(gap_words):
+                new_word = deepcopy(new_word)
+                if j == 0 and first_word is not None and first_word.end > gap_words[0].start:
+                    new_word.start = first_word.end
+                if new_segments[-1].id != new_word.segment_id:
+                    new_segments.append(other_result[new_word.segment_id].copy([]))
+                new_segments[-1].words.append(new_word)
+            if verbose:
+                changes.append('\n'.join('Added: ' + s.to_display_str(True) for s in new_segments))
+            self.segments = self.segments[:i+1] + new_segments + self.segments[i+1:]
+        if changes:
+            print('\n'.join(reversed(changes)))
+        self.reassign_ids()
+        self.update_all_segs_with_words()
+        return self
+    def regroup(
+            self,
+            regroup_algo: Union[str, bool] = None,
+            verbose: bool = False,
+            only_show: bool = False
+    ) -> "WhisperResult":
+        """
+        Regroup (in-place) words into segments.
+        Parameters
+        ----------
+        regroup_algo: str or bool, default 'da'
+             String representation of a custom regrouping algorithm or ``True`` use to the default algorithm 'da'.
+        verbose : bool, default False
+            Whether to show all the methods and arguments parsed from ``regroup_algo``.
+        only_show : bool, default False
+            Whether to show the all methods and arguments parsed from ``regroup_algo`` without running the methods
+        Returns
+        -------
+        stable_whisper.result.WhisperResult
+            The current instance after the changes.
+        Notes
+        -----
+        Syntax for string representation of custom regrouping algorithm.
+            Method keys:
+                sg: split_by_gap
+                sp: split_by_punctuation
+                sl: split_by_length
+                sd: split_by_duration
+                mg: merge_by_gap
+                mp: merge_by_punctuation
+                ms: merge_all_segment
+                cm: clamp_max
+                l: lock
+                us: unlock_all_segments
+                da: default algorithm (cm_sp=.* /。/?/？/,* /，_sg=.5_mg=.3+3_sp=.* /。/?/？)
+                rw: remove_word
+                rs: remove_segment
+                rp: remove_repetition
+                rws: remove_words_by_str
+                fg: fill_in_gaps
+            Metacharacters:
+                = separates a method key and its arguments (not used if no argument)
+                _ separates method keys (after arguments if there are any)
+                + separates arguments for a method key
+                / separates an argument into list of strings
+                * separates an item in list of strings into a nested list of strings
+            Notes:
+            -arguments are parsed positionally
+            -if no argument is provided, the default ones will be used
+            -use 1 or 0 to represent True or False
+            Example 1:
+                merge_by_gap(.2, 10, lock=True)
+                mg=.2+10+++1
+                Note: [lock] is the 5th argument hence the 2 missing arguments inbetween the three + before 1
+            Example 2:
+                split_by_punctuation([('.', ' '), '。', '?', '？'], True)
+                sp=.* /。/?/？+1
+            Example 3:
+                merge_all_segments().split_by_gap(.5).merge_by_gap(.15, 3)
+                ms_sg=.5_mg=.15+3
+        """
+        if regroup_algo is False:
+            return self
+        if regroup_algo is None or regroup_algo is True:
+            regroup_algo = 'da'
+        for method, kwargs, msg in self.parse_regroup_algo(regroup_algo, include_str=verbose or only_show):
+            if msg:
+                print(msg)
+            if not only_show:
+                method(**kwargs)
+        return self
+    def parse_regroup_algo(self, regroup_algo: str, include_str: bool = True) -> List[Tuple[Callable, dict, str]]:
+        methods = dict(
+            sg=self.split_by_gap,
+            sp=self.split_by_punctuation,
+            sl=self.split_by_length,
+            sd=self.split_by_duration,
+            mg=self.merge_by_gap,
+            mp=self.merge_by_punctuation,
+            ms=self.merge_all_segments,
+            cm=self.clamp_max,
+            us=self.unlock_all_segments,
+            l=self.lock,
+            rw=self.remove_word,
+            rs=self.remove_segment,
+            rp=self.remove_repetition,
+            rws=self.remove_words_by_str,
+            fg=self.fill_in_gaps,
+        )
+        if not regroup_algo:
+            return []
+        calls = regroup_algo.split('_')
+        if 'da' in calls:
+            default_calls = 'cm_sp=.* /。/?/？/,* /，_sg=.5_mg=.3+3_sp=.* /。/?/？'.split('_')
+            calls = chain.from_iterable(default_calls if method == 'da' else [method] for method in calls)
+        operations = []
+        for method in calls:
+            method, args = method.split('=', maxsplit=1) if '=' in method else (method, '')
+            if method not in methods:
+                raise NotImplementedError(f'{method} is not one of the available methods: {tuple(methods.keys())}')
+            args = [] if len(args) == 0 else list(map(str_to_valid_type, args.split('+')))
+            kwargs = {k: v for k, v in zip(methods[method].__code__.co_varnames[1:], args) if v is not None}
+            if include_str:
+                kwargs_str = ', '.join(f'{k}="{v}"' if isinstance(v, str) else f'{k}={v}' for k, v in kwargs.items())
+                op_str = f'{methods[method].__name__}({kwargs_str})'
+            else:
+                op_str = None
+            operations.append((methods[method], kwargs, op_str))
+        return operations
+    def find(self, pattern: str, word_level=True, flags=None) -> "WhisperResultMatches":
+        """
+        Find segments/words and timestamps with regular expression.
+        Parameters
+        ----------
+        pattern : str
+            RegEx pattern to search for.
+        word_level : bool, default True
+            Whether to search at word-level.
+        flags : optional
+            RegEx flags.
+        Returns
+        -------
+        stable_whisper.result.WhisperResultMatches
+            An instance of :class:`stable_whisper.result.WhisperResultMatches` with word/segment that match ``pattern``.
+        """
+        return WhisperResultMatches(self).find(pattern, word_level=word_level, flags=flags)
+    @property
+    def text(self):
+        return ''.join(s.text for s in self.segments)
+    @property
+    def regroup_history(self):
+        # same syntax as ``regroup_algo`` for :meth:``result.WhisperResult.regroup`
+        return self._regroup_history
+    @property
+    def nonspeech_sections(self):
+        return self._nonspeech_sections
+    def show_regroup_history(self):
+        """
+        Print details of all regrouping operations that been performed on data.
+        """
+        if not self._regroup_history:
+            print('Result has no history.')
+        for *_, msg in self.parse_regroup_algo(self._regroup_history):
+            print(f'.{msg}')
+    def __len__(self):
+        return len(self.segments)
+    def unlock_all_segments(self):
+        for s in self.segments:
+            s.unlock_all_words()
+        return self
+    def reset(self):
+        """
+        Restore all values to that at initialization.
+        """
+        self.language = self.ori_dict.get('language')
+        self._regroup_history = ''
+        segments = self.ori_dict.get('segments')
+        self.segments: List[Segment] = [Segment(**s) for s in segments] if segments else []
+        if self._forced_order:
+            self.force_order()
+        self.remove_no_word_segments(any(seg.has_words for seg in self.segments))
+        self.update_all_segs_with_words()
+    @property
+    def has_words(self):
+        return all(seg.has_words for seg in self.segments)
+    to_srt_vtt = result_to_srt_vtt
+    to_ass = result_to_ass
+    to_tsv = result_to_tsv
+    to_txt = result_to_txt
+    save_as_json = save_as_json
+class SegmentMatch:
+    def __init__(
+            self,
+            segments: Union[List[Segment], Segment],
+            _word_indices: List[List[int]] = None,
+            _text_match: str = None
+    ):
+        self.segments = [segments] if isinstance(segments, Segment) else segments
+        self.word_indices = [] if _word_indices is None else _word_indices
+        self.words = [self.segments[i].words[j] for i, indices in enumerate(self.word_indices) for j in indices]
+        if len(self.words) != 0:
+            self.text = ''.join(
+                self.segments[i].words[j].word
+                for i, indices in enumerate(self.word_indices)
+                for j in indices
+            )
+        else:
+            self.text = ''.join(seg.text for seg in self.segments)
+        self.text_match = _text_match
+    @property
+    def start(self):
+        return (
+            self.words[0].start
+            if len(self.words) != 0 else
+            (self.segments[0].start if len(self.segments) != 0 else None)
+        )
+    @property
+    def end(self):
+        return (
+            self.words[-1].end
+            if len(self.words) != 0 else
+            (self.segments[-1].end if len(self.segments) != 0 else None)
+        )
+    def __len__(self):
+        return len(self.segments)
+    def __repr__(self):
+        return self.__dict__.__repr__()
+    def __str__(self):
+        return self.__dict__.__str__()
+class WhisperResultMatches:
+    """
+    RegEx matches for WhisperResults.
+    """
+    # Use WhisperResult.find() instead of instantiating this class directly.
+    def __init__(
+            self,
+            matches: Union[List[SegmentMatch], WhisperResult],
+            _segment_indices: List[List[int]] = None
+    ):
+        if isinstance(matches, WhisperResult):
+            self.matches = list(map(SegmentMatch, matches.segments))
+            self._segment_indices = [[i] for i in range(len(matches.segments))]
+        else:
+            self.matches = matches
+            assert _segment_indices is not None
+            assert len(self.matches) == len(_segment_indices)
+            assert all(len(match.segments) == len(_segment_indices[i]) for i, match in enumerate(self.matches))
+            self._segment_indices = _segment_indices
+    @property
+    def segment_indices(self):
+        return self._segment_indices
+    def _curr_seg_groups(self) -> List[List[Tuple[int, Segment]]]:
+        seg_groups, curr_segs = [], []
+        curr_max = -1
+        for seg_indices, match in zip(self._segment_indices, self.matches):
+            for i, seg in zip(sorted(seg_indices), match.segments):
+                if i > curr_max:
+                    curr_segs.append((i, seg))
+                    if i - 1 != curr_max:
+                        seg_groups.append(curr_segs)
+                        curr_segs = []
+                    curr_max = i
+        if curr_segs:
+            seg_groups.append(curr_segs)
+        return seg_groups
+    def find(self, pattern: str, word_level=True, flags=None) -> "WhisperResultMatches":
+        """
+        Find segments/words and timestamps with regular expression.
+        Parameters
+        ----------
+        pattern : str
+            RegEx pattern to search for.
+        word_level : bool, default True
+            Whether to search at word-level.
+        flags : optional
+            RegEx flags.
+        Returns
+        -------
+        stable_whisper.result.WhisperResultMatches
+            An instance of :class:`stable_whisper.result.WhisperResultMatches` with word/segment that match ``pattern``.
+        """
+        seg_groups = self._curr_seg_groups()
+        matches: List[SegmentMatch] = []
+        match_seg_indices: List[List[int]] = []
+        if word_level:
+            if not all(all(seg.has_words for seg in match.segments) for match in self.matches):
+                warnings.warn('Cannot perform word-level search with segment(s) missing word timestamps.')
+                word_level = False
+        for segs in seg_groups:
+            if word_level:
+                idxs = list(chain.from_iterable(
+                    [(i, j)]*len(word.word) for (i, seg) in segs for j, word in enumerate(seg.words)
+                ))
+                text = ''.join(word.word for (_, seg) in segs for word in seg.words)
+            else:
+                idxs = list(chain.from_iterable([(i, None)]*len(seg.text) for (i, seg) in segs))
+                text = ''.join(seg.text for (_, seg) in segs)
+            assert len(idxs) == len(text)
+            for curr_match in re.finditer(pattern, text, flags=flags or 0):
+                start, end = curr_match.span()
+                curr_idxs = idxs[start: end]
+                curr_seg_idxs = sorted(set(i[0] for i in curr_idxs))
+                if word_level:
+                    curr_word_idxs = [
+                        sorted(set(j for i, j in curr_idxs if i == seg_idx))
+                        for seg_idx in curr_seg_idxs
+                    ]
+                else:
+                    curr_word_idxs = None
+                matches.append(SegmentMatch(
+                    segments=[s for i, s in segs if i in curr_seg_idxs],
+                    _word_indices=curr_word_idxs,
+                    _text_match=curr_match.group()
+                ))
+                match_seg_indices.append(curr_seg_idxs)
+        return WhisperResultMatches(matches, match_seg_indices)
+    def __len__(self):
+        return len(self.matches)
+    def __bool__(self):
+        return self.__len__() != 0
+    def __getitem__(self, idx):
+        return self.matches[idx]

stable_whisper/stabilization.py ADDED Viewed

	@@ -0,0 +1,424 @@

+import warnings
+from typing import List, Union, Tuple, Optional
+from itertools import chain
+import torch
+import torch.nn.functional as F
+import numpy as np
+from whisper.audio import TOKENS_PER_SECOND, SAMPLE_RATE, N_SAMPLES_PER_TOKEN
+NONVAD_SAMPLE_RATES = (16000,)
+VAD_SAMPLE_RATES = (16000, 8000)
+def is_ascending_sequence(
+        seq: List[Union[int, float]],
+        verbose=True
+) -> bool:
+    """
+    check if a sequence of numbers are in ascending order
+    """
+    is_ascending = True
+    for idx, (i, j) in enumerate(zip(seq[:-1], seq[1:])):
+        if i > j:
+            is_ascending = False
+            if verbose:
+                print(f'[Index{idx}]:{i} > [Index{idx + 1}]:{j}')
+            else:
+                break
+    return is_ascending
+def valid_ts(
+        ts: List[dict],
+        warn=True
+) -> bool:
+    valid = is_ascending_sequence(list(chain.from_iterable([s['start'], s['end']] for s in ts)), False)
+    if warn and not valid:
+        warnings.warn(message='Found timestamp(s) jumping backwards in time. '
+                              'Use word_timestamps=True to avoid the issue.')
+    return valid
+def mask2timing(
+        silence_mask: (np.ndarray, torch.Tensor),
+        time_offset: float = 0.0,
+) -> (Tuple[np.ndarray, np.ndarray], None):
+    if silence_mask is None or not silence_mask.any():
+        return
+    assert silence_mask.ndim == 1
+    if isinstance(silence_mask, torch.Tensor):
+        silences = silence_mask.cpu().numpy().copy()
+    elif isinstance(silence_mask, np.ndarray):
+        silences = silence_mask.copy()
+    else:
+        raise NotImplementedError(f'Expected torch.Tensor or numpy.ndarray, but got {type(silence_mask)}')
+    silences[0] = False
+    silences[-1] = False
+    silent_starts = np.logical_and(~silences[:-1], silences[1:]).nonzero()[0] / TOKENS_PER_SECOND
+    silent_ends = (np.logical_and(silences[:-1], ~silences[1:]).nonzero()[0] + 1) / TOKENS_PER_SECOND
+    if time_offset:
+        silent_starts += time_offset
+        silent_ends += time_offset
+    return silent_starts, silent_ends
+def timing2mask(
+        silent_starts: np.ndarray,
+        silent_ends: np.ndarray,
+        size: int,
+        time_offset: float = None
+) -> torch.Tensor:
+    assert len(silent_starts) == len(silent_ends)
+    ts_token_mask = torch.zeros(size, dtype=torch.bool)
+    if time_offset:
+        silent_starts = (silent_starts - time_offset).clip(min=0)
+        silent_ends = (silent_ends - time_offset).clip(min=0)
+    mask_i = (silent_starts * TOKENS_PER_SECOND).round().astype(np.int16)
+    mask_e = (silent_ends * TOKENS_PER_SECOND).round().astype(np.int16)
+    for mi, me in zip(mask_i, mask_e):
+        ts_token_mask[mi:me+1] = True
+    return ts_token_mask
+def suppress_silence(
+        result_obj,
+        silent_starts: Union[np.ndarray, List[float]],
+        silent_ends: Union[np.ndarray, List[float]],
+        min_word_dur: float,
+        nonspeech_error: float = 0.3,
+        keep_end: Optional[bool] = True
+):
+    assert len(silent_starts) == len(silent_ends)
+    if len(silent_starts) == 0 or (result_obj.end - result_obj.start) <= min_word_dur:
+        return
+    if isinstance(silent_starts, list):
+        silent_starts = np.array(silent_starts)
+    if isinstance(silent_ends, list):
+        silent_ends = np.array(silent_ends)
+    start_overlaps = np.all(
+        (silent_starts <= result_obj.start, result_obj.start < silent_ends, silent_ends <= result_obj.end),
+        axis=0
+    ).nonzero()[0].tolist()
+    if start_overlaps:
+        new_start = silent_ends[start_overlaps[0]]
+        result_obj.start = min(new_start, round(result_obj.end - min_word_dur, 3))
+        if (result_obj.end - result_obj.start) <= min_word_dur:
+            return
+    end_overlaps = np.all(
+        (result_obj.start <= silent_starts, silent_starts < result_obj.end, result_obj.end <= silent_ends),
+        axis=0
+    ).nonzero()[0].tolist()
+    if end_overlaps:
+        new_end = silent_starts[end_overlaps[0]]
+        result_obj.end = max(new_end, round(result_obj.start + min_word_dur, 3))
+        if (result_obj.end - result_obj.start) <= min_word_dur:
+            return
+    if nonspeech_error:
+        matches = np.logical_and(
+            result_obj.start <= silent_starts,
+            result_obj.end >= silent_ends,
+        ).nonzero()[0].tolist()
+        if len(matches) == 0:
+            return
+        silence_start = np.min(silent_starts[matches])
+        silence_end = np.max(silent_ends[matches])
+        start_extra = silence_start - result_obj.start
+        end_extra = result_obj.end - silence_end
+        silent_duration = silence_end - silence_start
+        start_within_error = (start_extra / silent_duration) <= nonspeech_error
+        end_within_error = (end_extra / silent_duration) <= nonspeech_error
+        if keep_end is None:
+            keep_end = start_extra <= end_extra
+            within_error = start_within_error if keep_end else end_within_error
+        else:
+            within_error = start_within_error or end_within_error
+        if within_error:
+            if keep_end:
+                result_obj.start = min(silence_end, round(result_obj.end - min_word_dur, 3))
+            else:
+                result_obj.end = max(silence_start, round(result_obj.start + min_word_dur, 3))
+def standardize_audio(
+        audio: Union[torch.Tensor, np.ndarray, str, bytes],
+        resample_sr: Tuple[Optional[int], Union[int, Tuple[int]]] = None
+) -> torch.Tensor:
+    if isinstance(audio, (str, bytes)):
+        from .audio import load_audio
+        audio = load_audio(audio)
+    if isinstance(audio, np.ndarray):
+        audio = torch.from_numpy(audio)
+    audio = audio.float()
+    if resample_sr:
+        in_sr, out_sr = resample_sr
+        if in_sr:
+            if isinstance(out_sr, int):
+                out_sr = [out_sr]
+            if in_sr not in out_sr:
+                from torchaudio.functional import resample
+                audio = resample(audio, in_sr, out_sr[0])
+    return audio
+def audio2loudness(
+        audio_tensor: torch.Tensor
+) -> (torch.Tensor, None):
+    assert audio_tensor.dim() == 1, f'waveform must be 1D, but got {audio_tensor.dim()}D'
+    audio_tensor = audio_tensor.abs()
+    k = int(audio_tensor.numel() * 0.001)
+    if k:
+        top_values, _ = torch.topk(audio_tensor, k)
+        threshold = top_values[-1]
+    else:
+        threshold = audio_tensor.quantile(0.999, dim=-1)
+    if (token_count := round(audio_tensor.shape[-1] / N_SAMPLES_PER_TOKEN)+1) > 2:
+        if threshold < 1e-5:
+            return torch.zeros(token_count, dtype=audio_tensor.dtype, device=audio_tensor.device)
+        audio_tensor = audio_tensor / min(1., threshold * 1.75)
+        audio_tensor = F.interpolate(
+            audio_tensor[None, None],
+            size=token_count,
+            mode='linear',
+            align_corners=False
+        )[0, 0]
+        return audio_tensor
+def visualize_mask(
+        loudness_tensor: torch.Tensor,
+        silence_mask: torch.Tensor = None,
+        width: int = 1500,
+        height: int = 200,
+        output: str = None,
+):
+    no_silence = silence_mask is None or not silence_mask.any()
+    assert no_silence or silence_mask.shape[0] == loudness_tensor.shape[0]
+    if loudness_tensor.shape[0] < 2:
+        raise NotImplementedError(f'audio size, {loudness_tensor.shape[0]}, is too short to visualize')
+    else:
+        width = loudness_tensor.shape[0] if width == -1 else width
+        im = torch.zeros((height, width, 3), dtype=torch.uint8)
+        mid = round(height / 2)
+        for i, j in enumerate(loudness_tensor.tolist()):
+            j = round(abs(j) * mid)
+            if j == 0 or width <= i:
+                continue
+            im[mid - j:mid + 1, i] = 255
+            im[mid + 1:mid + j + 1, i] = 255
+        if not no_silence:
+            im[:, silence_mask[:width], 1:] = 0
+        im = im.cpu().numpy()
+        if output and not output.endswith('.png'):
+            output += '.png'
+        try:
+            from PIL import Image
+        except ModuleNotFoundError:
+            try:
+                import cv2
+            except ModuleNotFoundError:
+                raise ModuleNotFoundError('Failed to import "PIL" or "cv2" to visualize suppression mask. '
+                                          'Try "pip install Pillow" or "pip install opencv-python"')
+            else:
+                im = im[..., [2, 1, 0]]
+                if isinstance(output, str):
+                    cv2.imwrite(output, im)
+                else:
+                    cv2.imshow('image', im)
+                    cv2.waitKey(0)
+        else:
+            im = Image.fromarray(im)
+            if isinstance(output, str):
+                im.save(output)
+            else:
+                im.show(im)
+        if output:
+            print(f'Save: {output}')
+def wav2mask(
+        audio: (torch.Tensor, np.ndarray, str, bytes),
+        q_levels: int = 20,
+        k_size: int = 5,
+        sr: int = None
+) -> (Tuple[torch.Tensor, Tuple[np.ndarray, np.ndarray]], None):
+    """
+    Generate 1D mask from waveform for suppressing timestamp tokens.
+    """
+    audio = standardize_audio(audio, (sr, NONVAD_SAMPLE_RATES))
+    loudness_tensor = audio2loudness(audio)
+    if loudness_tensor is None:
+        return
+    p = k_size // 2 if k_size else 0
+    if p and p < loudness_tensor.shape[-1]:
+        assert k_size % 2, f'kernel_size must be odd but got {k_size}'
+        mask = torch.avg_pool1d(
+            F.pad(
+                loudness_tensor[None],
+                (p, p),
+                'reflect'
+            ),
+            kernel_size=k_size,
+            stride=1
+        )[0]
+    else:
+        mask = loudness_tensor.clone()
+    if q_levels:
+        mask = mask.mul(q_levels).round()
+    mask = mask.bool()
+    if not mask.any():  # entirely silent
+        return ~mask
+    temp_timings = mask2timing(mask)
+    s, e = temp_timings
+    se_mask = (e - s) > 0.1
+    s = s[se_mask]
+    e = e[se_mask]
+    mask = ~timing2mask(s, e, loudness_tensor.shape[-1])
+    if not mask.any():  # no silence
+        return
+    return mask
+_model_cache = {}
+def get_vad_silence_func(
+        onnx=False,
+        verbose: (bool, None) = False
+):
+    if onnx in _model_cache:
+        model, get_ts = _model_cache[onnx]
+    else:
+        model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad:master',
+                                      model='silero_vad',
+                                      verbose=verbose,
+                                      onnx=onnx,
+                                      trust_repo=True)
+        get_ts = utils[0]
+        _model_cache[onnx] = (model, get_ts)
+    warnings.filterwarnings('ignore', message=r'operator \(\) profile_node.*', category=UserWarning)
+    def get_speech_timestamps(wav: torch.Tensor, threshold: float = .35):
+        return get_ts(wav, model, threshold, min_speech_duration_ms=100, min_silence_duration_ms=20)
+    def vad_silence_timing(
+            audio: (torch.Tensor, np.ndarray, str, bytes),
+            speech_threshold: float = .35,
+            sr: int = None
+    ) -> (Tuple[np.ndarray, np.ndarray], None):
+        audio = standardize_audio(audio, (sr, VAD_SAMPLE_RATES))
+        total_duration = round(audio.shape[-1] / SAMPLE_RATE, 3)
+        if not total_duration:
+            return
+        ori_t = torch.get_num_threads()
+        if verbose is not None:
+            print('Predicting silences(s) with VAD...\r', end='')
+        torch.set_num_threads(1)  # vad was optimized for single performance
+        speech_ts = get_speech_timestamps(audio, speech_threshold)
+        if verbose is not None:
+            print('Predicted silence(s) with VAD.       ')
+        torch.set_num_threads(ori_t)
+        if len(speech_ts) == 0:  # all silent
+            return np.array([0.0]), np.array([total_duration])
+        silent_starts = []
+        silent_ends = []
+        for ts in speech_ts:
+            start = round(ts['start'] / SAMPLE_RATE, 3)
+            end = round(ts['end'] / SAMPLE_RATE, 3)
+            if start != 0:
+                silent_ends.append(start)
+                if len(silent_starts) == 0:
+                    silent_starts.append(0.0)
+            if end < total_duration:
+                silent_starts.append(end)
+        if len(silent_starts) == 0 and len(silent_ends) == 0:
+            return
+        if len(silent_starts) != 0 and (len(silent_ends) == 0 or silent_ends[-1] < silent_starts[-1]):
+            silent_ends.append(total_duration)
+        silent_starts = np.array(silent_starts)
+        silent_ends = np.array(silent_ends)
+        return silent_starts, silent_ends
+    return vad_silence_timing
+def visualize_suppression(
+        audio: Union[torch.Tensor, np.ndarray, str, bytes],
+        output: str = None,
+        q_levels: int = 20,
+        k_size: int = 5,
+        vad_threshold: float = 0.35,
+        vad: bool = False,
+        max_width: int = 1500,
+        height: int = 200
+):
+    """
+    Visualize regions on the waveform of ``audio`` detected as silent.
+    Regions on the waveform colored red are detected as silent.
+    Parameters
+    ----------
+    audio : str or numpy.ndarray or torch.Tensor or bytes
+        Path/URL to the audio file, the audio waveform, or bytes of audio file.
+        If audio is ``numpy.ndarray`` or ``torch.Tensor``, the audio must be already at sampled to 16kHz.
+    output : str, default None, meaning image will be shown directly via Pillow or opencv-python
+        Path to save visualization.
+    q_levels : int, default 20
+        Quantization levels for generating timestamp suppression mask; ignored if ``vad = true``.
+        Acts as a threshold to marking sound as silent.
+        Fewer levels will increase the threshold of volume at which to mark a sound as silent.
+    k_size : int, default 5
+        Kernel size for avg-pooling waveform to generate timestamp suppression mask; ignored if ``vad = true``.
+        Recommend 5 or 3; higher sizes will reduce detection of silence.
+    vad : bool, default False
+        Whether to use Silero VAD to generate timestamp suppression mask.
+        Silero VAD requires PyTorch 1.12.0+. Official repo, https://github.com/snakers4/silero-vad.
+    vad_threshold : float, default 0.35
+        Threshold for detecting speech with Silero VAD. Low threshold reduces false positives for silence detection.
+    max_width : int, default 1500
+        Maximum width of visualization to avoid overly large image from long audio.
+        Each unit of pixel is equivalent  to 1 token.  Use -1 to visualize the entire audio track.
+    height : int, default 200
+        Height of visualization.
+    """
+    max_n_samples = None if max_width == -1 else round(max_width * N_SAMPLES_PER_TOKEN)
+    audio = standardize_audio(audio)
+    if max_n_samples is None:
+        max_width = audio.shape[-1]
+    else:
+        audio = audio[:max_n_samples]
+    loudness_tensor = audio2loudness(audio)
+    width = min(max_width, loudness_tensor.shape[-1])
+    if loudness_tensor is None:
+        raise NotImplementedError(f'Audio is too short and cannot visualized.')
+    if vad:
+        silence_timings = get_vad_silence_func()(audio, vad_threshold)
+        silence_mask = None if silence_timings is None else timing2mask(*silence_timings, size=loudness_tensor.shape[0])
+    else:
+        silence_mask = wav2mask(audio, q_levels=q_levels, k_size=k_size)
+    visualize_mask(loudness_tensor, silence_mask, width=width, height=height, output=output)

stable_whisper/text_output.py ADDED Viewed

	@@ -0,0 +1,620 @@

+import json
+import os
+import warnings
+from typing import List, Tuple, Union, Callable
+from itertools import chain
+from .stabilization import valid_ts
+__all__ = ['result_to_srt_vtt', 'result_to_ass', 'result_to_tsv', 'result_to_txt', 'save_as_json', 'load_result']
+SUPPORTED_FORMATS = ('srt', 'vtt', 'ass', 'tsv', 'txt')
+def _save_as_file(content: str, path: str):
+    with open(path, 'w', encoding='utf-8') as f:
+        f.write(content)
+    print(f'Saved: {os.path.abspath(path)}')
+def _get_segments(result: (dict, list), min_dur: float, reverse_text: Union[bool, tuple] = False):
+    if isinstance(result, dict):
+        if reverse_text:
+            warnings.warn(f'[reverse_text]=True only applies to WhisperResult but result is {type(result)}')
+        return result.get('segments')
+    elif not isinstance(result, list) and callable(getattr(result, 'segments_to_dicts', None)):
+        return result.apply_min_dur(min_dur, inplace=False).segments_to_dicts(reverse_text=reverse_text)
+    return result
+def finalize_text(text: str, strip: bool = True):
+    if not strip:
+        return text
+    return text.strip().replace('\n ', '\n')
+def sec2hhmmss(seconds: (float, int)):
+    mm, ss = divmod(seconds, 60)
+    hh, mm = divmod(mm, 60)
+    return hh, mm, ss
+def sec2milliseconds(seconds: (float, int)) -> int:
+    return round(seconds * 1000)
+def sec2centiseconds(seconds: (float, int)) -> int:
+    return round(seconds * 100)
+def sec2vtt(seconds: (float, int)) -> str:
+    hh, mm, ss = sec2hhmmss(seconds)
+    return f'{hh:0>2.0f}:{mm:0>2.0f}:{ss:0>6.3f}'
+def sec2srt(seconds: (float, int)) -> str:
+    return sec2vtt(seconds).replace(".", ",")
+def sec2ass(seconds: (float, int)) -> str:
+    hh, mm, ss = sec2hhmmss(seconds)
+    return f'{hh:0>1.0f}:{mm:0>2.0f}:{ss:0>2.2f}'
+def segment2vttblock(segment: dict, strip=True) -> str:
+    return f'{sec2vtt(segment["start"])} --> {sec2vtt(segment["end"])}\n' \
+           f'{finalize_text(segment["text"], strip)}'
+def segment2srtblock(segment: dict, idx: int, strip=True) -> str:
+    return f'{idx}\n{sec2srt(segment["start"])} --> {sec2srt(segment["end"])}\n' \
+           f'{finalize_text(segment["text"], strip)}'
+def segment2assblock(segment: dict, idx: int, strip=True) -> str:
+    return f'Dialogue: {idx},{sec2ass(segment["start"])},{sec2ass(segment["end"])},Default,,0,0,0,,' \
+           f'{finalize_text(segment["text"], strip)}'
+def segment2tsvblock(segment: dict, strip=True) -> str:
+    return f'{sec2milliseconds(segment["start"])}' \
+           f'\t{sec2milliseconds(segment["end"])}' \
+           f'\t{segment["text"].strip() if strip else segment["text"]}'
+def words2segments(words: List[dict], tag: Tuple[str, str], reverse_text: bool = False) -> List[dict]:
+    def add_tag(idx: int):
+        return ''.join(
+            (
+                f" {tag[0]}{w['word'][1:]}{tag[1]}"
+                if w['word'].startswith(' ') else
+                f"{tag[0]}{w['word']}{tag[1]}"
+            )
+            if w['word'] not in ('', ' ') and idx_ == idx else
+            w['word']
+            for idx_, w in idx_filled_words
+        )
+    filled_words = []
+    for i, word in enumerate(words):
+        curr_end = round(word['end'], 3)
+        filled_words.append(dict(word=word['word'], start=round(word['start'], 3), end=curr_end))
+        if word != words[-1]:
+            next_start = round(words[i + 1]['start'], 3)
+            if next_start - curr_end != 0:
+                filled_words.append(dict(word='', start=curr_end, end=next_start))
+    idx_filled_words = list(enumerate(filled_words))
+    if reverse_text:
+        idx_filled_words = list(reversed(idx_filled_words))
+    segments = [dict(text=add_tag(i), start=filled_words[i]['start'], end=filled_words[i]['end'])
+                for i in range(len(filled_words))]
+    return segments
+def to_word_level_segments(segments: List[dict], tag: Tuple[str, str]) -> List[dict]:
+    return list(
+        chain.from_iterable(
+            words2segments(s['words'], tag, reverse_text=s.get('reversed_text'))
+            for s in segments
+        )
+    )
+def to_vtt_word_level_segments(segments: List[dict], tag: Tuple[str, str] = None) -> List[dict]:
+    def to_segment_string(segment: dict):
+        segment_string = ''
+        prev_end = 0
+        for i, word in enumerate(segment['words']):
+            if i != 0:
+                curr_start = word['start']
+                if prev_end == curr_start:
+                    segment_string += f"<{sec2vtt(curr_start)}>"
+                else:
+                    if segment_string.endswith(' '):
+                        segment_string = segment_string[:-1]
+                    elif segment['words'][i]['word'].startswith(' '):
+                        segment['words'][i]['word'] = segment['words'][i]['word'][1:]
+                    segment_string += f"<{sec2vtt(prev_end)}> <{sec2vtt(curr_start)}>"
+            segment_string += word['word']
+            prev_end = word['end']
+        return segment_string
+    return [
+        dict(
+            text=to_segment_string(s),
+            start=s['start'],
+            end=s['end']
+        )
+        for s in segments
+    ]
+def to_ass_word_level_segments(segments: List[dict], tag: Tuple[str, str], karaoke: bool = False) -> List[dict]:
+    def to_segment_string(segment: dict):
+        segment_string = ''
+        for i, word in enumerate(segment['words']):
+            curr_word, space = (word['word'][1:], " ") if word['word'].startswith(" ") else (word['word'], "")
+            segment_string += (
+                    space +
+                    r"{\k" +
+                    ("f" if karaoke else "") +
+                    f"{sec2centiseconds(word['end']-word['start'])}" +
+                    r"}" +
+                    curr_word
+            )
+        return segment_string
+    return [
+        dict(
+            text=to_segment_string(s),
+            start=s['start'],
+            end=s['end']
+        )
+        for s in segments
+    ]
+def to_word_level(segments: List[dict]) -> List[dict]:
+    return [dict(text=w['word'], start=w['start'], end=w['end']) for s in segments for w in s['words']]
+def _confirm_word_level(segments: List[dict]) -> bool:
+    if not all(bool(s.get('words')) for s in segments):
+        warnings.warn('Result is missing word timestamps. Word-level timing cannot be exported. '
+                      'Use "word_level=False" to avoid this warning')
+        return False
+    return True
+def _preprocess_args(result: (dict, list),
+                     segment_level: bool,
+                     word_level: bool,
+                     min_dur: float,
+                     reverse_text: Union[bool, tuple] = False):
+    assert segment_level or word_level, '`segment_level` or `word_level` must be True'
+    segments = _get_segments(result, min_dur, reverse_text=reverse_text)
+    if word_level:
+        word_level = _confirm_word_level(segments)
+    return segments, segment_level, word_level
+def result_to_any(result: (dict, list),
+                  filepath: str = None,
+                  filetype: str = None,
+                  segments2blocks: Callable = None,
+                  segment_level=True,
+                  word_level=True,
+                  min_dur: float = 0.02,
+                  tag: Tuple[str, str] = None,
+                  default_tag: Tuple[str, str] = None,
+                  strip=True,
+                  reverse_text: Union[bool, tuple] = False,
+                  to_word_level_string_callback: Callable = None):
+    """
+    Generate file from ``result`` to display segment-level and/or word-level timestamp.
+    Returns
+    -------
+    str
+        String of the content if ``filepath`` is ``None``.
+    """
+    segments, segment_level, word_level = _preprocess_args(
+        result, segment_level, word_level, min_dur, reverse_text=reverse_text
+    )
+    if filetype is None:
+        filetype = os.path.splitext(filepath)[-1][1:] or 'srt'
+    if filetype.lower() not in SUPPORTED_FORMATS:
+        raise NotImplementedError(f'{filetype} not supported')
+    if filepath and not filepath.lower().endswith(f'.{filetype}'):
+        filepath += f'.{filetype}'
+    if word_level and segment_level:
+        if tag is None:
+            if default_tag is None:
+                tag = ('<font color="#00ff00">', '</font>') if filetype == 'srt' else ('<u>', '</u>')
+            else:
+                tag = default_tag
+        if to_word_level_string_callback is None:
+            to_word_level_string_callback = to_word_level_segments
+        segments = to_word_level_string_callback(segments, tag)
+    elif word_level:
+        segments = to_word_level(segments)
+    valid_ts(segments)
+    if segments2blocks is None:
+        sub_str = '\n\n'.join(segment2srtblock(s, i, strip=strip) for i, s in enumerate(segments))
+    else:
+        sub_str = segments2blocks(segments)
+    if filepath:
+        _save_as_file(sub_str, filepath)
+    else:
+        return sub_str
+def result_to_srt_vtt(result: (dict, list),
+                      filepath: str = None,
+                      segment_level=True,
+                      word_level=True,
+                      min_dur: float = 0.02,
+                      tag: Tuple[str, str] = None,
+                      vtt: bool = None,
+                      strip=True,
+                      reverse_text: Union[bool, tuple] = False):
+    """
+    Generate SRT/VTT from ``result`` to display segment-level and/or word-level timestamp.
+    Parameters
+    ----------
+    result : dict or list or stable_whisper.result.WhisperResult
+        Result of transcription.
+    filepath : str, default None, meaning content will be returned as a ``str``
+        Path to save file.
+    segment_level : bool, default True
+        Whether to use segment-level timestamps in output.
+    word_level : bool, default True
+        Whether to use word-level timestamps in output.
+    min_dur : float, default 0.2
+        Minimum duration allowed for any word/segment before the word/segments are merged with adjacent word/segments.
+    tag: tuple of (str, str), default None, meaning ('<font color="#00ff00">', '</font>') if SRT else ('<u>', '</u>')
+        Tag used to change the properties a word at its timestamp.
+    vtt : bool, default None, meaning determined by extension of ``filepath`` or ``False`` if no valid extension.
+        Whether to output VTT.
+    strip : bool, default True
+        Whether to remove spaces before and after text on each segment for output.
+    reverse_text: bool or tuple, default False
+        Whether to reverse the order of words for each segment or provide the ``prepend_punctuations`` and
+        ``append_punctuations`` as tuple pair instead of ``True`` which is for the default punctuations.
+    Returns
+    -------
+    str
+        String of the content if ``filepath`` is ``None``.
+    Notes
+    -----
+    ``reverse_text`` will not fix RTL text not displaying tags properly which is an issue with some video player. VLC
+    seems to not suffer from this issue.
+    Examples
+    --------
+    >>> import stable_whisper
+    >>> model = stable_whisper.load_model('base')
+    >>> result = model.transcribe('audio.mp3')
+    >>> result.to_srt_vtt('audio.srt')
+    Saved: audio.srt
+    """
+    is_srt = (filepath is None or not filepath.lower().endswith('.vtt')) if vtt is None else not vtt
+    if is_srt:
+        segments2blocks = None
+        to_word_level_string_callback = None
+    else:
+        def segments2blocks(segments):
+            return 'WEBVTT\n\n' + '\n\n'.join(segment2vttblock(s, strip=strip) for i, s in enumerate(segments))
+        to_word_level_string_callback = to_vtt_word_level_segments if tag is None else tag
+    return result_to_any(
+        result=result,
+        filepath=filepath,
+        filetype=('vtt', 'srt')[is_srt],
+        segments2blocks=segments2blocks,
+        segment_level=segment_level,
+        word_level=word_level,
+        min_dur=min_dur,
+        tag=tag,
+        strip=strip,
+        reverse_text=reverse_text,
+        to_word_level_string_callback=to_word_level_string_callback
+    )
+def result_to_tsv(result: (dict, list),
+                  filepath: str = None,
+                  segment_level: bool = None,
+                  word_level: bool = None,
+                  min_dur: float = 0.02,
+                  strip=True,
+                  reverse_text: Union[bool, tuple] = False):
+    """
+    Generate TSV from ``result`` to display segment-level and/or word-level timestamp.
+    Parameters
+    ----------
+    result : dict or list or stable_whisper.result.WhisperResult
+        Result of transcription.
+    filepath : str, default None, meaning content will be returned as a ``str``
+        Path to save file.
+    segment_level : bool, default True
+        Whether to use segment-level timestamps in output.
+    word_level : bool, default True
+        Whether to use word-level timestamps in output.
+    min_dur : float, default 0.2
+        Minimum duration allowed for any word/segment before the word/segments are merged with adjacent word/segments.
+    strip : bool, default True
+        Whether to remove spaces before and after text on each segment for output.
+    reverse_text: bool or tuple, default False
+        Whether to reverse the order of words for each segment or provide the ``prepend_punctuations`` and
+        ``append_punctuations`` as tuple pair instead of ``True`` which is for the default punctuations.
+    Returns
+    -------
+    str
+        String of the content if ``filepath`` is ``None``.
+    Notes
+    -----
+    ``reverse_text`` will not fix RTL text not displaying tags properly which is an issue with some video player. VLC
+    seems to not suffer from this issue.
+    Examples
+    --------
+    >>> import stable_whisper
+    >>> model = stable_whisper.load_model('base')
+    >>> result = model.transcribe('audio.mp3')
+    >>> result.to_tsv('audio.tsv')
+    Saved: audio.tsv
+    """
+    if segment_level is None and word_level is None:
+        segment_level = True
+    assert word_level is not segment_level, '[word_level] and [segment_level] cannot be the same ' \
+                                            'since [tag] is not support for this format'
+    def segments2blocks(segments):
+        return '\n\n'.join(segment2tsvblock(s, strip=strip) for i, s in enumerate(segments))
+    return result_to_any(
+        result=result,
+        filepath=filepath,
+        filetype='tsv',
+        segments2blocks=segments2blocks,
+        segment_level=segment_level,
+        word_level=word_level,
+        min_dur=min_dur,
+        strip=strip,
+        reverse_text=reverse_text
+    )
+def result_to_ass(result: (dict, list),
+                  filepath: str = None,
+                  segment_level=True,
+                  word_level=True,
+                  min_dur: float = 0.02,
+                  tag: Union[Tuple[str, str], int] = None,
+                  font: str = None,
+                  font_size: int = 24,
+                  strip=True,
+                  highlight_color: str = None,
+                  karaoke=False,
+                  reverse_text: Union[bool, tuple] = False,
+                  **kwargs):
+    """
+    Generate Advanced SubStation Alpha (ASS) file from ``result`` to display segment-level and/or word-level timestamp.
+    Parameters
+    ----------
+    result : dict or list or stable_whisper.result.WhisperResult
+        Result of transcription.
+    filepath : str, default None, meaning content will be returned as a ``str``
+        Path to save file.
+    segment_level : bool, default True
+        Whether to use segment-level timestamps in output.
+    word_level : bool, default True
+        Whether to use word-level timestamps in output.
+    min_dur : float, default 0.2
+        Minimum duration allowed for any word/segment before the word/segments are merged with adjacent word/segments.
+    tag: tuple of (str, str) or int, default None, meaning use default highlighting
+        Tag used to change the properties a word at its timestamp. -1 for individual word highlight tag.
+    font : str, default `Arial`
+        Word font.
+    font_size : int, default 48
+        Word font size.
+    strip : bool, default True
+        Whether to remove spaces before and after text on each segment for output.
+    highlight_color : str, default '00ff00'
+        Hexadecimal of the color use for default highlights as '<bb><gg><rr>'.
+    karaoke : bool, default False
+        Whether to use progressive filling highlights (for karaoke effect).
+    reverse_text: bool or tuple, default False
+        Whether to reverse the order of words for each segment or provide the ``prepend_punctuations`` and
+        ``append_punctuations`` as tuple pair instead of ``True`` which is for the default punctuations.
+    kwargs:
+        Format styles:
+        'Name', 'Fontname', 'Fontsize', 'PrimaryColour', 'SecondaryColour', 'OutlineColour', 'BackColour', 'Bold',
+        'Italic', 'Underline', 'StrikeOut', 'ScaleX', 'ScaleY', 'Spacing', 'Angle', 'BorderStyle', 'Outline',
+        'Shadow', 'Alignment', 'MarginL', 'MarginR', 'MarginV', 'Encoding'
+    Returns
+    -------
+    str
+        String of the content if ``filepath`` is ``None``.
+    Notes
+    -----
+    ``reverse_text`` will not fix RTL text not displaying tags properly which is an issue with some video player. VLC
+    seems to not suffer from this issue.
+    Examples
+    --------
+    >>> import stable_whisper
+    >>> model = stable_whisper.load_model('base')
+    >>> result = model.transcribe('audio.mp3')
+    >>> result.to_ass('audio.ass')
+    Saved: audio.ass
+    """
+    if tag == ['-1']:  # CLI
+        tag = -1
+    if highlight_color is None:
+        highlight_color = '00ff00'
+    def segments2blocks(segments):
+        fmt_style_dict = {'Name': 'Default', 'Fontname': 'Arial', 'Fontsize': '48', 'PrimaryColour': '&Hffffff',
+                          'SecondaryColour': '&Hffffff', 'OutlineColour': '&H0', 'BackColour': '&H0', 'Bold': '0',
+                          'Italic': '0', 'Underline': '0', 'StrikeOut': '0', 'ScaleX': '100', 'ScaleY': '100',
+                          'Spacing': '0', 'Angle': '0', 'BorderStyle': '1', 'Outline': '1', 'Shadow': '0',
+                          'Alignment': '2', 'MarginL': '10', 'MarginR': '10', 'MarginV': '10', 'Encoding': '0'}
+        for k, v in filter(lambda x: 'colour' in x[0].lower() and not str(x[1]).startswith('&H'), kwargs.items()):
+            kwargs[k] = f'&H{kwargs[k]}'
+        fmt_style_dict.update((k, v) for k, v in kwargs.items() if k in fmt_style_dict)
+        if tag is None and 'PrimaryColour' not in kwargs:
+            fmt_style_dict['PrimaryColour'] = \
+                highlight_color if highlight_color.startswith('&H') else f'&H{highlight_color}'
+        if font:
+            fmt_style_dict.update(Fontname=font)
+        if font_size:
+            fmt_style_dict.update(Fontsize=font_size)
+        fmts = f'Format: {", ".join(map(str, fmt_style_dict.keys()))}'
+        styles = f'Style: {",".join(map(str, fmt_style_dict.values()))}'
+        sub_str = f'[Script Info]\nScriptType: v4.00+\nPlayResX: 384\nPlayResY: 288\nScaledBorderAndShadow: yes\n\n' \
+                  f'[V4+ Styles]\n{fmts}\n{styles}\n\n' \
+                  f'[Events]\nFormat: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n\n'
+        sub_str += '\n'.join(segment2assblock(s, i, strip=strip) for i, s in enumerate(segments))
+        return sub_str
+    if tag is not None and karaoke:
+        warnings.warn(f'[tag] is not support for [karaoke]=True; [tag] will be ignored.')
+    return result_to_any(
+        result=result,
+        filepath=filepath,
+        filetype='ass',
+        segments2blocks=segments2blocks,
+        segment_level=segment_level,
+        word_level=word_level,
+        min_dur=min_dur,
+        tag=None if tag == -1 else tag,
+        default_tag=(r'{\1c' + f'{highlight_color}&' + '}', r'{\r}'),
+        strip=strip,
+        reverse_text=reverse_text,
+        to_word_level_string_callback=(
+            (lambda s, t: to_ass_word_level_segments(s, t, karaoke=karaoke))
+            if karaoke or (word_level and segment_level and tag is None)
+            else None
+        )
+    )
+def result_to_txt(
+        result: (dict, list),
+        filepath: str = None,
+        min_dur: float = 0.02,
+        strip=True,
+        reverse_text: Union[bool, tuple] = False
+):
+    """
+    Generate plain-text without timestamps from ``result``.
+    Parameters
+    ----------
+    result : dict or list or stable_whisper.result.WhisperResult
+        Result of transcription.
+    filepath : str, default None, meaning content will be returned as a ``str``
+        Path to save file.
+    min_dur : float, default 0.2
+        Minimum duration allowed for any word/segment before the word/segments are merged with adjacent word/segments.
+    strip : bool, default True
+        Whether to remove spaces before and after text on each segment for output.
+    reverse_text: bool or tuple, default False
+        Whether to reverse the order of words for each segment or provide the ``prepend_punctuations`` and
+        ``append_punctuations`` as tuple pair instead of ``True`` which is for the default punctuations.
+    Returns
+    -------
+    str
+        String of the content if ``filepath`` is ``None``.
+    Notes
+    -----
+    ``reverse_text`` will not fix RTL text not displaying tags properly which is an issue with some video player. VLC
+    seems to not suffer from this issue.
+    Examples
+    --------
+    >>> import stable_whisper
+    >>> model = stable_whisper.load_model('base')
+    >>> result = model.transcribe('audio.mp3')
+    >>> result.to_txt('audio.txt')
+    Saved: audio.txt
+    """
+    def segments2blocks(segments: dict, _strip=True) -> str:
+        return '\n'.join(f'{segment["text"].strip() if _strip else segment["text"]}' for segment in segments)
+    return result_to_any(
+        result=result,
+        filepath=filepath,
+        filetype='txt',
+        segments2blocks=segments2blocks,
+        segment_level=True,
+        word_level=False,
+        min_dur=min_dur,
+        strip=strip,
+        reverse_text=reverse_text
+    )
+def save_as_json(result: dict, path: str, ensure_ascii: bool = False, **kwargs):
+    """
+    Save ``result`` as JSON file to ``path``.
+    Parameters
+    ----------
+    result : dict or list or stable_whisper.result.WhisperResult
+        Result of transcription.
+    path : str
+        Path to save file.
+    ensure_ascii : bool, default False
+        Whether to escape non-ASCII characters.
+    Examples
+    --------
+    >>> import stable_whisper
+    >>> model = stable_whisper.load_model('base')
+    >>> result = model.transcribe('audio.mp3')
+    >>> result.save_as_json('audio.json')
+    Saved: audio.json
+    """
+    if not isinstance(result, dict) and callable(getattr(result, 'to_dict')):
+        result = result.to_dict()
+    if not path.lower().endswith('.json'):
+        path += '.json'
+    result = json.dumps(result, allow_nan=True, ensure_ascii=ensure_ascii, **kwargs)
+    _save_as_file(result, path)
+def load_result(json_path: str) -> dict:
+    """
+    Return a ``dict`` of the contents in ``json_path``.
+    """
+    with open(json_path, 'r', encoding='utf-8') as f:
+        return json.load(f)

stable_whisper/timing.py ADDED Viewed

	@@ -0,0 +1,275 @@

+import string
+import torch
+import numpy as np
+from typing import TYPE_CHECKING, List, Callable, Optional
+from itertools import chain
+from whisper.audio import TOKENS_PER_SECOND, N_SAMPLES_PER_TOKEN
+from whisper.timing import WordTiming, median_filter, dtw, merge_punctuations
+if TYPE_CHECKING:
+    from whisper.tokenizer import Tokenizer
+    from whisper.model import Whisper
+# modified version of whisper.timing.find_alignment
+def find_alignment_stable(
+        model: "Whisper",
+        tokenizer: "Tokenizer",
+        text_tokens: List[int],
+        mel: torch.Tensor,
+        num_samples: int,
+        *,
+        medfilt_width: int = 7,
+        qk_scale: float = 1.0,
+        ts_num: int = 0,
+        ts_noise: float = 0.1,
+        token_split=None,
+        audio_features: torch.Tensor = None
+) -> List[WordTiming]:
+    tokens = torch.tensor(
+        [
+            *tokenizer.sot_sequence,
+            tokenizer.no_timestamps,
+            *text_tokens,
+            tokenizer.eot,
+        ]
+    ).to(model.device)
+    # install hooks on the cross attention layers to retrieve the attention weights
+    QKs = [None] * model.dims.n_text_layer
+    hooks = [
+        block.cross_attn.register_forward_hook(
+            lambda _, ins, outs, index=i: QKs.__setitem__(index, outs[-1])
+        )
+        for i, block in enumerate(model.decoder.blocks)
+    ]
+    with torch.no_grad():
+        if audio_features is None:
+            audio_features = model.encoder(mel.unsqueeze(0))
+        if ts_num:
+            if ts_noise is None:
+                ts_noise = 0.1
+            extra_audio_features = audio_features.repeat_interleave(ts_num, 0)
+            torch.manual_seed(0)
+            audio_features = torch.cat([audio_features,
+                                        extra_audio_features *
+                                        (1 - (torch.rand_like(extra_audio_features) * ts_noise))],
+                                       dim=0)
+            logits = model.decoder(tokens.unsqueeze(0).repeat_interleave(audio_features.shape[0], 0),
+                                   audio_features)
+        else:
+            logits = model.decoder(tokens.unsqueeze(0), audio_features)
+        logits = logits[0]
+        sampled_logits = logits[len(tokenizer.sot_sequence):, : tokenizer.eot]
+        token_probs = sampled_logits.softmax(dim=-1)
+        text_token_probs = token_probs[np.arange(len(text_tokens)), text_tokens]
+        text_token_probs = text_token_probs.tolist()
+    for hook in hooks:
+        hook.remove()
+    # heads * tokens * frames
+    weights = torch.cat([QKs[_l][:, _h] for _l, _h in model.alignment_heads.indices().T], dim=0)
+    weights = weights[:, :, : round(num_samples / N_SAMPLES_PER_TOKEN)]
+    weights = (weights * qk_scale).softmax(dim=-1)
+    std, mean = torch.std_mean(weights, dim=-2, keepdim=True, unbiased=False)
+    weights = (weights - mean) / std
+    weights = median_filter(weights, medfilt_width)
+    matrix = weights.mean(axis=0)
+    matrix = matrix[len(tokenizer.sot_sequence): -1]
+    text_indices, time_indices = dtw(-matrix)
+    if token_split is None:
+        words, word_tokens = tokenizer.split_to_word_tokens(text_tokens + [tokenizer.eot])
+    else:
+        words, word_tokens = token_split
+        words.append(tokenizer.decode([tokenizer.eot]))
+        word_tokens.append([tokenizer.eot])
+    word_boundaries = np.pad(np.cumsum([len(t) for t in word_tokens[:-1]]), (1, 0))
+    jumps = np.pad(np.diff(text_indices), (1, 0), constant_values=1).astype(bool)
+    jump_times = time_indices[jumps].clip(min=0) / TOKENS_PER_SECOND
+    start_times = jump_times[word_boundaries[:-1]]
+    end_times = jump_times[word_boundaries[1:]]
+    word_probabilities = [
+        np.mean(text_token_probs[i:j])
+        for i, j in zip(word_boundaries[:-1], word_boundaries[1:])
+    ]
+    return [
+        WordTiming(word, tokens, start, end, probability)
+        for word, tokens, start, end, probability in zip(
+            words, word_tokens, start_times, end_times, word_probabilities
+        )
+    ]
+def _split_tokens(tokens: List[int], tokenizer: "Tokenizer"):
+    split_by_space = getattr(tokenizer, 'language_code', tokenizer.language) not in {"zh", "ja", "th", "lo", "my"}
+    text = tokenizer.decode_with_timestamps(tokens)
+    words = []
+    word_tokens = []
+    curr_tokens = []
+    is_append = False
+    for token in tokens:
+        curr_tokens.append(token)
+        curr_text = tokenizer.decode(curr_tokens)
+        is_whole = token >= tokenizer.eot
+        if not is_whole:
+            is_whole = text[:len(curr_text)] == curr_text
+            if is_whole and split_by_space:
+                is_append = not (curr_text.startswith(" ") or curr_text.strip() in string.punctuation)
+        if is_whole:
+            if is_append and len(words) != 0:
+                words[-1] += curr_text
+                word_tokens[-1].extend(curr_tokens)
+            else:
+                words.append(curr_text)
+                word_tokens.append(curr_tokens)
+            text = text[len(curr_text):]
+            curr_tokens = []
+    if len(curr_tokens) != 0:
+        words.append(curr_text if len(text) == 0 else text)
+        word_tokens.append(curr_tokens)
+    elif len(text) != 0:
+        words[-1] += text
+    return words, word_tokens
+def split_word_tokens(segments: List[dict],
+                      tokenizer: "Tokenizer",
+                      *,
+                      padding: (str, int) = None,
+                      split_callback: Callable = None):
+    if padding is not None:
+        if isinstance(padding, str):
+            padding = tokenizer.encode(padding)
+        else:
+            padding = [padding]
+    tokens = []
+    seg_indices = []
+    words = []
+    word_tokens = []
+    for i, s in enumerate(segments):
+        temp_word_tokens = [t for t in s['tokens'] if not isinstance(t, int) or t < tokenizer.eot]
+        curr_words, curr_word_tokens = (
+            _split_tokens(temp_word_tokens, tokenizer)
+            if split_callback is None else
+            split_callback(temp_word_tokens, tokenizer)
+        )
+        assert len(curr_words) == len(curr_word_tokens), \
+            f'word count and token group count do not match, {len(curr_words)} and {len(curr_word_tokens)}'
+        if (
+                padding is not None and
+                curr_word_tokens[0][0] != padding and
+                (len(tokens) == 0 or tokens[-1] != padding)
+        ):
+            tokens.extend(padding)
+            words.append(None)
+            word_tokens.append(padding)
+        seg_indices.extend([i] * len(curr_words))
+        tokens.extend(list(chain.from_iterable(curr_word_tokens)))
+        words.extend(curr_words)
+        word_tokens.extend(curr_word_tokens)
+    return tokens, (words, word_tokens), seg_indices
+def pop_empty_alignment(alignment: List[WordTiming]):
+    return list(reversed([alignment.pop(i) for i in reversed(range(len(alignment))) if alignment[i].word is None]))
+# modified version of whisper.timing.add_word_timestamps
+def add_word_timestamps_stable(
+        *,
+        segments: List[dict],
+        model: "Whisper",
+        tokenizer: "Tokenizer",
+        mel: torch.Tensor,
+        num_samples: int,
+        prepend_punctuations: str = "\"'“¿([{-",
+        append_punctuations: str = "\"'.。,，!！?？:：”)]}、",
+        audio_features: torch.Tensor = None,
+        ts_num: int = 0,
+        ts_noise: float = 0.1,
+        min_word_dur: float = 0.1,
+        split_callback: Callable = None,
+        gap_padding: Optional[str] = ' ...',
+        **kwargs,
+):
+    if len(segments) == 0:
+        return
+    if min_word_dur is None:
+        min_word_dur = 0
+    if prepend_punctuations is None:
+        prepend_punctuations = "\"'“¿([{-"
+    if append_punctuations is None:
+        append_punctuations = "\"'.。,，!！?？:：”)]}、"
+    def align():
+        for seg in segments:
+            seg['words'] = []
+        text_tokens, token_split, seg_indices = split_word_tokens(segments, tokenizer,
+                                                                  padding=gap_padding, split_callback=split_callback)
+        alignment = find_alignment_stable(model, tokenizer, text_tokens, mel, num_samples,
+                                          **kwargs,
+                                          token_split=token_split,
+                                          audio_features=audio_features,
+                                          ts_num=ts_num,
+                                          ts_noise=ts_noise)
+        alt_beginning_alignment = pop_empty_alignment(alignment)
+        merge_punctuations(alignment, prepend_punctuations, append_punctuations)
+        time_offset = segments[0]["seek"]
+        assert len(alignment) == len(seg_indices)
+        assert (gap_padding is None or len(segments) == len(alt_beginning_alignment))
+        for i, timing in zip(seg_indices, alignment):
+            if len(timing.tokens) != 0:
+                start = timing.start
+                end = timing.end
+                if (
+                        len(segments[i]['words']) == 0 and
+                        ((end - start) < min_word_dur) and
+                        len(alt_beginning_alignment)
+                ):
+                    start = alt_beginning_alignment[i].start
+                segments[i]['words'].append(
+                    dict(
+                        word=timing.word,
+                        start=round(time_offset + start, 3),
+                        end=round(time_offset + end, 3),
+                        probability=timing.probability,
+                        tokens=timing.tokens
+                    )
+                )
+    align()
+    if (
+            gap_padding is not None and
+            any(
+                (word['end'] - word['start']) < min_word_dur
+                for seg in segments
+                for word in seg['words']
+            )
+    ):
+        gap_padding = None
+        align()
+    for segment in segments:
+        if len(words := segment["words"]) > 0:
+            # adjust the segment-level timestamps based on the word-level timestamps
+            segment["start"] = words[0]["start"]
+            segment["end"] = words[-1]["end"]

stable_whisper/utils.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import inspect
+import sys
+system_encoding = sys.getdefaultencoding()
+if system_encoding != "utf-8":
+    def make_safe(string):
+        # replaces any character not representable using the system default encoding with an '?',
+        # avoiding UnicodeEncodeError (https://github.com/openai/whisper/discussions/729).
+        return string.encode(system_encoding, errors="replace").decode(system_encoding)
+else:
+    def make_safe(string):
+        # utf-8 can encode any Unicode code point, so no need to do the round-trip encoding
+        return string
+def str_to_valid_type(val: str):
+    if len(val) == 0:
+        return None
+    if '/' in val:
+        return [a.split('*') if '*' in a else a for a in val.split('/')]
+    try:
+        val = float(val) if '.' in val else int(val)
+    except ValueError:
+        pass
+    finally:
+        return val
+def get_func_parameters(func):
+    return inspect.signature(func).parameters.keys()
+def isolate_useful_options(options: dict, method, pop: bool = False) -> dict:
+    _get = dict.pop if pop else dict.get
+    return {k: _get(options, k) for k in get_func_parameters(method) if k in options}
+def safe_print(msg: str, _print=None):
+    if msg:
+        (_print or print)(make_safe(msg))
+def format_timestamp(
+    seconds: float, always_include_hours: bool = False, decimal_marker: str = "."
+):
+    assert seconds >= 0, "non-negative timestamp expected"
+    milliseconds = round(seconds * 1000.0)
+    hours = milliseconds // 3_600_000
+    milliseconds -= hours * 3_600_000
+    minutes = milliseconds // 60_000
+    milliseconds -= minutes * 60_000
+    seconds = milliseconds // 1_000
+    milliseconds -= seconds * 1_000
+    hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
+    return (
+        f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
+    )
+class UnsortedException(Exception):
+    def __init__(self, message: str = None, data: dict = None):
+        if not message:
+            message = 'Timestamps are not in ascending order. If data is produced by Stable-ts, please submit an issue.'
+        super().__init__(message)
+        self.data = data
+    def get_data(self):
+        return self.data

stable_whisper/video_output.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import os
+import subprocess as sp
+import warnings
+from typing import List
+__all__ = ['encode_video_comparison']
+def encode_video_comparison(
+        audiofile: str,
+        subtitle_files: List[str],
+        output_videopath: str = None,
+        *,
+        labels: List[str] = None,
+        height: int = 90,
+        width: int = 720,
+        color: str = 'black',
+        fontsize: int = 70,
+        border_color: str = 'white',
+        label_color: str = 'white',
+        label_size: int = 14,
+        fps: int = 25,
+        video_codec: str = None,
+        audio_codec: str = None,
+        overwrite=False,
+        only_cmd: bool = False,
+        verbose=True
+) -> (str, None):
+    """
+    Encode multiple subtitle files into one video with the subtitles vertically stacked.
+    Parameters
+    ----------
+    audiofile : str
+        Path of audio file.
+    subtitle_files : list of str
+        List of paths for subtitle file.
+    output_videopath : str, optional
+        Output video path.
+    labels : list of str, default, None, meaning use ``subtitle_files`` as labels
+        List of labels for ``subtitle_files``.
+    height : int, default 90
+        Height for each subtitle section.
+    width : int, default 720
+        Width for each subtitle section.
+    color : str, default 'black'
+        Background color of the video.
+    fontsize: int, default 70
+        Font size for subtitles.
+    border_color : str, default 'white'
+        Border color for separating the sections of subtitle.
+    label_color : str, default 'white'
+        Color of labels.
+    label_size : int, default 14
+        Font size of labels.
+    fps : int, default 25
+        Frame-rate of the video.
+    video_codec : str, optional
+        Video codec opf the video.
+    audio_codec : str, optional
+        Audio codec opf the video.
+    overwrite : bool, default False
+        Whether to overwrite existing video files with the same path as the output video.
+    only_cmd : bool, default False
+        Whether to skip encoding and only return the full command generate from the specified options.
+    verbose : bool, default True
+        Whether to display ffmpeg processing info.
+    Returns
+    -------
+    str or None
+        Encoding command as a string if ``only_cmd = True``.
+    """
+    vc = '' if video_codec is None else f' -c:v {video_codec}'
+    ac = '' if audio_codec is None else f' -c:a {audio_codec}'
+    background = f'-f lavfi -i color=size={width}x{height}:rate={fps}:color={color}'
+    border = f'-f lavfi -i color=size={width}x3:rate={fps}:color={border_color}'
+    audio = f'-i "{audiofile}"'
+    cfilters0 = []
+    assert labels is None or len(labels) == len(subtitle_files)
+    for i, sub in enumerate(subtitle_files):
+        label = sub if labels is None else labels[i]
+        label = label.replace("'", '"')
+        fil = f"[0]drawtext=text='{label}':fontcolor={label_color}:fontsize={label_size}:x=10:y=10[a{i}]," \
+              f"[a{i}]subtitles='{sub}':force_style='Fontsize={fontsize}'[b{i}]"
+        cfilters0.append(fil)
+    cfilters1 = (
+            '[1]'.join(
+                f'[b{i}]' for i in range(len(cfilters0))
+            )
+            +
+            f'vstack=inputs={len(cfilters0) * 2 - 1}'
+    )
+    final_fil = ','.join(cfilters0) + f';{cfilters1}'
+    ow = '-y' if overwrite else '-n'
+    if output_videopath is None:
+        name = os.path.split(os.path.splitext(audiofile)[0])[1]
+        output_videopath = f'{name}_sub_comparison.mp4'
+    cmd = (f'ffmpeg {ow} {background} {border} {audio} '
+           f'-filter_complex "{final_fil}"{vc}{ac} -shortest "{output_videopath}"')
+    if only_cmd:
+        return cmd
+    if verbose:
+        print(cmd)
+    rc = sp.run(cmd, capture_output=not verbose).returncode
+    if rc == 0:
+        if verbose:
+            print(f'Encoded: {output_videopath}')
+    else:
+        warnings.warn(f'Failed to encode {output_videopath}')

stable_whisper/whisper_compatibility.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import warnings
+import importlib.metadata
+import whisper.tokenizer
+from .utils import get_func_parameters
+_COMPATIBLE_WHISPER_VERSIONS = (
+    '20230314',
+    '20230918',
+    '20231105',
+    '20231106',
+    '20231117',
+)
+_required_whisper_ver = _COMPATIBLE_WHISPER_VERSIONS[-1]
+_TOKENIZER_PARAMS = get_func_parameters(whisper.tokenizer.get_tokenizer)
+def warn_compatibility_issues(
+        whisper_module,
+        ignore: bool = False,
+        additional_msg: str = ''
+):
+    compatibility_warning = ''
+    if not ignore:
+        if whisper_module.__version__ not in _COMPATIBLE_WHISPER_VERSIONS:
+            compatibility_warning += (f'Whisper {whisper_module.__version__} is installed.'
+                                      f'Versions confirm to be compatible: {", ".join(_COMPATIBLE_WHISPER_VERSIONS)}\n')
+        _is_whisper_repo_version = bool(importlib.metadata.distribution('openai-whisper').read_text('direct_url.json'))
+        if _is_whisper_repo_version:
+            compatibility_warning += ('The detected version appears to be installed from the repository '
+                                      'which can have compatibility issues '
+                                      'due to multiple commits sharing the same version number. '
+                                      f'It is recommended to install version {_required_whisper_ver} from PyPI.\n')
+        if compatibility_warning:
+            compatibility_warning = (
+                    'The installed version of Whisper might be incompatible.\n'
+                    + compatibility_warning +
+                    'To prevent errors and performance issues, reinstall correct version with: '
+                    f'"pip install --upgrade --no-deps --force-reinstall openai-whisper=={_required_whisper_ver}".'
+            )
+            if additional_msg:
+                compatibility_warning += f' {additional_msg}'
+            warnings.warn(compatibility_warning)
+def get_tokenizer(model=None, is_faster_model: bool = False, **kwargs):
+    """
+    Backward compatible wrapper of :func:`whisper.tokenizer.get_tokenizer` and
+    :class:`faster_whisper.tokenizer.Tokenizer`.
+    """
+    if is_faster_model:
+        import faster_whisper.tokenizer
+        tokenizer = faster_whisper.tokenizer.Tokenizer
+        params = get_func_parameters(tokenizer)
+        if model is not None and 'tokenizer' not in kwargs:
+            kwargs['tokenizer'] = model.hf_tokenizer
+    else:
+        tokenizer = whisper.tokenizer.get_tokenizer
+        params = _TOKENIZER_PARAMS
+    if model is not None and 'multilingual' not in kwargs:
+        kwargs['multilingual'] = \
+            (model.is_multilingual if hasattr(model, 'is_multilingual') else model.model.is_multilingual)
+    if 'num_languages' in params:
+        if hasattr(model, 'num_languages'):
+            kwargs['num_languages'] = \
+                (model.num_languages if hasattr(model, 'num_languages') else model.model.num_languages)
+    elif 'num_languages' in kwargs:
+        del kwargs['num_languages']
+    return tokenizer(**kwargs)

stable_whisper/whisper_word_level.py ADDED Viewed

	@@ -0,0 +1,1651 @@

+import warnings
+import torch
+import numpy as np
+from typing import TYPE_CHECKING, List, Optional, Tuple, Union, Callable
+from types import MethodType
+from tqdm import tqdm
+import whisper
+from whisper.audio import (
+    SAMPLE_RATE, N_FRAMES, HOP_LENGTH, N_SAMPLES, N_SAMPLES_PER_TOKEN, TOKENS_PER_SECOND, FRAMES_PER_SECOND, N_FFT,
+    pad_or_trim, log_mel_spectrogram
+)
+from whisper.utils import exact_div
+from whisper.tokenizer import LANGUAGES, TO_LANGUAGE_CODE
+from whisper.decoding import DecodingOptions, DecodingResult
+from .audio import prep_audio
+from .decode import decode_stable
+from .result import WhisperResult, Segment
+from .timing import add_word_timestamps_stable
+from .stabilization import get_vad_silence_func, wav2mask, mask2timing, timing2mask
+from .non_whisper import transcribe_any
+from .utils import isolate_useful_options, safe_print
+from .whisper_compatibility import warn_compatibility_issues, get_tokenizer
+if TYPE_CHECKING:
+    from whisper.model import Whisper
+__all__ = ['modify_model', 'load_model', 'load_faster_whisper']
+warnings.filterwarnings('ignore', module='whisper', message='.*Triton.*', category=UserWarning)
+# modified version of whisper.transcribe.transcribe
+def transcribe_stable(
+        model: "Whisper",
+        audio: Union[str, np.ndarray, torch.Tensor, bytes],
+        *,
+        verbose: Optional[bool] = False,
+        temperature: Union[float, Tuple[float, ...]] = (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
+        compression_ratio_threshold: Optional[float] = 2.4,
+        logprob_threshold: Optional[float] = -1.0,
+        no_speech_threshold: Optional[float] = 0.6,
+        condition_on_previous_text: bool = True,
+        initial_prompt: Optional[str] = None,
+        word_timestamps: bool = True,
+        regroup: Union[bool, str] = True,
+        ts_num: int = 0,
+        ts_noise: float = 0.1,
+        suppress_silence: bool = True,
+        suppress_word_ts: bool = True,
+        use_word_position: bool = True,
+        q_levels: int = 20,
+        k_size: int = 5,
+        time_scale: float = None,
+        demucs: Union[bool, torch.nn.Module] = False,
+        demucs_output: str = None,
+        demucs_options: dict = None,
+        vad: bool = False,
+        vad_threshold: float = 0.35,
+        vad_onnx: bool = False,
+        min_word_dur: float = 0.1,
+        nonspeech_error: float = 0.3,
+        only_voice_freq: bool = False,
+        prepend_punctuations: str = "\"'“¿([{-",
+        append_punctuations: str = "\"'.。,，!！?？:：”)]}、",
+        mel_first: bool = False,
+        split_callback: Callable = None,
+        suppress_ts_tokens: bool = False,
+        gap_padding: str = ' ...',
+        only_ffmpeg: bool = False,
+        max_instant_words: float = 0.5,
+        avg_prob_threshold: Optional[float] = None,
+        progress_callback: Callable = None,
+        ignore_compatibility: bool = False,
+        **decode_options) \
+        -> WhisperResult:
+    """
+    Transcribe audio using Whisper.
+    This is a modified version of :func:`whisper.transcribe.transcribe` with slightly different decoding logic while
+    allowing additional preprocessing and postprocessing. The preprocessing performed on the audio includes: isolating
+    voice / removing noise with Demucs and low/high-pass filter. The postprocessing performed on the transcription
+    result includes: adjusting timestamps with VAD and custom regrouping segments based punctuation and speech gaps.
+    Parameters
+    ----------
+    model : whisper.model.Whisper
+        An instance of Whisper ASR model.
+    audio : str or numpy.ndarray or torch.Tensor or bytes
+        Path/URL to the audio file, the audio waveform, or bytes of audio file.
+        If audio is :class:`numpy.ndarray` or :class:`torch.Tensor`, the audio must be already at sampled to 16kHz.
+    verbose : bool or None, default False
+        Whether to display the text being decoded to the console.
+        Displays all the details if ``True``. Displays progressbar if ``False``. Display nothing if ``None``.
+    temperature : float or iterable of float, default (0.0, 0.2, 0.4, 0.6, 0.8, 1.0)
+        Temperature for sampling. It can be a tuple of temperatures, which will be successfully used
+        upon failures according to either ``compression_ratio_threshold`` or ``logprob_threshold``.
+    compression_ratio_threshold : float, default 2.4
+        If the gzip compression ratio is above this value, treat as failed.
+    logprob_threshold : float, default -1
+        If the average log probability over sampled tokens is below this value, treat as failed
+    no_speech_threshold : float, default 0.6
+        If the no_speech probability is higher than this value AND the average log probability
+        over sampled tokens is below ``logprob_threshold``, consider the segment as silent
+    condition_on_previous_text : bool, default True
+        If ``True``, the previous output of the model is provided as a prompt for the next window;
+        disabling may make the text inconsistent across windows, but the model becomes less prone to
+        getting stuck in a failure loop, such as repetition looping or timestamps going out of sync.
+    initial_prompt : str, optional
+        Text to provide as a prompt for the first window. This can be used to provide, or
+        "prompt-engineer" a context for transcription, e.g. custom vocabularies or proper nouns
+        to make it more likely to predict those word correctly.
+    word_timestamps : bool, default True
+        Extract word-level timestamps using the cross-attention pattern and dynamic time warping,
+        and include the timestamps for each word in each segment.
+        Disabling this will prevent segments from splitting/merging properly.
+    regroup : bool or str, default True, meaning the default regroup algorithm
+        String for customizing the regrouping algorithm. False disables regrouping.
+        Ignored if ``word_timestamps = False``.
+    ts_num : int, default 0, meaning disable this option
+        Number of extra timestamp inferences to perform then use average of these extra timestamps.
+        An experimental option that might hurt performance.
+    ts_noise : float, default 0.1
+        Percentage of noise to add to audio_features to perform inferences for ``ts_num``.
+    suppress_silence : bool, default True
+        Whether to enable timestamps adjustments based on the detected silence.
+    suppress_word_ts : bool, default True
+        Whether to adjust word timestamps based on the detected silence. Only enabled if ``suppress_silence = True``.
+    use_word_position : bool, default True
+        Whether to use position of the word in its segment to determine whether to keep end or start timestamps if
+        adjustments are required. If it is the first word, keep end. Else if it is the last word, keep the start.
+    q_levels : int, default 20
+        Quantization levels for generating timestamp suppression mask; ignored if ``vad = true``.
+        Acts as a threshold to marking sound as silent.
+        Fewer levels will increase the threshold of volume at which to mark a sound as silent.
+    k_size : int, default 5
+        Kernel size for avg-pooling waveform to generate timestamp suppression mask; ignored if ``vad = true``.
+        Recommend 5 or 3; higher sizes will reduce detection of silence.
+    time_scale : float, optional
+        Factor for scaling audio duration for inference.
+        Greater than 1.0 'slows down' the audio, and less than 1.0 'speeds up' the audio. None is same as 1.0.
+        A factor of 1.5 will stretch 10s audio to 15s for inference. This increases the effective resolution
+        of the model but can increase word error rate.
+    demucs : bool or torch.nn.Module, default False
+        Whether to preprocess ``audio`` with Demucs to isolate vocals / remove noise. Set ``demucs`` to an instance of
+        a Demucs model to avoid reloading the model for each run.
+        Demucs must be installed to use. Official repo. https://github.com/facebookresearch/demucs.
+    demucs_output : str, optional
+        Path to save the vocals isolated by Demucs as WAV file. Ignored if ``demucs = False``.
+        Demucs must be installed to use. Official repo. https://github.com/facebookresearch/demucs.
+    demucs_options : dict, optional
+        Options to use for :func:`stable_whisper.audio.demucs_audio`.
+    vad : bool, default False
+        Whether to use Silero VAD to generate timestamp suppression mask.
+        Silero VAD requires PyTorch 1.12.0+. Official repo, https://github.com/snakers4/silero-vad.
+    vad_threshold : float, default 0.35
+        Threshold for detecting speech with Silero VAD. Low threshold reduces false positives for silence detection.
+    vad_onnx : bool, default False
+        Whether to use ONNX for Silero VAD.
+    min_word_dur : float, default 0.1
+        Shortest duration each word is allowed to reach for silence suppression.
+    nonspeech_error : float, default 0.3
+        Relative error of non-speech sections that appear in between a word for silence suppression.
+    only_voice_freq : bool, default False
+        Whether to only use sound between 200 - 5000 Hz, where majority of human speech are.
+    prepend_punctuations : str, default '"\'“¿([{-)'
+        Punctuations to prepend to next word.
+    append_punctuations : str, default '.。,，!！?？:：”)]}、)'
+        Punctuations to append to previous word.
+    mel_first : bool, default False
+        Process entire audio track into log-Mel spectrogram first instead in chunks.
+        Used if odd behavior seen in stable-ts but not in whisper, but use significantly more memory for long audio.
+    split_callback : Callable, optional
+        Custom callback for grouping tokens up with their corresponding words.
+        The callback must take two arguments, list of tokens and tokenizer.
+        The callback returns a tuple with a list of words and a corresponding nested list of tokens.
+    suppress_ts_tokens : bool, default False
+        Whether to suppress timestamp tokens during inference for timestamps are detected at silent.
+        Reduces hallucinations in some cases, but also prone to ignore disfluencies and repetitions.
+        This option is ignored if ``suppress_silence = False``.
+    gap_padding : str, default ' ...'
+        Padding prepend to each segments for word timing alignment.
+        Used to reduce the probability of model predicting timestamps earlier than the first utterance.
+    only_ffmpeg : bool, default False
+        Whether to use only FFmpeg (instead of not yt-dlp) for URls
+    max_instant_words : float, default 0.5
+        If percentage of instantaneous words in a segment exceed this amount, the segment is removed.
+    avg_prob_threshold: float or None, default None
+        Transcribe the gap after the previous word and if the average word proababiliy of a segment falls below this
+        value, discard the segment. If ``None``, skip transcribing the gap to reduce chance of timestamps starting
+        before the next utterance.
+    progress_callback : Callable, optional
+        A function that will be called when transcription progress is updated.
+        The callback need two parameters.
+        The first parameter is a float for seconds of the audio that has been transcribed.
+        The second parameter is a float for total duration of audio in seconds.
+    ignore_compatibility : bool, default False
+        Whether to ignore warnings for compatibility issues with the detected Whisper version.
+    decode_options
+        Keyword arguments to construct class:`whisper.decode.DecodingOptions` instances.
+    Returns
+    -------
+    stable_whisper.result.WhisperResult
+        All timestamps, words, probabilities, and other data from the transcription of ``audio``.
+    See Also
+    --------
+    stable_whisper.non_whisper.transcribe_any : Return :class:`stable_whisper.result.WhisperResult` containing all the
+        data from transcribing audio with unmodified :func:`whisper.transcribe.transcribe` with preprocessing and
+        postprocessing.
+    stable_whisper.whisper_word_level.load_faster_whisper.faster_transcribe : Return
+        :class:`stable_whisper.result.WhisperResult` containing all the data from transcribing audio with
+        :meth:`faster_whisper.WhisperModel.transcribe` with preprocessing and postprocessing.
+    Examples
+    --------
+    >>> import stable_whisper
+    >>> model = stable_whisper.load_model('base')
+    >>> result = model.transcribe('audio.mp3', vad=True)
+    >>> result.to_srt_vtt('audio.srt')
+    Saved: audio.srt
+    """
+    warn_compatibility_issues(whisper, ignore_compatibility, 'Or use transcribe_minimal().')
+    dtype = torch.float16 if decode_options.get("fp16", True) and not getattr(model, 'dq', False) else torch.float32
+    if model.device == torch.device("cpu"):
+        if torch.cuda.is_available():
+            warnings.warn("Performing inference on CPU when CUDA is available")
+        if dtype == torch.float16:
+            warnings.warn("FP16 is not supported on CPU; using FP32 instead")
+            dtype = torch.float32
+    if dtype == torch.float32:
+        decode_options["fp16"] = False
+    if 'max_initial_timestamp' not in decode_options:
+        decode_options['max_initial_timestamp'] = None
+    device = model.device
+    if time_scale:
+        warnings.warn('``time_scale`` is deprecated. It will not affect results.',
+                      DeprecationWarning, stacklevel=2)
+    if decode_options.pop('input_sr', None):
+        warnings.warn('``input_sr`` is deprecated. '
+                      '``audio`` of types numpy.ndarray and torch.Tensor inputs must be already at 16kHz. '
+                      'To higher sample rates for ``audio`` use str or bytes.',
+                      DeprecationWarning, stacklevel=2)
+    if not demucs_options:
+        demucs_options = {}
+    if demucs_output:
+        if 'save_path' not in demucs_options:
+            demucs_options['save_path'] = demucs_output
+        warnings.warn('``demucs_output`` is deprecated. Use ``demucs_options`` with ``save_path`` instead. '
+                      'E.g. demucs_options=dict(save_path="demucs_output.mp3")',
+                      DeprecationWarning, stacklevel=2)
+    if 'device' not in demucs_options:
+        demucs_options['device'] = device
+    audio = prep_audio(
+        audio,
+        demucs=demucs,
+        demucs_options=demucs_options,
+        only_voice_freq=only_voice_freq,
+        only_ffmpeg=only_ffmpeg,
+        verbose=verbose
+    )
+    sample_padding = int(N_FFT // 2) + 1
+    whole_mel = log_mel_spectrogram(audio, model.dims.n_mels, padding=sample_padding) if mel_first else None
+    tokenizer = None
+    language = None
+    initial_prompt_tokens = []
+    task = decode_options.get("task", "transcribe")
+    def detect_language():
+        nonlocal tokenizer
+        if tokenizer is None:
+            if decode_options.get("language", None) is None and model:
+                if not model.is_multilingual:
+                    decode_options["language"] = "en"
+                else:
+                    if verbose:
+                        print("Detecting language using up to 30 seconds following first non-silent sample. "
+                              "Use `--language` to specify the language")
+                    timing_mask = None
+                    if segment_silence_timing is not None:
+                        timing_mask = np.logical_and(
+                            segment_silence_timing[0] <= time_offset,
+                            segment_silence_timing[1] >= time_offset
+                        )
+                    start_sample = (
+                        None
+                        if segment_silence_timing is None or not timing_mask.any() else
+                        round(segment_silence_timing[1][timing_mask.nonzero()[0]][0] * SAMPLE_RATE)
+                    )
+                    if start_sample is None:
+                        nonlocal mel_segment
+                        curr_mel_segment = mel_segment
+                    else:
+                        if whole_mel is None:
+                            curr_mel_segment = log_mel_spectrogram(
+                                audio[..., start_sample:start_sample+N_SAMPLES],
+                                model.dims.n_mels,
+                                padding=sample_padding
+                            )
+                        else:
+                            start_frame = int(start_sample/HOP_LENGTH)
+                            curr_mel_segment = whole_mel[..., start_frame:start_frame+N_FRAMES]
+                        curr_mel_segment = pad_or_trim(curr_mel_segment, N_FRAMES).to(device=device, dtype=dtype)
+                    _, probs = model.detect_language(curr_mel_segment)
+                    decode_options["language"] = max(probs, key=probs.get)
+                    if verbose is not None:
+                        detected_msg = f"Detected language: {LANGUAGES[decode_options['language']]}"
+                        if tqdm_pbar.disable:
+                            print(detected_msg)
+                        else:
+                            tqdm_pbar.write(detected_msg)
+            nonlocal language
+            language = decode_options["language"]
+            tokenizer = get_tokenizer(model, language=language, task=task)
+            if word_timestamps and task == "translate":
+                warnings.warn("Word-level timestamps on translations may not be reliable.")
+            if initial_prompt is not None:
+                nonlocal initial_prompt_tokens
+                initial_prompt_tokens = tokenizer.encode(" " + initial_prompt.strip())
+                all_tokens.extend(initial_prompt_tokens)
+    audio_features = None
+    def decode_with_fallback(seg: torch.Tensor,
+                             ts_token_mask: torch.Tensor = None) \
+            -> DecodingResult:
+        nonlocal audio_features
+        temperatures = [temperature] if isinstance(temperature, (int, float)) else temperature
+        decode_result = None
+        for t in temperatures:
+            kwargs = {**decode_options}
+            if t > 0:
+                # disable beam_size and patience when t > 0
+                kwargs.pop("beam_size", None)
+                kwargs.pop("patience", None)
+            else:
+                # disable best_of when t == 0
+                kwargs.pop("best_of", None)
+            options = DecodingOptions(**kwargs, temperature=t)
+            decode_result, audio_features = decode_stable(model,
+                                                          seg,
+                                                          options,
+                                                          ts_token_mask=ts_token_mask if suppress_ts_tokens else None,
+                                                          audio_features=audio_features)
+            needs_fallback = False
+            if (
+                    compression_ratio_threshold is not None
+                    and decode_result.compression_ratio > compression_ratio_threshold
+            ):
+                needs_fallback = True  # too repetitive
+            if (
+                    logprob_threshold is not None
+                    and decode_result.avg_logprob < logprob_threshold
+            ):
+                needs_fallback = True  # average log probability is too low
+            if (
+                no_speech_threshold is not None
+                and decode_result.no_speech_prob > no_speech_threshold
+            ):
+                needs_fallback = False  # silence
+            if not needs_fallback:
+                break
+        return decode_result
+    seek_sample = 0  # samples
+    input_stride = exact_div(
+        N_FRAMES, model.dims.n_audio_ctx
+    )  # mel frames per output token: 2
+    time_precision = (
+            input_stride * HOP_LENGTH / SAMPLE_RATE
+    )  # time per output token: 0.02 (seconds)
+    all_tokens = []
+    all_segments = []
+    prompt_reset_since = 0
+    def new_segment(
+            *, start: float, end: float, tokens: torch.Tensor, result: DecodingResult
+    ):
+        tokens = tokens.tolist()
+        text_tokens = [token for token in tokens if token < tokenizer.eot]
+        return {
+            "seek": round(seek_sample / SAMPLE_RATE, 3),  # units in seconds
+            "start": start,
+            "end": end,
+            "text": tokenizer.decode(text_tokens),
+            "tokens": tokens,
+            "temperature": result.temperature,
+            "avg_logprob": result.avg_logprob,
+            "compression_ratio": result.compression_ratio,
+            "no_speech_prob": result.no_speech_prob,
+        }
+    punctuations = prepend_punctuations + append_punctuations
+    total_samples = audio.shape[-1]
+    total_duration = round(total_samples / SAMPLE_RATE, 2)
+    n_samples_per_frame = exact_div(N_SAMPLES_PER_TOKEN * TOKENS_PER_SECOND, FRAMES_PER_SECOND)
+    silent_timings = [[], []]
+    silence_timing = None
+    if suppress_silence and vad:
+        silence_timing = get_vad_silence_func(onnx=vad_onnx, verbose=verbose)(audio, speech_threshold=vad_threshold)
+    with tqdm(total=total_duration, unit='sec', disable=verbose is not False, desc=task.title()) as tqdm_pbar:
+        def update_pbar():
+            nonlocal audio_features
+            audio_features = None
+            seek_duration = min(total_duration, round(seek_sample / SAMPLE_RATE, 2))
+            if not tqdm_pbar.disable:
+                tqdm_pbar.update(seek_duration - tqdm_pbar.n)
+            if progress_callback is not None:
+                progress_callback(seek=seek_duration, total=total_duration)
+        def update_seek():
+            nonlocal seek_sample
+            seek_sample += segment_samples
+        def fast_forward():
+            # fast-forward to the next segment boundary
+            update_seek()
+            update_pbar()
+        while seek_sample < audio.shape[-1]:
+            seek_sample_end = seek_sample + N_SAMPLES
+            audio_segment = audio[seek_sample:seek_sample_end]
+            time_offset = seek_sample / SAMPLE_RATE
+            segment_samples = audio_segment.shape[-1]
+            segment_duration = segment_samples / SAMPLE_RATE
+            mel_segment = (
+                log_mel_spectrogram(audio_segment, model.dims.n_mels, padding=sample_padding)
+                if whole_mel is None else
+                whole_mel[..., round(seek_sample / n_samples_per_frame): round(seek_sample_end / n_samples_per_frame)]
+            )
+            mel_segment = pad_or_trim(mel_segment, N_FRAMES).to(device=model.device, dtype=dtype)
+            segment_silence_timing = None
+            ts_token_mask = None
+            if suppress_silence:
+                if silence_timing is None:
+                    ts_token_mask = wav2mask(audio_segment, q_levels=q_levels, k_size=k_size)
+                    segment_silence_timing = mask2timing(ts_token_mask, time_offset=time_offset)
+                else:
+                    timing_indices = np.logical_and(
+                        silence_timing[1] > time_offset,
+                        silence_timing[0] < time_offset + segment_duration
+                    )
+                    segment_silence_timing = (silence_timing[0][timing_indices], silence_timing[1][timing_indices])
+                    ts_token_mask = timing2mask(*segment_silence_timing, size=1501, time_offset=time_offset)
+                    if mn := timing_indices.argmax():
+                        silence_timing = (silence_timing[0][mn:], silence_timing[1][mn:])
+                if ts_token_mask is not None:
+                    if ts_token_mask.all():  # segment is silent
+                        fast_forward()
+                        continue
+                    ts_token_mask = pad_or_trim(ts_token_mask, 1501)
+            detect_language()
+            decode_options["prompt"] = all_tokens[prompt_reset_since:]
+            result: DecodingResult = decode_with_fallback(mel_segment, ts_token_mask=ts_token_mask)
+            tokens = torch.tensor(result.tokens)
+            if no_speech_threshold is not None:
+                # no voice activity check
+                should_skip = result.no_speech_prob > no_speech_threshold
+                if logprob_threshold is not None and result.avg_logprob > logprob_threshold:
+                    # don't skip if the logprob is high enough, despite the no_speech_prob
+                    should_skip = False
+                if should_skip:
+                    fast_forward()
+                    continue
+            current_segments = []
+            timestamp_tokens: torch.Tensor = tokens.ge(tokenizer.timestamp_begin)
+            single_timestamp_ending = timestamp_tokens[-2:].tolist() == [False, True]
+            consecutive = torch.where(timestamp_tokens[:-1] & timestamp_tokens[1:])[0]
+            consecutive.add_(1)
+            if len(consecutive) > 0:
+                # if the output contains two consecutive timestamp tokens
+                slices = consecutive.tolist()
+                if single_timestamp_ending:
+                    slices.append(len(tokens))
+                last_slice = 0
+                for current_slice in slices:
+                    sliced_tokens = tokens[last_slice:current_slice]
+                    start_timestamp_pos = (
+                            sliced_tokens[0].item() - tokenizer.timestamp_begin
+                    )
+                    end_timestamp_pos = (
+                            sliced_tokens[-1].item() - tokenizer.timestamp_begin
+                    )
+                    current_segments.append(
+                        new_segment(
+                            start=round(time_offset + start_timestamp_pos * time_precision, 3),
+                            end=round(time_offset + min(end_timestamp_pos * time_precision, segment_duration), 3),
+                            tokens=sliced_tokens,
+                            result=result,
+                        )
+                    )
+                    last_slice = current_slice
+            else:
+                duration = segment_duration
+                timestamps = tokens[timestamp_tokens.nonzero().flatten()]
+                if (
+                        len(timestamps) > 0
+                        and timestamps[-1].item() != tokenizer.timestamp_begin
+                ):
+                    # no consecutive timestamps but it has a timestamp; use the last one.
+                    end_timestamp_pos = (
+                            timestamps[-1].item() - tokenizer.timestamp_begin
+                    )
+                    duration = min(end_timestamp_pos * time_precision, segment_duration)
+                else:
+                    end_timestamp_pos = 0
+                current_segments.append(
+                    new_segment(
+                        start=round(time_offset, 3),
+                        end=round(time_offset + duration, 3),
+                        tokens=tokens,
+                        result=result,
+                    )
+                )
+            # if a segment is instantaneous or does not contain text, remove it
+            for i in reversed(range(len(current_segments))):
+                seg = current_segments[i]
+                if seg["start"] == seg["end"] or seg["text"].strip() in punctuations:
+                    del current_segments[i]
+            num_samples = (
+                min(round(end_timestamp_pos * N_SAMPLES_PER_TOKEN), segment_samples)
+                if end_timestamp_pos > 0 else
+                segment_samples
+            )
+            if word_timestamps:
+                add_word_timestamps_stable(
+                    segments=current_segments,
+                    model=model,
+                    tokenizer=tokenizer,
+                    mel=mel_segment,
+                    num_samples=num_samples,
+                    prepend_punctuations=prepend_punctuations,
+                    append_punctuations=append_punctuations,
+                    audio_features=audio_features,
+                    ts_num=ts_num,
+                    ts_noise=ts_noise,
+                    split_callback=split_callback,
+                    gap_padding=gap_padding
+                )
+                # if [max_instant_words] of the words in a segment are instantaneous, remove it
+                for i in reversed(range(len(current_segments))):
+                    zero_duration_percent = (
+                        np.array(
+                            [w['start'] == w['end'] for w in current_segments[i]['words']]
+                        )
+                        .astype(np.float16)
+                        .mean()
+                    )
+                    if zero_duration_percent > max_instant_words:
+                        del current_segments[i]
+                if avg_prob_threshold and current_segments:
+                    if (
+                            single_timestamp_ending and
+                            (np.mean([w['probability'] for s in current_segments for w in s['words']]) <
+                             avg_prob_threshold)
+                    ):
+                        num_samples = segment_samples
+                        current_segments = []
+                    else:
+                        num_samples = round((current_segments[-1]['words'][-1]['end']-time_offset) * SAMPLE_RATE)
+            if len(current_segments) == 0:
+                fast_forward()
+                continue
+            if segment_silence_timing is not None:
+                silent_timings[0].extend(segment_silence_timing[0])
+                silent_timings[1].extend(segment_silence_timing[1])
+                for seg_i, segment in enumerate(current_segments):
+                    segment = Segment(**segment).suppress_silence(
+                            *segment_silence_timing,
+                            min_word_dur=min_word_dur,
+                            word_level=suppress_word_ts,
+                            nonspeech_error=nonspeech_error,
+                            use_word_position=use_word_position,
+                        )
+                    if verbose:
+                        safe_print(segment.to_display_str())
+                    current_segments[seg_i] = segment.to_dict()
+            all_segments.extend(
+                [
+                    {"id": i, **segment}
+                    for i, segment in enumerate(current_segments, start=len(all_segments))
+                ]
+            )
+            all_tokens.extend(
+                [token for segment in current_segments for token in segment["tokens"]]
+            )
+            if not single_timestamp_ending or avg_prob_threshold:
+                segment_samples = num_samples
+            if not condition_on_previous_text or result.temperature > 0.5:
+                # do not feed the prompt tokens if a high temperature was used
+                prompt_reset_since = len(all_tokens)
+            fast_forward()
+        # final update
+        update_pbar()
+    if model.device != torch.device('cpu'):
+        torch.cuda.empty_cache()
+    text = '' if tokenizer is None else tokenizer.decode(all_tokens[len(initial_prompt_tokens):])
+    final_result = WhisperResult(dict(text=text,
+                                      segments=all_segments,
+                                      language=language,
+                                      time_scale=time_scale))
+    if word_timestamps and regroup:
+        final_result.regroup(regroup)
+    if time_scale is not None:
+        final_result.rescale_time(1 / time_scale)
+    if len(final_result.text) == 0:
+        warnings.warn(f'Failed to {task} audio. Result contains no text. ')
+    final_result.update_nonspeech_sections(*silent_timings)
+    return final_result
+def transcribe_minimal(
+        model: "Whisper",
+        audio: Union[str, np.ndarray, torch.Tensor, bytes],
+        *,
+        verbose: Optional[bool] = False,
+        word_timestamps: bool = True,
+        regroup: Union[bool, str] = True,
+        suppress_silence: bool = True,
+        suppress_word_ts: bool = True,
+        use_word_position: bool = True,
+        q_levels: int = 20,
+        k_size: int = 5,
+        demucs: bool = False,
+        demucs_output: str = None,
+        demucs_options: dict = None,
+        vad: bool = False,
+        vad_threshold: float = 0.35,
+        vad_onnx: bool = False,
+        min_word_dur: float = 0.1,
+        nonspeech_error: float = 0.3,
+        only_voice_freq: bool = False,
+        only_ffmpeg: bool = False,
+        **options) \
+        -> WhisperResult:
+    """
+    Transcribe audio using Whisper.
+    This is uses the original whisper transcribe function, :func:`whisper.transcribe.transcribe`, while still allowing
+    additional preprocessing and postprocessing. The preprocessing performed on the audio includes: isolating voice /
+    removing noise with Demucs and low/high-pass filter. The postprocessing performed on the transcription
+    result includes: adjusting timestamps with VAD and custom regrouping segments based punctuation and speech gaps.
+    Parameters
+    ----------
+    model : whisper.model.Whisper
+        An instance of Whisper ASR model.
+    audio : str or numpy.ndarray or torch.Tensor or bytes
+        Path/URL to the audio file, the audio waveform, or bytes of audio file.
+        If audio is ``numpy.ndarray`` or ``torch.Tensor``, the audio must be already at sampled to 16kHz.
+    verbose : bool or None, default False
+        Whether to display the text being decoded to the console.
+        Displays all the details if ``True``. Displays progressbar if ``False``. Display nothing if ``None``.
+    word_timestamps : bool, default True
+        Extract word-level timestamps using the cross-attention pattern and dynamic time warping,
+        and include the timestamps for each word in each segment.
+        Disabling this will prevent segments from splitting/merging properly.
+    regroup : bool or str, default True, meaning the default regroup algorithm
+        String for customizing the regrouping algorithm. False disables regrouping.
+        Ignored if ``word_timestamps = False``.
+    suppress_silence : bool, default True
+        Whether to enable timestamps adjustments based on the detected silence.
+    suppress_word_ts : bool, default True
+        Whether to adjust word timestamps based on the detected silence. Only enabled if ``suppress_silence = True``.
+    use_word_position : bool, default True
+        Whether to use position of the word in its segment to determine whether to keep end or start timestamps if
+        adjustments are required. If it is the first word, keep end. Else if it is the last word, keep the start.
+    q_levels : int, default 20
+        Quantization levels for generating timestamp suppression mask; ignored if ``vad = true``.
+        Acts as a threshold to marking sound as silent.
+        Fewer levels will increase the threshold of volume at which to mark a sound as silent.
+    k_size : int, default 5
+        Kernel size for avg-pooling waveform to generate timestamp suppression mask; ignored if ``vad = true``.
+        Recommend 5 or 3; higher sizes will reduce detection of silence.
+    demucs : bool or torch.nn.Module, default False
+        Whether to preprocess ``audio`` with Demucs to isolate vocals / remove noise. Set ``demucs`` to an instance of
+        a Demucs model to avoid reloading the model for each run.
+        Demucs must be installed to use. Official repo, https://github.com/facebookresearch/demucs.
+    demucs_output : str, optional
+        Path to save the vocals isolated by Demucs as WAV file. Ignored if ``demucs = False``.
+        Demucs must be installed to use. Official repo, https://github.com/facebookresearch/demucs.
+    demucs_options : dict, optional
+        Options to use for :func:`stable_whisper.audio.demucs_audio`.
+    vad : bool, default False
+        Whether to use Silero VAD to generate timestamp suppression mask.
+        Silero VAD requires PyTorch 1.12.0+. Official repo, https://github.com/snakers4/silero-vad.
+    vad_threshold : float, default 0.35
+        Threshold for detecting speech with Silero VAD. Low threshold reduces false positives for silence detection.
+    vad_onnx : bool, default False
+        Whether to use ONNX for Silero VAD.
+    min_word_dur : float, default 0.1
+        Shortest duration each word is allowed to reach for silence suppression.
+    nonspeech_error : float, default 0.3
+        Relative error of non-speech sections that appear in between a word for silence suppression.
+    only_voice_freq : bool, default False
+        Whether to only use sound between 200 - 5000 Hz, where majority of human speech are.
+    only_ffmpeg : bool, default False
+        Whether to use only FFmpeg (instead of not yt-dlp) for URls
+    options
+        Additional options used for :func:`whisper.transcribe.transcribe` and
+        :func:`stable_whisper.non_whisper.transcribe_any`.
+    Returns
+    -------
+    stable_whisper.result.WhisperResult
+        All timestamps, words, probabilities, and other data from the transcription of ``audio``.
+    Examples
+    --------
+    >>> import stable_whisper
+    >>> model = stable_whisper.load_model('base')
+    >>> result = model.transcribe_minimal('audio.mp3', vad=True)
+    >>> result.to_srt_vtt('audio.srt')
+    Saved: audio.srt
+    """
+    inference_kwargs = dict(
+        model=model,
+        audio=audio,
+        word_timestamps=word_timestamps,
+        verbose=verbose
+    )
+    extra_options = isolate_useful_options(options, transcribe_any, True)
+    if demucs or only_voice_freq:
+        if 'audio_type' not in extra_options:
+            extra_options['audio_type'] = 'torch'
+        if 'model_sr' not in extra_options:
+            extra_options['model_sr'] = SAMPLE_RATE
+    inference_kwargs.update(options)
+    return transcribe_any(
+        inference_func=whisper.transcribe,
+        audio=audio,
+        inference_kwargs=inference_kwargs,
+        verbose=verbose,
+        regroup=regroup,
+        suppress_silence=suppress_silence,
+        suppress_word_ts=suppress_word_ts,
+        q_levels=q_levels,
+        k_size=k_size,
+        demucs=demucs,
+        demucs_output=demucs_output,
+        demucs_options=demucs_options,
+        vad=vad,
+        vad_threshold=vad_threshold,
+        vad_onnx=vad_onnx,
+        min_word_dur=min_word_dur,
+        nonspeech_error=nonspeech_error,
+        use_word_position=use_word_position,
+        only_voice_freq=only_voice_freq,
+        only_ffmpeg=only_ffmpeg,
+        force_order=True,
+        **extra_options
+    )
+def load_faster_whisper(model_size_or_path: str, **model_init_options):
+    """
+    Load an instance of :class:`faster_whisper.WhisperModel`.
+    Parameters
+    ----------
+    model_size_or_path : {'tiny', 'tiny.en', 'base', 'base.en', 'small', 'small.en', 'medium', 'medium.en', 'large-v1',
+        'large-v2', 'large-v3', or 'large'}
+        Size of the model.
+    model_init_options
+        Additional options to use for initialization of :class:`faster_whisper.WhisperModel`.
+    Returns
+    -------
+    faster_whisper.WhisperModel
+        A modified instance with :func:`stable_whisper.whisper_word_level.load_faster_whisper.faster_transcribe`
+        assigned to :meth:`faster_whisper.WhisperModel.transcribe_stable`.
+    """
+    from faster_whisper import WhisperModel
+    faster_model = WhisperModel(model_size_or_path, **model_init_options)
+    def _inner_transcribe(model, audio, verbose, **faster_transcribe_options):
+        if isinstance(audio, bytes):
+            import io
+            audio = io.BytesIO(audio)
+        progress_callback = faster_transcribe_options.pop('progress_callback', None)
+        segments, info = model.transcribe(audio, **faster_transcribe_options)
+        language = LANGUAGES.get(info.language, info.language)
+        if verbose is not None:
+            print(f'Detected Language: {language}')
+            print(f'Transcribing with faster-whisper ({model_size_or_path})...\r', end='')
+        final_segments = []
+        task = faster_transcribe_options.get('task', 'transcribe').title()
+        total_duration = round(info.duration, 2)
+        with tqdm(total=total_duration, unit='sec', disable=verbose is not False, desc=task) as tqdm_pbar:
+            def update_pbar(seek):
+                tqdm_pbar.update(seek - tqdm_pbar.n)
+                if progress_callback is not None:
+                    progress_callback(seek, total_duration)
+            for segment in segments:
+                segment = segment._asdict()
+                if (words := segment.get('words')) is not None:
+                    segment['words'] = [w._asdict() for w in words]
+                else:
+                    del segment['words']
+                if verbose:
+                    safe_print(Segment(**segment).to_display_str())
+                final_segments.append(segment)
+                update_pbar(segment["end"])
+            update_pbar(tqdm_pbar.total)
+        if verbose:
+            print(f'Completed transcription with faster-whisper ({model_size_or_path}).')
+        return dict(language=language, segments=final_segments)
+    def faster_transcribe(
+            model: WhisperModel,
+            audio: Union[str, bytes, np.ndarray],
+            *,
+            word_timestamps: bool = True,
+            verbose: Optional[bool] = False,
+            regroup: Union[bool, str] = True,
+            suppress_silence: bool = True,
+            suppress_word_ts: bool = True,
+            use_word_position: bool = True,
+            q_levels: int = 20,
+            k_size: int = 5,
+            demucs: bool = False,
+            demucs_output: str = None,
+            demucs_options: dict = None,
+            vad: bool = False,
+            vad_threshold: float = 0.35,
+            vad_onnx: bool = False,
+            min_word_dur: float = 0.1,
+            nonspeech_error: float = 0.3,
+            only_voice_freq: bool = False,
+            only_ffmpeg: bool = False,
+            check_sorted: bool = True,
+            progress_callback: Callable = None,
+            **options
+    ) -> WhisperResult:
+        """
+        Transcribe audio using faster-whisper (https://github.com/guillaumekln/faster-whisper).
+        This is uses the transcribe method from faster-whisper, :meth:`faster_whisper.WhisperModel.transcribe`, while
+        still allowing additional preprocessing and postprocessing. The preprocessing performed on the audio includes:
+        isolating voice / removing noise with Demucs and low/high-pass filter. The postprocessing performed on the
+        transcription result includes: adjusting timestamps with VAD and custom regrouping segments based punctuation
+        and speech gaps.
+        Parameters
+        ----------
+        model : faster_whisper.WhisperModel
+            The faster-whisper ASR model instance.
+        audio : str or numpy.ndarray or torch.Tensor or bytes
+            Path/URL to the audio file, the audio waveform, or bytes of audio file.
+            If audio is :class:`numpy.ndarray` or :class:`torch.Tensor`, the audio must be already at sampled to 16kHz.
+        verbose : bool or None, default False
+            Whether to display the text being decoded to the console.
+            Displays all the details if ``True``. Displays progressbar if ``False``. Display nothing if ``None``.
+        word_timestamps : bool, default True
+            Extract word-level timestamps using the cross-attention pattern and dynamic time warping,
+            and include the timestamps for each word in each segment.
+            Disabling this will prevent segments from splitting/merging properly.
+        regroup : bool or str, default True, meaning the default regroup algorithm
+            String for customizing the regrouping algorithm. False disables regrouping.
+            Ignored if ``word_timestamps = False``.
+        suppress_silence : bool, default True
+            Whether to enable timestamps adjustments based on the detected silence.
+        suppress_word_ts : bool, default True
+            Whether to adjust word timestamps based on the detected silence. Only enabled if ``suppress_silence = True``.
+        use_word_position : bool, default True
+            Whether to use position of the word in its segment to determine whether to keep end or start timestamps if
+            adjustments are required. If it is the first word, keep end. Else if it is the last word, keep the start.
+        q_levels : int, default 20
+            Quantization levels for generating timestamp suppression mask; ignored if ``vad = true``.
+            Acts as a threshold to marking sound as silent.
+            Fewer levels will increase the threshold of volume at which to mark a sound as silent.
+        k_size : int, default 5
+            Kernel size for avg-pooling waveform to generate timestamp suppression mask; ignored if ``vad = true``.
+            Recommend 5 or 3; higher sizes will reduce detection of silence.
+        demucs : bool or torch.nn.Module, default False
+            Whether to preprocess ``audio`` with Demucs to isolate vocals / remove noise. Set ``demucs`` to an instance
+            of a Demucs model to avoid reloading the model for each run.
+            Demucs must be installed to use. Official repo, https://github.com/facebookresearch/demucs.
+        demucs_output : str, optional
+            Path to save the vocals isolated by Demucs as WAV file. Ignored if ``demucs = False``.
+            Demucs must be installed to use. Official repo, https://github.com/facebookresearch/demucs.
+        demucs_options : dict, optional
+            Options to use for :func:`stable_whisper.audio.demucs_audio`.
+        vad : bool, default False
+            Whether to use Silero VAD to generate timestamp suppression mask.
+            Silero VAD requires PyTorch 1.12.0+. Official repo, https://github.com/snakers4/silero-vad.
+        vad_threshold : float, default 0.35
+            Threshold for detecting speech with Silero VAD. Low threshold reduces false positives for silence detection.
+        vad_onnx : bool, default False
+            Whether to use ONNX for Silero VAD.
+        min_word_dur : float, default 0.1
+            Shortest duration each word is allowed to reach for silence suppression.
+        nonspeech_error : float, default 0.3
+            Relative error of non-speech sections that appear in between a word for silence suppression.
+        only_voice_freq : bool, default False
+            Whether to only use sound between 200 - 5000 Hz, where majority of human speech are.
+        only_ffmpeg : bool, default False
+            Whether to use only FFmpeg (instead of not yt-dlp) for URls
+        check_sorted : bool, default True
+            Whether to raise an error when timestamps returned by faster-whipser are not in ascending order.
+        progress_callback : Callable, optional
+            A function that will be called when transcription progress is updated.
+            The callback need two parameters.
+            The first parameter is a float for seconds of the audio that has been transcribed.
+            The second parameter is a float for total duration of audio in seconds.
+        options
+            Additional options used for :meth:`faster_whisper.WhisperModel.transcribe` and
+            :func:`stable_whisper.non_whisper.transcribe_any`.
+        Returns
+        -------
+        stable_whisper.result.WhisperResult
+            All timestamps, words, probabilities, and other data from the transcription of ``audio``.
+        Examples
+        --------
+        >>> import stable_whisper
+        >>> model = stable_whisper.load_faster_whisper('base')
+        >>> result = model.transcribe_stable('audio.mp3', vad=True)
+        >>> result.to_srt_vtt('audio.srt')
+        Saved: audio.srt
+        """
+        extra_options = isolate_useful_options(options, transcribe_any, pop=True)
+        if demucs or only_voice_freq:
+            if 'audio_type' not in extra_options:
+                extra_options['audio_type'] = 'numpy'
+            if 'model_sr' not in extra_options:
+                extra_options['model_sr'] = SAMPLE_RATE
+        faster_whisper_options = options
+        faster_whisper_options['model'] = model
+        faster_whisper_options['audio'] = audio
+        faster_whisper_options['word_timestamps'] = word_timestamps
+        faster_whisper_options['verbose'] = verbose
+        faster_whisper_options['progress_callback'] = progress_callback
+        if not demucs_options:
+            demucs_options = {}
+        if demucs_output:
+            if 'save_path' not in demucs_options:
+                demucs_options['save_path'] = demucs_output
+            warnings.warn('``demucs_output`` is deprecated. Use ``demucs_options`` with ``save_path`` instead. '
+                          'E.g. demucs_options=dict(save_path="demucs_output.mp3")',
+                          DeprecationWarning, stacklevel=2)
+        return transcribe_any(
+            inference_func=_inner_transcribe,
+            audio=audio,
+            inference_kwargs=faster_whisper_options,
+            verbose=verbose,
+            regroup=regroup,
+            suppress_silence=suppress_silence,
+            suppress_word_ts=suppress_word_ts,
+            q_levels=q_levels,
+            k_size=k_size,
+            demucs=demucs,
+            demucs_options=demucs_options,
+            vad=vad,
+            vad_threshold=vad_threshold,
+            vad_onnx=vad_onnx,
+            min_word_dur=min_word_dur,
+            nonspeech_error=nonspeech_error,
+            use_word_position=use_word_position,
+            only_voice_freq=only_voice_freq,
+            only_ffmpeg=only_ffmpeg,
+            force_order=True,
+            check_sorted=check_sorted,
+            **extra_options
+        )
+    faster_model.transcribe_stable = MethodType(faster_transcribe, faster_model)
+    from .alignment import align
+    faster_model.align = MethodType(align, faster_model)
+    return faster_model
+def modify_model(model: "Whisper"):
+    """
+    Modify an instance if :class:`whisper.model.Whisper`.
+    The following are performed:
+    -replace :meth:`whisper.model.Whisper.transcribe` with :func:`stable_whisper.whisper_word_level.transcribe_stable`
+    -assign :meth:`whisper.model.transcribe_minimal` to :func:`stable_whisper.whisper_word_level.transcribe_minimal`
+    -assign :meth:`whisper.model.Whisper.transcribe_original` to :meth:`whisper.model.Whisper.transcribe`
+    -assign :meth:`whisper.model.Whisper.align` to :func:`stable_whisper.alignment.align`
+    -assign :meth:`whisper.model.Whisper.locate` to :func:`stable_whisper.alignment.locate`
+    """
+    model.transcribe = MethodType(transcribe_stable, model)
+    model.transcribe_minimal = MethodType(transcribe_minimal, model)
+    model.transcribe_original = MethodType(whisper.transcribe, model)
+    from .alignment import align, refine, locate
+    model.align = MethodType(align, model)
+    model.refine = MethodType(refine, model)
+    model.locate = MethodType(locate, model)
+# modified version of whisper.load_model
+def load_model(name: str, device: Optional[Union[str, torch.device]] = None,
+               download_root: str = None, in_memory: bool = False,
+               cpu_preload: bool = True, dq: bool = False) -> "Whisper":
+    """
+    Load an instance if :class:`whisper.model.Whisper`.
+    Parameters
+    ----------
+    name : {'tiny', 'tiny.en', 'base', 'base.en', 'small', 'small.en', 'medium', 'medium.en', 'large-v1',
+        'large-v2', 'large-v3', or 'large'}
+        One of the official model names listed by :func:`whisper.available_models`, or
+        path to a model checkpoint containing the model dimensions and the model state_dict.
+    device : str or torch.device, optional
+        PyTorch device to put the model into.
+    download_root : str, optional
+        Path to download the model files; by default, it uses "~/.cache/whisper".
+    in_memory : bool, default False
+        Whether to preload the model weights into host memory.
+    cpu_preload : bool, default True
+        Load model into CPU memory first then move model to specified device
+        to reduce GPU memory usage when loading model
+    dq : bool, default False
+        Whether to apply Dynamic Quantization to model to reduced memory usage and increase inference speed
+        but at the cost of a slight decrease in accuracy. Only for CPU.
+    Returns
+    -------
+    model : "Whisper"
+        The Whisper ASR model instance.
+    Notes
+    -----
+    The overhead from ``dq = True`` might make inference slower for models smaller than 'large'.
+    """
+    if device is None or dq:
+        device = "cuda" if torch.cuda.is_available() and not dq else "cpu"
+    if cpu_preload:
+        model = whisper.load_model(name, device='cpu', download_root=download_root, in_memory=in_memory)
+        cuda_index = None
+        if isinstance(device, str) and device.startswith('cuda'):
+            try:
+                cuda_index = [] if device == 'cuda' else [int(device.split(':')[-1])]
+            except ValueError:
+                pass
+        model = model.to(device=device) if cuda_index is None else model.cuda(*cuda_index)
+    else:
+        model = whisper.load_model(name, device=device, download_root=download_root, in_memory=in_memory)
+    modify_model(model)
+    if dq:
+        from .quantization import ptdq_linear
+        ptdq_linear(model)
+    return model
+# modified version of whisper.transcribe.cli
+def cli():
+    import argparse
+    import os
+    from os.path import splitext, split, isfile, join
+    from whisper import available_models
+    from whisper.utils import optional_int, optional_float
+    from .utils import str_to_valid_type, get_func_parameters
+    str2val = {"true": True, "false": False, "1": True, "0": False}
+    def str2bool(string: str) -> bool:
+        string = string.lower()
+        if string in str2val:
+            return str2val[string]
+        raise ValueError(f"Expected one of {set(str2val.keys())}, got {string}")
+    def valid_model_name(name):
+        if name in available_models() or os.path.exists(name):
+            return name
+        raise ValueError(
+            f"model should be one of {available_models()} or path to a model checkpoint"
+        )
+    def update_options_with_args(arg_key: str, options: Optional[dict] = None, pop: bool = False):
+        extra_options = args.pop(arg_key) if pop else args.get(arg_key)
+        if not extra_options:
+            return
+        extra_options = [kv.split('=', maxsplit=1) for kv in extra_options]
+        missing_val = [kv[0] for kv in extra_options if len(kv) == 1]
+        if missing_val:
+            raise ValueError(f'Following expected values for the following custom options: {missing_val}')
+        extra_options = dict((k, str_to_valid_type(v)) for k, v in extra_options)
+        if options is None:
+            return extra_options
+        options.update(extra_options)
+    OUTPUT_FORMATS_METHODS = {
+        "srt": "to_srt_vtt",
+        "ass": "to_ass",
+        "json": "save_as_json",
+        "vtt": "to_srt_vtt",
+        "tsv": "to_tsv",
+        "txt": "to_txt",
+    }
+    OUTPUT_FORMATS = set(OUTPUT_FORMATS_METHODS.keys())
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument("inputs", nargs="+", type=str,
+                        help="audio/video filepath/URL(s) to transcribe "
+                             "or json file(s) to process into [output_format]")
+    parser.add_argument("--output", "-o", action="extend", nargs="+", type=str,
+                        help="output filepaths(s);"
+                             "if not specified, auto-named output file(s) will be saved to "
+                             "[output_dir] or current dir if not specified.")
+    parser.add_argument("--model", '-m', default="base", type=valid_model_name,
+                        help="name of the Whisper model to use")
+    parser.add_argument("--model_dir", type=str, default=None,
+                        help="the path to save model files; uses ~/.cache/whisper by default")
+    parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu",
+                        help="device to use for PyTorch inference")
+    parser.add_argument("--cpu_preload", type=str2bool, default=True,
+                        help="load model into CPU memory first then move model to specified device; "
+                             "this reduces GPU memory usage when loading model.")
+    parser.add_argument("--output_dir", "-d", type=str,
+                        help="directory to save the outputs;"
+                             "if a path in [output] does not have parent, that output will be save to this directory")
+    parser.add_argument("--output_format", "-f", type=str,
+                        help="format of the output file(s); "
+                             f"Supported Formats: {OUTPUT_FORMATS}; "
+                             "use ',' to separate multiple formats")
+    parser.add_argument("--verbose", '-v', type=int, default=1, choices=(0, 1, 2),
+                        help="whether to display the text being decoded to the console; "
+                             "if 2, display all the details; "
+                             "if 1, display progressbar; "
+                             "if 0, display nothing")
+    parser.add_argument("--dynamic_quantization", "-dq", action='store_true',
+                        help="whether to apply Dynamic Quantization to model "
+                             "to reduced memory usage (~half less) and increase inference speed "
+                             "at cost of slight decrease in accuracy; Only for CPU; "
+                             "NOTE: overhead might make inference slower for models smaller than 'large'")
+    parser.add_argument("--task", type=str, default="transcribe",
+                        choices=["transcribe", "translate"],
+                        help="whether to perform X->X speech recognition ('transcribe') "
+                             "or X->English translation ('translate')")
+    parser.add_argument("--language", '-l', type=str, default=None,
+                        choices=sorted(LANGUAGES.keys()) + sorted([k.title() for k in TO_LANGUAGE_CODE.keys()]),
+                        help="language spoken in the audio, specify None to perform language detection")
+    parser.add_argument("--prepend_punctuations", '-pp', type=str, default="\"'“¿([{-",
+                        help="Punctuations to prepend to next word")
+    parser.add_argument("--append_punctuations", '-ap', type=str, default="\"'.。,，!！?？:：”)]}、",
+                        help="Punctuations to append to previous word")
+    parser.add_argument("--gap_padding", type=str, default=" ...",
+                        help="padding prepend to each segments for word timing alignment;"
+                             "used to reduce the probability of model predicting timestamps "
+                             "earlier than the first utterance")
+    parser.add_argument("--word_timestamps", type=str2bool, default=True,
+                        help="extract word-level timestamps using the cross-attention pattern and dynamic time warping,"
+                             "and include the timestamps for each word in each segment;"
+                             "disabling this will prevent segments from splitting/merging properly.")
+    parser.add_argument("--regroup", type=str, default="True",
+                        help="whether to regroup all words into segments with more natural boundaries;"
+                             "specify string for customizing the regrouping algorithm"
+                             "ignored if [word_timestamps]=False.")
+    parser.add_argument('--ts_num', type=int, default=0,
+                        help="number of extra inferences to perform to find the mean timestamps")
+    parser.add_argument('--ts_noise', type=float, default=0.1,
+                        help="percentage of noise to add to audio_features to perform inferences for [ts_num]")
+    parser.add_argument('--suppress_silence', type=str2bool, default=True,
+                        help="whether to suppress timestamp where audio is silent at segment-level"
+                             "and word-level if [suppress_word_ts]=True")
+    parser.add_argument('--suppress_word_ts', type=str2bool, default=True,
+                        help="whether to suppress timestamps where audio is silent at word-level; "
+                             "ignored if [suppress_silence]=False")
+    parser.add_argument('--suppress_ts_tokens', type=str2bool, default=False,
+                        help="whether to use silence mask to suppress silent timestamp tokens during inference; "
+                             "increases word accuracy in some cases, but tends reduce 'verbatimness' of the transcript"
+                             "ignored if [suppress_silence]=False")
+    parser.add_argument("--q_levels", type=int, default=20,
+                        help="quantization levels for generating timestamp suppression mask; "
+                             "acts as a threshold to marking sound as silent;"
+                             "fewer levels will increase the threshold of volume at which to mark a sound as silent")
+    parser.add_argument("--k_size", type=int, default=5,
+                        help="Kernel size for average pooling waveform to generate suppression mask; "
+                             "recommend 5 or 3; higher sizes will reduce detection of silence")
+    parser.add_argument('--time_scale', type=float,
+                        help="factor for scaling audio duration for inference;"
+                             "greater than 1.0 'slows down' the audio; "
+                             "less than 1.0 'speeds up' the audio; "
+                             "1.0 is no scaling")
+    parser.add_argument('--vad', type=str2bool, default=False,
+                        help='whether to use Silero VAD to generate timestamp suppression mask; '
+                             'Silero VAD requires PyTorch 1.12.0+;'
+                             'Official repo: https://github.com/snakers4/silero-vad')
+    parser.add_argument('--vad_threshold', type=float, default=0.35,
+                        help='threshold for detecting speech with Silero VAD. (Default: 0.35); '
+                             'low threshold reduces false positives for silence detection')
+    parser.add_argument('--vad_onnx', type=str2bool, default=False,
+                        help='whether to use ONNX for Silero VAD')
+    parser.add_argument('--min_word_dur', type=float, default=0.1,
+                        help="shortest duration each word is allowed to reach for silence suppression")
+    parser.add_argument('--nonspeech_error', type=float, default=0.3,
+                        help="relative error of non-speech sections that appear in between a word for "
+                             "silence suppression.")
+    parser.add_argument('--max_chars', type=int,
+                        help="maximum number of character allowed in each segment")
+    parser.add_argument('--max_words', type=int,
+                        help="maximum number of words allowed in each segment")
+    parser.add_argument('--demucs', type=str2bool, default=False,
+                        help='whether to reprocess the audio track with Demucs to isolate vocals/remove noise; '
+                             'Demucs official repo: https://github.com/facebookresearch/demucs')
+    parser.add_argument('--demucs_output', action="extend", nargs="+", type=str,
+                        help='path(s) to save the vocals isolated by Demucs as WAV file(s); '
+                             'ignored if [demucs]=False')
+    parser.add_argument('--only_voice_freq', '-ovf', action='store_true',
+                        help='whether to only use sound between 200 - 5000 Hz, where majority of human speech are.')
+    parser.add_argument('--strip', type=str2bool, default=True,
+                        help="whether to remove spaces before and after text on each segment for output")
+    parser.add_argument('--tag', type=str, action="extend", nargs="+",
+                        help="a pair tags used to change the properties a word at its predicted time"
+                             "SRT Default: '<font color=\"#00ff00\">', '</font>'"
+                             "VTT Default: '<u>', '</u>'"
+                             "ASS Default: '{\\1c&HFF00&}', '{\\r}'")
+    parser.add_argument('--segment_level', type=str2bool, default=True,
+                        help="whether to use segment-level timestamps in output")
+    parser.add_argument('--word_level', type=str2bool, default=True,
+                        help="whether to use word-level timestamps in output")
+    parser.add_argument('--reverse_text', type=str2bool, default=False,
+                        help="whether to reverse the order of words for each segment of text output")
+    # ass output
+    parser.add_argument('--font', type=str, default='Arial',
+                        help="word font for ASS output(s)")
+    parser.add_argument('--font_size', type=int, default=48,
+                        help="word font size for ASS output(s)")
+    parser.add_argument('--karaoke', type=str2bool, default=False,
+                        help="whether to use progressive filling highlights for karaoke effect (only for ASS outputs)")
+    parser.add_argument("--temperature", type=float, default=0,
+                        help="temperature to use for sampling")
+    parser.add_argument("--best_of", type=optional_int,
+                        help="number of candidates when sampling with non-zero temperature")
+    parser.add_argument("--beam_size", type=optional_int,
+                        help="number of beams in beam search, only applicable when temperature is zero")
+    parser.add_argument("--patience", type=float, default=None,
+                        help="optional patience value to use in beam decoding, "
+                             "as in https://arxiv.org/abs/2204.05424, "
+                             "the default (1.0) is equivalent to conventional beam search")
+    parser.add_argument("--length_penalty", type=float, default=None,
+                        help="optional token length penalty coefficient (alpha) "
+                             "as in https://arxiv.org/abs/1609.08144, uses simple length normalization by default")
+    parser.add_argument("--suppress_tokens", type=str, default="-1",
+                        help="comma-separated list of token ids to suppress during sampling; "
+                             "'-1' will suppress most special characters except common punctuations")
+    parser.add_argument("--initial_prompt", type=str, default=None,
+                        help="optional text to provide as a prompt for the first window.")
+    parser.add_argument("--condition_on_previous_text", type=str2bool, default=True,
+                        help="if True, provide the previous output of the model as a prompt for the next window; "
+                             "disabling may make the text inconsistent across windows, "
+                             "but the model becomes less prone to getting stuck in a failure loop")
+    parser.add_argument("--fp16", type=str2bool, default=True,
+                        help="whether to perform inference in fp16; True by default")
+    parser.add_argument("--temperature_increment_on_fallback", type=optional_float, default=0.2,
+                        help="temperature to increase when falling back when the decoding fails to meet either of "
+                             "the thresholds below")
+    parser.add_argument("--compression_ratio_threshold", type=optional_float, default=2.4,
+                        help="if the gzip compression ratio is higher than this value, treat the decoding as failed")
+    parser.add_argument("--logprob_threshold", type=optional_float, default=-1.0,
+                        help="if the average log probability is lower than this value, treat the decoding as failed")
+    parser.add_argument("--no_speech_threshold", type=optional_float, default=0.6,
+                        help="if the probability of the <|nospeech|> token is higher than this value AND the decoding "
+                             "has failed due to `logprob_threshold`, consider the segment as silence")
+    parser.add_argument("--threads", type=optional_int, default=0,
+                        help="number of threads used by torch for CPU inference; "
+                             "supercedes MKL_NUM_THREADS/OMP_NUM_THREADS")
+    parser.add_argument('--mel_first', action='store_true',
+                        help='process entire audio track into log-Mel spectrogram first instead in chunks')
+    parser.add_argument('--only_ffmpeg', action='store_true',
+                        help='whether to use only FFmpeg (and not yt-dlp) for URls')
+    parser.add_argument('--overwrite', '-y', action='store_true',
+                        help='overwrite all output files')
+    parser.add_argument('--debug', action='store_true',
+                        help='print all input/output pair(s) and all arguments used for transcribing/translating')
+    parser.add_argument('--transcribe_method', '-tm', type=str, default='transcribe',
+                        choices=('transcribe', 'transcribe_minimal'))
+    parser.add_argument('--align', '-a', action="extend", nargs='+', type=str,
+                        help='path(s) to TXT file(s) or JSON previous result(s)')
+    parser.add_argument('--refine', '-r', action='store_true',
+                        help='Refine timestamps to increase precision of timestamps')
+    parser.add_argument('--locate', '-lc', action="extend", nargs='+', type=str,
+                        help='words to locate in the audio(s); skips transcription and output')
+    parser.add_argument('--refine_option', '-ro', action="extend", nargs='+', type=str,
+                        help='Extra option(s) to use for refining timestamps; Replace True/False with 1/0; '
+                             'E.g. --refine_option "steps=sese" --refine_options "rel_prob_decrease=0.05"')
+    parser.add_argument('--demucs_option', '-do', action="extend", nargs='+', type=str,
+                        help='Extra option(s) to use for demucs; Replace True/False with 1/0; '
+                             'E.g. --demucs_option "shifts=3" --demucs_options "overlap=0.5"')
+    parser.add_argument('--model_option', '-mo', action="extend", nargs='+', type=str,
+                        help='Extra option(s) to use for loading model; Replace True/False with 1/0; '
+                             'E.g. --model_option "download_root=./downloads"')
+    parser.add_argument('--transcribe_option', '-to', action="extend", nargs='+', type=str,
+                        help='Extra option(s) to use for transcribing/alignment/locating; Replace True/False with 1/0; '
+                             'E.g. --transcribe_option "ignore_compatibility=1"')
+    parser.add_argument('--save_option', '-so', action="extend", nargs='+', type=str,
+                        help='Extra option(s) to use for text outputs; Replace True/False with 1/0; '
+                             'E.g. --save_option "highlight_color=ffffff"')
+    parser.add_argument('--faster_whisper', '-fw', action='store_true',
+                        help='whether to use faster-whisper (https://github.com/guillaumekln/faster-whisper); '
+                             'note: some features may not be available')
+    args = parser.parse_args().__dict__
+    debug = args.pop('debug')
+    if not args['language'] and (args['align'] or args['locate']):
+        raise ValueError('langauge is required for --align / --locate')
+    is_faster_whisper = args.pop('faster_whisper')
+    model_name: str = args.pop("model")
+    model_dir: str = args.pop("model_dir")
+    inputs: List[Union[str, torch.Tensor]] = args.pop("inputs")
+    outputs: List[str] = args.pop("output")
+    output_dir: str = args.pop("output_dir")
+    output_format = args.pop("output_format")
+    overwrite: bool = args.pop("overwrite")
+    use_demucs = args['demucs'] or False
+    demucs_outputs: List[Optional[str]] = args.pop("demucs_output")
+    args['demucs_options'] = update_options_with_args('demucs_option', pop=True)
+    regroup = args.pop('regroup')
+    max_chars = args.pop('max_chars')
+    max_words = args.pop('max_words')
+    args['verbose'] = False if args['verbose'] == 1 else (True if args['verbose'] == 2 else None)
+    show_curr_task = args['verbose'] is not None
+    strings_to_locate = args.pop('locate')
+    if dq := args.pop('dynamic_quantization', False):
+        args['device'] = 'cpu'
+    if args['reverse_text']:
+        args['reverse_text'] = (args.get('prepend_punctuations'), args.get('append_punctuations'))
+    if regroup:
+        try:
+            regroup = str2bool(regroup)
+        except ValueError:
+            pass
+    curr_output_formats: List[str] = output_format.split(',') if output_format else []
+    unsupported_formats = list(set(map(str.lower, curr_output_formats)) - OUTPUT_FORMATS)
+    if outputs:
+        unsupported_formats.extend(list(set(splitext(o)[-1].lower().strip('.') for o in outputs) - OUTPUT_FORMATS))
+    if len(unsupported_formats) != 0:
+        raise NotImplementedError(f'{unsupported_formats} are not supported. Supported formats: {OUTPUT_FORMATS}.')
+    has_demucs_output = bool(demucs_outputs)
+    if use_demucs and has_demucs_output and len(demucs_outputs) != len(inputs):
+        raise NotImplementedError(f'[demucs_output] and [inputs] do not match in count. '
+                                  f'Got {len(demucs_outputs)} and {len(inputs)}')
+    if tag := args.get('tag'):
+        assert tag == ['-1'] or len(tag) == 2, f'[tag] must be a pair of str but got {tag}'
+    def make_parent(filepath: str):
+        if parent := split(filepath)[0]:
+            os.makedirs(parent, exist_ok=True)
+    def is_json(file: str):
+        return file.endswith(".json")
+    def call_method_with_options(method, options: dict, include_first: bool = True):
+        def val_to_str(val) -> str:
+            if isinstance(val, (np.ndarray, torch.Tensor)):
+                return f'{val.__class__}(shape:{list(val.shape)})'
+            elif isinstance(val, str):
+                return f'"{val}"'
+            elif isinstance(val, bytes):
+                return f'{type(val)}(len:{len(val)})'
+            elif isinstance(val, torch.nn.Module):
+                return str(type(val))
+            return str(val)
+        params = tuple(get_func_parameters(method))
+        if debug:
+            temp_options = {k: options.pop(k) for k in params if k in options}
+            temp_options.update(options)
+            options = temp_options
+            options_str = ',\n'.join(
+                f'    {k}={val_to_str(v)}'
+                for k, v in options.items()
+                if include_first or k != params[0]
+            )
+            if options_str:
+                options_str = f'\n{options_str}\n'
+            else:
+                print(options, params)
+            print(f'{method.__qualname__}({options_str})')
+        return method(**options)
+    if alignments := args['align']:
+        if unsupported_align_fmts := \
+                [_ext for p in alignments if (_ext := splitext(p)[-1].lower()) not in ('.json', '.txt')]:
+            raise NotImplementedError(
+                f'Unsupported format(s) for alignment: {unsupported_align_fmts}'
+            )
+        if len(inputs) != len(alignments):
+            raise NotImplementedError(
+                f'Got {len(inputs)} audio file(s) but specified {len(alignments)} file(s) to align.'
+            )
+    else:
+        alignments = ['']*len(inputs)
+    def finalize_outputs(input_file: str, _output: str = None, _alignment: str = None) -> List[str]:
+        _curr_output_formats = curr_output_formats.copy()
+        basename, ext = splitext(_output or input_file)
+        ext = ext[1:]
+        if _output:
+            if ext.lower() in OUTPUT_FORMATS:
+                _curr_output_formats.append(ext)
+            else:
+                basename = _output
+        if not _curr_output_formats:
+            _curr_output_formats = ["srt" if is_json(input_file) or is_json(_alignment) else "json"]
+        _outputs = [f'{basename}.{ext}' for ext in set(_curr_output_formats)]
+        if output_dir:
+            _outputs = [join(output_dir, o) for o in _outputs]
+        return _outputs
+    if outputs:
+        if len(outputs) != len(inputs):
+            raise NotImplementedError(f'Got {len(inputs)} audio file(s) but specified {len(outputs)} output file(s).')
+        final_outputs = [finalize_outputs(i, o, a) for i, o, a in zip(inputs, outputs, alignments)]
+    else:
+        if not output_dir:
+            output_dir = '.'
+        final_outputs = [finalize_outputs(i, _alignment=a) for i, a in zip(inputs, alignments)]
+    if not overwrite:
+        def cancel_overwrite():
+            resp = input(f'{path} already exist, overwrite (y/n)? ').lower()
+            if resp in ('y', 'n'):
+                return resp == 'n'
+            print(f'Expected "y" or "n", but got {resp}.')
+            return True
+        for paths in final_outputs:
+            for path in paths:
+                if isfile(path) and cancel_overwrite():
+                    return
+    if model_name.endswith(".en") and args["language"] not in {"en", "English"}:
+        if args["language"] is not None:
+            warnings.warn(f"{model_name} is an English-only model but receipted "
+                          f"'{args['language']}'; using English instead.")
+        args["language"] = "en"
+    temperature = args.pop("temperature")
+    increment = args.pop("temperature_increment_on_fallback")
+    if increment is not None:
+        temperature = tuple(np.arange(temperature, 1.0 + 1e-6, increment))
+    else:
+        temperature = [temperature]
+    args['temperature'] = temperature
+    threads = args.pop("threads")
+    if threads > 0:
+        torch.set_num_threads(threads)
+    if debug:
+        print('Input(s)  ->  Outputs(s)')
+        for i, (input_audio, output_paths, alignment) in enumerate(zip(inputs, final_outputs, alignments)):
+            dm_output = f' {demucs_outputs[i]} ->' if demucs_outputs else ''
+            alignment = f' + "{alignment}"' if alignment else ''
+            print(f'"{input_audio}"{alignment}  ->{dm_output}  {output_paths}')
+        print('')
+    if show_curr_task:
+        model_from_str = '' if model_dir is None else f' from {model_dir}'
+        model_loading_str = f'{"Faster-Whisper" if is_faster_whisper else "Whisper"} {model_name} model {model_from_str}'
+        print(f'Loading {model_loading_str}\r', end='\n' if debug else '')
+    else:
+        model_loading_str = ''
+    alignments = args['align']
+    model = None
+    def _load_model():
+        nonlocal model
+        if model is None:
+            model_options = dict(
+                name=model_name,
+                model_size_or_path=model_name,
+                device=args.get('device'),
+                download_root=model_dir,
+                dq=dq,
+            )
+            load_model_func = load_faster_whisper if is_faster_whisper else load_model
+            model_options = isolate_useful_options(model_options, load_model_func)
+            update_options_with_args('model_option', model_options)
+            model = call_method_with_options(load_model_func, model_options)
+            if model_loading_str:
+                print(f'Loaded {model_loading_str}  ')
+        return model
+    for i, (input_audio, output_paths) in enumerate(zip(inputs, final_outputs)):
+        skip_output = False
+        if isinstance(input_audio, str) and is_json(input_audio):
+            result = WhisperResult(input_audio)
+        else:
+            model = _load_model()
+            args['regroup'] = False
+            args['audio'] = input_audio
+            if has_demucs_output:
+                args['demucs_output'] = demucs_outputs[i]
+            transcribe_method = args.get('transcribe_method')
+            text = None
+            if alignments and (text := alignments[i]):
+                if text.endswith('.json'):
+                    text = WhisperResult(text)
+                else:
+                    with open(text, 'r', encoding='utf-8') as f:
+                        text = f.read()
+                args['text'] = text
+                transcribe_method = 'align'
+            if is_faster_whisper and transcribe_method == 'transcribe':
+                transcribe_method = 'transcribe_stable'
+            if strings_to_locate and (text := strings_to_locate[i]):
+                args['text'] = text
+                transcribe_method = 'locate'
+                skip_output = args['verbose'] = True
+            transcribe_method = getattr(model, transcribe_method)
+            transcribe_options = isolate_useful_options(args, transcribe_method)
+            if not text:
+                decoding_options = (
+                    isolate_useful_options(args, model.transcribe if is_faster_whisper else DecodingOptions)
+                )
+                if is_faster_whisper:
+                    if decoding_options['suppress_tokens']:
+                        decoding_options['suppress_tokens'] = (
+                            list(map(int, decoding_options['suppress_tokens'].split(',')))
+                        )
+                    for k in list(decoding_options.keys()):
+                        if decoding_options[k] is None:
+                            del decoding_options[k]
+                transcribe_options.update(decoding_options)
+            update_options_with_args('transcribe_option', transcribe_options)
+            result: WhisperResult = call_method_with_options(transcribe_method, transcribe_options)
+        if skip_output:
+            continue
+        if args['refine']:
+            model = _load_model()
+            refine_options = isolate_useful_options(args, model.refine)
+            refine_options['result'] = result
+            update_options_with_args('refine_option', refine_options)
+            call_method_with_options(model.refine, refine_options)
+        if args.get('word_timestamps'):
+            if regroup:
+                result.regroup(regroup, verbose=args['verbose'] or debug)
+            if max_chars or max_words:
+                result.split_by_length(max_chars=max_chars, max_words=max_words)
+        for path in output_paths:
+            make_parent(path)
+            save_method = getattr(result, OUTPUT_FORMATS_METHODS[splitext(path)[-1][1:]])
+            args['filepath'] = path
+            args['path'] = path
+            save_options = isolate_useful_options(args, save_method)
+            update_options_with_args('save_option', save_options)
+            call_method_with_options(save_method, save_options)
+if __name__ == '__main__':
+    cli()