Spaces:

jhj0517
/

Whisper-WebUI

Running

App Files Files Community

jhj0517 commited on Nov 7, 2024

Commit

60fd426

2 Parent(s): 451ca33 1189737

Merge branch 'master' into huggingface

Browse files

Files changed (36) hide show

.dockerignore +7 -7
.github/pull_request_template.md +4 -4
.github/workflows/ci-shell.yml +3 -1
.github/workflows/ci.yml +4 -2
.gitignore +2 -1
Install.bat +1 -0
Install.sh +1 -0
app.py +208 -310
configs/default_parameters.yaml +1 -0
configs/translation.yaml +459 -0
modules/diarize/diarize_pipeline.py +6 -3
modules/diarize/diarizer.py +15 -8
modules/translation/deepl_api.py +18 -27
modules/translation/nllb_inference.py +4 -5
modules/translation/translation_base.py +28 -30
modules/utils/constants.py +6 -0
modules/utils/files_manager.py +6 -0
modules/utils/paths.py +1 -0
modules/utils/subtitle_manager.py +419 -115
modules/vad/silero_vad.py +6 -5
modules/whisper/{whisper_base.py → base_transcription_pipeline.py} +177 -139
modules/whisper/data_classes.py +608 -0
modules/whisper/faster_whisper_inference.py +8 -24
modules/whisper/insanely_fast_whisper_inference.py +34 -34
modules/whisper/whisper_Inference.py +28 -21
modules/whisper/whisper_factory.py +8 -14
modules/whisper/whisper_parameter.py +0 -369
notebook/whisper-webui.ipynb +3 -1
requirements.txt +4 -3
screenshot.png +0 -0
tests/test_bgm_separation.py +7 -7
tests/test_config.py +25 -2
tests/test_diarization.py +4 -4
tests/test_transcription.py +44 -31
tests/test_translation.py +4 -0
tests/test_vad.py +4 -4

.dockerignore CHANGED Viewed

@@ -1,10 +1,10 @@
 # from .gitignore
-venv/
-ui/__pycache__/
-outputs/
-modules/__pycache__/
-models/
 modules/yt_tmp.wav
-.git
-.github

 # from .gitignore
 modules/yt_tmp.wav
+**/venv/
+**/__pycache__/
+**/outputs/
+**/models/
+**/.idea
+**/.git
+**/.github

.github/pull_request_template.md CHANGED Viewed

@@ -1,5 +1,5 @@
-## Related issues
-- #0
-## Changed
-1. Changes

+## Related issues / PRs
+- #
+## Summarize Changes
+1.

.github/workflows/ci-shell.yml CHANGED Viewed

@@ -6,9 +6,11 @@ on:
   push:
     branches:
       - master
   pull_request:
     branches:
       - master
 jobs:
   test-shell-script:
@@ -16,7 +18,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python: [ "3.10" ]
     steps:
     - name: Clean up space for action

   push:
     branches:
       - master
+      - intel-gpu
   pull_request:
     branches:
       - master
+      - intel-gpu
 jobs:
   test-shell-script:
     runs-on: ubuntu-latest
     strategy:
       matrix:
+        python: ["3.10", "3.11", "3.12"]
     steps:
     - name: Clean up space for action

.github/workflows/ci.yml CHANGED Viewed

@@ -6,9 +6,11 @@ on:
   push:
     branches:
       - master
   pull_request:
     branches:
       - master
 jobs:
   build:
@@ -16,7 +18,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python: ["3.10"]
     env:
       DEEPL_API_KEY: ${{ secrets.DEEPL_API_KEY }}
@@ -35,7 +37,7 @@ jobs:
         run: sudo apt-get update && sudo apt-get install -y git ffmpeg
       - name: Install dependencies
-        run: pip install -r requirements.txt pytest
       - name: Run test
         run: python -m pytest -rs tests

   push:
     branches:
       - master
+      - intel-gpu
   pull_request:
     branches:
       - master
+      - intel-gpu
 jobs:
   build:
     runs-on: ubuntu-latest
     strategy:
       matrix:
+        python: ["3.10", "3.11", "3.12"]
     env:
       DEEPL_API_KEY: ${{ secrets.DEEPL_API_KEY }}
         run: sudo apt-get update && sudo apt-get install -y git ffmpeg
       - name: Install dependencies
+        run: pip install -r requirements.txt pytest jiwer
       - name: Run test
         run: python -m pytest -rs tests

.gitignore CHANGED Viewed

@@ -10,4 +10,5 @@ outputs/
 modules/__pycache__/
 models/
 modules/yt_tmp.wav
-configs/default_parameters.yaml

 modules/__pycache__/
 models/
 modules/yt_tmp.wav
+configs/default_parameters.yaml
+__pycache__/

Install.bat CHANGED Viewed

@@ -8,6 +8,7 @@ echo checked the venv folder. now installing requirements..
 call "%~dp0\venv\scripts\activate"
 pip install -r requirements.txt
 if errorlevel 1 (

 call "%~dp0\venv\scripts\activate"
+python -m pip install -U pip
 pip install -r requirements.txt
 if errorlevel 1 (

Install.sh CHANGED Viewed

@@ -7,6 +7,7 @@ fi
 source venv/bin/activate
 pip install -r requirements.txt && echo "Requirements installed successfully." || {
     echo ""
     echo "Requirements installation failed. Please remove the venv folder and run the script again."

 source venv/bin/activate
+python -m pip install -U pip
 pip install -r requirements.txt && echo "Requirements installed successfully." || {
     echo ""
     echo "Requirements installation failed. Please remove the venv folder and run the script again."

app.py CHANGED Viewed

@@ -1,27 +1,27 @@
 import os
 import argparse
 import gradio as gr
 import yaml
 from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, WHISPER_MODELS_DIR,
                                  INSANELY_FAST_WHISPER_MODELS_DIR, NLLB_MODELS_DIR, DEFAULT_PARAMETERS_CONFIG_PATH,
-                                 UVR_MODELS_DIR)
 from modules.utils.files_manager import load_yaml
 from modules.whisper.whisper_factory import WhisperFactory
-from modules.whisper.faster_whisper_inference import FasterWhisperInference
-from modules.whisper.insanely_fast_whisper_inference import InsanelyFastWhisperInference
 from modules.translation.nllb_inference import NLLBInference
 from modules.ui.htmls import *
 from modules.utils.cli_manager import str2bool
 from modules.utils.youtube_manager import get_ytmetas
 from modules.translation.deepl_api import DeepLAPI
-from modules.whisper.whisper_parameter import *
 class App:
     def __init__(self, args):
         self.args = args
         self.app = gr.Blocks(css=CSS, theme=self.args.theme, delete_cache=(60, 3600))
         self.whisper_inf = WhisperFactory.create_whisper_inference(
             whisper_type=self.args.whisper_type,
             whisper_model_dir=self.args.whisper_model_dir,
@@ -38,10 +38,10 @@ class App:
             output_dir=os.path.join(self.args.output_dir, "translations")
         )
         self.default_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
-        print(f"Use \"{self.args.whisper_type}\" implementation")
-        print(f"Device \"{self.whisper_inf.device}\" is detected")
-    def create_whisper_parameters(self):
         whisper_params = self.default_params["whisper"]
         vad_params = self.default_params["vad"]
         diarization_params = self.default_params["diarization"]
@@ -49,158 +49,45 @@ class App:
         with gr.Row():
             dd_model = gr.Dropdown(choices=self.whisper_inf.available_models, value=whisper_params["model_size"],
-                                   label="Model")
-            dd_lang = gr.Dropdown(choices=["Automatic Detection"] + self.whisper_inf.available_langs,
-                                  value=whisper_params["lang"], label="Language")
-            dd_file_format = gr.Dropdown(choices=["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
         with gr.Row():
-            cb_translate = gr.Checkbox(value=whisper_params["is_translate"], label="Translate to English?",
                                        interactive=True)
         with gr.Row():
-            cb_timestamp = gr.Checkbox(value=whisper_params["add_timestamp"], label="Add a timestamp to the end of the filename",
                                        interactive=True)
-        with gr.Accordion("Advanced Parameters", open=False):
-            nb_beam_size = gr.Number(label="Beam Size", value=whisper_params["beam_size"], precision=0, interactive=True,
-                                     info="Beam size to use for decoding.")
-            nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=whisper_params["log_prob_threshold"], interactive=True,
-                                              info="If the average log probability over sampled tokens is below this value, treat as failed.")
-            nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=whisper_params["no_speech_threshold"], interactive=True,
-                                               info="If the no speech probability is higher than this value AND the average log probability over sampled tokens is below 'Log Prob Threshold', consider the segment as silent.")
-            dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types,
-                                          value=self.whisper_inf.current_compute_type, interactive=True,
-                                          allow_custom_value=True,
-                                          info="Select the type of computation to perform.")
-            nb_best_of = gr.Number(label="Best Of", value=whisper_params["best_of"], interactive=True,
-                                   info="Number of candidates when sampling with non-zero temperature.")
-            nb_patience = gr.Number(label="Patience", value=whisper_params["patience"], interactive=True,
-                                    info="Beam search patience factor.")
-            cb_condition_on_previous_text = gr.Checkbox(label="Condition On Previous Text", value=whisper_params["condition_on_previous_text"],
-                                                        interactive=True,
-                                                        info="Condition on previous text during decoding.")
-            sld_prompt_reset_on_temperature = gr.Slider(label="Prompt Reset On Temperature", value=whisper_params["prompt_reset_on_temperature"],
-                                                        minimum=0, maximum=1, step=0.01, interactive=True,
-                                                        info="Resets prompt if temperature is above this value."
-                                                             " Arg has effect only if 'Condition On Previous Text' is True.")
-            tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True,
-                                           info="Initial prompt to use for decoding.")
-            sd_temperature = gr.Slider(label="Temperature", value=whisper_params["temperature"], minimum=0.0,
-                                       step=0.01, maximum=1.0, interactive=True,
-                                       info="Temperature for sampling. It can be a tuple of temperatures, which will be successively used upon failures according to either `Compression Ratio Threshold` or `Log Prob Threshold`.")
-            nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=whisper_params["compression_ratio_threshold"],
-                                                       interactive=True,
-                                                       info="If the gzip compression ratio is above this value, treat as failed.")
-            nb_chunk_length = gr.Number(label="Chunk Length (s)", value=lambda: whisper_params["chunk_length"],
-                                        precision=0,
-                                        info="The length of audio segments. If it is not None, it will overwrite the default chunk_length of the FeatureExtractor.")
-            with gr.Group(visible=isinstance(self.whisper_inf, FasterWhisperInference)):
-                nb_length_penalty = gr.Number(label="Length Penalty", value=whisper_params["length_penalty"],
-                                              info="Exponential length penalty constant.")
-                nb_repetition_penalty = gr.Number(label="Repetition Penalty", value=whisper_params["repetition_penalty"],
-                                                  info="Penalty applied to the score of previously generated tokens (set > 1 to penalize).")
-                nb_no_repeat_ngram_size = gr.Number(label="No Repeat N-gram Size", value=whisper_params["no_repeat_ngram_size"],
-                                                    precision=0,
-                                                    info="Prevent repetitions of n-grams with this size (set 0 to disable).")
-                tb_prefix = gr.Textbox(label="Prefix", value=lambda: whisper_params["prefix"],
-                                       info="Optional text to provide as a prefix for the first window.")
-                cb_suppress_blank = gr.Checkbox(label="Suppress Blank", value=whisper_params["suppress_blank"],
-                                                info="Suppress blank outputs at the beginning of the sampling.")
-                tb_suppress_tokens = gr.Textbox(label="Suppress Tokens", value=whisper_params["suppress_tokens"],
-                                                info="List of token IDs to suppress. -1 will suppress a default set of symbols as defined in the model config.json file.")
-                nb_max_initial_timestamp = gr.Number(label="Max Initial Timestamp", value=whisper_params["max_initial_timestamp"],
-                                                     info="The initial timestamp cannot be later than this.")
-                cb_word_timestamps = gr.Checkbox(label="Word Timestamps", value=whisper_params["word_timestamps"],
-                                                 info="Extract word-level timestamps using the cross-attention pattern and dynamic time warping, and include the timestamps for each word in each segment.")
-                tb_prepend_punctuations = gr.Textbox(label="Prepend Punctuations", value=whisper_params["prepend_punctuations"],
-                                                     info="If 'Word Timestamps' is True, merge these punctuation symbols with the next word.")
-                tb_append_punctuations = gr.Textbox(label="Append Punctuations", value=whisper_params["append_punctuations"],
-                                                    info="If 'Word Timestamps' is True, merge these punctuation symbols with the previous word.")
-                nb_max_new_tokens = gr.Number(label="Max New Tokens", value=lambda: whisper_params["max_new_tokens"],
-                                              precision=0,
-                                              info="Maximum number of new tokens to generate per-chunk. If not set, the maximum will be set by the default max_length.")
-                nb_hallucination_silence_threshold = gr.Number(label="Hallucination Silence Threshold (sec)",
-                                                               value=lambda: whisper_params["hallucination_silence_threshold"],
-                                                               info="When 'Word Timestamps' is True, skip silent periods longer than this threshold (in seconds) when a possible hallucination is detected.")
-                tb_hotwords = gr.Textbox(label="Hotwords", value=lambda: whisper_params["hotwords"],
-                                         info="Hotwords/hint phrases to provide the model with. Has no effect if prefix is not None.")
-                nb_language_detection_threshold = gr.Number(label="Language Detection Threshold", value=lambda: whisper_params["language_detection_threshold"],
-                                                            info="If the maximum probability of the language tokens is higher than this value, the language is detected.")
-                nb_language_detection_segments = gr.Number(label="Language Detection Segments", value=lambda: whisper_params["language_detection_segments"],
-                                                           precision=0,
-                                                           info="Number of segments to consider for the language detection.")
-            with gr.Group(visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
-                nb_batch_size = gr.Number(label="Batch Size", value=whisper_params["batch_size"], precision=0)
-        with gr.Accordion("Background Music Remover Filter", open=False):
-            cb_bgm_separation = gr.Checkbox(label="Enable Background Music Remover Filter", value=uvr_params["is_separate_bgm"],
-                                            interactive=True,
-                                            info="Enabling this will remove background music by submodel before"
-                                                 " transcribing ")
-            dd_uvr_device = gr.Dropdown(label="Device", value=self.whisper_inf.music_separator.device,
-                                        choices=self.whisper_inf.music_separator.available_devices)
-            dd_uvr_model_size = gr.Dropdown(label="Model", value=uvr_params["model_size"],
-                                            choices=self.whisper_inf.music_separator.available_models)
-            nb_uvr_segment_size = gr.Number(label="Segment Size", value=uvr_params["segment_size"], precision=0)
-            cb_uvr_save_file = gr.Checkbox(label="Save separated files to output", value=uvr_params["save_file"])
-            cb_uvr_enable_offload = gr.Checkbox(label="Offload sub model after removing background music",
-                                                value=uvr_params["enable_offload"])
-        with gr.Accordion("Voice Detection Filter", open=False):
-            cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=vad_params["vad_filter"],
-                                        interactive=True,
-                                        info="Enable this to transcribe only detected voice parts by submodel.")
-            sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold",
-                                     value=vad_params["threshold"],
-                                     info="Lower it to be more sensitive to small sounds.")
-            nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0,
-                                                  value=vad_params["min_speech_duration_ms"],
-                                                  info="Final speech chunks shorter than this time are thrown out")
-            nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)",
-                                                 value=vad_params["max_speech_duration_s"],
-                                                 info="Maximum duration of speech chunks in \"seconds\".")
-            nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0,
-                                                   value=vad_params["min_silence_duration_ms"],
-                                                   info="In the end of each speech chunk wait for this time"
-                                                        " before separating it")
-            nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=vad_params["speech_pad_ms"],
-                                         info="Final speech chunks are padded by this time each side")
-        with gr.Accordion("Diarization", open=False):
-            cb_diarize = gr.Checkbox(label="Enable Diarization", value=diarization_params["is_diarize"])
-            tb_hf_token = gr.Text(label="HuggingFace Token", value=diarization_params["hf_token"],
-                                  info="This is only needed the first time you download the model. If you already have"
-                                       " models, you don't need to enter. To download the model, you must manually go "
-                                       "to \"https://huggingface.co/pyannote/speaker-diarization-3.1\" and agree to"
-                                       " their requirement.")
-            dd_diarization_device = gr.Dropdown(label="Device",
-                                                choices=self.whisper_inf.diarizer.get_available_device(),
-                                                value=self.whisper_inf.diarizer.get_device())
         dd_model.change(fn=self.on_change_models, inputs=[dd_model], outputs=[cb_translate])
         return (
-            WhisperParameters(
-                model_size=dd_model, lang=dd_lang, is_translate=cb_translate, beam_size=nb_beam_size,
-                log_prob_threshold=nb_log_prob_threshold, no_speech_threshold=nb_no_speech_threshold,
-                compute_type=dd_compute_type, best_of=nb_best_of, patience=nb_patience,
-                condition_on_previous_text=cb_condition_on_previous_text, initial_prompt=tb_initial_prompt,
-                temperature=sd_temperature, compression_ratio_threshold=nb_compression_ratio_threshold,
-                vad_filter=cb_vad_filter, threshold=sd_threshold, min_speech_duration_ms=nb_min_speech_duration_ms,
-                max_speech_duration_s=nb_max_speech_duration_s, min_silence_duration_ms=nb_min_silence_duration_ms,
-                speech_pad_ms=nb_speech_pad_ms, chunk_length=nb_chunk_length, batch_size=nb_batch_size,
-                is_diarize=cb_diarize, hf_token=tb_hf_token, diarization_device=dd_diarization_device,
-                length_penalty=nb_length_penalty, repetition_penalty=nb_repetition_penalty,
-                no_repeat_ngram_size=nb_no_repeat_ngram_size, prefix=tb_prefix, suppress_blank=cb_suppress_blank,
-                suppress_tokens=tb_suppress_tokens, max_initial_timestamp=nb_max_initial_timestamp,
-                word_timestamps=cb_word_timestamps, prepend_punctuations=tb_prepend_punctuations,
-                append_punctuations=tb_append_punctuations, max_new_tokens=nb_max_new_tokens,
-                hallucination_silence_threshold=nb_hallucination_silence_threshold, hotwords=tb_hotwords,
-                language_detection_threshold=nb_language_detection_threshold,
-                language_detection_segments=nb_language_detection_segments,
-                prompt_reset_on_temperature=sld_prompt_reset_on_temperature, is_bgm_separate=cb_bgm_separation,
-                uvr_device=dd_uvr_device, uvr_model_size=dd_uvr_model_size, uvr_segment_size=nb_uvr_segment_size,
-                uvr_save_file=cb_uvr_save_file, uvr_enable_offload=cb_uvr_enable_offload
-            ),
             dd_file_format,
             cb_timestamp
         )
@@ -212,185 +99,194 @@ class App:
         uvr_params = self.default_params["bgm_separation"]
         with self.app:
-            with gr.Row():
-                with gr.Column():
-                    gr.Markdown(MARKDOWN, elem_id="md_project")
-            with gr.Tabs():
-                with gr.TabItem("File"):  # tab1
                     with gr.Column():
-                        input_file = gr.Files(type="filepath", label="Upload File here")
-                        tb_input_folder = gr.Textbox(label="Input Folder Path (Optional)",
-                                                     info="Optional: Specify the folder path where the input files are located, if you prefer to use local files instead of uploading them."
-                                                          " Leave this field empty if you do not wish to use a local path.",
-                                                     visible=self.args.colab,
-                                                     value="")
-                    whisper_params, dd_file_format, cb_timestamp = self.create_whisper_parameters()
-                    with gr.Row():
-                        btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
-                    with gr.Row():
-                        tb_indicator = gr.Textbox(label="Output", scale=5)
-                        files_subtitles = gr.Files(label="Downloadable output file", scale=3, interactive=False)
-                        btn_openfolder = gr.Button('📂', scale=1)
-                    params = [input_file, tb_input_folder, dd_file_format, cb_timestamp]
-                    btn_run.click(fn=self.whisper_inf.transcribe_file,
-                                  inputs=params + whisper_params.as_list(),
-                                  outputs=[tb_indicator, files_subtitles])
-                    btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
-                with gr.TabItem("Youtube"):  # tab2
-                    with gr.Row():
-                        tb_youtubelink = gr.Textbox(label="Youtube Link")
-                    with gr.Row(equal_height=True):
                         with gr.Column():
-                            img_thumbnail = gr.Image(label="Youtube Thumbnail")
-                        with gr.Column():
-                            tb_title = gr.Label(label="Youtube Title")
-                            tb_description = gr.Textbox(label="Youtube Description", max_lines=15)
-                    whisper_params, dd_file_format, cb_timestamp = self.create_whisper_parameters()
-                    with gr.Row():
-                        btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
-                    with gr.Row():
-                        tb_indicator = gr.Textbox(label="Output", scale=5)
-                        files_subtitles = gr.Files(label="Downloadable output file", scale=3)
-                        btn_openfolder = gr.Button('📂', scale=1)
-                    params = [tb_youtubelink, dd_file_format, cb_timestamp]
-                    btn_run.click(fn=self.whisper_inf.transcribe_youtube,
-                                  inputs=params + whisper_params.as_list(),
-                                  outputs=[tb_indicator, files_subtitles])
-                    tb_youtubelink.change(get_ytmetas, inputs=[tb_youtubelink],
-                                          outputs=[img_thumbnail, tb_title, tb_description])
-                    btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
-                with gr.TabItem("Mic"):  # tab3
-                    with gr.Row():
-                        mic_input = gr.Microphone(label="Record with Mic", type="filepath", interactive=True)
-                    whisper_params, dd_file_format, cb_timestamp = self.create_whisper_parameters()
-                    with gr.Row():
-                        btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
-                    with gr.Row():
-                        tb_indicator = gr.Textbox(label="Output", scale=5)
-                        files_subtitles = gr.Files(label="Downloadable output file", scale=3)
-                        btn_openfolder = gr.Button('📂', scale=1)
-                    params = [mic_input, dd_file_format, cb_timestamp]
-                    btn_run.click(fn=self.whisper_inf.transcribe_mic,
-                                  inputs=params + whisper_params.as_list(),
-                                  outputs=[tb_indicator, files_subtitles])
-                    btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
-                with gr.TabItem("T2T Translation"):  # tab 4
-                    with gr.Row():
-                        file_subs = gr.Files(type="filepath", label="Upload Subtitle Files to translate here",
-                                             file_types=['.vtt', '.srt'])
-                    with gr.TabItem("DeepL API"):  # sub tab1
                         with gr.Row():
-                            tb_api_key = gr.Textbox(label="Your Auth Key (API KEY)", value=deepl_params["api_key"])
                         with gr.Row():
-                            dd_source_lang = gr.Dropdown(label="Source Language", value=deepl_params["source_lang"],
-                                                          choices=list(
-                                                                  self.deepl_api.available_source_langs.keys()))
-                            dd_target_lang = gr.Dropdown(label="Target Language", value=deepl_params["target_lang"],
-                                                         choices=list(self.deepl_api.available_target_langs.keys()))
-                        with gr.Row():
-                            cb_is_pro = gr.Checkbox(label="Pro User?", value=deepl_params["is_pro"])
-                        with gr.Row():
-                            cb_timestamp = gr.Checkbox(value=translation_params["add_timestamp"], label="Add a timestamp to the end of the filename",
-                                                       interactive=True)
-                        with gr.Row():
-                            btn_run = gr.Button("TRANSLATE SUBTITLE FILE", variant="primary")
-                        with gr.Row():
-                            tb_indicator = gr.Textbox(label="Output", scale=5)
-                            files_subtitles = gr.Files(label="Downloadable output file", scale=3)
                             btn_openfolder = gr.Button('📂', scale=1)
-                    btn_run.click(fn=self.deepl_api.translate_deepl,
-                                  inputs=[tb_api_key, file_subs, dd_source_lang, dd_target_lang,
-                                          cb_is_pro, cb_timestamp],
-                                  outputs=[tb_indicator, files_subtitles])
-                    btn_openfolder.click(fn=lambda: self.open_folder(os.path.join(self.args.output_dir, "translations")),
-                                         inputs=None,
-                                         outputs=None)
-                    with gr.TabItem("NLLB"):  # sub tab2
-                        with gr.Row():
-                            dd_model_size = gr.Dropdown(label="Model", value=nllb_params["model_size"],
-                                                        choices=self.nllb_inf.available_models)
-                            dd_source_lang = gr.Dropdown(label="Source Language", value=nllb_params["source_lang"],
-                                                         choices=self.nllb_inf.available_source_langs)
-                            dd_target_lang = gr.Dropdown(label="Target Language", value=nllb_params["target_lang"],
-                                                         choices=self.nllb_inf.available_target_langs)
-                        with gr.Row():
-                            nb_max_length = gr.Number(label="Max Length Per Line", value=nllb_params["max_length"],
-                                                      precision=0)
-                        with gr.Row():
-                            cb_timestamp = gr.Checkbox(value=translation_params["add_timestamp"], label="Add a timestamp to the end of the filename",
-                                                       interactive=True)
                         with gr.Row():
-                            btn_run = gr.Button("TRANSLATE SUBTITLE FILE", variant="primary")
-                        with gr.Row():
-                            tb_indicator = gr.Textbox(label="Output", scale=5)
-                            files_subtitles = gr.Files(label="Downloadable output file", scale=3)
-                            btn_openfolder = gr.Button('📂', scale=1)
                         with gr.Column():
-                            md_vram_table = gr.HTML(NLLB_VRAM_TABLE, elem_id="md_nllb_vram_table")
-                    btn_run.click(fn=self.nllb_inf.translate_file,
-                                  inputs=[file_subs, dd_model_size, dd_source_lang, dd_target_lang,
-                                          nb_max_length, cb_timestamp],
-                                  outputs=[tb_indicator, files_subtitles])
-                    btn_openfolder.click(fn=lambda: self.open_folder(os.path.join(self.args.output_dir, "translations")),
-                                         inputs=None,
-                                         outputs=None)
-                with gr.TabItem("BGM Separation"):
-                    files_audio = gr.Files(type="filepath", label="Upload Audio Files to separate background music")
-                    dd_uvr_device = gr.Dropdown(label="Device", value=self.whisper_inf.music_separator.device,
-                                                choices=self.whisper_inf.music_separator.available_devices)
-                    dd_uvr_model_size = gr.Dropdown(label="Model", value=uvr_params["model_size"],
-                                                    choices=self.whisper_inf.music_separator.available_models)
-                    nb_uvr_segment_size = gr.Number(label="Segment Size", value=uvr_params["segment_size"], precision=0)
-                    cb_uvr_save_file = gr.Checkbox(label="Save separated files to output",
-                                                   value=True, visible=False)
-                    btn_run = gr.Button("SEPARATE BACKGROUND MUSIC", variant="primary")
-                    with gr.Column():
-                        with gr.Row():
-                            ad_instrumental = gr.Audio(label="Instrumental", scale=8)
-                            btn_open_instrumental_folder = gr.Button('📂', scale=1)
-                        with gr.Row():
-                            ad_vocals = gr.Audio(label="Vocals", scale=8)
-                            btn_open_vocals_folder = gr.Button('📂', scale=1)
-                    btn_run.click(fn=self.whisper_inf.music_separator.separate_files,
-                                  inputs=[files_audio, dd_uvr_model_size, dd_uvr_device, nb_uvr_segment_size,
-                                          cb_uvr_save_file],
-                                  outputs=[ad_instrumental, ad_vocals])
-                    btn_open_instrumental_folder.click(inputs=None,
-                                                       outputs=None,
-                                                       fn=lambda: self.open_folder(os.path.join(
-                                                           self.args.output_dir, "UVR", "instrumental"
-                                                       )))
-                    btn_open_vocals_folder.click(inputs=None,
-                                                 outputs=None,
-                                                 fn=lambda: self.open_folder(os.path.join(
-                                                    self.args.output_dir, "UVR", "vocals"
-                                                 )))
         # Launch the app with optional gradio settings
         args = self.args
         self.app.queue(
             api_open=args.api_open
         ).launch(
@@ -419,10 +315,10 @@ class App:
             return gr.Checkbox(visible=True, value=False, label="Translate to English?", interactive=True)
-# Create the parser for command-line arguments
 parser = argparse.ArgumentParser()
-parser.add_argument('--whisper_type', type=str, default="faster-whisper",
-                    help='A type of the whisper implementation between: ["whisper", "faster-whisper", "insanely-fast-whisper"]')
 parser.add_argument('--share', type=str2bool, default=False, nargs='?', const=True, help='Gradio share value')
 parser.add_argument('--server_name', type=str, default=None, help='Gradio server host')
 parser.add_argument('--server_port', type=int, default=None, help='Gradio server port')
@@ -431,8 +327,10 @@ parser.add_argument('--username', type=str, default=None, help='Gradio authentic
 parser.add_argument('--password', type=str, default=None, help='Gradio authentication password')
 parser.add_argument('--theme', type=str, default=None, help='Gradio Blocks theme')
 parser.add_argument('--colab', type=str2bool, default=False, nargs='?', const=True, help='Is colab user or not')
-parser.add_argument('--api_open', type=str2bool, default=False, nargs='?', const=True, help='Enable api or not in Gradio')
-parser.add_argument('--inbrowser', type=str2bool, default=True, nargs='?', const=True, help='Whether to automatically start Gradio app or not')
 parser.add_argument('--whisper_model_dir', type=str, default=WHISPER_MODELS_DIR,
                     help='Directory path of the whisper model')
 parser.add_argument('--faster_whisper_model_dir', type=str, default=FASTER_WHISPER_MODELS_DIR,

 import os
 import argparse
 import gradio as gr
+from gradio_i18n import Translate, gettext as _
 import yaml
 from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, WHISPER_MODELS_DIR,
                                  INSANELY_FAST_WHISPER_MODELS_DIR, NLLB_MODELS_DIR, DEFAULT_PARAMETERS_CONFIG_PATH,
+                                 UVR_MODELS_DIR, I18N_YAML_PATH)
 from modules.utils.files_manager import load_yaml
 from modules.whisper.whisper_factory import WhisperFactory
 from modules.translation.nllb_inference import NLLBInference
 from modules.ui.htmls import *
 from modules.utils.cli_manager import str2bool
 from modules.utils.youtube_manager import get_ytmetas
 from modules.translation.deepl_api import DeepLAPI
+from modules.whisper.data_classes import *
 class App:
     def __init__(self, args):
         self.args = args
         self.app = gr.Blocks(css=CSS, theme=self.args.theme, delete_cache=(60, 3600))
+        self.i18n = Translate(I18N_YAML_PATH)
         self.whisper_inf = WhisperFactory.create_whisper_inference(
             whisper_type=self.args.whisper_type,
             whisper_model_dir=self.args.whisper_model_dir,
             output_dir=os.path.join(self.args.output_dir, "translations")
         )
         self.default_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
+        print(f"Use \"{self.args.whisper_type}\" implementation\n"
+              f"Device \"{self.whisper_inf.device}\" is detected")
+    def create_pipeline_inputs(self):
         whisper_params = self.default_params["whisper"]
         vad_params = self.default_params["vad"]
         diarization_params = self.default_params["diarization"]
         with gr.Row():
             dd_model = gr.Dropdown(choices=self.whisper_inf.available_models, value=whisper_params["model_size"],
+                                   label=_("Model"))
+            dd_lang = gr.Dropdown(choices=self.whisper_inf.available_langs + [AUTOMATIC_DETECTION],
+                                  value=AUTOMATIC_DETECTION if whisper_params["lang"] == AUTOMATIC_DETECTION.unwrap()
+                                  else whisper_params["lang"], label=_("Language"))
+            dd_file_format = gr.Dropdown(choices=["SRT", "WebVTT", "txt", "LRC"], value=whisper_params["file_format"], label=_("File Format"))
         with gr.Row():
+            cb_translate = gr.Checkbox(value=whisper_params["is_translate"], label=_("Translate to English?"),
                                        interactive=True)
         with gr.Row():
+            cb_timestamp = gr.Checkbox(value=whisper_params["add_timestamp"],
+                                       label=_("Add a timestamp to the end of the filename"),
                                        interactive=True)
+        with gr.Accordion(_("Advanced Parameters"), open=False):
+            whisper_inputs = WhisperParams.to_gradio_inputs(defaults=whisper_params, only_advanced=True,
+                                                            whisper_type=self.args.whisper_type,
+                                                            available_compute_types=self.whisper_inf.available_compute_types,
+                                                            compute_type=self.whisper_inf.current_compute_type)
+        with gr.Accordion(_("Background Music Remover Filter"), open=False):
+            uvr_inputs = BGMSeparationParams.to_gradio_input(defaults=uvr_params,
+                                                             available_models=self.whisper_inf.music_separator.available_models,
+                                                             available_devices=self.whisper_inf.music_separator.available_devices,
+                                                             device=self.whisper_inf.music_separator.device)
+        with gr.Accordion(_("Voice Detection Filter"), open=False):
+            vad_inputs = VadParams.to_gradio_inputs(defaults=vad_params)
+        with gr.Accordion(_("Diarization"), open=False):
+            diarization_inputs = DiarizationParams.to_gradio_inputs(defaults=diarization_params,
+                                                                    available_devices=self.whisper_inf.diarizer.available_device,
+                                                                    device=self.whisper_inf.diarizer.device)
         dd_model.change(fn=self.on_change_models, inputs=[dd_model], outputs=[cb_translate])
+        pipeline_inputs = [dd_model, dd_lang, cb_translate] + whisper_inputs + vad_inputs + diarization_inputs + uvr_inputs
         return (
+            pipeline_inputs,
             dd_file_format,
             cb_timestamp
         )
         uvr_params = self.default_params["bgm_separation"]
         with self.app:
+            with self.i18n:
+                with gr.Row():
                     with gr.Column():
+                        gr.Markdown(MARKDOWN, elem_id="md_project")
+                with gr.Tabs():
+                    with gr.TabItem(_("File")):  # tab1
                         with gr.Column():
+                            input_file = gr.Files(type="filepath", label=_("Upload File here"))
+                            tb_input_folder = gr.Textbox(label="Input Folder Path (Optional)",
+                                                         info="Optional: Specify the folder path where the input files are located, if you prefer to use local files instead of uploading them."
+                                                              " Leave this field empty if you do not wish to use a local path.",
+                                                         visible=self.args.colab,
+                                                         value="")
+                        pipeline_params, dd_file_format, cb_timestamp = self.create_pipeline_inputs()
+                        with gr.Row():
+                            btn_run = gr.Button(_("GENERATE SUBTITLE FILE"), variant="primary")
+                        with gr.Row():
+                            tb_indicator = gr.Textbox(label=_("Output"), scale=5)
+                            files_subtitles = gr.Files(label=_("Downloadable output file"), scale=3, interactive=False)
+                            btn_openfolder = gr.Button('📂', scale=1)
+                        params = [input_file, tb_input_folder, dd_file_format, cb_timestamp]
+                        btn_run.click(fn=self.whisper_inf.transcribe_file,
+                                      inputs=params + pipeline_params,
+                                      outputs=[tb_indicator, files_subtitles])
+                        btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
+                    with gr.TabItem(_("Youtube")):  # tab2
+                        with gr.Row():
+                            tb_youtubelink = gr.Textbox(label=_("Youtube Link"))
+                        with gr.Row(equal_height=True):
+                            with gr.Column():
+                                img_thumbnail = gr.Image(label=_("Youtube Thumbnail"))
+                            with gr.Column():
+                                tb_title = gr.Label(label=_("Youtube Title"))
+                                tb_description = gr.Textbox(label=_("Youtube Description"), max_lines=15)
+                        pipeline_params, dd_file_format, cb_timestamp = self.create_pipeline_inputs()
+                        with gr.Row():
+                            btn_run = gr.Button(_("GENERATE SUBTITLE FILE"), variant="primary")
+                        with gr.Row():
+                            tb_indicator = gr.Textbox(label=_("Output"), scale=5)
+                            files_subtitles = gr.Files(label=_("Downloadable output file"), scale=3)
+                            btn_openfolder = gr.Button('📂', scale=1)
+                        params = [tb_youtubelink, dd_file_format, cb_timestamp]
+                        btn_run.click(fn=self.whisper_inf.transcribe_youtube,
+                                      inputs=params + pipeline_params,
+                                      outputs=[tb_indicator, files_subtitles])
+                        tb_youtubelink.change(get_ytmetas, inputs=[tb_youtubelink],
+                                              outputs=[img_thumbnail, tb_title, tb_description])
+                        btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
+                    with gr.TabItem(_("Mic")):  # tab3
+                        with gr.Row():
+                            mic_input = gr.Microphone(label=_("Record with Mic"), type="filepath", interactive=True)
+                        pipeline_params, dd_file_format, cb_timestamp = self.create_pipeline_inputs()
                         with gr.Row():
+                            btn_run = gr.Button(_("GENERATE SUBTITLE FILE"), variant="primary")
                         with gr.Row():
+                            tb_indicator = gr.Textbox(label=_("Output"), scale=5)
+                            files_subtitles = gr.Files(label=_("Downloadable output file"), scale=3)
                             btn_openfolder = gr.Button('📂', scale=1)
+                        params = [mic_input, dd_file_format, cb_timestamp]
+                        btn_run.click(fn=self.whisper_inf.transcribe_mic,
+                                      inputs=params + pipeline_params,
+                                      outputs=[tb_indicator, files_subtitles])
+                        btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
+                    with gr.TabItem(_("T2T Translation")):  # tab 4
                         with gr.Row():
+                            file_subs = gr.Files(type="filepath", label=_("Upload Subtitle Files to translate here"))
+                        with gr.TabItem(_("DeepL API")):  # sub tab1
+                            with gr.Row():
+                                tb_api_key = gr.Textbox(label=_("Your Auth Key (API KEY)"),
+                                                        value=deepl_params["api_key"])
+                            with gr.Row():
+                                dd_source_lang = gr.Dropdown(label=_("Source Language"),
+                                                             value=AUTOMATIC_DETECTION if deepl_params["source_lang"] == AUTOMATIC_DETECTION.unwrap()
+                                                             else deepl_params["source_lang"],
+                                                             choices=list(self.deepl_api.available_source_langs.keys()))
+                                dd_target_lang = gr.Dropdown(label=_("Target Language"),
+                                                             value=deepl_params["target_lang"],
+                                                             choices=list(self.deepl_api.available_target_langs.keys()))
+                            with gr.Row():
+                                cb_is_pro = gr.Checkbox(label=_("Pro User?"), value=deepl_params["is_pro"])
+                            with gr.Row():
+                                cb_timestamp = gr.Checkbox(value=translation_params["add_timestamp"],
+                                                           label=_("Add a timestamp to the end of the filename"),
+                                                           interactive=True)
+                            with gr.Row():
+                                btn_run = gr.Button(_("TRANSLATE SUBTITLE FILE"), variant="primary")
+                            with gr.Row():
+                                tb_indicator = gr.Textbox(label=_("Output"), scale=5)
+                                files_subtitles = gr.Files(label=_("Downloadable output file"), scale=3)
+                                btn_openfolder = gr.Button('📂', scale=1)
+                        btn_run.click(fn=self.deepl_api.translate_deepl,
+                                      inputs=[tb_api_key, file_subs, dd_source_lang, dd_target_lang,
+                                              cb_is_pro, cb_timestamp],
+                                      outputs=[tb_indicator, files_subtitles])
+                        btn_openfolder.click(
+                            fn=lambda: self.open_folder(os.path.join(self.args.output_dir, "translations")),
+                            inputs=None,
+                            outputs=None)
+                        with gr.TabItem(_("NLLB")):  # sub tab2
+                            with gr.Row():
+                                dd_model_size = gr.Dropdown(label=_("Model"), value=nllb_params["model_size"],
+                                                            choices=self.nllb_inf.available_models)
+                                dd_source_lang = gr.Dropdown(label=_("Source Language"),
+                                                             value=nllb_params["source_lang"],
+                                                             choices=self.nllb_inf.available_source_langs)
+                                dd_target_lang = gr.Dropdown(label=_("Target Language"),
+                                                             value=nllb_params["target_lang"],
+                                                             choices=self.nllb_inf.available_target_langs)
+                            with gr.Row():
+                                nb_max_length = gr.Number(label="Max Length Per Line", value=nllb_params["max_length"],
+                                                          precision=0)
+                            with gr.Row():
+                                cb_timestamp = gr.Checkbox(value=translation_params["add_timestamp"],
+                                                           label=_("Add a timestamp to the end of the filename"),
+                                                           interactive=True)
+                            with gr.Row():
+                                btn_run = gr.Button(_("TRANSLATE SUBTITLE FILE"), variant="primary")
+                            with gr.Row():
+                                tb_indicator = gr.Textbox(label=_("Output"), scale=5)
+                                files_subtitles = gr.Files(label=_("Downloadable output file"), scale=3)
+                                btn_openfolder = gr.Button('📂', scale=1)
+                            with gr.Column():
+                                md_vram_table = gr.HTML(NLLB_VRAM_TABLE, elem_id="md_nllb_vram_table")
+                        btn_run.click(fn=self.nllb_inf.translate_file,
+                                      inputs=[file_subs, dd_model_size, dd_source_lang, dd_target_lang,
+                                              nb_max_length, cb_timestamp],
+                                      outputs=[tb_indicator, files_subtitles])
+                        btn_openfolder.click(
+                            fn=lambda: self.open_folder(os.path.join(self.args.output_dir, "translations")),
+                            inputs=None,
+                            outputs=None)
+                    with gr.TabItem(_("BGM Separation")):
+                        files_audio = gr.Files(type="filepath", label=_("Upload Audio Files to separate background music"))
+                        dd_uvr_device = gr.Dropdown(label=_("Device"), value=self.whisper_inf.music_separator.device,
+                                                    choices=self.whisper_inf.music_separator.available_devices)
+                        dd_uvr_model_size = gr.Dropdown(label=_("Model"), value=uvr_params["model_size"],
+                                                        choices=self.whisper_inf.music_separator.available_models)
+                        nb_uvr_segment_size = gr.Number(label="Segment Size", value=uvr_params["segment_size"],
+                                                        precision=0)
+                        cb_uvr_save_file = gr.Checkbox(label=_("Save separated files to output"),
+                                                       value=True, visible=False)
+                        btn_run = gr.Button(_("SEPARATE BACKGROUND MUSIC"), variant="primary")
                         with gr.Column():
+                            with gr.Row():
+                                ad_instrumental = gr.Audio(label=_("Instrumental"), scale=8)
+                                btn_open_instrumental_folder = gr.Button('📂', scale=1)
+                            with gr.Row():
+                                ad_vocals = gr.Audio(label=_("Vocals"), scale=8)
+                                btn_open_vocals_folder = gr.Button('📂', scale=1)
+                        btn_run.click(fn=self.whisper_inf.music_separator.separate_files,
+                                      inputs=[files_audio, dd_uvr_model_size, dd_uvr_device, nb_uvr_segment_size,
+                                              cb_uvr_save_file],
+                                      outputs=[ad_instrumental, ad_vocals])
+                        btn_open_instrumental_folder.click(inputs=None,
+                                                           outputs=None,
+                                                           fn=lambda: self.open_folder(os.path.join(
+                                                               self.args.output_dir, "UVR", "instrumental"
+                                                           )))
+                        btn_open_vocals_folder.click(inputs=None,
+                                                     outputs=None,
+                                                     fn=lambda: self.open_folder(os.path.join(
+                                                         self.args.output_dir, "UVR", "vocals"
+                                                     )))
         # Launch the app with optional gradio settings
         args = self.args
         self.app.queue(
             api_open=args.api_open
         ).launch(
             return gr.Checkbox(visible=True, value=False, label="Translate to English?", interactive=True)
 parser = argparse.ArgumentParser()
+parser.add_argument('--whisper_type', type=str, default=WhisperImpl.FASTER_WHISPER.value,
+                    choices=[item.value for item in WhisperImpl],
+                    help='A type of the whisper implementation (Github repo name)')
 parser.add_argument('--share', type=str2bool, default=False, nargs='?', const=True, help='Gradio share value')
 parser.add_argument('--server_name', type=str, default=None, help='Gradio server host')
 parser.add_argument('--server_port', type=int, default=None, help='Gradio server port')
 parser.add_argument('--password', type=str, default=None, help='Gradio authentication password')
 parser.add_argument('--theme', type=str, default=None, help='Gradio Blocks theme')
 parser.add_argument('--colab', type=str2bool, default=False, nargs='?', const=True, help='Is colab user or not')
+parser.add_argument('--api_open', type=str2bool, default=False, nargs='?', const=True,
+                    help='Enable api or not in Gradio')
+parser.add_argument('--inbrowser', type=str2bool, default=True, nargs='?', const=True,
+                    help='Whether to automatically start Gradio app or not')
 parser.add_argument('--whisper_model_dir', type=str, default=WHISPER_MODELS_DIR,
                     help='Directory path of the whisper model')
 parser.add_argument('--faster_whisper_model_dir', type=str, default=FASTER_WHISPER_MODELS_DIR,

configs/default_parameters.yaml CHANGED Viewed

@@ -1,5 +1,6 @@
 whisper:
   model_size: "large-v2"
   lang: "Automatic Detection"
   is_translate: false
   beam_size: 5

 whisper:
   model_size: "large-v2"
+  file_format: "SRT"
   lang: "Automatic Detection"
   is_translate: false
   beam_size: 5

configs/translation.yaml ADDED Viewed

	@@ -0,0 +1,459 @@

+en: # English
+  Language: Language
+  File: File
+  Youtube: Youtube
+  Mic: Mic
+  T2T Translation: T2T Translation
+  BGM Separation: BGM Separation
+  GENERATE SUBTITLE FILE: GENERATE SUBTITLE FILE
+  Output: Output
+  Downloadable output file: Downloadable output file
+  Upload File here: Upload File here
+  Model: Model
+  Automatic Detection: Automatic Detection
+  File Format: File Format
+  Translate to English?: Translate to English?
+  Add a timestamp to the end of the filename: Add a timestamp to the end of the filename
+  Advanced Parameters: Advanced Parameters
+  Background Music Remover Filter: Background Music Remover Filter
+  Enabling this will remove background music: Enabling this will remove background music by submodel before transcribing
+  Enable Background Music Remover Filter: Enable Background Music Remover Filter
+  Save separated files to output: Save separated files to output
+  Offload sub model after removing background music: Offload sub model after removing background music
+  Voice Detection Filter: Voice Detection Filter
+  Enable this to transcribe only detected voice: Enable this to transcribe only detected voice parts by submodel.
+  Enable Silero VAD Filter: Enable Silero VAD Filter
+  Diarization: Diarization
+  Enable Diarization: Enable Diarization
+  HuggingFace Token: HuggingFace Token
+  This is only needed the first time you download the model: This is only needed the first time you download the model. If you already have models, you don't need to enter. To download the model, you must manually go to "https://huggingface.co/pyannote/speaker-diarization-3.1" and "https://huggingface.co/pyannote/segmentation-3.0" and agree to their requirement.
+  Device: Device
+  Youtube Link: Youtube Link
+  Youtube Thumbnail: Youtube Thumbnail
+  Youtube Title: Youtube Title
+  Youtube Description: Youtube Description
+  Record with Mic: Record with Mic
+  Upload Subtitle Files to translate here: Upload Subtitle Files to translate here
+  Your Auth Key (API KEY): Your Auth Key (API KEY)
+  Source Language: Source Language
+  Target Language: Target Language
+  Pro User?: Pro User?
+  TRANSLATE SUBTITLE FILE: TRANSLATE SUBTITLE FILE
+  Upload Audio Files to separate background music: Upload Audio Files to separate background music
+  Instrumental: Instrumental
+  Vocals: Vocals
+  SEPARATE BACKGROUND MUSIC: SEPARATE BACKGROUND MUSIC
+ko: # Korean
+  Language: 언어
+  File: 파일
+  Youtube: 유튜브
+  Mic: 마이크
+  T2T Translation: T2T 자막 번역
+  BGM Separation: 배경 음악 분리
+  GENERATE SUBTITLE FILE: 자막 파일 생성
+  Output: 결과물
+  Downloadable output file: 결과물 파일 다운로드
+  Upload File here: 파일을 업로드 하세요
+  Model: 모델
+  Automatic Detection: 자동 감지
+  File Format: 파일 형식
+  Translate to English?: 영어로 번역합니까? (위스퍼 모델 자체 번역 기능)
+  Add a timestamp to the end of the filename: 파일 이름 끝에 타임스태프 붙이기
+  Advanced Parameters: 고급 변수
+  Background Music Remover Filter: 배경 음악 제거 필터
+  Enabling this will remove background music: 받아쓰기 이전에 먼저 배경 음악 제거용 서브 모델을 활성화 합니다.
+  Enable Background Music Remover Filter: 배경 음악 제거 필터 활성화
+  Save separated files to output: 분리된 배경 음악 & 음성 파일 따로 출력 폴더에 저장
+  Offload sub model after removing background music: 배경 음악 제거 후 서브 모델을 비활성화 합니다. (VRAM 이 부족할 시 체크하세요.)
+  Voice Detection Filter: 목소리 감지 필터
+  Enable this to transcribe only detected voice: 서브 모델에 의해 목소리라고 판단된 부분만 받아쓰기를 진행합니다.
+  Enable Silero VAD Filter: Silero VAD 필터 활성화
+  Diarization: 화자 구분
+  Enable Diarization: 화자 구분 활성화
+  HuggingFace Token: 허깅페이스 토큰
+  This is only needed the first time you download the model: 모델을 처음 다운받을 때만 토큰이 필요합니다. 이미 다운로드 받으신 상태라면 입력하지 않아도 됩니다. 모델을 다운 받기 위해선 "https://huggingface.co/pyannote/speaker-diarization-3.1" 와 "https://huggingface.co/pyannote/segmentation-3.0" 에서 먼저 사용 지침에 동의하셔야 합니다.
+  Device: 디바이스
+  Youtube Link: 유튜브 링크
+  Youtube Thumbnail: 유튜브 썸네일
+  Youtube Title: 유튜브 제목
+  Youtube Description: 유튜브 설명
+  Record with Mic: 마이크로 녹음하세요
+  Upload Subtitle Files to translate here: 번역할 자막 파일을 업로드 하세요
+  Your Auth Key (API KEY): DeepL API 키
+  Source Language: 원본 언어
+  Target Language: 대상 언어
+  Pro User?: Pro 버전 사용자
+  TRANSLATE SUBTITLE FILE: 자막 파일 번역
+  Upload Audio Files to separate background music: 배경 음악을 분리할 오디오 파일을 업로드 하세요
+  Instrumental: 악기
+  Vocals: 보컬
+  SEPARATE BACKGROUND MUSIC: 배경 음악 분리
+ja: # Japanese
+  Language: 言語
+  File: File
+  Youtube: Youtube
+  Mic: Mic
+  T2T Translation: T2T Translation
+  BGM Separation: BGM Separation
+  GENERATE SUBTITLE FILE: GENERATE SUBTITLE FILE
+  Output: Output
+  Downloadable output file: Downloadable output file
+  Upload File here: Upload File here
+  Model: Model
+  Automatic Detection: Automatic Detection
+  File Format: File Format
+  Translate to English?: Translate to English?
+  Add a timestamp to the end of the filename: Add a timestamp to the end of the filename
+  Advanced Parameters: Advanced Parameters
+  Background Music Remover Filter: Background Music Remover Filter
+  Enabling this will remove background music: Enabling this will remove background music by submodel before transcribing
+  Enable Background Music Remover Filter: Enable Background Music Remover Filter
+  Save separated files to output: Save separated files to output
+  Offload sub model after removing background music: Offload sub model after removing background music
+  Voice Detection Filter: Voice Detection Filter
+  Enable this to transcribe only detected voice: Enable this to transcribe only detected voice parts by submodel.
+  Enable Silero VAD Filter: Enable Silero VAD Filter
+  Diarization: Diarization
+  Enable Diarization: Enable Diarization
+  HuggingFace Token: HuggingFace Token
+  This is only needed the first time you download the model: This is only needed the first time you download the model. If you already have models, you don't need to enter. To download the model, you must manually go to "https://huggingface.co/pyannote/speaker-diarization-3.1" and "https://huggingface.co/pyannote/segmentation-3.0" and agree to their requirement.
+  Device: Device
+  Youtube Link: Youtube Link
+  Youtube Thumbnail: Youtube Thumbnail
+  Youtube Title: Youtube Title
+  Youtube Description: Youtube Description
+  Record with Mic: Record with Mic
+  Upload Subtitle Files to translate here: Upload Subtitle Files to translate here
+  Your Auth Key (API KEY): Your Auth Key (API KEY)
+  Source Language: Source Language
+  Target Language: Target Language
+  Pro User?: Pro User?
+  TRANSLATE SUBTITLE FILE: TRANSLATE SUBTITLE FILE
+  Upload Audio Files to separate background music: Upload Audio Files to separate background music
+  Instrumental: Instrumental
+  Vocals: Vocals
+  SEPARATE BACKGROUND MUSIC: SEPARATE BACKGROUND MUSIC
+es: # Spanish
+  Language: Idioma
+  File: File
+  Youtube: Youtube
+  Mic: Mic
+  T2T Translation: T2T Translation
+  BGM Separation: BGM Separation
+  GENERATE SUBTITLE FILE: GENERATE SUBTITLE FILE
+  Output: Output
+  Downloadable output file: Downloadable output file
+  Upload File here: Upload File here
+  Model: Model
+  Automatic Detection: Automatic Detection
+  File Format: File Format
+  Translate to English?: Translate to English?
+  Add a timestamp to the end of the filename: Add a timestamp to the end of the filename
+  Advanced Parameters: Advanced Parameters
+  Background Music Remover Filter: Background Music Remover Filter
+  Enabling this will remove background music: Enabling this will remove background music by submodel before transcribing
+  Enable Background Music Remover Filter: Enable Background Music Remover Filter
+  Save separated files to output: Save separated files to output
+  Offload sub model after removing background music: Offload sub model after removing background music
+  Voice Detection Filter: Voice Detection Filter
+  Enable this to transcribe only detected voice: Enable this to transcribe only detected voice parts by submodel.
+  Enable Silero VAD Filter: Enable Silero VAD Filter
+  Diarization: Diarization
+  Enable Diarization: Enable Diarization
+  HuggingFace Token: HuggingFace Token
+  This is only needed the first time you download the model: This is only needed the first time you download the model. If you already have models, you don't need to enter. To download the model, you must manually go to "https://huggingface.co/pyannote/speaker-diarization-3.1" and "https://huggingface.co/pyannote/segmentation-3.0" and agree to their requirement.
+  Device: Device
+  Youtube Link: Youtube Link
+  Youtube Thumbnail: Youtube Thumbnail
+  Youtube Title: Youtube Title
+  Youtube Description: Youtube Description
+  Record with Mic: Record with Mic
+  Upload Subtitle Files to translate here: Upload Subtitle Files to translate here
+  Your Auth Key (API KEY): Your Auth Key (API KEY)
+  Source Language: Source Language
+  Target Language: Target Language
+  Pro User?: Pro User?
+  TRANSLATE SUBTITLE FILE: TRANSLATE SUBTITLE FILE
+  Upload Audio Files to separate background music: Upload Audio Files to separate background music
+  Instrumental: Instrumental
+  Vocals: Vocals
+  SEPARATE BACKGROUND MUSIC: SEPARATE BACKGROUND MUSIC
+fr: # French
+  Language: Langue
+  File: File
+  Youtube: Youtube
+  Mic: Mic
+  T2T Translation: T2T Translation
+  BGM Separation: BGM Separation
+  GENERATE SUBTITLE FILE: GENERATE SUBTITLE FILE
+  Output: Output
+  Downloadable output file: Downloadable output file
+  Upload File here: Upload File here
+  Model: Model
+  Automatic Detection: Automatic Detection
+  File Format: File Format
+  Translate to English?: Translate to English?
+  Add a timestamp to the end of the filename: Add a timestamp to the end of the filename
+  Advanced Parameters: Advanced Parameters
+  Background Music Remover Filter: Background Music Remover Filter
+  Enabling this will remove background music: Enabling this will remove background music by submodel before transcribing
+  Enable Background Music Remover Filter: Enable Background Music Remover Filter
+  Save separated files to output: Save separated files to output
+  Offload sub model after removing background music: Offload sub model after removing background music
+  Voice Detection Filter: Voice Detection Filter
+  Enable this to transcribe only detected voice: Enable this to transcribe only detected voice parts by submodel.
+  Enable Silero VAD Filter: Enable Silero VAD Filter
+  Diarization: Diarization
+  Enable Diarization: Enable Diarization
+  HuggingFace Token: HuggingFace Token
+  This is only needed the first time you download the model: This is only needed the first time you download the model. If you already have models, you don't need to enter. To download the model, you must manually go to "https://huggingface.co/pyannote/speaker-diarization-3.1" and "https://huggingface.co/pyannote/segmentation-3.0" and agree to their requirement.
+  Device: Device
+  Youtube Link: Youtube Link
+  Youtube Thumbnail: Youtube Thumbnail
+  Youtube Title: Youtube Title
+  Youtube Description: Youtube Description
+  Record with Mic: Record with Mic
+  Upload Subtitle Files to translate here: Upload Subtitle Files to translate here
+  Your Auth Key (API KEY): Your Auth Key (API KEY)
+  Source Language: Source Language
+  Target Language: Target Language
+  Pro User?: Pro User?
+  TRANSLATE SUBTITLE FILE: TRANSLATE SUBTITLE FILE
+  Upload Audio Files to separate background music: Upload Audio Files to separate background music
+  Instrumental: Instrumental
+  Vocals: Vocals
+  SEPARATE BACKGROUND MUSIC: SEPARATE BACKGROUND MUSIC
+de: # German
+  Language: Sprache
+  File: File
+  Youtube: Youtube
+  Mic: Mic
+  T2T Translation: T2T Translation
+  BGM Separation: BGM Separation
+  GENERATE SUBTITLE FILE: GENERATE SUBTITLE FILE
+  Output: Output
+  Downloadable output file: Downloadable output file
+  Upload File here: Upload File here
+  Model: Model
+  Automatic Detection: Automatic Detection
+  File Format: File Format
+  Translate to English?: Translate to English?
+  Add a timestamp to the end of the filename: Add a timestamp to the end of the filename
+  Advanced Parameters: Advanced Parameters
+  Background Music Remover Filter: Background Music Remover Filter
+  Enabling this will remove background music: Enabling this will remove background music by submodel before transcribing
+  Enable Background Music Remover Filter: Enable Background Music Remover Filter
+  Save separated files to output: Save separated files to output
+  Offload sub model after removing background music: Offload sub model after removing background music
+  Voice Detection Filter: Voice Detection Filter
+  Enable this to transcribe only detected voice: Enable this to transcribe only detected voice parts by submodel.
+  Enable Silero VAD Filter: Enable Silero VAD Filter
+  Diarization: Diarization
+  Enable Diarization: Enable Diarization
+  HuggingFace Token: HuggingFace Token
+  This is only needed the first time you download the model: This is only needed the first time you download the model. If you already have models, you don't need to enter. To download the model, you must manually go to "https://huggingface.co/pyannote/speaker-diarization-3.1" and "https://huggingface.co/pyannote/segmentation-3.0" and agree to their requirement.
+  Device: Device
+  Youtube Link: Youtube Link
+  Youtube Thumbnail: Youtube Thumbnail
+  Youtube Title: Youtube Title
+  Youtube Description: Youtube Description
+  Record with Mic: Record with Mic
+  Upload Subtitle Files to translate here: Upload Subtitle Files to translate here
+  Your Auth Key (API KEY): Your Auth Key (API KEY)
+  Source Language: Source Language
+  Target Language: Target Language
+  Pro User?: Pro User?
+  TRANSLATE SUBTITLE FILE: TRANSLATE SUBTITLE FILE
+  Upload Audio Files to separate background music: Upload Audio Files to separate background music
+  Instrumental: Instrumental
+  Vocals: Vocals
+  SEPARATE BACKGROUND MUSIC: SEPARATE BACKGROUND MUSIC
+zh: # Chinese
+  Language: 语言
+  File: File
+  Youtube: Youtube
+  Mic: Mic
+  T2T Translation: T2T Translation
+  BGM Separation: BGM Separation
+  GENERATE SUBTITLE FILE: GENERATE SUBTITLE FILE
+  Output: Output
+  Downloadable output file: Downloadable output file
+  Upload File here: Upload File here
+  Model: Model
+  Automatic Detection: Automatic Detection
+  File Format: File Format
+  Translate to English?: Translate to English?
+  Add a timestamp to the end of the filename: Add a timestamp to the end of the filename
+  Advanced Parameters: Advanced Parameters
+  Background Music Remover Filter: Background Music Remover Filter
+  Enabling this will remove background music: Enabling this will remove background music by submodel before transcribing
+  Enable Background Music Remover Filter: Enable Background Music Remover Filter
+  Save separated files to output: Save separated files to output
+  Offload sub model after removing background music: Offload sub model after removing background music
+  Voice Detection Filter: Voice Detection Filter
+  Enable this to transcribe only detected voice: Enable this to transcribe only detected voice parts by submodel.
+  Enable Silero VAD Filter: Enable Silero VAD Filter
+  Diarization: Diarization
+  Enable Diarization: Enable Diarization
+  HuggingFace Token: HuggingFace Token
+  This is only needed the first time you download the model: This is only needed the first time you download the model. If you already have models, you don't need to enter. To download the model, you must manually go to "https://huggingface.co/pyannote/speaker-diarization-3.1" and "https://huggingface.co/pyannote/segmentation-3.0" and agree to their requirement.
+  Device: Device
+  Youtube Link: Youtube Link
+  Youtube Thumbnail: Youtube Thumbnail
+  Youtube Title: Youtube Title
+  Youtube Description: Youtube Description
+  Record with Mic: Record with Mic
+  Upload Subtitle Files to translate here: Upload Subtitle Files to translate here
+  Your Auth Key (API KEY): Your Auth Key (API KEY)
+  Source Language: Source Language
+  Target Language: Target Language
+  Pro User?: Pro User?
+  TRANSLATE SUBTITLE FILE: TRANSLATE SUBTITLE FILE
+  Upload Audio Files to separate background music: Upload Audio Files to separate background music
+  Instrumental: Instrumental
+  Vocals: Vocals
+  SEPARATE BACKGROUND MUSIC: SEPARATE BACKGROUND MUSIC
+uk: # Ukrainian
+  Language: Мова
+  File: Файл
+  Youtube: Youtube
+  Mic: Мікрофон
+  T2T Translation: T2T Переклад
+  BGM Separation: Розділення фонової музики
+  GENERATE SUBTITLE FILE: СТВОРИТИ ФАЙЛ СУБТИТРІВ
+  Output: Результат
+  Downloadable output file: Завантажуваний файл результату
+  Upload File here: Завантажте файл тут
+  Model: Модель
+  Automatic Detection: Автоматичне визначення
+  File Format: Формат файлу
+  Translate to English?: Перекласти на англійську?
+  Add a timestamp to the end of the filename: Додати мітку часу до кінця імені файлу
+  Advanced Parameters: Розширені параметри
+  Background Music Remover Filter: Фільтр видалення фонової музики
+  Enabling this will remove background music: Увімкнення цього видалить фонову музику за допомогою підмоделі перед транскрипцією
+  Enable Background Music Remover Filter: Увімкнути фільтр видалення фонової музики
+  Save separated files to output: Зберегти розділені файли до вихідної папки
+  Offload sub model after removing background music: Вивантажити підмодель після видалення фонової музики
+  Voice Detection Filter: Фільтр розпізнавання голосу
+  Enable this to transcribe only detected voice: Увімкніть це, щоб транскрибувати лише розпізнані голосові частини за допомогою підмоделі
+  Enable Silero VAD Filter: Увімкнути фільтр Silero VAD
+  Diarization: Діаризація
+  Enable Diarization: Увімкнути діаризацію
+  HuggingFace Token: Токен HuggingFace
+  This is only needed the first time you download the model: Це потрібно лише при першому завантаженні моделі. Якщо у вас вже є моделі, вводити не потрібно. Щоб завантажити модель, потрібно вручну перейти на "https://huggingface.co/pyannote/speaker-diarization-3.1" та "https://huggingface.co/pyannote/segmentation-3.0" і погодитися з їхніми вимогами.
+  Device: Пристрій
+  Youtube Link: Посилання на Youtube
+  Youtube Thumbnail: Ескіз Youtube
+  Youtube Title: Назва Youtube
+  Youtube Description: Опис Youtube
+  Record with Mic: Записати з мікрофона
+  Upload Subtitle Files to translate here: Завантажте файли субтитрів для перекладу тут
+  Your Auth Key (API KEY): Ваш ключ авторизації (API KEY)
+  Source Language: Мова джерела
+  Target Language: Мова перекладу
+  Pro User?: Професійний користувач?
+  TRANSLATE SUBTITLE FILE: ПЕРЕКЛАСТИ ФАЙЛ СУБТИТРІВ
+  Upload Audio Files to separate background music: Завантажте аудіофайли для розділення фонової музики
+  Instrumental: Інструментал
+  Vocals: Вокал
+  SEPARATE BACKGROUND MUSIC: РОЗДІЛИТИ ФОНОВУ МУЗИКУ
+ru: # Russian
+  Language: Язык
+  File: Файл
+  Youtube: Youtube
+  Mic: Микрофон
+  T2T Translation: Перевод T2T
+  BGM Separation: Разделение фоновой музыки
+  GENERATE SUBTITLE FILE: СГЕНЕРИРОВАТЬ ФАЙЛ СУБТИТРОВ
+  Output: Результат
+  Downloadable output file: Загружаемый файл результата
+  Upload File here: Загрузите файл здесь
+  Model: Модель
+  Automatic Detection: Автоматическое определение
+  File Format: Формат файла
+  Translate to English?: Перевести на английский?
+  Add a timestamp to the end of the filename: Добавить метку времени в конец имени файла
+  Advanced Parameters: Расширенные параметры
+  Background Music Remover Filter: Фильтр удаления фоновой музыки
+  Enabling this will remove background music: Включение этого удалит фоновую музыку с помощью подмодели перед транскрипцией
+  Enable Background Music Remover Filter: Включить фильтр удаления фоновой музыки
+  Save separated files to output: Сохранить разделенные файлы в выходную папку
+  Offload sub model after removing background music: Выгрузить подмодель после удаления фоновой музыки
+  Voice Detection Filter: Фильтр обнаружения голоса
+  Enable this to transcribe only detected voice: Включите это, чтобы транскрибировать только обнаруженные голосовые части с помощью подмодели
+  Enable Silero VAD Filter: Включить фильтр Silero VAD
+  Diarization: Диаризация
+  Enable Diarization: Включить диаризацию
+  HuggingFace Token: Токен HuggingFace
+  This is only needed the first time you download the model: Это нужно только при первом скачивании модели. Если у вас уже есть модели, вводить не нужно. Чтобы скачать модель, нужно вручную перейти на "https://huggingface.co/pyannote/speaker-diarization-3.1" и "https://huggingface.co/pyannote/segmentation-3.0" и согласиться с их требованиями.
+  Device: Устройство
+  Youtube Link: Ссылка на Youtube
+  Youtube Thumbnail: Миниатюра Youtube
+  Youtube Title: Название Youtube
+  Youtube Description: Описание Youtube
+  Record with Mic: Записать с микрофона
+  Upload Subtitle Files to translate here: Загрузите файлы субтитров для перевода здесь
+  Your Auth Key (API KEY): Ваш Auth Key (API KEY)
+  Source Language: Исходный язык
+  Target Language: Целевой язык
+  Pro User?: Профессиональный пользователь?
+  TRANSLATE SUBTITLE FILE: ПЕРЕВЕСТИ ФАЙЛ СУБТИТРОВ
+  Upload Audio Files to separate background music: Загрузите аудиофайлы для разделения фоновой музыки
+  Instrumental: Инструментал
+  Vocals: Вокал
+  SEPARATE BACKGROUND MUSIC: РАЗДЕЛИТЬ ФОНОВУЮ МУЗЫКУ
+tr: # Turkish
+  Language: Dil
+  File: Dosya
+  Youtube: Youtube
+  Mic: Mikrofon
+  T2T Translation: T2T Çeviri
+  BGM Separation: Arka Plan Müziği Ayırma
+  GENERATE SUBTITLE FILE: ALTYAZI DOSYASI OLUŞTUR
+  Output: Çıktı
+  Downloadable output file: İndirilebilir çıktı dosyası
+  Upload File here: Dosya Yükle
+  Model: Model
+  Automatic Detection: Otomatik Algılama
+  File Format: Dosya Formatı
+  Translate to English?: İngilizceye Çevir?
+  Add a timestamp to the end of the filename: Dosya adının sonuna zaman damgası ekle
+  Advanced Parameters: Gelişmiş Parametreler
+  Background Music Remover Filter: Arka Plan Müziği Kaldırma Filtresi
+  Enabling this will remove background music: Bunu etkinleştirmek, arka plan müziğini alt model tarafından transkripsiyondan önce kaldıracaktır
+  Enable Background Music Remover Filter: Arka Plan Müziği Kaldırma Filtresini Etkinleştir
+  Save separated files to output: Ayrılmış dosyaları çıktıya kaydet
+  Offload sub model after removing background music: Arka plan müziği kaldırıldıktan sonra alt modeli devre dışı bırak
+  Voice Detection Filter: Ses Algılama Filtresi
+  Enable this to transcribe only detected voice: Bunu etkinleştirerek yalnızca alt model tarafından algılanan ses kısımlarını transkribe et
+  Enable Silero VAD Filter: Silero VAD Filtresini Etkinleştir
+  Diarization: Konuşmacı Ayrımı
+  Enable Diarization: Konuşmacı Ayrımını Etkinleştir
+  HuggingFace Token: HuggingFace Anahtarı
+  This is only needed the first time you download the model: Bu, modeli ilk kez indirirken gereklidir. Zaten modelleriniz varsa girmenize gerek yok. Modeli indirmek için "https://huggingface.co/pyannote/speaker-diarization-3.1" ve "https://huggingface.co/pyannote/segmentation-3.0" adreslerine gidip gereksinimlerini kabul etmeniz gerekiyor
+  Device: Cihaz
+  Youtube Link: Youtube Bağlantısı
+  Youtube Thumbnail: Youtube Küçük Resmi
+  Youtube Title: Youtube Başlığı
+  Youtube Description: Youtube Açıklaması
+  Record with Mic: Mikrofonla Kaydet
+  Upload Subtitle Files to translate here: Çeviri için altyazı dosyalarını buraya yükle
+  Your Auth Key (API KEY): Yetki Anahtarınız (API ANAHTARI)
+  Source Language: Kaynak Dil
+  Target Language: Hedef Dil
+  Pro User?: Pro Kullanıcı?
+  TRANSLATE SUBTITLE FILE: ALTYAZI DOSYASINI ÇEVİR
+  Upload Audio Files to separate background music: Arka plan müziğini ayırmak için ses dosyalarını yükle
+  Instrumental: Enstrümantal
+  Vocals: Vokal
+  SEPARATE BACKGROUND MUSIC: ARKA PLAN MÜZİĞİNİ AYIR

modules/diarize/diarize_pipeline.py CHANGED Viewed

@@ -7,6 +7,7 @@ from pyannote.audio import Pipeline
 from typing import Optional, Union
 import torch
 from modules.utils.paths import DIARIZATION_MODELS_DIR
 from modules.diarize.audio_loader import load_audio, SAMPLE_RATE
@@ -43,6 +44,8 @@ class DiarizationPipeline:
 def assign_word_speakers(diarize_df, transcript_result, fill_nearest=False):
     transcript_segments = transcript_result["segments"]
     for seg in transcript_segments:
         # assign speaker to segment (if any)
         diarize_df['intersection'] = np.minimum(diarize_df['end'], seg['end']) - np.maximum(diarize_df['start'],
@@ -63,7 +66,7 @@ def assign_word_speakers(diarize_df, transcript_result, fill_nearest=False):
             seg["speaker"] = speaker
         # assign speaker to words
-        if 'words' in seg:
             for word in seg['words']:
                 if 'start' in word:
                     diarize_df['intersection'] = np.minimum(diarize_df['end'], word['end']) - np.maximum(
@@ -85,10 +88,10 @@ def assign_word_speakers(diarize_df, transcript_result, fill_nearest=False):
                     if word_speaker is not None:
                         word["speaker"] = word_speaker
-    return transcript_result
-class Segment:
     def __init__(self, start, end, speaker=None):
         self.start = start
         self.end = end

 from typing import Optional, Union
 import torch
+from modules.whisper.data_classes import *
 from modules.utils.paths import DIARIZATION_MODELS_DIR
 from modules.diarize.audio_loader import load_audio, SAMPLE_RATE
 def assign_word_speakers(diarize_df, transcript_result, fill_nearest=False):
     transcript_segments = transcript_result["segments"]
+    if transcript_segments and isinstance(transcript_segments[0], Segment):
+        transcript_segments = [seg.model_dump() for seg in transcript_segments]
     for seg in transcript_segments:
         # assign speaker to segment (if any)
         diarize_df['intersection'] = np.minimum(diarize_df['end'], seg['end']) - np.maximum(diarize_df['start'],
             seg["speaker"] = speaker
         # assign speaker to words
+        if 'words' in seg and seg['words'] is not None:
             for word in seg['words']:
                 if 'start' in word:
                     diarize_df['intersection'] = np.minimum(diarize_df['end'], word['end']) - np.maximum(
                     if word_speaker is not None:
                         word["speaker"] = word_speaker
+    return {"segments": transcript_segments}
+class DiarizationSegment:
     def __init__(self, start, end, speaker=None):
         self.start = start
         self.end = end

modules/diarize/diarizer.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import os
 import torch
-from typing import List, Union, BinaryIO, Optional
 import numpy as np
 import time
 import logging
@@ -9,6 +9,7 @@ import spaces
 from modules.utils.paths import DIARIZATION_MODELS_DIR
 from modules.diarize.diarize_pipeline import DiarizationPipeline, assign_word_speakers
 from modules.diarize.audio_loader import load_audio
 class Diarizer:
@@ -25,10 +26,10 @@ class Diarizer:
     @spaces.GPU
     def run(self,
             audio: Union[str, BinaryIO, np.ndarray],
-            transcribed_result: List[dict],
             use_auth_token: str,
             device: Optional[str] = None
-            ):
         """
         Diarize transcribed result as a post-processing
@@ -36,7 +37,7 @@ class Diarizer:
         ----------
         audio: Union[str, BinaryIO, np.ndarray]
             Audio input. This can be file path or binary type.
-        transcribed_result: List[dict]
             transcribed result through whisper.
         use_auth_token: str
             Huggingface token with READ permission. This is only needed the first time you download the model.
@@ -46,8 +47,8 @@ class Diarizer:
         Returns
         ----------
-        segments_result: List[dict]
-            list of dicts that includes start, end timestamps and transcribed text
         elapsed_time: float
             elapsed time for running
         """
@@ -70,14 +71,20 @@ class Diarizer:
             {"segments": transcribed_result}
         )
         for segment in diarized_result["segments"]:
             speaker = "None"
             if "speaker" in segment:
                 speaker = segment["speaker"]
-            segment["text"] = speaker + "|" + segment["text"].strip()
         elapsed_time = time.time() - start_time
-        return diarized_result["segments"], elapsed_time
     @spaces.GPU
     def update_pipe(self,

 import os
 import torch
+from typing import List, Union, BinaryIO, Optional, Tuple
 import numpy as np
 import time
 import logging
 from modules.utils.paths import DIARIZATION_MODELS_DIR
 from modules.diarize.diarize_pipeline import DiarizationPipeline, assign_word_speakers
 from modules.diarize.audio_loader import load_audio
+from modules.whisper.data_classes import *
 class Diarizer:
     @spaces.GPU
     def run(self,
             audio: Union[str, BinaryIO, np.ndarray],
+            transcribed_result: List[Segment],
             use_auth_token: str,
             device: Optional[str] = None
+            ) -> Tuple[List[Segment], float]:
         """
         Diarize transcribed result as a post-processing
         ----------
         audio: Union[str, BinaryIO, np.ndarray]
             Audio input. This can be file path or binary type.
+        transcribed_result: List[Segment]
             transcribed result through whisper.
         use_auth_token: str
             Huggingface token with READ permission. This is only needed the first time you download the model.
         Returns
         ----------
+        segments_result: List[Segment]
+            list of Segment that includes start, end timestamps and transcribed text
         elapsed_time: float
             elapsed time for running
         """
             {"segments": transcribed_result}
         )
+        segments_result = []
         for segment in diarized_result["segments"]:
             speaker = "None"
             if "speaker" in segment:
                 speaker = segment["speaker"]
+            diarized_text = speaker + "|" + segment["text"].strip()
+            segments_result.append(Segment(
+                start=segment["start"],
+                end=segment["end"],
+                text=diarized_text
+            ))
         elapsed_time = time.time() - start_time
+        return segments_result, elapsed_time
     @spaces.GPU
     def update_pipe(self,

modules/translation/deepl_api.py CHANGED Viewed

@@ -5,6 +5,7 @@ from datetime import datetime
 import gradio as gr
 from modules.utils.paths import TRANSLATION_OUTPUT_DIR, DEFAULT_PARAMETERS_CONFIG_PATH
 from modules.utils.subtitle_manager import *
 from modules.utils.files_manager import load_yaml, save_yaml
@@ -50,7 +51,7 @@ DEEPL_AVAILABLE_TARGET_LANGS = {
 }
 DEEPL_AVAILABLE_SOURCE_LANGS = {
-    'Automatic Detection': None,
     'Bulgarian': 'BG',
     'Czech': 'CS',
     'Danish': 'DA',
@@ -138,37 +139,27 @@ class DeepLAPI:
         )
         files_info = {}
-        for fileobj in fileobjs:
-            file_path = fileobj
-            file_name, file_ext = os.path.splitext(os.path.basename(fileobj))
-            if file_ext == ".srt":
-                parsed_dicts = parse_srt(file_path=file_path)
-            elif file_ext == ".vtt":
-                parsed_dicts = parse_vtt(file_path=file_path)
             batch_size = self.max_text_batch_size
-            for batch_start in range(0, len(parsed_dicts), batch_size):
-                batch_end = min(batch_start + batch_size, len(parsed_dicts))
-                sentences_to_translate = [dic["sentence"] for dic in parsed_dicts[batch_start:batch_end]]
                 translated_texts = self.request_deepl_translate(auth_key, sentences_to_translate, source_lang,
                                                                 target_lang, is_pro)
                 for i, translated_text in enumerate(translated_texts):
-                    parsed_dicts[batch_start + i]["sentence"] = translated_text["text"]
-                progress(batch_end / len(parsed_dicts), desc="Translating..")
-            if file_ext == ".srt":
-                subtitle = get_serialized_srt(parsed_dicts)
-            elif file_ext == ".vtt":
-                subtitle = get_serialized_vtt(parsed_dicts)
-            if add_timestamp:
-                timestamp = datetime.now().strftime("%m%d%H%M%S")
-                file_name += f"-{timestamp}"
-            output_path = os.path.join(self.output_dir, f"{file_name}{file_ext}")
-            write_file(subtitle, output_path)
             files_info[file_name] = {"subtitle": subtitle, "path": output_path}

 import gradio as gr
 from modules.utils.paths import TRANSLATION_OUTPUT_DIR, DEFAULT_PARAMETERS_CONFIG_PATH
+from modules.utils.constants import AUTOMATIC_DETECTION
 from modules.utils.subtitle_manager import *
 from modules.utils.files_manager import load_yaml, save_yaml
 }
 DEEPL_AVAILABLE_SOURCE_LANGS = {
+    AUTOMATIC_DETECTION: None,
     'Bulgarian': 'BG',
     'Czech': 'CS',
     'Danish': 'DA',
         )
         files_info = {}
+        for file_path in fileobjs:
+            file_name, file_ext = os.path.splitext(os.path.basename(file_path))
+            writer = get_writer(file_ext, self.output_dir)
+            segments = writer.to_segments(file_path)
             batch_size = self.max_text_batch_size
+            for batch_start in range(0, len(segments), batch_size):
+                progress(batch_start / len(segments), desc="Translating..")
+                sentences_to_translate = [seg.text for seg in segments[batch_start:batch_start+batch_size]]
                 translated_texts = self.request_deepl_translate(auth_key, sentences_to_translate, source_lang,
                                                                 target_lang, is_pro)
                 for i, translated_text in enumerate(translated_texts):
+                    segments[batch_start + i].text = translated_text["text"]
+            subtitle, output_path = generate_file(
+                output_dir=self.output_dir,
+                output_file_name=file_name,
+                output_format=file_ext,
+                result=segments,
+                add_timestamp=add_timestamp
+            )
             files_info[file_name] = {"subtitle": subtitle, "path": output_path}

modules/translation/nllb_inference.py CHANGED Viewed

@@ -4,10 +4,10 @@ import os
 import spaces
 from modules.utils.paths import TRANSLATION_OUTPUT_DIR, NLLB_MODELS_DIR
-from modules.translation.translation_base import TranslationBase
-class NLLBInference(TranslationBase):
     def __init__(self,
                  model_dir: str = NLLB_MODELS_DIR,
                  output_dir: str = TRANSLATION_OUTPUT_DIR
@@ -31,7 +31,7 @@ class NLLBInference(TranslationBase):
             text,
             max_length=max_length
         )
-        return result[0]['translation_text']
     @spaces.GPU(duration=120)
     def update_model(self,
@@ -44,8 +44,7 @@ class NLLBInference(TranslationBase):
             if lang in NLLB_AVAILABLE_LANGS:
                 return NLLB_AVAILABLE_LANGS[lang]
             elif lang not in NLLB_AVAILABLE_LANGS.values():
-                raise ValueError(
-                    f"Language '{lang}' is not supported. Use one of: {list(NLLB_AVAILABLE_LANGS.keys())}")
             return lang
         src_lang = validate_language(src_lang)

 import spaces
 from modules.utils.paths import TRANSLATION_OUTPUT_DIR, NLLB_MODELS_DIR
+import modules.translation.translation_base as base
+class NLLBInference(base.TranslationBase):
     def __init__(self,
                  model_dir: str = NLLB_MODELS_DIR,
                  output_dir: str = TRANSLATION_OUTPUT_DIR
             text,
             max_length=max_length
         )
+        return result[0]["translation_text"]
     @spaces.GPU(duration=120)
     def update_model(self,
             if lang in NLLB_AVAILABLE_LANGS:
                 return NLLB_AVAILABLE_LANGS[lang]
             elif lang not in NLLB_AVAILABLE_LANGS.values():
+                raise ValueError(f"Language '{lang}' is not supported. Use one of: {list(NLLB_AVAILABLE_LANGS.keys())}")
             return lang
         src_lang = validate_language(src_lang)

modules/translation/translation_base.py CHANGED Viewed

@@ -6,7 +6,8 @@ from typing import List
 from datetime import datetime
 import spaces
-from modules.whisper.whisper_parameter import *
 from modules.utils.subtitle_manager import *
 from modules.utils.files_manager import load_yaml, save_yaml
 from modules.utils.paths import DEFAULT_PARAMETERS_CONFIG_PATH, NLLB_MODELS_DIR, TRANSLATION_OUTPUT_DIR
@@ -98,32 +99,22 @@ class TranslationBase(ABC):
             files_info = {}
             for fileobj in fileobjs:
                 file_name, file_ext = os.path.splitext(os.path.basename(fileobj))
-                if file_ext == ".srt":
-                    parsed_dicts = parse_srt(file_path=fileobj)
-                    total_progress = len(parsed_dicts)
-                    for index, dic in enumerate(parsed_dicts):
-                        progress(index / total_progress, desc="Translating..")
-                        translated_text = self.translate(dic["sentence"], max_length=max_length)
-                        dic["sentence"] = translated_text
-                    subtitle = get_serialized_srt(parsed_dicts)
-                elif file_ext == ".vtt":
-                    parsed_dicts = parse_vtt(file_path=fileobj)
-                    total_progress = len(parsed_dicts)
-                    for index, dic in enumerate(parsed_dicts):
-                        progress(index / total_progress, desc="Translating..")
-                        translated_text = self.translate(dic["sentence"], max_length=max_length)
-                        dic["sentence"] = translated_text
-                    subtitle = get_serialized_vtt(parsed_dicts)
-                if add_timestamp:
-                    timestamp = datetime.now().strftime("%m%d%H%M%S")
-                    file_name += f"-{timestamp}"
-                output_path = os.path.join(self.output_dir, f"{file_name}{file_ext}")
-                write_file(subtitle, output_path)
-                files_info[file_name] = {"subtitle": subtitle, "path": output_path}
             total_result = ''
             for file_name, info in files_info.items():
@@ -136,7 +127,8 @@ class TranslationBase(ABC):
             return [gr_str, output_file_paths]
         except Exception as e:
-            print(f"Error: {str(e)}")
         finally:
             self.release_cuda_memory()
@@ -172,11 +164,17 @@ class TranslationBase(ABC):
                          tgt_lang: str,
                          max_length: int,
                          add_timestamp: bool):
         cached_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
         cached_params["translation"]["nllb"] = {
             "model_size": model_size,
-            "source_lang": src_lang,
-            "target_lang": tgt_lang,
             "max_length": max_length,
         }
         cached_params["translation"]["add_timestamp"] = add_timestamp

 from datetime import datetime
 import spaces
+import modules.translation.nllb_inference as nllb
+from modules.whisper.data_classes import *
 from modules.utils.subtitle_manager import *
 from modules.utils.files_manager import load_yaml, save_yaml
 from modules.utils.paths import DEFAULT_PARAMETERS_CONFIG_PATH, NLLB_MODELS_DIR, TRANSLATION_OUTPUT_DIR
             files_info = {}
             for fileobj in fileobjs:
                 file_name, file_ext = os.path.splitext(os.path.basename(fileobj))
+                writer = get_writer(file_ext, self.output_dir)
+                segments = writer.to_segments(fileobj)
+                for i, segment in enumerate(segments):
+                    progress(i / len(segments), desc="Translating..")
+                    translated_text = self.translate(segment.text, max_length=max_length)
+                    segment.text = translated_text
+                subtitle, file_path = generate_file(
+                    output_dir=self.output_dir,
+                    output_file_name=file_name,
+                    output_format=file_ext,
+                    result=segments,
+                    add_timestamp=add_timestamp
+                )
+                files_info[file_name] = {"subtitle": subtitle, "path": file_path}
             total_result = ''
             for file_name, info in files_info.items():
             return [gr_str, output_file_paths]
         except Exception as e:
+            print(f"Error translating file: {e}")
+            raise
         finally:
             self.release_cuda_memory()
                          tgt_lang: str,
                          max_length: int,
                          add_timestamp: bool):
+        def validate_lang(lang: str):
+            if lang in list(nllb.NLLB_AVAILABLE_LANGS.values()):
+                flipped = {value: key for key, value in nllb.NLLB_AVAILABLE_LANGS.items()}
+                return flipped[lang]
+            return lang
         cached_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
         cached_params["translation"]["nllb"] = {
             "model_size": model_size,
+            "source_lang": validate_lang(src_lang),
+            "target_lang": validate_lang(tgt_lang),
             "max_length": max_length,
         }
         cached_params["translation"]["add_timestamp"] = add_timestamp

modules/utils/constants.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from gradio_i18n import Translate, gettext as _
+AUTOMATIC_DETECTION = _("Automatic Detection")
+GRADIO_NONE_STR = ""
+GRADIO_NONE_NUMBER_MAX = 9999
+GRADIO_NONE_NUMBER_MIN = 0

modules/utils/files_manager.py CHANGED Viewed

@@ -67,3 +67,9 @@ def is_video(file_path):
     video_extensions = ['.mp4', '.mkv', '.avi', '.mov', '.flv', '.wmv', '.webm', '.m4v', '.mpeg', '.mpg', '.3gp']
     extension = os.path.splitext(file_path)[1].lower()
     return extension in video_extensions

     video_extensions = ['.mp4', '.mkv', '.avi', '.mov', '.flv', '.wmv', '.webm', '.m4v', '.mpeg', '.mpg', '.3gp']
     extension = os.path.splitext(file_path)[1].lower()
     return extension in video_extensions
+def read_file(file_path):
+    with open(file_path, "r", encoding="utf-8") as f:
+        subtitle_content = f.read()
+    return subtitle_content

modules/utils/paths.py CHANGED Viewed

@@ -10,6 +10,7 @@ DIARIZATION_MODELS_DIR = os.path.join(MODELS_DIR, "Diarization")
 UVR_MODELS_DIR = os.path.join(MODELS_DIR, "UVR", "MDX_Net_Models")
 CONFIGS_DIR = os.path.join(WEBUI_DIR, "configs")
 DEFAULT_PARAMETERS_CONFIG_PATH = os.path.join(CONFIGS_DIR, "default_parameters.yaml")
 OUTPUT_DIR = os.path.join(WEBUI_DIR, "outputs")
 TRANSLATION_OUTPUT_DIR = os.path.join(OUTPUT_DIR, "translations")
 UVR_OUTPUT_DIR = os.path.join(OUTPUT_DIR, "UVR")

 UVR_MODELS_DIR = os.path.join(MODELS_DIR, "UVR", "MDX_Net_Models")
 CONFIGS_DIR = os.path.join(WEBUI_DIR, "configs")
 DEFAULT_PARAMETERS_CONFIG_PATH = os.path.join(CONFIGS_DIR, "default_parameters.yaml")
+I18N_YAML_PATH = os.path.join(CONFIGS_DIR, "translation.yaml")
 OUTPUT_DIR = os.path.join(WEBUI_DIR, "outputs")
 TRANSLATION_OUTPUT_DIR = os.path.join(OUTPUT_DIR, "translations")
 UVR_OUTPUT_DIR = os.path.join(OUTPUT_DIR, "UVR")

modules/utils/subtitle_manager.py CHANGED Viewed

@@ -1,123 +1,427 @@
 import re
 # Zero GPU
 import spaces
-def timeformat_srt(time):
-    hours = time // 3600
-    minutes = (time - hours * 3600) // 60
-    seconds = time - hours * 3600 - minutes * 60
-    milliseconds = (time - int(time)) * 1000
-    return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d},{int(milliseconds):03d}"
-def timeformat_vtt(time):
-    hours = time // 3600
-    minutes = (time - hours * 3600) // 60
-    seconds = time - hours * 3600 - minutes * 60
-    milliseconds = (time - int(time)) * 1000
-    return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}.{int(milliseconds):03d}"
-def write_file(subtitle, output_file):
-    with open(output_file, 'w', encoding='utf-8') as f:
-        f.write(subtitle)
-def get_srt(segments):
-    output = ""
-    for i, segment in enumerate(segments):
-        output += f"{i + 1}\n"
-        output += f"{timeformat_srt(segment['start'])} --> {timeformat_srt(segment['end'])}\n"
-        if segment['text'].startswith(' '):
-            segment['text'] = segment['text'][1:]
-        output += f"{segment['text']}\n\n"
-    return output
-def get_vtt(segments):
-    output = "WebVTT\n\n"
-    for i, segment in enumerate(segments):
-        output += f"{i + 1}\n"
-        output += f"{timeformat_vtt(segment['start'])} --> {timeformat_vtt(segment['end'])}\n"
-        if segment['text'].startswith(' '):
-            segment['text'] = segment['text'][1:]
-        output += f"{segment['text']}\n\n"
-    return output
-def get_txt(segments):
-    output = ""
-    for i, segment in enumerate(segments):
-        if segment['text'].startswith(' '):
-            segment['text'] = segment['text'][1:]
-        output += f"{segment['text']}\n"
-    return output
-def parse_srt(file_path):
-    """Reads SRT file and returns as dict"""
-    with open(file_path, 'r', encoding='utf-8') as file:
-        srt_data = file.read()
-    data = []
-    blocks = srt_data.split('\n\n')
-    for block in blocks:
-        if block.strip() != '':
-            lines = block.strip().split('\n')
-            index = lines[0]
-            timestamp = lines[1]
-            sentence = ' '.join(lines[2:])
-            data.append({
-                "index": index,
-                "timestamp": timestamp,
-                "sentence": sentence
-            })
-    return data
-def parse_vtt(file_path):
-    """Reads WebVTT file and returns as dict"""
-    with open(file_path, 'r', encoding='utf-8') as file:
-        webvtt_data = file.read()
-    data = []
-    blocks = webvtt_data.split('\n\n')
-    for block in blocks:
-        if block.strip() != '' and not block.strip().startswith("WebVTT"):
-            lines = block.strip().split('\n')
-            index = lines[0]
-            timestamp = lines[1]
-            sentence = ' '.join(lines[2:])
-            data.append({
-                "index": index,
-                "timestamp": timestamp,
-                "sentence": sentence
-            })
-    return data
-def get_serialized_srt(dicts):
-    output = ""
-    for dic in dicts:
-        output += f'{dic["index"]}\n'
-        output += f'{dic["timestamp"]}\n'
-        output += f'{dic["sentence"]}\n\n'
-    return output
-def get_serialized_vtt(dicts):
-    output = "WebVTT\n\n"
-    for dic in dicts:
-        output += f'{dic["index"]}\n'
-        output += f'{dic["timestamp"]}\n'
-        output += f'{dic["sentence"]}\n\n'
-    return output
 @spaces.GPU(duration=120)
 def safe_filename(name):

+# Ported from https://github.com/openai/whisper/blob/main/whisper/utils.py
+import json
+import os
 import re
+import sys
+import zlib
+from typing import Callable, List, Optional, TextIO, Union, Dict, Tuple
+from datetime import datetime
+from modules.whisper.data_classes import Segment, Word
+from .files_manager import read_file
 # Zero GPU
 import spaces
+def format_timestamp(
+    seconds: float, always_include_hours: bool = True, decimal_marker: str = ","
+) -> str:
+    assert seconds >= 0, "non-negative timestamp expected"
+    milliseconds = round(seconds * 1000.0)
+    hours = milliseconds // 3_600_000
+    milliseconds -= hours * 3_600_000
+    minutes = milliseconds // 60_000
+    milliseconds -= minutes * 60_000
+    seconds = milliseconds // 1_000
+    milliseconds -= seconds * 1_000
+    hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
+    return (
+        f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
+    )
+def time_str_to_seconds(time_str: str, decimal_marker: str = ",") -> float:
+    times = time_str.split(":")
+    if len(times) == 3:
+        hours, minutes, rest = times
+        hours = int(hours)
+    else:
+        hours = 0
+        minutes, rest = times
+    seconds, fractional = rest.split(decimal_marker)
+    minutes = int(minutes)
+    seconds = int(seconds)
+    fractional_seconds = float("0." + fractional)
+    return hours * 3600 + minutes * 60 + seconds + fractional_seconds
+def get_start(segments: List[dict]) -> Optional[float]:
+    return next(
+        (w["start"] for s in segments for w in s["words"]),
+        segments[0]["start"] if segments else None,
+    )
+def get_end(segments: List[dict]) -> Optional[float]:
+    return next(
+        (w["end"] for s in reversed(segments) for w in reversed(s["words"])),
+        segments[-1]["end"] if segments else None,
+    )
+class ResultWriter:
+    extension: str
+    def __init__(self, output_dir: str):
+        self.output_dir = output_dir
+    def __call__(
+        self, result: Union[dict, List[Segment]], output_file_name: str,
+            options: Optional[dict] = None, **kwargs
+    ):
+        if isinstance(result, List) and result and isinstance(result[0], Segment):
+            result = {"segments": [seg.model_dump() for seg in result]}
+        output_path = os.path.join(
+            self.output_dir, output_file_name + "." + self.extension
+        )
+        with open(output_path, "w", encoding="utf-8") as f:
+            self.write_result(result, file=f, options=options, **kwargs)
+    def write_result(
+        self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
+    ):
+        raise NotImplementedError
+class WriteTXT(ResultWriter):
+    extension: str = "txt"
+    def write_result(
+        self, result: Union[Dict, List[Segment]], file: TextIO, options: Optional[dict] = None, **kwargs
+    ):
+        for segment in result["segments"]:
+            print(segment["text"].strip(), file=file, flush=True)
+class SubtitlesWriter(ResultWriter):
+    always_include_hours: bool
+    decimal_marker: str
+    def iterate_result(
+        self,
+        result: dict,
+        options: Optional[dict] = None,
+        *,
+        max_line_width: Optional[int] = None,
+        max_line_count: Optional[int] = None,
+        highlight_words: bool = False,
+        align_lrc_words: bool = False,
+        max_words_per_line: Optional[int] = None,
+    ):
+        options = options or {}
+        max_line_width = max_line_width or options.get("max_line_width")
+        max_line_count = max_line_count or options.get("max_line_count")
+        highlight_words = highlight_words or options.get("highlight_words", False)
+        align_lrc_words = align_lrc_words or options.get("align_lrc_words", False)
+        max_words_per_line = max_words_per_line or options.get("max_words_per_line")
+        preserve_segments = max_line_count is None or max_line_width is None
+        max_line_width = max_line_width or 1000
+        max_words_per_line = max_words_per_line or 1000
+        def iterate_subtitles():
+            line_len = 0
+            line_count = 1
+            # the next subtitle to yield (a list of word timings with whitespace)
+            subtitle: List[dict] = []
+            last: float = get_start(result["segments"]) or 0.0
+            for segment in result["segments"]:
+                chunk_index = 0
+                words_count = max_words_per_line
+                while chunk_index < len(segment["words"]):
+                    remaining_words = len(segment["words"]) - chunk_index
+                    if max_words_per_line > len(segment["words"]) - chunk_index:
+                        words_count = remaining_words
+                    for i, original_timing in enumerate(
+                        segment["words"][chunk_index : chunk_index + words_count]
+                    ):
+                        timing = original_timing.copy()
+                        long_pause = (
+                            not preserve_segments and timing["start"] - last > 3.0
+                        )
+                        has_room = line_len + len(timing["word"]) <= max_line_width
+                        seg_break = i == 0 and len(subtitle) > 0 and preserve_segments
+                        if (
+                            line_len > 0
+                            and has_room
+                            and not long_pause
+                            and not seg_break
+                        ):
+                            # line continuation
+                            line_len += len(timing["word"])
+                        else:
+                            # new line
+                            timing["word"] = timing["word"].strip()
+                            if (
+                                len(subtitle) > 0
+                                and max_line_count is not None
+                                and (long_pause or line_count >= max_line_count)
+                                or seg_break
+                            ):
+                                # subtitle break
+                                yield subtitle
+                                subtitle = []
+                                line_count = 1
+                            elif line_len > 0:
+                                # line break
+                                line_count += 1
+                                timing["word"] = "\n" + timing["word"]
+                            line_len = len(timing["word"].strip())
+                        subtitle.append(timing)
+                        last = timing["start"]
+                    chunk_index += max_words_per_line
+            if len(subtitle) > 0:
+                yield subtitle
+        if len(result["segments"]) > 0 and "words" in result["segments"][0] and result["segments"][0]["words"]:
+            for subtitle in iterate_subtitles():
+                subtitle_start = self.format_timestamp(subtitle[0]["start"])
+                subtitle_end = self.format_timestamp(subtitle[-1]["end"])
+                subtitle_text = "".join([word["word"] for word in subtitle])
+                if highlight_words:
+                    last = subtitle_start
+                    all_words = [timing["word"] for timing in subtitle]
+                    for i, this_word in enumerate(subtitle):
+                        start = self.format_timestamp(this_word["start"])
+                        end = self.format_timestamp(this_word["end"])
+                        if last != start:
+                            yield last, start, subtitle_text
+                        yield start, end, "".join(
+                            [
+                                re.sub(r"^(\s*)(.*)$", r"\1<u>\2</u>", word)
+                                if j == i
+                                else word
+                                for j, word in enumerate(all_words)
+                            ]
+                        )
+                        last = end
+                if align_lrc_words:
+                    lrc_aligned_words = [f"[{self.format_timestamp(sub['start'])}]{sub['word']}" for sub in subtitle]
+                    l_start, l_end = self.format_timestamp(subtitle[-1]['start']), self.format_timestamp(subtitle[-1]['end'])
+                    lrc_aligned_words[-1] = f"[{l_start}]{subtitle[-1]['word']}[{l_end}]"
+                    lrc_aligned_words = ' '.join(lrc_aligned_words)
+                    yield None, None, lrc_aligned_words
+                else:
+                    yield subtitle_start, subtitle_end, subtitle_text
+        else:
+            for segment in result["segments"]:
+                segment_start = self.format_timestamp(segment["start"])
+                segment_end = self.format_timestamp(segment["end"])
+                segment_text = segment["text"].strip().replace("-->", "->")
+                yield segment_start, segment_end, segment_text
+    def format_timestamp(self, seconds: float):
+        return format_timestamp(
+            seconds=seconds,
+            always_include_hours=self.always_include_hours,
+            decimal_marker=self.decimal_marker,
+        )
+class WriteVTT(SubtitlesWriter):
+    extension: str = "vtt"
+    always_include_hours: bool = False
+    decimal_marker: str = "."
+    def write_result(
+        self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
+    ):
+        print("WEBVTT\n", file=file)
+        for start, end, text in self.iterate_result(result, options, **kwargs):
+            print(f"{start} --> {end}\n{text}\n", file=file, flush=True)
+    def to_segments(self, file_path: str) -> List[Segment]:
+        segments = []
+        blocks = read_file(file_path).split('\n\n')
+        for block in blocks:
+            if block.strip() != '' and not block.strip().startswith("WEBVTT"):
+                lines = block.strip().split('\n')
+                time_line = lines[0].split(" --> ")
+                start, end = time_str_to_seconds(time_line[0], self.decimal_marker), time_str_to_seconds(time_line[1], self.decimal_marker)
+                sentence = ' '.join(lines[1:])
+                segments.append(Segment(
+                    start=start,
+                    end=end,
+                    text=sentence
+                ))
+        return segments
+class WriteSRT(SubtitlesWriter):
+    extension: str = "srt"
+    always_include_hours: bool = True
+    decimal_marker: str = ","
+    def write_result(
+        self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
+    ):
+        for i, (start, end, text) in enumerate(
+            self.iterate_result(result, options, **kwargs), start=1
+        ):
+            print(f"{i}\n{start} --> {end}\n{text}\n", file=file, flush=True)
+    def to_segments(self, file_path: str) -> List[Segment]:
+        segments = []
+        blocks = read_file(file_path).split('\n\n')
+        for block in blocks:
+            if block.strip() != '':
+                lines = block.strip().split('\n')
+                index = lines[0]
+                time_line = lines[1].split(" --> ")
+                start, end = time_str_to_seconds(time_line[0], self.decimal_marker), time_str_to_seconds(time_line[1], self.decimal_marker)
+                sentence = ' '.join(lines[2:])
+                segments.append(Segment(
+                    start=start,
+                    end=end,
+                    text=sentence
+                ))
+        return segments
+class WriteLRC(SubtitlesWriter):
+    extension: str = "lrc"
+    always_include_hours: bool = False
+    decimal_marker: str = "."
+    def write_result(
+        self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
+    ):
+        for i, (start, end, text) in enumerate(
+            self.iterate_result(result, options, **kwargs), start=1
+        ):
+            if "align_lrc_words" in kwargs and kwargs["align_lrc_words"]:
+                print(f"{text}\n", file=file, flush=True)
+            else:
+                print(f"[{start}]{text}[{end}]\n", file=file, flush=True)
+    def to_segments(self, file_path: str) -> List[Segment]:
+        segments = []
+        blocks = read_file(file_path).split('\n')
+        for block in blocks:
+            if block.strip() != '':
+                lines = block.strip()
+                pattern = r'(\[.*?\])'
+                parts = re.split(pattern, lines)
+                parts = [part.strip() for part in parts if part]
+                for i, part in enumerate(parts):
+                    sentence_i = i%2
+                    if sentence_i == 1:
+                        start_str, text, end_str = parts[sentence_i-1], parts[sentence_i], parts[sentence_i+1]
+                        start_str, end_str = start_str.replace("[", "").replace("]", ""), end_str.replace("[", "").replace("]", "")
+                        start, end = time_str_to_seconds(start_str, self.decimal_marker), time_str_to_seconds(end_str, self.decimal_marker)
+                        segments.append(Segment(
+                            start=start,
+                            end=end,
+                            text=text,
+                        ))
+        return segments
+class WriteTSV(ResultWriter):
+    """
+    Write a transcript to a file in TSV (tab-separated values) format containing lines like:
+    <start time in integer milliseconds>\t<end time in integer milliseconds>\t<transcript text>
+    Using integer milliseconds as start and end times means there's no chance of interference from
+    an environment setting a language encoding that causes the decimal in a floating point number
+    to appear as a comma; also is faster and more efficient to parse & store, e.g., in C++.
+    """
+    extension: str = "tsv"
+    def write_result(
+        self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
+    ):
+        print("start", "end", "text", sep="\t", file=file)
+        for segment in result["segments"]:
+            print(round(1000 * segment["start"]), file=file, end="\t")
+            print(round(1000 * segment["end"]), file=file, end="\t")
+            print(segment["text"].strip().replace("\t", " "), file=file, flush=True)
+class WriteJSON(ResultWriter):
+    extension: str = "json"
+    def write_result(
+        self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
+    ):
+        json.dump(result, file)
+def get_writer(
+    output_format: str, output_dir: str
+) -> Callable[[dict, TextIO, dict], None]:
+    output_format = output_format.strip().lower().replace(".", "")
+    writers = {
+        "txt": WriteTXT,
+        "vtt": WriteVTT,
+        "srt": WriteSRT,
+        "tsv": WriteTSV,
+        "json": WriteJSON,
+        "lrc": WriteLRC
+    }
+    if output_format == "all":
+        all_writers = [writer(output_dir) for writer in writers.values()]
+        def write_all(
+            result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
+        ):
+            for writer in all_writers:
+                writer(result, file, options, **kwargs)
+        return write_all
+    return writers[output_format](output_dir)
+def generate_file(
+    output_format: str, output_dir: str, result: Union[dict, List[Segment]], output_file_name: str,
+    add_timestamp: bool = True, **kwargs
+) -> Tuple[str, str]:
+    output_format = output_format.strip().lower().replace(".", "")
+    output_format = "vtt" if output_format == "webvtt" else output_format
+    if add_timestamp:
+        timestamp = datetime.now().strftime("%m%d%H%M%S")
+        output_file_name += f"-{timestamp}"
+    file_path = os.path.join(output_dir, f"{output_file_name}.{output_format}")
+    file_writer = get_writer(output_format=output_format, output_dir=output_dir)
+    if isinstance(file_writer, WriteLRC) and kwargs.get("highlight_words", False):
+        kwargs["highlight_words"], kwargs["align_lrc_words"] = False, True
+    file_writer(result=result, output_file_name=output_file_name, **kwargs)
+    content = read_file(file_path)
+    return content, file_path
 @spaces.GPU(duration=120)
 def safe_filename(name):

modules/vad/silero_vad.py CHANGED Viewed

@@ -5,7 +5,8 @@ import numpy as np
 from typing import BinaryIO, Union, List, Optional, Tuple
 import warnings
 import faster_whisper
-from faster_whisper.transcribe import SpeechTimestampsMap, Segment
 import gradio as gr
@@ -247,18 +248,18 @@ class SileroVAD:
     def restore_speech_timestamps(
         self,
-        segments: List[dict],
         speech_chunks: List[dict],
         sampling_rate: Optional[int] = None,
-    ) -> List[dict]:
         if sampling_rate is None:
             sampling_rate = self.sampling_rate
         ts_map = SpeechTimestampsMap(speech_chunks, sampling_rate)
         for segment in segments:
-            segment["start"] = ts_map.get_original_time(segment["start"])
-            segment["end"] = ts_map.get_original_time(segment["end"])
         return segments

 from typing import BinaryIO, Union, List, Optional, Tuple
 import warnings
 import faster_whisper
+from modules.whisper.data_classes import *
+from faster_whisper.transcribe import SpeechTimestampsMap
 import gradio as gr
     def restore_speech_timestamps(
         self,
+        segments: List[Segment],
         speech_chunks: List[dict],
         sampling_rate: Optional[int] = None,
+    ) -> List[Segment]:
         if sampling_rate is None:
             sampling_rate = self.sampling_rate
         ts_map = SpeechTimestampsMap(speech_chunks, sampling_rate)
         for segment in segments:
+            segment.start = ts_map.get_original_time(segment.start)
+            segment.end = ts_map.get_original_time(segment.end)
         return segments

modules/whisper/{whisper_base.py → base_transcription_pipeline.py} RENAMED Viewed

@@ -1,6 +1,6 @@
 import os
-import torch
 import whisper
 import gradio as gr
 import torchaudio
 from abc import ABC, abstractmethod
@@ -8,20 +8,20 @@ from typing import BinaryIO, Union, Tuple, List
 import numpy as np
 from datetime import datetime
 from faster_whisper.vad import VadOptions
-from dataclasses import astuple
 from modules.uvr.music_separator import MusicSeparator
 from modules.utils.paths import (WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, DEFAULT_PARAMETERS_CONFIG_PATH,
                                  UVR_MODELS_DIR)
-from modules.utils.subtitle_manager import get_srt, get_vtt, get_txt, write_file, safe_filename
 from modules.utils.youtube_manager import get_ytdata, get_ytaudio
-from modules.utils.files_manager import get_media_files, format_gradio_files, load_yaml, save_yaml
-from modules.whisper.whisper_parameter import *
 from modules.diarize.diarizer import Diarizer
 from modules.vad.silero_vad import SileroVAD
-class WhisperBase(ABC):
     def __init__(self,
                  model_dir: str = WHISPER_MODELS_DIR,
                  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
@@ -47,8 +47,8 @@ class WhisperBase(ABC):
         self.available_langs = sorted(list(whisper.tokenizer.LANGUAGES.values()))
         self.translatable_models = ["large", "large-v1", "large-v2", "large-v3"]
         self.device = self.get_device()
-        self.available_compute_types = ["float16", "float32"]
-        self.current_compute_type = "float16" if self.device == "cuda" else "float32"
     @abstractmethod
     def transcribe(self,
@@ -71,13 +71,15 @@ class WhisperBase(ABC):
     def run(self,
             audio: Union[str, BinaryIO, np.ndarray],
             progress: gr.Progress = gr.Progress(),
             add_timestamp: bool = True,
-            *whisper_params,
-            ) -> Tuple[List[dict], float]:
         """
         Run transcription with conditional pre-processing and post-processing.
         The VAD will be performed to remove noise from the audio input in pre-processing, if enabled.
         The diarization will be performed in post-processing, if enabled.
         Parameters
         ----------
@@ -85,40 +87,33 @@ class WhisperBase(ABC):
             Audio input. This can be file path or binary type.
         progress: gr.Progress
             Indicator to show progress directly in gradio.
         add_timestamp: bool
             Whether to add a timestamp at the end of the filename.
-        *whisper_params: tuple
-            Parameters related with whisper. This will be dealt with "WhisperParameters" data class
         Returns
         ----------
-        segments_result: List[dict]
-            list of dicts that includes start, end timestamps and transcribed text
         elapsed_time: float
             elapsed time for running
         """
-        params = WhisperParameters.as_value(*whisper_params)
-        self.cache_parameters(
-            whisper_params=params,
-            add_timestamp=add_timestamp
-        )
-        if params.lang is None:
-            pass
-        elif params.lang == "Automatic Detection":
-            params.lang = None
-        else:
-            language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}
-            params.lang = language_code_dict[params.lang]
-        if params.is_bgm_separate:
             music, audio, _ = self.music_separator.separate(
                 audio=audio,
-                model_name=params.uvr_model_size,
-                device=params.uvr_device,
-                segment_size=params.uvr_segment_size,
-                save_file=params.uvr_save_file,
                 progress=progress
             )
@@ -130,47 +125,55 @@ class WhisperBase(ABC):
                     origin_sample_rate = self.music_separator.audio_info.sample_rate
                 audio = self.resample_audio(audio=audio, original_sample_rate=origin_sample_rate)
-            if params.uvr_enable_offload:
                 self.music_separator.offload()
-        if params.vad_filter:
-            # Explicit value set for float('inf') from gr.Number()
-            if params.max_speech_duration_s is None or params.max_speech_duration_s >= 9999:
-                params.max_speech_duration_s = float('inf')
             vad_options = VadOptions(
-                threshold=params.threshold,
-                min_speech_duration_ms=params.min_speech_duration_ms,
-                max_speech_duration_s=params.max_speech_duration_s,
-                min_silence_duration_ms=params.min_silence_duration_ms,
-                speech_pad_ms=params.speech_pad_ms
             )
-            audio, speech_chunks = self.vad.run(
                 audio=audio,
                 vad_parameters=vad_options,
                 progress=progress
             )
         result, elapsed_time = self.transcribe(
             audio,
             progress,
-            *astuple(params)
         )
-        if params.vad_filter:
             result = self.vad.restore_speech_timestamps(
                 segments=result,
                 speech_chunks=speech_chunks,
             )
-        if params.is_diarize:
             result, elapsed_time_diarization = self.diarizer.run(
                 audio=audio,
-                use_auth_token=params.hf_token,
                 transcribed_result=result,
             )
             elapsed_time += elapsed_time_diarization
         return result, elapsed_time
     def transcribe_file(self,
@@ -179,8 +182,8 @@ class WhisperBase(ABC):
                         file_format: str = "SRT",
                         add_timestamp: bool = True,
                         progress=gr.Progress(),
-                        *whisper_params,
-                        ) -> list:
         """
         Write subtitle file from Files
@@ -197,8 +200,8 @@ class WhisperBase(ABC):
             Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the subtitle filename.
         progress: gr.Progress
             Indicator to show progress directly in gradio.
-        *whisper_params: tuple
-            Parameters related with whisper. This will be dealt with "WhisperParameters" data class
         Returns
         ----------
@@ -208,6 +211,11 @@ class WhisperBase(ABC):
             Output file path to return to gr.Files()
         """
         try:
             if input_folder_path:
                 files = get_media_files(input_folder_path)
             if isinstance(files, str):
@@ -220,19 +228,21 @@ class WhisperBase(ABC):
                 transcribed_segments, time_for_task = self.run(
                     file,
                     progress,
                     add_timestamp,
-                    *whisper_params,
                 )
                 file_name, file_ext = os.path.splitext(os.path.basename(file))
-                subtitle, file_path = self.generate_and_write_file(
-                    file_name=file_name,
-                    transcribed_segments=transcribed_segments,
                     add_timestamp=add_timestamp,
-                    file_format=file_format,
-                    output_dir=self.output_dir
                 )
-                files_info[file_name] = {"subtitle": subtitle, "time_for_task": time_for_task, "path": file_path}
             total_result = ''
             total_time = 0
@@ -245,10 +255,11 @@ class WhisperBase(ABC):
             result_str = f"Done in {self.format_time(total_time)}! Subtitle is in the outputs folder.\n\n{total_result}"
             result_file_path = [info['path'] for info in files_info.values()]
-            return [result_str, result_file_path]
         except Exception as e:
             print(f"Error transcribing file: {e}")
         finally:
             self.release_cuda_memory()
@@ -257,8 +268,8 @@ class WhisperBase(ABC):
                        file_format: str = "SRT",
                        add_timestamp: bool = True,
                        progress=gr.Progress(),
-                       *whisper_params,
-                       ) -> list:
         """
         Write subtitle file from microphone
@@ -272,7 +283,7 @@ class WhisperBase(ABC):
             Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
         progress: gr.Progress
             Indicator to show progress directly in gradio.
-        *whisper_params: tuple
             Parameters related with whisper. This will be dealt with "WhisperParameters" data class
         Returns
@@ -283,27 +294,36 @@ class WhisperBase(ABC):
             Output file path to return to gr.Files()
         """
         try:
             progress(0, desc="Loading Audio..")
             transcribed_segments, time_for_task = self.run(
                 mic_audio,
                 progress,
                 add_timestamp,
-                *whisper_params,
             )
             progress(1, desc="Completed!")
-            subtitle, result_file_path = self.generate_and_write_file(
-                file_name="Mic",
-                transcribed_segments=transcribed_segments,
                 add_timestamp=add_timestamp,
-                file_format=file_format,
-                output_dir=self.output_dir
             )
             result_str = f"Done in {self.format_time(time_for_task)}! Subtitle file is in the outputs folder.\n\n{subtitle}"
-            return [result_str, result_file_path]
         except Exception as e:
-            print(f"Error transcribing file: {e}")
         finally:
             self.release_cuda_memory()
@@ -312,8 +332,8 @@ class WhisperBase(ABC):
                            file_format: str = "SRT",
                            add_timestamp: bool = True,
                            progress=gr.Progress(),
-                           *whisper_params,
-                           ) -> list:
         """
         Write subtitle file from Youtube
@@ -327,7 +347,7 @@ class WhisperBase(ABC):
             Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
         progress: gr.Progress
             Indicator to show progress directly in gradio.
-        *whisper_params: tuple
             Parameters related with whisper. This will be dealt with "WhisperParameters" data class
         Returns
@@ -338,6 +358,11 @@ class WhisperBase(ABC):
             Output file path to return to gr.Files()
         """
         try:
             progress(0, desc="Loading Audio from Youtube..")
             yt = get_ytdata(youtube_link)
             audio = get_ytaudio(yt)
@@ -345,83 +370,49 @@ class WhisperBase(ABC):
             transcribed_segments, time_for_task = self.run(
                 audio,
                 progress,
                 add_timestamp,
-                *whisper_params,
             )
             progress(1, desc="Completed!")
             file_name = safe_filename(yt.title)
-            subtitle, result_file_path = self.generate_and_write_file(
-                file_name=file_name,
-                transcribed_segments=transcribed_segments,
                 add_timestamp=add_timestamp,
-                file_format=file_format,
-                output_dir=self.output_dir
             )
             result_str = f"Done in {self.format_time(time_for_task)}! Subtitle file is in the outputs folder.\n\n{subtitle}"
             if os.path.exists(audio):
                 os.remove(audio)
-            return [result_str, result_file_path]
         except Exception as e:
-            print(f"Error transcribing file: {e}")
         finally:
             self.release_cuda_memory()
-    @staticmethod
-    def generate_and_write_file(file_name: str,
-                                transcribed_segments: list,
-                                add_timestamp: bool,
-                                file_format: str,
-                                output_dir: str
-                                ) -> str:
-        """
-        Writes subtitle file
-        Parameters
-        ----------
-        file_name: str
-            Output file name
-        transcribed_segments: list
-            Text segments transcribed from audio
-        add_timestamp: bool
-            Determines whether to add a timestamp to the end of the filename.
-        file_format: str
-            File format to write. Supported formats: [SRT, WebVTT, txt]
-        output_dir: str
-            Directory path of the output
-        Returns
-        ----------
-        content: str
-            Result of the transcription
-        output_path: str
-            output file path
-        """
-        if add_timestamp:
-            timestamp = datetime.now().strftime("%m%d%H%M%S")
-            output_path = os.path.join(output_dir, f"{file_name}-{timestamp}")
         else:
-            output_path = os.path.join(output_dir, f"{file_name}")
-        file_format = file_format.strip().lower()
-        if file_format == "srt":
-            content = get_srt(transcribed_segments)
-            output_path += '.srt'
-        elif file_format == "webvtt":
-            content = get_vtt(transcribed_segments)
-            output_path += '.vtt'
-        elif file_format == "txt":
-            content = get_txt(transcribed_segments)
-            output_path += '.txt'
-        write_file(content, output_path)
-        return content, output_path
     @staticmethod
     def format_time(elapsed_time: float) -> str:
@@ -455,7 +446,7 @@ class WhisperBase(ABC):
         if torch.cuda.is_available():
             return "cuda"
         elif torch.backends.mps.is_available():
-            if not WhisperBase.is_sparse_api_supported():
                 # Device `SparseMPS` is not supported for now. See : https://github.com/pytorch/pytorch/issues/87886
                 return "cpu"
             return "mps"
@@ -496,18 +487,65 @@ class WhisperBase(ABC):
             if file_path and os.path.exists(file_path):
                 os.remove(file_path)
     @staticmethod
     def cache_parameters(
-        whisper_params: WhisperValues,
-        add_timestamp: bool
     ):
-        """cache parameters to the yaml file"""
         cached_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
-        cached_whisper_param = whisper_params.to_yaml()
-        cached_yaml = {**cached_params, **cached_whisper_param}
         cached_yaml["whisper"]["add_timestamp"] = add_timestamp
-        save_yaml(cached_yaml, DEFAULT_PARAMETERS_CONFIG_PATH)
     @staticmethod
     def resample_audio(audio: Union[str, np.ndarray],

 import os
 import whisper
+import ctranslate2
 import gradio as gr
 import torchaudio
 from abc import ABC, abstractmethod
 import numpy as np
 from datetime import datetime
 from faster_whisper.vad import VadOptions
 from modules.uvr.music_separator import MusicSeparator
 from modules.utils.paths import (WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, DEFAULT_PARAMETERS_CONFIG_PATH,
                                  UVR_MODELS_DIR)
+from modules.utils.constants import *
+from modules.utils.subtitle_manager import *
 from modules.utils.youtube_manager import get_ytdata, get_ytaudio
+from modules.utils.files_manager import get_media_files, format_gradio_files, load_yaml, save_yaml, read_file
+from modules.whisper.data_classes import *
 from modules.diarize.diarizer import Diarizer
 from modules.vad.silero_vad import SileroVAD
+class BaseTranscriptionPipeline(ABC):
     def __init__(self,
                  model_dir: str = WHISPER_MODELS_DIR,
                  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
         self.available_langs = sorted(list(whisper.tokenizer.LANGUAGES.values()))
         self.translatable_models = ["large", "large-v1", "large-v2", "large-v3"]
         self.device = self.get_device()
+        self.available_compute_types = self.get_available_compute_type()
+        self.current_compute_type = self.get_compute_type()
     @abstractmethod
     def transcribe(self,
     def run(self,
             audio: Union[str, BinaryIO, np.ndarray],
             progress: gr.Progress = gr.Progress(),
+            file_format: str = "SRT",
             add_timestamp: bool = True,
+            *pipeline_params,
+            ) -> Tuple[List[Segment], float]:
         """
         Run transcription with conditional pre-processing and post-processing.
         The VAD will be performed to remove noise from the audio input in pre-processing, if enabled.
         The diarization will be performed in post-processing, if enabled.
+        Due to the integration with gradio, the parameters have to be specified with a `*` wildcard.
         Parameters
         ----------
             Audio input. This can be file path or binary type.
         progress: gr.Progress
             Indicator to show progress directly in gradio.
+        file_format: str
+            Subtitle file format between ["SRT", "WebVTT", "txt", "lrc"]
         add_timestamp: bool
             Whether to add a timestamp at the end of the filename.
+        *pipeline_params: tuple
+            Parameters for the transcription pipeline. This will be dealt with "TranscriptionPipelineParams" data class.
+            This must be provided as a List with * wildcard because of the integration with gradio.
+            See more info at : https://github.com/gradio-app/gradio/issues/2471
         Returns
         ----------
+        segments_result: List[Segment]
+            list of Segment that includes start, end timestamps and transcribed text
         elapsed_time: float
             elapsed time for running
         """
+        params = TranscriptionPipelineParams.from_list(list(pipeline_params))
+        params = self.validate_gradio_values(params)
+        bgm_params, vad_params, whisper_params, diarization_params = params.bgm_separation, params.vad, params.whisper, params.diarization
+        if bgm_params.is_separate_bgm:
             music, audio, _ = self.music_separator.separate(
                 audio=audio,
+                model_name=bgm_params.model_size,
+                device=bgm_params.device,
+                segment_size=bgm_params.segment_size,
+                save_file=bgm_params.save_file,
                 progress=progress
             )
                     origin_sample_rate = self.music_separator.audio_info.sample_rate
                 audio = self.resample_audio(audio=audio, original_sample_rate=origin_sample_rate)
+            if bgm_params.enable_offload:
                 self.music_separator.offload()
+        if vad_params.vad_filter:
             vad_options = VadOptions(
+                threshold=vad_params.threshold,
+                min_speech_duration_ms=vad_params.min_speech_duration_ms,
+                max_speech_duration_s=vad_params.max_speech_duration_s,
+                min_silence_duration_ms=vad_params.min_silence_duration_ms,
+                speech_pad_ms=vad_params.speech_pad_ms
             )
+            vad_processed, speech_chunks = self.vad.run(
                 audio=audio,
                 vad_parameters=vad_options,
                 progress=progress
             )
+            if vad_processed.size > 0:
+                audio = vad_processed
+            else:
+                vad_params.vad_filter = False
         result, elapsed_time = self.transcribe(
             audio,
             progress,
+            *whisper_params.to_list()
         )
+        if vad_params.vad_filter:
             result = self.vad.restore_speech_timestamps(
                 segments=result,
                 speech_chunks=speech_chunks,
             )
+        if diarization_params.is_diarize:
             result, elapsed_time_diarization = self.diarizer.run(
                 audio=audio,
+                use_auth_token=diarization_params.hf_token,
                 transcribed_result=result,
+                device=diarization_params.device
             )
             elapsed_time += elapsed_time_diarization
+        self.cache_parameters(
+            params=params,
+            file_format=file_format,
+            add_timestamp=add_timestamp
+        )
         return result, elapsed_time
     def transcribe_file(self,
                         file_format: str = "SRT",
                         add_timestamp: bool = True,
                         progress=gr.Progress(),
+                        *pipeline_params,
+                        ) -> Tuple[str, List]:
         """
         Write subtitle file from Files
             Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the subtitle filename.
         progress: gr.Progress
             Indicator to show progress directly in gradio.
+        *pipeline_params: tuple
+            Parameters for the transcription pipeline. This will be dealt with "TranscriptionPipelineParams" data class
         Returns
         ----------
             Output file path to return to gr.Files()
         """
         try:
+            params = TranscriptionPipelineParams.from_list(list(pipeline_params))
+            writer_options = {
+                "highlight_words": True if params.whisper.word_timestamps else False
+            }
             if input_folder_path:
                 files = get_media_files(input_folder_path)
             if isinstance(files, str):
                 transcribed_segments, time_for_task = self.run(
                     file,
                     progress,
+                    file_format,
                     add_timestamp,
+                    *pipeline_params,
                 )
                 file_name, file_ext = os.path.splitext(os.path.basename(file))
+                subtitle, file_path = generate_file(
+                    output_dir=self.output_dir,
+                    output_file_name=file_name,
+                    output_format=file_format,
+                    result=transcribed_segments,
                     add_timestamp=add_timestamp,
+                    **writer_options
                 )
+                files_info[file_name] = {"subtitle": read_file(file_path), "time_for_task": time_for_task, "path": file_path}
             total_result = ''
             total_time = 0
             result_str = f"Done in {self.format_time(total_time)}! Subtitle is in the outputs folder.\n\n{total_result}"
             result_file_path = [info['path'] for info in files_info.values()]
+            return result_str, result_file_path
         except Exception as e:
             print(f"Error transcribing file: {e}")
+            raise
         finally:
             self.release_cuda_memory()
                        file_format: str = "SRT",
                        add_timestamp: bool = True,
                        progress=gr.Progress(),
+                       *pipeline_params,
+                       ) -> Tuple[str, str]:
         """
         Write subtitle file from microphone
             Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
         progress: gr.Progress
             Indicator to show progress directly in gradio.
+        *pipeline_params: tuple
             Parameters related with whisper. This will be dealt with "WhisperParameters" data class
         Returns
             Output file path to return to gr.Files()
         """
         try:
+            params = TranscriptionPipelineParams.from_list(list(pipeline_params))
+            writer_options = {
+                "highlight_words": True if params.whisper.word_timestamps else False
+            }
             progress(0, desc="Loading Audio..")
             transcribed_segments, time_for_task = self.run(
                 mic_audio,
                 progress,
+                file_format,
                 add_timestamp,
+                *pipeline_params,
             )
             progress(1, desc="Completed!")
+            file_name = "Mic"
+            subtitle, file_path = generate_file(
+                output_dir=self.output_dir,
+                output_file_name=file_name,
+                output_format=file_format,
+                result=transcribed_segments,
                 add_timestamp=add_timestamp,
+                **writer_options
             )
             result_str = f"Done in {self.format_time(time_for_task)}! Subtitle file is in the outputs folder.\n\n{subtitle}"
+            return result_str, file_path
         except Exception as e:
+            print(f"Error transcribing mic: {e}")
+            raise
         finally:
             self.release_cuda_memory()
                            file_format: str = "SRT",
                            add_timestamp: bool = True,
                            progress=gr.Progress(),
+                           *pipeline_params,
+                           ) -> Tuple[str, str]:
         """
         Write subtitle file from Youtube
             Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
         progress: gr.Progress
             Indicator to show progress directly in gradio.
+        *pipeline_params: tuple
             Parameters related with whisper. This will be dealt with "WhisperParameters" data class
         Returns
             Output file path to return to gr.Files()
         """
         try:
+            params = TranscriptionPipelineParams.from_list(list(pipeline_params))
+            writer_options = {
+                "highlight_words": True if params.whisper.word_timestamps else False
+            }
             progress(0, desc="Loading Audio from Youtube..")
             yt = get_ytdata(youtube_link)
             audio = get_ytaudio(yt)
             transcribed_segments, time_for_task = self.run(
                 audio,
                 progress,
+                file_format,
                 add_timestamp,
+                *pipeline_params,
             )
             progress(1, desc="Completed!")
             file_name = safe_filename(yt.title)
+            subtitle, file_path = generate_file(
+                output_dir=self.output_dir,
+                output_file_name=file_name,
+                output_format=file_format,
+                result=transcribed_segments,
                 add_timestamp=add_timestamp,
+                **writer_options
             )
             result_str = f"Done in {self.format_time(time_for_task)}! Subtitle file is in the outputs folder.\n\n{subtitle}"
             if os.path.exists(audio):
                 os.remove(audio)
+            return result_str, file_path
         except Exception as e:
+            print(f"Error transcribing youtube: {e}")
+            raise
         finally:
             self.release_cuda_memory()
+    def get_compute_type(self):
+        if "float16" in self.available_compute_types:
+            return "float16"
+        if "float32" in self.available_compute_types:
+            return "float32"
         else:
+            return self.available_compute_types[0]
+    def get_available_compute_type(self):
+        if self.device == "cuda":
+            return list(ctranslate2.get_supported_compute_types("cuda"))
+        else:
+            return list(ctranslate2.get_supported_compute_types("cpu"))
     @staticmethod
     def format_time(elapsed_time: float) -> str:
         if torch.cuda.is_available():
             return "cuda"
         elif torch.backends.mps.is_available():
+            if not BaseTranscriptionPipeline.is_sparse_api_supported():
                 # Device `SparseMPS` is not supported for now. See : https://github.com/pytorch/pytorch/issues/87886
                 return "cpu"
             return "mps"
             if file_path and os.path.exists(file_path):
                 os.remove(file_path)
+    @staticmethod
+    def validate_gradio_values(params: TranscriptionPipelineParams):
+        """
+        Validate gradio specific values that can't be displayed as None in the UI.
+        Related issue : https://github.com/gradio-app/gradio/issues/8723
+        """
+        if params.whisper.lang is None:
+            pass
+        elif params.whisper.lang == AUTOMATIC_DETECTION:
+            params.whisper.lang = None
+        else:
+            language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}
+            params.whisper.lang = language_code_dict[params.whisper.lang]
+        if params.whisper.initial_prompt == GRADIO_NONE_STR:
+            params.whisper.initial_prompt = None
+        if params.whisper.prefix == GRADIO_NONE_STR:
+            params.whisper.prefix = None
+        if params.whisper.hotwords == GRADIO_NONE_STR:
+            params.whisper.hotwords = None
+        if params.whisper.max_new_tokens == GRADIO_NONE_NUMBER_MIN:
+            params.whisper.max_new_tokens = None
+        if params.whisper.hallucination_silence_threshold == GRADIO_NONE_NUMBER_MIN:
+            params.whisper.hallucination_silence_threshold = None
+        if params.whisper.language_detection_threshold == GRADIO_NONE_NUMBER_MIN:
+            params.whisper.language_detection_threshold = None
+        if params.vad.max_speech_duration_s == GRADIO_NONE_NUMBER_MAX:
+            params.vad.max_speech_duration_s = float('inf')
+        return params
     @staticmethod
     def cache_parameters(
+        params: TranscriptionPipelineParams,
+        file_format: str = "SRT",
+        add_timestamp: bool = True
     ):
+        """Cache parameters to the yaml file"""
         cached_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
+        param_to_cache = params.to_dict()
+        cached_yaml = {**cached_params, **param_to_cache}
         cached_yaml["whisper"]["add_timestamp"] = add_timestamp
+        cached_yaml["whisper"]["file_format"] = file_format
+        supress_token = cached_yaml["whisper"].get("suppress_tokens", None)
+        if supress_token and isinstance(supress_token, list):
+            cached_yaml["whisper"]["suppress_tokens"] = str(supress_token)
+        if cached_yaml["whisper"].get("lang", None) is None:
+            cached_yaml["whisper"]["lang"] = AUTOMATIC_DETECTION.unwrap()
+        else:
+            language_dict = whisper.tokenizer.LANGUAGES
+            cached_yaml["whisper"]["lang"] = language_dict[cached_yaml["whisper"]["lang"]]
+        if cached_yaml["vad"].get("max_speech_duration_s", float('inf')) == float('inf'):
+            cached_yaml["vad"]["max_speech_duration_s"] = GRADIO_NONE_NUMBER_MAX
+        if cached_yaml is not None and cached_yaml:
+            save_yaml(cached_yaml, DEFAULT_PARAMETERS_CONFIG_PATH)
     @staticmethod
     def resample_audio(audio: Union[str, np.ndarray],

modules/whisper/data_classes.py ADDED Viewed

	@@ -0,0 +1,608 @@

+import faster_whisper.transcribe
+import gradio as gr
+import torch
+from typing import Optional, Dict, List, Union, NamedTuple
+from pydantic import BaseModel, Field, field_validator, ConfigDict
+from gradio_i18n import Translate, gettext as _
+from enum import Enum
+from copy import deepcopy
+import yaml
+from modules.utils.constants import *
+class WhisperImpl(Enum):
+    WHISPER = "whisper"
+    FASTER_WHISPER = "faster-whisper"
+    INSANELY_FAST_WHISPER = "insanely_fast_whisper"
+class Segment(BaseModel):
+    id: Optional[int] = Field(default=None, description="Incremental id for the segment")
+    seek: Optional[int] = Field(default=None, description="Seek of the segment from chunked audio")
+    text: Optional[str] = Field(default=None, description="Transcription text of the segment")
+    start: Optional[float] = Field(default=None, description="Start time of the segment")
+    end: Optional[float] = Field(default=None, description="End time of the segment")
+    tokens: Optional[List[int]] = Field(default=None, description="List of token IDs")
+    temperature: Optional[float] = Field(default=None, description="Temperature used during the decoding process")
+    avg_logprob: Optional[float] = Field(default=None, description="Average log probability of the tokens")
+    compression_ratio: Optional[float] = Field(default=None, description="Compression ratio of the segment")
+    no_speech_prob: Optional[float] = Field(default=None, description="Probability that it's not speech")
+    words: Optional[List['Word']] = Field(default=None, description="List of words contained in the segment")
+    @classmethod
+    def from_faster_whisper(cls,
+                            seg: faster_whisper.transcribe.Segment):
+        if seg.words is not None:
+            words = [
+                Word(
+                    start=w.start,
+                    end=w.end,
+                    word=w.word,
+                    probability=w.probability
+                ) for w in seg.words
+            ]
+        else:
+            words = None
+        return cls(
+            id=seg.id,
+            seek=seg.seek,
+            text=seg.text,
+            start=seg.start,
+            end=seg.end,
+            tokens=seg.tokens,
+            temperature=seg.temperature,
+            avg_logprob=seg.avg_logprob,
+            compression_ratio=seg.compression_ratio,
+            no_speech_prob=seg.no_speech_prob,
+            words=words
+        )
+class Word(BaseModel):
+    start: Optional[float] = Field(default=None, description="Start time of the word")
+    end: Optional[float] = Field(default=None, description="Start time of the word")
+    word: Optional[str] = Field(default=None, description="Word text")
+    probability: Optional[float] = Field(default=None, description="Probability of the word")
+class BaseParams(BaseModel):
+    model_config = ConfigDict(protected_namespaces=())
+    def to_dict(self) -> Dict:
+        return self.model_dump()
+    def to_list(self) -> List:
+        return list(self.model_dump().values())
+    @classmethod
+    def from_list(cls, data_list: List) -> 'BaseParams':
+        field_names = list(cls.model_fields.keys())
+        return cls(**dict(zip(field_names, data_list)))
+class VadParams(BaseParams):
+    """Voice Activity Detection parameters"""
+    vad_filter: bool = Field(default=False, description="Enable voice activity detection to filter out non-speech parts")
+    threshold: float = Field(
+        default=0.5,
+        ge=0.0,
+        le=1.0,
+        description="Speech threshold for Silero VAD. Probabilities above this value are considered speech"
+    )
+    min_speech_duration_ms: int = Field(
+        default=250,
+        ge=0,
+        description="Final speech chunks shorter than this are discarded"
+    )
+    max_speech_duration_s: float = Field(
+        default=float("inf"),
+        gt=0,
+        description="Maximum duration of speech chunks in seconds"
+    )
+    min_silence_duration_ms: int = Field(
+        default=2000,
+        ge=0,
+        description="Minimum silence duration between speech chunks"
+    )
+    speech_pad_ms: int = Field(
+        default=400,
+        ge=0,
+        description="Padding added to each side of speech chunks"
+    )
+    @classmethod
+    def to_gradio_inputs(cls, defaults: Optional[Dict] = None) -> List[gr.components.base.FormComponent]:
+        return [
+            gr.Checkbox(
+                label=_("Enable Silero VAD Filter"),
+                value=defaults.get("vad_filter", cls.__fields__["vad_filter"].default),
+                interactive=True,
+                info=_("Enable this to transcribe only detected voice")
+            ),
+            gr.Slider(
+                minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold",
+                value=defaults.get("threshold", cls.__fields__["threshold"].default),
+                info="Lower it to be more sensitive to small sounds."
+            ),
+            gr.Number(
+                label="Minimum Speech Duration (ms)", precision=0,
+                value=defaults.get("min_speech_duration_ms", cls.__fields__["min_speech_duration_ms"].default),
+                info="Final speech chunks shorter than this time are thrown out"
+            ),
+            gr.Number(
+                label="Maximum Speech Duration (s)",
+                value=defaults.get("max_speech_duration_s", GRADIO_NONE_NUMBER_MAX),
+                info="Maximum duration of speech chunks in \"seconds\"."
+            ),
+            gr.Number(
+                label="Minimum Silence Duration (ms)", precision=0,
+                value=defaults.get("min_silence_duration_ms", cls.__fields__["min_silence_duration_ms"].default),
+                info="In the end of each speech chunk wait for this time before separating it"
+            ),
+            gr.Number(
+                label="Speech Padding (ms)", precision=0,
+                value=defaults.get("speech_pad_ms", cls.__fields__["speech_pad_ms"].default),
+                info="Final speech chunks are padded by this time each side"
+            )
+        ]
+class DiarizationParams(BaseParams):
+    """Speaker diarization parameters"""
+    is_diarize: bool = Field(default=False, description="Enable speaker diarization")
+    device: str = Field(default="cuda", description="Device to run Diarization model.")
+    hf_token: str = Field(
+        default="",
+        description="Hugging Face token for downloading diarization models"
+    )
+    @classmethod
+    def to_gradio_inputs(cls,
+                         defaults: Optional[Dict] = None,
+                         available_devices: Optional[List] = None,
+                         device: Optional[str] = None) -> List[gr.components.base.FormComponent]:
+        return [
+            gr.Checkbox(
+                label=_("Enable Diarization"),
+                value=defaults.get("is_diarize", cls.__fields__["is_diarize"].default),
+            ),
+            gr.Dropdown(
+                label=_("Device"),
+                choices=["cpu", "cuda"] if available_devices is None else available_devices,
+                value=defaults.get("device", device),
+            ),
+            gr.Textbox(
+                label=_("HuggingFace Token"),
+                value=defaults.get("hf_token", cls.__fields__["hf_token"].default),
+                info=_("This is only needed the first time you download the model")
+            ),
+        ]
+class BGMSeparationParams(BaseParams):
+    """Background music separation parameters"""
+    is_separate_bgm: bool = Field(default=False, description="Enable background music separation")
+    model_size: str = Field(
+        default="UVR-MDX-NET-Inst_HQ_4",
+        description="UVR model size"
+    )
+    device: str = Field(default="cuda", description="Device to run UVR model.")
+    segment_size: int = Field(
+        default=256,
+        gt=0,
+        description="Segment size for UVR model"
+    )
+    save_file: bool = Field(
+        default=False,
+        description="Whether to save separated audio files"
+    )
+    enable_offload: bool = Field(
+        default=True,
+        description="Offload UVR model after transcription"
+    )
+    @classmethod
+    def to_gradio_input(cls,
+                        defaults: Optional[Dict] = None,
+                        available_devices: Optional[List] = None,
+                        device: Optional[str] = None,
+                        available_models: Optional[List] = None) -> List[gr.components.base.FormComponent]:
+        return [
+            gr.Checkbox(
+                label=_("Enable Background Music Remover Filter"),
+                value=defaults.get("is_separate_bgm", cls.__fields__["is_separate_bgm"].default),
+                interactive=True,
+                info=_("Enabling this will remove background music")
+            ),
+            gr.Dropdown(
+                label=_("Model"),
+                choices=["UVR-MDX-NET-Inst_HQ_4",
+                         "UVR-MDX-NET-Inst_3"] if available_models is None else available_models,
+                value=defaults.get("model_size", cls.__fields__["model_size"].default),
+            ),
+            gr.Dropdown(
+                label=_("Device"),
+                choices=["cpu", "cuda"] if available_devices is None else available_devices,
+                value=defaults.get("device", device),
+            ),
+            gr.Number(
+                label="Segment Size",
+                value=defaults.get("segment_size", cls.__fields__["segment_size"].default),
+                precision=0,
+                info="Segment size for UVR model"
+            ),
+            gr.Checkbox(
+                label=_("Save separated files to output"),
+                value=defaults.get("save_file", cls.__fields__["save_file"].default),
+            ),
+            gr.Checkbox(
+                label=_("Offload sub model after removing background music"),
+                value=defaults.get("enable_offload", cls.__fields__["enable_offload"].default),
+            )
+        ]
+class WhisperParams(BaseParams):
+    """Whisper parameters"""
+    model_size: str = Field(default="large-v2", description="Whisper model size")
+    lang: Optional[str] = Field(default=None, description="Source language of the file to transcribe")
+    is_translate: bool = Field(default=False, description="Translate speech to English end-to-end")
+    beam_size: int = Field(default=5, ge=1, description="Beam size for decoding")
+    log_prob_threshold: float = Field(
+        default=-1.0,
+        description="Threshold for average log probability of sampled tokens"
+    )
+    no_speech_threshold: float = Field(
+        default=0.6,
+        ge=0.0,
+        le=1.0,
+        description="Threshold for detecting silence"
+    )
+    compute_type: str = Field(default="float16", description="Computation type for transcription")
+    best_of: int = Field(default=5, ge=1, description="Number of candidates when sampling")
+    patience: float = Field(default=1.0, gt=0, description="Beam search patience factor")
+    condition_on_previous_text: bool = Field(
+        default=True,
+        description="Use previous output as prompt for next window"
+    )
+    prompt_reset_on_temperature: float = Field(
+        default=0.5,
+        ge=0.0,
+        le=1.0,
+        description="Temperature threshold for resetting prompt"
+    )
+    initial_prompt: Optional[str] = Field(default=None, description="Initial prompt for first window")
+    temperature: float = Field(
+        default=0.0,
+        ge=0.0,
+        description="Temperature for sampling"
+    )
+    compression_ratio_threshold: float = Field(
+        default=2.4,
+        gt=0,
+        description="Threshold for gzip compression ratio"
+    )
+    length_penalty: float = Field(default=1.0, gt=0, description="Exponential length penalty")
+    repetition_penalty: float = Field(default=1.0, gt=0, description="Penalty for repeated tokens")
+    no_repeat_ngram_size: int = Field(default=0, ge=0, description="Size of n-grams to prevent repetition")
+    prefix: Optional[str] = Field(default=None, description="Prefix text for first window")
+    suppress_blank: bool = Field(
+        default=True,
+        description="Suppress blank outputs at start of sampling"
+    )
+    suppress_tokens: Optional[Union[List[int], str]] = Field(default=[-1], description="Token IDs to suppress")
+    max_initial_timestamp: float = Field(
+        default=1.0,
+        ge=0.0,
+        description="Maximum initial timestamp"
+    )
+    word_timestamps: bool = Field(default=False, description="Extract word-level timestamps")
+    prepend_punctuations: Optional[str] = Field(
+        default="\"'“¿([{-",
+        description="Punctuations to merge with next word"
+    )
+    append_punctuations: Optional[str] = Field(
+        default="\"'.。,，!！?？:：”)]}、",
+        description="Punctuations to merge with previous word"
+    )
+    max_new_tokens: Optional[int] = Field(default=None, description="Maximum number of new tokens per chunk")
+    chunk_length: Optional[int] = Field(default=30, description="Length of audio segments in seconds")
+    hallucination_silence_threshold: Optional[float] = Field(
+        default=None,
+        description="Threshold for skipping silent periods in hallucination detection"
+    )
+    hotwords: Optional[str] = Field(default=None, description="Hotwords/hint phrases for the model")
+    language_detection_threshold: Optional[float] = Field(
+        default=None,
+        description="Threshold for language detection probability"
+    )
+    language_detection_segments: int = Field(
+        default=1,
+        gt=0,
+        description="Number of segments for language detection"
+    )
+    batch_size: int = Field(default=24, gt=0, description="Batch size for processing")
+    @field_validator('lang')
+    def validate_lang(cls, v):
+        from modules.utils.constants import AUTOMATIC_DETECTION
+        return None if v == AUTOMATIC_DETECTION.unwrap() else v
+    @field_validator('suppress_tokens')
+    def validate_supress_tokens(cls, v):
+        import ast
+        try:
+            if isinstance(v, str):
+                suppress_tokens = ast.literal_eval(v)
+                if not isinstance(suppress_tokens, list):
+                    raise ValueError("Invalid Suppress Tokens. The value must be type of List[int]")
+                return suppress_tokens
+            if isinstance(v, list):
+                return v
+        except Exception as e:
+            raise ValueError(f"Invalid Suppress Tokens. The value must be type of List[int]: {e}")
+    @classmethod
+    def to_gradio_inputs(cls,
+                         defaults: Optional[Dict] = None,
+                         only_advanced: Optional[bool] = True,
+                         whisper_type: Optional[str] = None,
+                         available_models: Optional[List] = None,
+                         available_langs: Optional[List] = None,
+                         available_compute_types: Optional[List] = None,
+                         compute_type: Optional[str] = None):
+        whisper_type = WhisperImpl.FASTER_WHISPER.value if whisper_type is None else whisper_type.strip().lower()
+        inputs = []
+        if not only_advanced:
+            inputs += [
+                gr.Dropdown(
+                    label=_("Model"),
+                    choices=available_models,
+                    value=defaults.get("model_size", cls.__fields__["model_size"].default),
+                ),
+                gr.Dropdown(
+                    label=_("Language"),
+                    choices=available_langs,
+                    value=defaults.get("lang", AUTOMATIC_DETECTION),
+                ),
+                gr.Checkbox(
+                    label=_("Translate to English?"),
+                    value=defaults.get("is_translate", cls.__fields__["is_translate"].default),
+                ),
+            ]
+        inputs += [
+            gr.Number(
+                label="Beam Size",
+                value=defaults.get("beam_size", cls.__fields__["beam_size"].default),
+                precision=0,
+                info="Beam size for decoding"
+            ),
+            gr.Number(
+                label="Log Probability Threshold",
+                value=defaults.get("log_prob_threshold", cls.__fields__["log_prob_threshold"].default),
+                info="Threshold for average log probability of sampled tokens"
+            ),
+            gr.Number(
+                label="No Speech Threshold",
+                value=defaults.get("no_speech_threshold", cls.__fields__["no_speech_threshold"].default),
+                info="Threshold for detecting silence"
+            ),
+            gr.Dropdown(
+                label="Compute Type",
+                choices=["float16", "int8", "int16"] if available_compute_types is None else available_compute_types,
+                value=defaults.get("compute_type", compute_type),
+                info="Computation type for transcription"
+            ),
+            gr.Number(
+                label="Best Of",
+                value=defaults.get("best_of", cls.__fields__["best_of"].default),
+                precision=0,
+                info="Number of candidates when sampling"
+            ),
+            gr.Number(
+                label="Patience",
+                value=defaults.get("patience", cls.__fields__["patience"].default),
+                info="Beam search patience factor"
+            ),
+            gr.Checkbox(
+                label="Condition On Previous Text",
+                value=defaults.get("condition_on_previous_text", cls.__fields__["condition_on_previous_text"].default),
+                info="Use previous output as prompt for next window"
+            ),
+            gr.Slider(
+                label="Prompt Reset On Temperature",
+                value=defaults.get("prompt_reset_on_temperature",
+                                   cls.__fields__["prompt_reset_on_temperature"].default),
+                minimum=0,
+                maximum=1,
+                step=0.01,
+                info="Temperature threshold for resetting prompt"
+            ),
+            gr.Textbox(
+                label="Initial Prompt",
+                value=defaults.get("initial_prompt", GRADIO_NONE_STR),
+                info="Initial prompt for first window"
+            ),
+            gr.Slider(
+                label="Temperature",
+                value=defaults.get("temperature", cls.__fields__["temperature"].default),
+                minimum=0.0,
+                step=0.01,
+                maximum=1.0,
+                info="Temperature for sampling"
+            ),
+            gr.Number(
+                label="Compression Ratio Threshold",
+                value=defaults.get("compression_ratio_threshold",
+                                   cls.__fields__["compression_ratio_threshold"].default),
+                info="Threshold for gzip compression ratio"
+            )
+        ]
+        faster_whisper_inputs = [
+            gr.Number(
+                label="Length Penalty",
+                value=defaults.get("length_penalty", cls.__fields__["length_penalty"].default),
+                info="Exponential length penalty",
+            ),
+            gr.Number(
+                label="Repetition Penalty",
+                value=defaults.get("repetition_penalty", cls.__fields__["repetition_penalty"].default),
+                info="Penalty for repeated tokens"
+            ),
+            gr.Number(
+                label="No Repeat N-gram Size",
+                value=defaults.get("no_repeat_ngram_size", cls.__fields__["no_repeat_ngram_size"].default),
+                precision=0,
+                info="Size of n-grams to prevent repetition"
+            ),
+            gr.Textbox(
+                label="Prefix",
+                value=defaults.get("prefix", GRADIO_NONE_STR),
+                info="Prefix text for first window"
+            ),
+            gr.Checkbox(
+                label="Suppress Blank",
+                value=defaults.get("suppress_blank", cls.__fields__["suppress_blank"].default),
+                info="Suppress blank outputs at start of sampling"
+            ),
+            gr.Textbox(
+                label="Suppress Tokens",
+                value=defaults.get("suppress_tokens", "[-1]"),
+                info="Token IDs to suppress"
+            ),
+            gr.Number(
+                label="Max Initial Timestamp",
+                value=defaults.get("max_initial_timestamp", cls.__fields__["max_initial_timestamp"].default),
+                info="Maximum initial timestamp"
+            ),
+            gr.Checkbox(
+                label="Word Timestamps",
+                value=defaults.get("word_timestamps", cls.__fields__["word_timestamps"].default),
+                info="Extract word-level timestamps"
+            ),
+            gr.Textbox(
+                label="Prepend Punctuations",
+                value=defaults.get("prepend_punctuations", cls.__fields__["prepend_punctuations"].default),
+                info="Punctuations to merge with next word"
+            ),
+            gr.Textbox(
+                label="Append Punctuations",
+                value=defaults.get("append_punctuations", cls.__fields__["append_punctuations"].default),
+                info="Punctuations to merge with previous word"
+            ),
+            gr.Number(
+                label="Max New Tokens",
+                value=defaults.get("max_new_tokens", GRADIO_NONE_NUMBER_MIN),
+                precision=0,
+                info="Maximum number of new tokens per chunk"
+            ),
+            gr.Number(
+                label="Chunk Length (s)",
+                value=defaults.get("chunk_length", cls.__fields__["chunk_length"].default),
+                precision=0,
+                info="Length of audio segments in seconds"
+            ),
+            gr.Number(
+                label="Hallucination Silence Threshold (sec)",
+                value=defaults.get("hallucination_silence_threshold",
+                                   GRADIO_NONE_NUMBER_MIN),
+                info="Threshold for skipping silent periods in hallucination detection"
+            ),
+            gr.Textbox(
+                label="Hotwords",
+                value=defaults.get("hotwords", cls.__fields__["hotwords"].default),
+                info="Hotwords/hint phrases for the model"
+            ),
+            gr.Number(
+                label="Language Detection Threshold",
+                value=defaults.get("language_detection_threshold",
+                                   GRADIO_NONE_NUMBER_MIN),
+                info="Threshold for language detection probability"
+            ),
+            gr.Number(
+                label="Language Detection Segments",
+                value=defaults.get("language_detection_segments",
+                                   cls.__fields__["language_detection_segments"].default),
+                precision=0,
+                info="Number of segments for language detection"
+            )
+        ]
+        insanely_fast_whisper_inputs = [
+            gr.Number(
+                label="Batch Size",
+                value=defaults.get("batch_size", cls.__fields__["batch_size"].default),
+                precision=0,
+                info="Batch size for processing"
+            )
+        ]
+        if whisper_type != WhisperImpl.FASTER_WHISPER.value:
+            for input_component in faster_whisper_inputs:
+                input_component.visible = False
+        if whisper_type != WhisperImpl.INSANELY_FAST_WHISPER.value:
+            for input_component in insanely_fast_whisper_inputs:
+                input_component.visible = False
+        inputs += faster_whisper_inputs + insanely_fast_whisper_inputs
+        return inputs
+class TranscriptionPipelineParams(BaseModel):
+    """Transcription pipeline parameters"""
+    whisper: WhisperParams = Field(default_factory=WhisperParams)
+    vad: VadParams = Field(default_factory=VadParams)
+    diarization: DiarizationParams = Field(default_factory=DiarizationParams)
+    bgm_separation: BGMSeparationParams = Field(default_factory=BGMSeparationParams)
+    def to_dict(self) -> Dict:
+        data = {
+            "whisper": self.whisper.to_dict(),
+            "vad": self.vad.to_dict(),
+            "diarization": self.diarization.to_dict(),
+            "bgm_separation": self.bgm_separation.to_dict()
+        }
+        return data
+    def to_list(self) -> List:
+        """
+        Convert data class to the list because I have to pass the parameters as a list in the gradio.
+        Related Gradio issue: https://github.com/gradio-app/gradio/issues/2471
+        See more about Gradio pre-processing: https://www.gradio.app/docs/components
+        """
+        whisper_list = self.whisper.to_list()
+        vad_list = self.vad.to_list()
+        diarization_list = self.diarization.to_list()
+        bgm_sep_list = self.bgm_separation.to_list()
+        return whisper_list + vad_list + diarization_list + bgm_sep_list
+    @staticmethod
+    def from_list(pipeline_list: List) -> 'TranscriptionPipelineParams':
+        """Convert list to the data class again to use it in a function."""
+        data_list = deepcopy(pipeline_list)
+        whisper_list = data_list[0:len(WhisperParams.__annotations__)]
+        data_list = data_list[len(WhisperParams.__annotations__):]
+        vad_list = data_list[0:len(VadParams.__annotations__)]
+        data_list = data_list[len(VadParams.__annotations__):]
+        diarization_list = data_list[0:len(DiarizationParams.__annotations__)]
+        data_list = data_list[len(DiarizationParams.__annotations__):]
+        bgm_sep_list = data_list[0:len(BGMSeparationParams.__annotations__)]
+        return TranscriptionPipelineParams(
+            whisper=WhisperParams.from_list(whisper_list),
+            vad=VadParams.from_list(vad_list),
+            diarization=DiarizationParams.from_list(diarization_list),
+            bgm_separation=BGMSeparationParams.from_list(bgm_sep_list)
+        )

modules/whisper/faster_whisper_inference.py CHANGED Viewed

@@ -12,11 +12,11 @@ import gradio as gr
 from argparse import Namespace
 from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, UVR_MODELS_DIR, OUTPUT_DIR)
-from modules.whisper.whisper_parameter import *
-from modules.whisper.whisper_base import WhisperBase
-class FasterWhisperInference(WhisperBase):
     def __init__(self,
                  model_dir: str = FASTER_WHISPER_MODELS_DIR,
                  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
@@ -35,14 +35,12 @@ class FasterWhisperInference(WhisperBase):
         self.model_paths = self.get_model_paths()
         self.device = self.get_device()
         self.available_models = self.model_paths.keys()
-        self.available_compute_types = ctranslate2.get_supported_compute_types(
-            "cuda") if self.device == "cuda" else ctranslate2.get_supported_compute_types("cpu")
     def transcribe(self,
                    audio: Union[str, BinaryIO, np.ndarray],
                    progress: gr.Progress = gr.Progress(),
                    *whisper_params,
-                   ) -> Tuple[List[dict], float]:
         """
         transcribe method for faster-whisper.
@@ -57,28 +55,18 @@ class FasterWhisperInference(WhisperBase):
         Returns
         ----------
-        segments_result: List[dict]
-            list of dicts that includes start, end timestamps and transcribed text
         elapsed_time: float
             elapsed time for transcription
         """
         start_time = time.time()
-        params = WhisperParameters.as_value(*whisper_params)
         if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
             self.update_model(params.model_size, params.compute_type, progress)
-        # None parameters with Textboxes: https://github.com/gradio-app/gradio/issues/8723
-        if not params.initial_prompt:
-            params.initial_prompt = None
-        if not params.prefix:
-            params.prefix = None
-        if not params.hotwords:
-            params.hotwords = None
-        params.suppress_tokens = self.format_suppress_tokens_str(params.suppress_tokens)
         segments, info = self.model.transcribe(
             audio=audio,
             language=params.lang,
@@ -114,11 +102,7 @@ class FasterWhisperInference(WhisperBase):
         segments_result = []
         for segment in segments:
             progress(segment.start / info.duration, desc="Transcribing..")
-            segments_result.append({
-                "start": segment.start,
-                "end": segment.end,
-                "text": segment.text
-            })
         elapsed_time = time.time() - start_time
         return segments_result, elapsed_time

 from argparse import Namespace
 from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, UVR_MODELS_DIR, OUTPUT_DIR)
+from modules.whisper.data_classes import *
+from modules.whisper.base_transcription_pipeline import BaseTranscriptionPipeline
+class FasterWhisperInference(BaseTranscriptionPipeline):
     def __init__(self,
                  model_dir: str = FASTER_WHISPER_MODELS_DIR,
                  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
         self.model_paths = self.get_model_paths()
         self.device = self.get_device()
         self.available_models = self.model_paths.keys()
     def transcribe(self,
                    audio: Union[str, BinaryIO, np.ndarray],
                    progress: gr.Progress = gr.Progress(),
                    *whisper_params,
+                   ) -> Tuple[List[Segment], float]:
         """
         transcribe method for faster-whisper.
         Returns
         ----------
+        segments_result: List[Segment]
+            list of Segment that includes start, end timestamps and transcribed text
         elapsed_time: float
             elapsed time for transcription
         """
         start_time = time.time()
+        params = WhisperParams.from_list(list(whisper_params))
         if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
             self.update_model(params.model_size, params.compute_type, progress)
         segments, info = self.model.transcribe(
             audio=audio,
             language=params.lang,
         segments_result = []
         for segment in segments:
             progress(segment.start / info.duration, desc="Transcribing..")
+            segments_result.append(Segment.from_faster_whisper(segment))
         elapsed_time = time.time() - start_time
         return segments_result, elapsed_time

modules/whisper/insanely_fast_whisper_inference.py CHANGED Viewed

@@ -12,11 +12,11 @@ from rich.progress import Progress, TimeElapsedColumn, BarColumn, TextColumn
 from argparse import Namespace
 from modules.utils.paths import (INSANELY_FAST_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, UVR_MODELS_DIR, OUTPUT_DIR)
-from modules.whisper.whisper_parameter import *
-from modules.whisper.whisper_base import WhisperBase
-class InsanelyFastWhisperInference(WhisperBase):
     def __init__(self,
                  model_dir: str = INSANELY_FAST_WHISPER_MODELS_DIR,
                  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
@@ -32,16 +32,13 @@ class InsanelyFastWhisperInference(WhisperBase):
         self.model_dir = model_dir
         os.makedirs(self.model_dir, exist_ok=True)
-        openai_models = whisper.available_models()
-        distil_models = ["distil-large-v2", "distil-large-v3", "distil-medium.en", "distil-small.en"]
-        self.available_models = openai_models + distil_models
-        self.available_compute_types = ["float16"]
     def transcribe(self,
                    audio: Union[str, np.ndarray, torch.Tensor],
                    progress: gr.Progress = gr.Progress(),
                    *whisper_params,
-                   ) -> Tuple[List[dict], float]:
         """
         transcribe method for faster-whisper.
@@ -56,13 +53,13 @@ class InsanelyFastWhisperInference(WhisperBase):
         Returns
         ----------
-        segments_result: List[dict]
-            list of dicts that includes start, end timestamps and transcribed text
         elapsed_time: float
             elapsed time for transcription
         """
         start_time = time.time()
-        params = WhisperParameters.as_value(*whisper_params)
         if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
             self.update_model(params.model_size, params.compute_type, progress)
@@ -96,9 +93,17 @@ class InsanelyFastWhisperInference(WhisperBase):
                 generate_kwargs=kwargs
             )
-        segments_result = self.format_result(
-            transcribed_result=segments,
-        )
         elapsed_time = time.time() - start_time
         return segments_result, elapsed_time
@@ -139,31 +144,26 @@ class InsanelyFastWhisperInference(WhisperBase):
             model_kwargs={"attn_implementation": "flash_attention_2"} if is_flash_attn_2_available() else {"attn_implementation": "sdpa"},
         )
-    @staticmethod
-    def format_result(
-        transcribed_result: dict
-    ) -> List[dict]:
         """
-        Format the transcription result of insanely_fast_whisper as the same with other implementation.
-        Parameters
-        ----------
-        transcribed_result: dict
-            Transcription result of the insanely_fast_whisper
         Returns
         ----------
-        result: List[dict]
-            Formatted result as the same with other implementation
         """
-        result = transcribed_result["chunks"]
-        for item in result:
-            start, end = item["timestamp"][0], item["timestamp"][1]
-            if end is None:
-                end = start
-            item["start"] = start
-            item["end"] = end
-        return result
     @staticmethod
     def download_model(

 from argparse import Namespace
 from modules.utils.paths import (INSANELY_FAST_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, UVR_MODELS_DIR, OUTPUT_DIR)
+from modules.whisper.data_classes import *
+from modules.whisper.base_transcription_pipeline import BaseTranscriptionPipeline
+class InsanelyFastWhisperInference(BaseTranscriptionPipeline):
     def __init__(self,
                  model_dir: str = INSANELY_FAST_WHISPER_MODELS_DIR,
                  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
         self.model_dir = model_dir
         os.makedirs(self.model_dir, exist_ok=True)
+        self.available_models = self.get_model_paths()
     def transcribe(self,
                    audio: Union[str, np.ndarray, torch.Tensor],
                    progress: gr.Progress = gr.Progress(),
                    *whisper_params,
+                   ) -> Tuple[List[Segment], float]:
         """
         transcribe method for faster-whisper.
         Returns
         ----------
+        segments_result: List[Segment]
+            list of Segment that includes start, end timestamps and transcribed text
         elapsed_time: float
             elapsed time for transcription
         """
         start_time = time.time()
+        params = WhisperParams.from_list(list(whisper_params))
         if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
             self.update_model(params.model_size, params.compute_type, progress)
                 generate_kwargs=kwargs
             )
+        segments_result = []
+        for item in segments["chunks"]:
+            start, end = item["timestamp"][0], item["timestamp"][1]
+            if end is None:
+                end = start
+            segments_result.append(Segment(
+                text=item["text"],
+                start=start,
+                end=end
+            ))
         elapsed_time = time.time() - start_time
         return segments_result, elapsed_time
             model_kwargs={"attn_implementation": "flash_attention_2"} if is_flash_attn_2_available() else {"attn_implementation": "sdpa"},
         )
+    def get_model_paths(self):
         """
+        Get available models from models path including fine-tuned model.
         Returns
         ----------
+        Name set of models
         """
+        openai_models = whisper.available_models()
+        distil_models = ["distil-large-v2", "distil-large-v3", "distil-medium.en", "distil-small.en"]
+        default_models = openai_models + distil_models
+        existing_models = os.listdir(self.model_dir)
+        wrong_dirs = [".locks"]
+        available_models = default_models + existing_models
+        available_models = [model for model in available_models if model not in wrong_dirs]
+        available_models = sorted(set(available_models), key=available_models.index)
+        return available_models
     @staticmethod
     def download_model(

modules/whisper/whisper_Inference.py CHANGED Viewed

@@ -8,11 +8,11 @@ import os
 from argparse import Namespace
 from modules.utils.paths import (WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, UVR_MODELS_DIR)
-from modules.whisper.whisper_base import WhisperBase
-from modules.whisper.whisper_parameter import *
-class WhisperInference(WhisperBase):
     def __init__(self,
                  model_dir: str = WHISPER_MODELS_DIR,
                  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
@@ -30,7 +30,7 @@ class WhisperInference(WhisperBase):
                    audio: Union[str, np.ndarray, torch.Tensor],
                    progress: gr.Progress = gr.Progress(),
                    *whisper_params,
-                   ) -> Tuple[List[dict], float]:
         """
         transcribe method for faster-whisper.
@@ -45,13 +45,13 @@ class WhisperInference(WhisperBase):
         Returns
         ----------
-        segments_result: List[dict]
-            list of dicts that includes start, end timestamps and transcribed text
         elapsed_time: float
             elapsed time for transcription
         """
         start_time = time.time()
-        params = WhisperParameters.as_value(*whisper_params)
         if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
             self.update_model(params.model_size, params.compute_type, progress)
@@ -59,21 +59,28 @@ class WhisperInference(WhisperBase):
         def progress_callback(progress_value):
             progress(progress_value, desc="Transcribing..")
-        segments_result = self.model.transcribe(audio=audio,
-                                                language=params.lang,
-                                                verbose=False,
-                                                beam_size=params.beam_size,
-                                                logprob_threshold=params.log_prob_threshold,
-                                                no_speech_threshold=params.no_speech_threshold,
-                                                task="translate" if params.is_translate and self.current_model_size in self.translatable_models else "transcribe",
-                                                fp16=True if params.compute_type == "float16" else False,
-                                                best_of=params.best_of,
-                                                patience=params.patience,
-                                                temperature=params.temperature,
-                                                compression_ratio_threshold=params.compression_ratio_threshold,
-                                                progress_callback=progress_callback,)["segments"]
-        elapsed_time = time.time() - start_time
         return segments_result, elapsed_time
     def update_model(self,

 from argparse import Namespace
 from modules.utils.paths import (WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, UVR_MODELS_DIR)
+from modules.whisper.base_transcription_pipeline import BaseTranscriptionPipeline
+from modules.whisper.data_classes import *
+class WhisperInference(BaseTranscriptionPipeline):
     def __init__(self,
                  model_dir: str = WHISPER_MODELS_DIR,
                  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
                    audio: Union[str, np.ndarray, torch.Tensor],
                    progress: gr.Progress = gr.Progress(),
                    *whisper_params,
+                   ) -> Tuple[List[Segment], float]:
         """
         transcribe method for faster-whisper.
         Returns
         ----------
+        segments_result: List[Segment]
+            list of Segment that includes start, end timestamps and transcribed text
         elapsed_time: float
             elapsed time for transcription
         """
         start_time = time.time()
+        params = WhisperParams.from_list(list(whisper_params))
         if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
             self.update_model(params.model_size, params.compute_type, progress)
         def progress_callback(progress_value):
             progress(progress_value, desc="Transcribing..")
+        result = self.model.transcribe(audio=audio,
+                                       language=params.lang,
+                                       verbose=False,
+                                       beam_size=params.beam_size,
+                                       logprob_threshold=params.log_prob_threshold,
+                                       no_speech_threshold=params.no_speech_threshold,
+                                       task="translate" if params.is_translate and self.current_model_size in self.translatable_models else "transcribe",
+                                       fp16=True if params.compute_type == "float16" else False,
+                                       best_of=params.best_of,
+                                       patience=params.patience,
+                                       temperature=params.temperature,
+                                       compression_ratio_threshold=params.compression_ratio_threshold,
+                                       progress_callback=progress_callback,)["segments"]
+        segments_result = []
+        for segment in result:
+            segments_result.append(Segment(
+                start=segment["start"],
+                end=segment["end"],
+                text=segment["text"]
+            ))
+        elapsed_time = time.time() - start_time
         return segments_result, elapsed_time
     def update_model(self,

modules/whisper/whisper_factory.py CHANGED Viewed

@@ -6,7 +6,8 @@ from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_D
 from modules.whisper.faster_whisper_inference import FasterWhisperInference
 from modules.whisper.whisper_Inference import WhisperInference
 from modules.whisper.insanely_fast_whisper_inference import InsanelyFastWhisperInference
-from modules.whisper.whisper_base import WhisperBase
 class WhisperFactory:
@@ -19,7 +20,7 @@ class WhisperFactory:
         diarization_model_dir: str = DIARIZATION_MODELS_DIR,
         uvr_model_dir: str = UVR_MODELS_DIR,
         output_dir: str = OUTPUT_DIR,
-    ) -> "WhisperBase":
         """
         Create a whisper inference class based on the provided whisper_type.
@@ -45,36 +46,29 @@ class WhisperFactory:
         Returns
         -------
-        WhisperBase
             An instance of the appropriate whisper inference class based on the whisper_type.
         """
         # Temporal fix of the bug : https://github.com/jhj0517/Whisper-WebUI/issues/144
         os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
-        whisper_type = whisper_type.lower().strip()
-        faster_whisper_typos = ["faster_whisper", "faster-whisper", "fasterwhisper"]
-        whisper_typos = ["whisper"]
-        insanely_fast_whisper_typos = [
-            "insanely_fast_whisper", "insanely-fast-whisper", "insanelyfastwhisper",
-            "insanely_faster_whisper", "insanely-faster-whisper", "insanelyfasterwhisper"
-        ]
-        if whisper_type in faster_whisper_typos:
             return FasterWhisperInference(
                 model_dir=faster_whisper_model_dir,
                 output_dir=output_dir,
                 diarization_model_dir=diarization_model_dir,
                 uvr_model_dir=uvr_model_dir
             )
-        elif whisper_type in whisper_typos:
             return WhisperInference(
                 model_dir=whisper_model_dir,
                 output_dir=output_dir,
                 diarization_model_dir=diarization_model_dir,
                 uvr_model_dir=uvr_model_dir
             )
-        elif whisper_type in insanely_fast_whisper_typos:
             return InsanelyFastWhisperInference(
                 model_dir=insanely_fast_whisper_model_dir,
                 output_dir=output_dir,

 from modules.whisper.faster_whisper_inference import FasterWhisperInference
 from modules.whisper.whisper_Inference import WhisperInference
 from modules.whisper.insanely_fast_whisper_inference import InsanelyFastWhisperInference
+from modules.whisper.base_transcription_pipeline import BaseTranscriptionPipeline
+from modules.whisper.data_classes import *
 class WhisperFactory:
         diarization_model_dir: str = DIARIZATION_MODELS_DIR,
         uvr_model_dir: str = UVR_MODELS_DIR,
         output_dir: str = OUTPUT_DIR,
+    ) -> "BaseTranscriptionPipeline":
         """
         Create a whisper inference class based on the provided whisper_type.
         Returns
         -------
+        BaseTranscriptionPipeline
             An instance of the appropriate whisper inference class based on the whisper_type.
         """
         # Temporal fix of the bug : https://github.com/jhj0517/Whisper-WebUI/issues/144
         os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
+        whisper_type = whisper_type.strip().lower()
+        if whisper_type == WhisperImpl.FASTER_WHISPER.value:
             return FasterWhisperInference(
                 model_dir=faster_whisper_model_dir,
                 output_dir=output_dir,
                 diarization_model_dir=diarization_model_dir,
                 uvr_model_dir=uvr_model_dir
             )
+        elif whisper_type == WhisperImpl.WHISPER.value:
             return WhisperInference(
                 model_dir=whisper_model_dir,
                 output_dir=output_dir,
                 diarization_model_dir=diarization_model_dir,
                 uvr_model_dir=uvr_model_dir
             )
+        elif whisper_type == WhisperImpl.INSANELY_FAST_WHISPER.value:
             return InsanelyFastWhisperInference(
                 model_dir=insanely_fast_whisper_model_dir,
                 output_dir=output_dir,

modules/whisper/whisper_parameter.py DELETED Viewed

@@ -1,369 +0,0 @@
-from dataclasses import dataclass, fields
-import gradio as gr
-from typing import Optional, Dict
-import yaml
-@dataclass
-class WhisperParameters:
-    model_size: gr.Dropdown
-    lang: gr.Dropdown
-    is_translate: gr.Checkbox
-    beam_size: gr.Number
-    log_prob_threshold: gr.Number
-    no_speech_threshold: gr.Number
-    compute_type: gr.Dropdown
-    best_of: gr.Number
-    patience: gr.Number
-    condition_on_previous_text: gr.Checkbox
-    prompt_reset_on_temperature: gr.Slider
-    initial_prompt: gr.Textbox
-    temperature: gr.Slider
-    compression_ratio_threshold: gr.Number
-    vad_filter: gr.Checkbox
-    threshold: gr.Slider
-    min_speech_duration_ms: gr.Number
-    max_speech_duration_s: gr.Number
-    min_silence_duration_ms: gr.Number
-    speech_pad_ms: gr.Number
-    batch_size: gr.Number
-    is_diarize: gr.Checkbox
-    hf_token: gr.Textbox
-    diarization_device: gr.Dropdown
-    length_penalty: gr.Number
-    repetition_penalty: gr.Number
-    no_repeat_ngram_size: gr.Number
-    prefix: gr.Textbox
-    suppress_blank: gr.Checkbox
-    suppress_tokens: gr.Textbox
-    max_initial_timestamp: gr.Number
-    word_timestamps: gr.Checkbox
-    prepend_punctuations: gr.Textbox
-    append_punctuations: gr.Textbox
-    max_new_tokens: gr.Number
-    chunk_length: gr.Number
-    hallucination_silence_threshold: gr.Number
-    hotwords: gr.Textbox
-    language_detection_threshold: gr.Number
-    language_detection_segments: gr.Number
-    is_bgm_separate: gr.Checkbox
-    uvr_model_size: gr.Dropdown
-    uvr_device: gr.Dropdown
-    uvr_segment_size: gr.Number
-    uvr_save_file: gr.Checkbox
-    uvr_enable_offload: gr.Checkbox
-    """
-    A data class for Gradio components of the Whisper Parameters. Use "before" Gradio pre-processing.
-    This data class is used to mitigate the key-value problem between Gradio components and function parameters.
-    Related Gradio issue: https://github.com/gradio-app/gradio/issues/2471
-    See more about Gradio pre-processing: https://www.gradio.app/docs/components
-    Attributes
-    ----------
-    model_size: gr.Dropdown
-        Whisper model size.
-    lang: gr.Dropdown
-        Source language of the file to transcribe.
-    is_translate: gr.Checkbox
-        Boolean value that determines whether to translate to English.
-        It's Whisper's feature to translate speech from another language directly into English end-to-end.
-    beam_size: gr.Number
-        Int value that is used for decoding option.
-    log_prob_threshold: gr.Number
-        If the average log probability over sampled tokens is below this value, treat as failed.
-    no_speech_threshold: gr.Number
-        If the no_speech probability is higher than this value AND
-        the average log probability over sampled tokens is below `log_prob_threshold`,
-        consider the segment as silent.
-    compute_type: gr.Dropdown
-        compute type for transcription.
-        see more info : https://opennmt.net/CTranslate2/quantization.html
-    best_of: gr.Number
-        Number of candidates when sampling with non-zero temperature.
-    patience: gr.Number
-        Beam search patience factor.
-    condition_on_previous_text: gr.Checkbox
-        if True, the previous output of the model is provided as a prompt for the next window;
-        disabling may make the text inconsistent across windows, but the model becomes less prone to
-        getting stuck in a failure loop, such as repetition looping or timestamps going out of sync.
-    initial_prompt: gr.Textbox
-        Optional text to provide as a prompt for the first window. This can be used to provide, or
-        "prompt-engineer" a context for transcription, e.g. custom vocabularies or proper nouns
-        to make it more likely to predict those word correctly.
-    temperature: gr.Slider
-        Temperature for sampling. It can be a tuple of temperatures,
-        which will be successively used upon failures according to either
-        `compression_ratio_threshold` or `log_prob_threshold`.
-    compression_ratio_threshold: gr.Number
-        If the gzip compression ratio is above this value, treat as failed
-    vad_filter: gr.Checkbox
-        Enable the voice activity detection (VAD) to filter out parts of the audio
-        without speech. This step is using the Silero VAD model
-        https://github.com/snakers4/silero-vad.
-    threshold: gr.Slider
-        This parameter is related with Silero VAD. Speech threshold.
-        Silero VAD outputs speech probabilities for each audio chunk,
-        probabilities ABOVE this value are considered as SPEECH. It is better to tune this
-        parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
-    min_speech_duration_ms: gr.Number
-        This parameter is related with Silero VAD. Final speech chunks shorter min_speech_duration_ms are thrown out.
-    max_speech_duration_s: gr.Number
-        This parameter is related with Silero VAD. Maximum duration of speech chunks in seconds. Chunks longer
-        than max_speech_duration_s will be split at the timestamp of the last silence that
-        lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will be
-        split aggressively just before max_speech_duration_s.
-    min_silence_duration_ms: gr.Number
-        This parameter is related with Silero VAD. In the end of each speech chunk wait for min_silence_duration_ms
-        before separating it
-    speech_pad_ms: gr.Number
-        This parameter is related with Silero VAD. Final speech chunks are padded by speech_pad_ms each side
-    batch_size: gr.Number
-        This parameter is related with insanely-fast-whisper pipe. Batch size to pass to the pipe
-    is_diarize: gr.Checkbox
-        This parameter is related with whisperx. Boolean value that determines whether to diarize or not.
-    hf_token: gr.Textbox
-        This parameter is related with whisperx. Huggingface token is needed to download diarization models.
-        Read more about : https://huggingface.co/pyannote/speaker-diarization-3.1#requirements
-    diarization_device: gr.Dropdown
-        This parameter is related with whisperx. Device to run diarization model
-    length_penalty: gr.Number
-        This parameter is related to faster-whisper. Exponential length penalty constant.
-    repetition_penalty: gr.Number
-        This parameter is related to faster-whisper. Penalty applied to the score of previously generated tokens
-        (set > 1 to penalize).
-    no_repeat_ngram_size: gr.Number
-        This parameter is related to faster-whisper. Prevent repetitions of n-grams with this size (set 0 to disable).
-    prefix: gr.Textbox
-        This parameter is related to faster-whisper. Optional text to provide as a prefix for the first window.
-    suppress_blank: gr.Checkbox
-        This parameter is related to faster-whisper. Suppress blank outputs at the beginning of the sampling.
-    suppress_tokens: gr.Textbox
-        This parameter is related to faster-whisper. List of token IDs to suppress. -1 will suppress a default set
-        of symbols as defined in the model config.json file.
-    max_initial_timestamp: gr.Number
-        This parameter is related to faster-whisper. The initial timestamp cannot be later than this.
-    word_timestamps: gr.Checkbox
-        This parameter is related to faster-whisper. Extract word-level timestamps using the cross-attention pattern
-        and dynamic time warping, and include the timestamps for each word in each segment.
-    prepend_punctuations: gr.Textbox
-        This parameter is related to faster-whisper. If word_timestamps is True, merge these punctuation symbols
-        with the next word.
-    append_punctuations: gr.Textbox
-        This parameter is related to faster-whisper. If word_timestamps is True, merge these punctuation symbols
-        with the previous word.
-    max_new_tokens: gr.Number
-        This parameter is related to faster-whisper. Maximum number of new tokens to generate per-chunk. If not set,
-        the maximum will be set by the default max_length.
-    chunk_length: gr.Number
-        This parameter is related to faster-whisper and insanely-fast-whisper. The length of audio segments in seconds.
-         If it is not None, it will overwrite the default chunk_length of the FeatureExtractor.
-    hallucination_silence_threshold: gr.Number
-        This parameter is related to faster-whisper. When word_timestamps is True, skip silent periods longer than this threshold
-        (in seconds) when a possible hallucination is detected.
-    hotwords: gr.Textbox
-        This parameter is related to faster-whisper. Hotwords/hint phrases to provide the model with. Has no effect if prefix is not None.
-    language_detection_threshold: gr.Number
-        This parameter is related to faster-whisper. If the maximum probability of the language tokens is higher than this value, the language is detected.
-    language_detection_segments: gr.Number
-        This parameter is related to faster-whisper. Number of segments to consider for the language detection.
-    is_separate_bgm: gr.Checkbox
-        This parameter is related to UVR. Boolean value that determines whether to separate bgm or not.
-    uvr_model_size: gr.Dropdown
-        This parameter is related to UVR. UVR model size.
-    uvr_device: gr.Dropdown
-        This parameter is related to UVR. Device to run UVR model.
-    uvr_segment_size: gr.Number
-        This parameter is related to UVR. Segment size for UVR model.
-    uvr_save_file: gr.Checkbox
-        This parameter is related to UVR. Boolean value that determines whether to save the file or not.
-    uvr_enable_offload: gr.Checkbox
-        This parameter is related to UVR. Boolean value that determines whether to offload the UVR model or not
-        after each transcription.
-    """
-    def as_list(self) -> list:
-        """
-        Converts the data class attributes into a list, Use in Gradio UI before Gradio pre-processing.
-        See more about Gradio pre-processing: : https://www.gradio.app/docs/components
-        Returns
-        ----------
-        A list of Gradio components
-        """
-        return [getattr(self, f.name) for f in fields(self)]
-    @staticmethod
-    def as_value(*args) -> 'WhisperValues':
-        """
-        To use Whisper parameters in function after Gradio post-processing.
-        See more about Gradio post-processing: : https://www.gradio.app/docs/components
-        Returns
-        ----------
-        WhisperValues
-           Data class that has values of parameters
-        """
-        return WhisperValues(*args)
-@dataclass
-class WhisperValues:
-    model_size: str = "large-v2"
-    lang: Optional[str] = None
-    is_translate: bool = False
-    beam_size: int = 5
-    log_prob_threshold: float = -1.0
-    no_speech_threshold: float = 0.6
-    compute_type: str = "float16"
-    best_of: int = 5
-    patience: float = 1.0
-    condition_on_previous_text: bool = True
-    prompt_reset_on_temperature: float = 0.5
-    initial_prompt: Optional[str] = None
-    temperature: float = 0.0
-    compression_ratio_threshold: float = 2.4
-    vad_filter: bool = False
-    threshold: float = 0.5
-    min_speech_duration_ms: int = 250
-    max_speech_duration_s: float = float("inf")
-    min_silence_duration_ms: int = 2000
-    speech_pad_ms: int = 400
-    batch_size: int = 24
-    is_diarize: bool = False
-    hf_token: str = ""
-    diarization_device: str = "cuda"
-    length_penalty: float = 1.0
-    repetition_penalty: float = 1.0
-    no_repeat_ngram_size: int = 0
-    prefix: Optional[str] = None
-    suppress_blank: bool = True
-    suppress_tokens: Optional[str] = "[-1]"
-    max_initial_timestamp: float = 0.0
-    word_timestamps: bool = False
-    prepend_punctuations: Optional[str] = "\"'“¿([{-"
-    append_punctuations: Optional[str] = "\"'.。,，!！?？:：”)]}、"
-    max_new_tokens: Optional[int] = None
-    chunk_length: Optional[int] = 30
-    hallucination_silence_threshold: Optional[float] = None
-    hotwords: Optional[str] = None
-    language_detection_threshold: Optional[float] = None
-    language_detection_segments: int = 1
-    is_bgm_separate: bool = False
-    uvr_model_size: str = "UVR-MDX-NET-Inst_HQ_4"
-    uvr_device: str = "cuda"
-    uvr_segment_size: int = 256
-    uvr_save_file: bool = False
-    uvr_enable_offload: bool = True
-    """
-    A data class to use Whisper parameters.
-    """
-    def to_yaml(self) -> Dict:
-        data = {
-            "whisper": {
-                "model_size": self.model_size,
-                "lang": "Automatic Detection" if self.lang is None else self.lang,
-                "is_translate": self.is_translate,
-                "beam_size": self.beam_size,
-                "log_prob_threshold": self.log_prob_threshold,
-                "no_speech_threshold": self.no_speech_threshold,
-                "best_of": self.best_of,
-                "patience": self.patience,
-                "condition_on_previous_text": self.condition_on_previous_text,
-                "prompt_reset_on_temperature": self.prompt_reset_on_temperature,
-                "initial_prompt": None if not self.initial_prompt else self.initial_prompt,
-                "temperature": self.temperature,
-                "compression_ratio_threshold": self.compression_ratio_threshold,
-                "batch_size": self.batch_size,
-                "length_penalty": self.length_penalty,
-                "repetition_penalty": self.repetition_penalty,
-                "no_repeat_ngram_size": self.no_repeat_ngram_size,
-                "prefix": None if not self.prefix else self.prefix,
-                "suppress_blank": self.suppress_blank,
-                "suppress_tokens": self.suppress_tokens,
-                "max_initial_timestamp": self.max_initial_timestamp,
-                "word_timestamps": self.word_timestamps,
-                "prepend_punctuations": self.prepend_punctuations,
-                "append_punctuations": self.append_punctuations,
-                "max_new_tokens": self.max_new_tokens,
-                "chunk_length": self.chunk_length,
-                "hallucination_silence_threshold": self.hallucination_silence_threshold,
-                "hotwords": None if not self.hotwords else self.hotwords,
-                "language_detection_threshold": self.language_detection_threshold,
-                "language_detection_segments": self.language_detection_segments,
-            },
-            "vad": {
-                "vad_filter": self.vad_filter,
-                "threshold": self.threshold,
-                "min_speech_duration_ms": self.min_speech_duration_ms,
-                "max_speech_duration_s": self.max_speech_duration_s,
-                "min_silence_duration_ms": self.min_silence_duration_ms,
-                "speech_pad_ms": self.speech_pad_ms,
-            },
-            "diarization": {
-                "is_diarize": self.is_diarize,
-                "hf_token": self.hf_token
-            },
-            "bgm_separation": {
-                "is_separate_bgm": self.is_bgm_separate,
-                "model_size": self.uvr_model_size,
-                "segment_size": self.uvr_segment_size,
-                "save_file": self.uvr_save_file,
-                "enable_offload": self.uvr_enable_offload
-            },
-        }
-        return data
-    def as_list(self) -> list:
-        """
-        Converts the data class attributes into a list
-        Returns
-        ----------
-        A list of Whisper parameters
-        """
-        return [getattr(self, f.name) for f in fields(self)]

notebook/whisper-webui.ipynb CHANGED Viewed

@@ -54,7 +54,9 @@
         "%cd Whisper-WebUI\n",
         "!pip install git+https://github.com/jhj0517/jhj0517-whisper.git\n",
         "!pip install faster-whisper==1.0.3\n",
-        "!pip install gradio==4.43.0\n",
         "# Temporal bug fix from https://github.com/jhj0517/Whisper-WebUI/issues/256\n",
         "!pip install git+https://github.com/JuanBindez/pytubefix.git\n",
         "!pip install tokenizers==0.19.1\n",

         "%cd Whisper-WebUI\n",
         "!pip install git+https://github.com/jhj0517/jhj0517-whisper.git\n",
         "!pip install faster-whisper==1.0.3\n",
+        "!pip install ctranslate2==4.4.0\n",
+        "!pip install gradio\n",
+        "!pip install gradio-i18n\n",
         "# Temporal bug fix from https://github.com/jhj0517/Whisper-WebUI/issues/256\n",
         "!pip install git+https://github.com/JuanBindez/pytubefix.git\n",
         "!pip install tokenizers==0.19.1\n",

requirements.txt CHANGED Viewed

@@ -2,15 +2,16 @@
 # If you're using it, update url to your CUDA version (CUDA 12.1 is minimum requirement):
 # For CUDA 12.1, use : https://download.pytorch.org/whl/cu121
 # For CUDA 12.4, use : https://download.pytorch.org/whl/cu124
---extra-index-url https://download.pytorch.org/whl/cu121
-torch==2.3.1
-torchaudio==2.3.1
 git+https://github.com/jhj0517/jhj0517-whisper.git
 faster-whisper==1.0.3
 transformers
 gradio
 pytubefix
 ruamel.yaml==0.18.6
 pyannote.audio==3.3.1

 # If you're using it, update url to your CUDA version (CUDA 12.1 is minimum requirement):
 # For CUDA 12.1, use : https://download.pytorch.org/whl/cu121
 # For CUDA 12.4, use : https://download.pytorch.org/whl/cu124
+--extra-index-url https://download.pytorch.org/whl/cu124
+torch
+torchaudio
 git+https://github.com/jhj0517/jhj0517-whisper.git
 faster-whisper==1.0.3
 transformers
 gradio
+gradio-i18n
 pytubefix
 ruamel.yaml==0.18.6
 pyannote.audio==3.3.1

screenshot.png CHANGED Viewed

tests/test_bgm_separation.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from modules.utils.paths import *
 from modules.whisper.whisper_factory import WhisperFactory
-from modules.whisper.whisper_parameter import WhisperValues
 from test_config import *
 from test_transcription import download_file, test_transcribe
@@ -17,9 +17,9 @@ import os
 @pytest.mark.parametrize(
     "whisper_type,vad_filter,bgm_separation,diarization",
     [
-        ("whisper", False, True, False),
-        ("faster-whisper", False, True, False),
-        ("insanely_fast_whisper", False, True, False)
     ]
 )
 def test_bgm_separation_pipeline(
@@ -38,9 +38,9 @@ def test_bgm_separation_pipeline(
 @pytest.mark.parametrize(
     "whisper_type,vad_filter,bgm_separation,diarization",
     [
-        ("whisper", True, True, False),
-        ("faster-whisper", True, True, False),
-        ("insanely_fast_whisper", True, True, False)
     ]
 )
 def test_bgm_separation_with_vad_pipeline(

 from modules.utils.paths import *
 from modules.whisper.whisper_factory import WhisperFactory
+from modules.whisper.data_classes import *
 from test_config import *
 from test_transcription import download_file, test_transcribe
 @pytest.mark.parametrize(
     "whisper_type,vad_filter,bgm_separation,diarization",
     [
+        (WhisperImpl.WHISPER.value, False, True, False),
+        (WhisperImpl.FASTER_WHISPER.value, False, True, False),
+        (WhisperImpl.INSANELY_FAST_WHISPER.value, False, True, False)
     ]
 )
 def test_bgm_separation_pipeline(
 @pytest.mark.parametrize(
     "whisper_type,vad_filter,bgm_separation,diarization",
     [
+        (WhisperImpl.WHISPER.value, True, True, False),
+        (WhisperImpl.FASTER_WHISPER.value, True, True, False),
+        (WhisperImpl.INSANELY_FAST_WHISPER.value, True, True, False)
     ]
 )
 def test_bgm_separation_with_vad_pipeline(

tests/test_config.py CHANGED Viewed

@@ -1,10 +1,14 @@
-from modules.utils.paths import *
 import os
 import torch
 TEST_FILE_DOWNLOAD_URL = "https://github.com/jhj0517/whisper_flutter_new/raw/main/example/assets/jfk.wav"
 TEST_FILE_PATH = os.path.join(WEBUI_DIR, "tests", "jfk.wav")
 TEST_YOUTUBE_URL = "https://www.youtube.com/watch?v=4WEQtgnBu0I&ab_channel=AndriaFitzer"
 TEST_WHISPER_MODEL = "tiny"
 TEST_UVR_MODEL = "UVR-MDX-NET-Inst_HQ_4"
@@ -13,5 +17,24 @@ TEST_SUBTITLE_SRT_PATH = os.path.join(WEBUI_DIR, "tests", "test_srt.srt")
 TEST_SUBTITLE_VTT_PATH = os.path.join(WEBUI_DIR, "tests", "test_vtt.vtt")
 def is_cuda_available():
     return torch.cuda.is_available()

+import functools
+import jiwer
 import os
 import torch
+from modules.utils.paths import *
+from modules.utils.youtube_manager import *
 TEST_FILE_DOWNLOAD_URL = "https://github.com/jhj0517/whisper_flutter_new/raw/main/example/assets/jfk.wav"
 TEST_FILE_PATH = os.path.join(WEBUI_DIR, "tests", "jfk.wav")
+TEST_ANSWER = "And so my fellow Americans ask not what your country can do for you ask what you can do for your country"
 TEST_YOUTUBE_URL = "https://www.youtube.com/watch?v=4WEQtgnBu0I&ab_channel=AndriaFitzer"
 TEST_WHISPER_MODEL = "tiny"
 TEST_UVR_MODEL = "UVR-MDX-NET-Inst_HQ_4"
 TEST_SUBTITLE_VTT_PATH = os.path.join(WEBUI_DIR, "tests", "test_vtt.vtt")
+@functools.lru_cache
 def is_cuda_available():
     return torch.cuda.is_available()
+@functools.lru_cache
+def is_pytube_detected_bot(url: str = TEST_YOUTUBE_URL):
+    try:
+        yt_temp_path = os.path.join("modules", "yt_tmp.wav")
+        if os.path.exists(yt_temp_path):
+            return False
+        yt = get_ytdata(url)
+        audio = get_ytaudio(yt)
+        return False
+    except Exception as e:
+        print(f"Pytube has detected as a bot: {e}")
+        return True
+def calculate_wer(answer, prediction):
+    return jiwer.wer(answer, prediction)

tests/test_diarization.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from modules.utils.paths import *
 from modules.whisper.whisper_factory import WhisperFactory
-from modules.whisper.whisper_parameter import WhisperValues
 from test_config import *
 from test_transcription import download_file, test_transcribe
@@ -16,9 +16,9 @@ import os
 @pytest.mark.parametrize(
     "whisper_type,vad_filter,bgm_separation,diarization",
     [
-        ("whisper", False, False, True),
-        ("faster-whisper", False, False, True),
-        ("insanely_fast_whisper", False, False, True)
     ]
 )
 def test_diarization_pipeline(

 from modules.utils.paths import *
 from modules.whisper.whisper_factory import WhisperFactory
+from modules.whisper.data_classes import *
 from test_config import *
 from test_transcription import download_file, test_transcribe
 @pytest.mark.parametrize(
     "whisper_type,vad_filter,bgm_separation,diarization",
     [
+        (WhisperImpl.WHISPER.value, False, False, True),
+        (WhisperImpl.FASTER_WHISPER.value, False, False, True),
+        (WhisperImpl.INSANELY_FAST_WHISPER.value, False, False, True)
     ]
 )
 def test_diarization_pipeline(

tests/test_transcription.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from modules.whisper.whisper_factory import WhisperFactory
-from modules.whisper.whisper_parameter import WhisperValues
 from modules.utils.paths import WEBUI_DIR
 from test_config import *
@@ -12,9 +13,9 @@ import os
 @pytest.mark.parametrize(
     "whisper_type,vad_filter,bgm_separation,diarization",
     [
-        ("whisper", False, False, False),
-        ("faster-whisper", False, False, False),
-        ("insanely_fast_whisper", False, False, False)
     ]
 )
 def test_transcribe(
@@ -28,6 +29,10 @@ def test_transcribe(
     if not os.path.exists(audio_path):
         download_file(TEST_FILE_DOWNLOAD_URL, audio_path_dir)
     whisper_inferencer = WhisperFactory.create_whisper_inference(
         whisper_type=whisper_type,
     )
@@ -37,16 +42,24 @@ def test_transcribe(
         f"""Diarization Device: {whisper_inferencer.diarizer.device}"""
     )
-    hparams = WhisperValues(
-        model_size=TEST_WHISPER_MODEL,
-        vad_filter=vad_filter,
-        is_bgm_separate=bgm_separation,
-        compute_type=whisper_inferencer.current_compute_type,
-        uvr_enable_offload=True,
-        is_diarize=diarization,
-    ).as_list()
-    subtitle_str, file_path = whisper_inferencer.transcribe_file(
         [audio_path],
         None,
         "SRT",
@@ -54,29 +67,29 @@ def test_transcribe(
         gr.Progress(),
         *hparams,
     )
-    assert isinstance(subtitle_str, str) and subtitle_str
-    assert isinstance(file_path[0], str) and file_path
-    whisper_inferencer.transcribe_youtube(
-        TEST_YOUTUBE_URL,
-        "SRT",
-        False,
-        gr.Progress(),
-        *hparams,
-    )
-    assert isinstance(subtitle_str, str) and subtitle_str
-    assert isinstance(file_path[0], str) and file_path
-    whisper_inferencer.transcribe_mic(
         audio_path,
         "SRT",
         False,
         gr.Progress(),
         *hparams,
     )
-    assert isinstance(subtitle_str, str) and subtitle_str
-    assert isinstance(file_path[0], str) and file_path
 def download_file(url, save_dir):

 from modules.whisper.whisper_factory import WhisperFactory
+from modules.whisper.data_classes import *
+from modules.utils.subtitle_manager import read_file
 from modules.utils.paths import WEBUI_DIR
 from test_config import *
 @pytest.mark.parametrize(
     "whisper_type,vad_filter,bgm_separation,diarization",
     [
+        (WhisperImpl.WHISPER.value, False, False, False),
+        (WhisperImpl.FASTER_WHISPER.value, False, False, False),
+        (WhisperImpl.INSANELY_FAST_WHISPER.value, False, False, False)
     ]
 )
 def test_transcribe(
     if not os.path.exists(audio_path):
         download_file(TEST_FILE_DOWNLOAD_URL, audio_path_dir)
+    answer = TEST_ANSWER
+    if diarization:
+        answer = "SPEAKER_00|"+TEST_ANSWER
     whisper_inferencer = WhisperFactory.create_whisper_inference(
         whisper_type=whisper_type,
     )
         f"""Diarization Device: {whisper_inferencer.diarizer.device}"""
     )
+    hparams = TranscriptionPipelineParams(
+        whisper=WhisperParams(
+            model_size=TEST_WHISPER_MODEL,
+            compute_type=whisper_inferencer.current_compute_type
+        ),
+        vad=VadParams(
+            vad_filter=vad_filter
+        ),
+        bgm_separation=BGMSeparationParams(
+            is_separate_bgm=bgm_separation,
+            enable_offload=True
+        ),
+        diarization=DiarizationParams(
+            is_diarize=diarization
+        ),
+    ).to_list()
+    subtitle_str, file_paths = whisper_inferencer.transcribe_file(
         [audio_path],
         None,
         "SRT",
         gr.Progress(),
         *hparams,
     )
+    subtitle = read_file(file_paths[0]).split("\n")
+    assert calculate_wer(answer, subtitle[2].strip().replace(",", "").replace(".", "")) < 0.1
+    if not is_pytube_detected_bot():
+        subtitle_str, file_path = whisper_inferencer.transcribe_youtube(
+            TEST_YOUTUBE_URL,
+            "SRT",
+            False,
+            gr.Progress(),
+            *hparams,
+        )
+        assert isinstance(subtitle_str, str) and subtitle_str
+        assert os.path.exists(file_path)
+    subtitle_str, file_path = whisper_inferencer.transcribe_mic(
         audio_path,
         "SRT",
         False,
         gr.Progress(),
         *hparams,
     )
+    subtitle = read_file(file_path).split("\n")
+    assert calculate_wer(answer, subtitle[2].strip().replace(",", "").replace(".", "")) < 0.1
 def download_file(url, save_dir):

tests/test_translation.py CHANGED Viewed

@@ -28,6 +28,10 @@ def test_nllb_inference(
     assert isinstance(file_paths[0], str)
 @pytest.mark.parametrize("file_path", [
     TEST_SUBTITLE_SRT_PATH,
     TEST_SUBTITLE_VTT_PATH,

     assert isinstance(file_paths[0], str)
+@pytest.mark.skipif(
+    os.getenv("DEEPL_API_KEY") is None or not os.getenv("DEEPL_API_KEY"),
+    reason="DeepL API key is unavailable"
+)
 @pytest.mark.parametrize("file_path", [
     TEST_SUBTITLE_SRT_PATH,
     TEST_SUBTITLE_VTT_PATH,

tests/test_vad.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from modules.utils.paths import *
 from modules.whisper.whisper_factory import WhisperFactory
-from modules.whisper.whisper_parameter import WhisperValues
 from test_config import *
 from test_transcription import download_file, test_transcribe
@@ -12,9 +12,9 @@ import os
 @pytest.mark.parametrize(
     "whisper_type,vad_filter,bgm_separation,diarization",
     [
-        ("whisper", True, False, False),
-        ("faster-whisper", True, False, False),
-        ("insanely_fast_whisper", True, False, False)
     ]
 )
 def test_vad_pipeline(

 from modules.utils.paths import *
 from modules.whisper.whisper_factory import WhisperFactory
+from modules.whisper.data_classes import *
 from test_config import *
 from test_transcription import download_file, test_transcribe
 @pytest.mark.parametrize(
     "whisper_type,vad_filter,bgm_separation,diarization",
     [
+        (WhisperImpl.WHISPER.value, True, False, False),
+        (WhisperImpl.FASTER_WHISPER.value, True, False, False),
+        (WhisperImpl.INSANELY_FAST_WHISPER.value, True, False, False)
     ]
 )
 def test_vad_pipeline(