Spaces:

jhj0517
/

Whisper-WebUI

Running

+---
+name: Hallucination
+about: Whisper hallucinations. ( Repeating certain words or subtitles starting too
+  early, etc. )
+title: ''
+labels: hallucination
+assignees: jhj0517
+---
+**Download URL for sample audio**
+- Please upload download URL for sample audio file so I can test with some settings for better result. You can use https://easyupload.io/ or any other service to share.

.github/pull_request_template.md ADDED Viewed

	@@ -0,0 +1,5 @@

+## Related issues
+- #0
+## Changed
+1. Changes

.github/workflows/{shell-scrpit-test.yml → ci-shell.yml} RENAMED Viewed

@@ -1,38 +1,42 @@
-name: Shell Script Test
 on:
   push:
-    branches: ["feature/shell-script"]
-env:
-  PYTHON_VERSION: '3.9'
 jobs:
   test-shell-script:
     runs-on: ubuntu-latest
     steps:
-    - name: 'Checkout GitHub Action'
-      uses: actions/checkout@v3
-    - name: Setup Python ${{ env.PYTHON_VERSION }} Environment
-      uses: actions/setup-python@v4
       with:
-        python-version: ${{ env.PYTHON_VERSION }}
-    - name: 'Setup FFmpeg'
-      uses: FedericoCarboni/setup-ffmpeg@v3
-      id: setup-ffmpeg
-      with:
-        ffmpeg-version: release
-        architecture: 'arm64'
-        linking-type: static
-    - name: 'Execute Install.sh'
       run: |
         chmod +x ./Install.sh
         ./Install.sh
-    - name: 'Execute start-webui.sh'
       run: |
         chmod +x ./start-webui.sh
         timeout 60s ./start-webui.sh || true

+name: CI-Shell Script
 on:
+  workflow_dispatch:
   push:
+    branches:
+      - master
+  pull_request:
+    branches:
+      - master
 jobs:
   test-shell-script:
     runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python: [ "3.10" ]
     steps:
+    - name: Clean up space for action
+      run: rm -rf /opt/hostedtoolcache
+    - uses: actions/checkout@v4
+    - name: Setup Python
+      uses: actions/setup-python@v5
       with:
+        python-version: ${{ matrix.python }}
+    - name: Install git and ffmpeg
+      run: sudo apt-get update && sudo apt-get install -y git ffmpeg
+    - name: Execute Install.sh
       run: |
         chmod +x ./Install.sh
         ./Install.sh
+    - name: Execute start-webui.sh
       run: |
         chmod +x ./start-webui.sh
         timeout 60s ./start-webui.sh || true

.github/workflows/ci.yml ADDED Viewed

	@@ -0,0 +1,41 @@

+name: CI
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - master
+  pull_request:
+    branches:
+      - master
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python: ["3.10"]
+    env:
+      DEEPL_API_KEY: ${{ secrets.DEEPL_API_KEY }}
+    steps:
+      - name: Clean up space for action
+        run: rm -rf /opt/hostedtoolcache
+      - uses: actions/checkout@v4
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python }}
+      - name: Install git and ffmpeg
+        run: sudo apt-get update && sudo apt-get install -y git ffmpeg
+      - name: Install dependencies
+        run: pip install -r requirements.txt pytest
+      - name: Run test
+        run: python -m pytest -rs tests

.gitignore CHANGED Viewed

@@ -2,6 +2,8 @@
 *.png
 *.mp4
 *.mp3
 venv/
 modules/ui/__pycache__/
 outputs/

 *.png
 *.mp4
 *.mp3
+.idea/
+.pytest_cache/
 venv/
 modules/ui/__pycache__/
 outputs/

app.py CHANGED Viewed

@@ -21,7 +21,7 @@ from modules.whisper.whisper_parameter import *
 class App:
     def __init__(self, args):
         self.args = args
-        self.app = gr.Blocks(css=CSS, theme=self.args.theme)
         self.whisper_inf = WhisperFactory.create_whisper_inference(
             whisper_type=self.args.whisper_type,
             whisper_model_dir=self.args.whisper_model_dir,
@@ -59,6 +59,7 @@ class App:
         with gr.Row():
             cb_timestamp = gr.Checkbox(value=whisper_params["add_timestamp"], label="Add a timestamp to the end of the filename",
                                        interactive=True)
         with gr.Accordion("Advanced Parameters", open=False):
             nb_beam_size = gr.Number(label="Beam Size", value=whisper_params["beam_size"], precision=0, interactive=True,
                                      info="Beam size to use for decoding.")
@@ -68,6 +69,7 @@ class App:
                                                info="If the no speech probability is higher than this value AND the average log probability over sampled tokens is below 'Log Prob Threshold', consider the segment as silent.")
             dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types,
                                           value=self.whisper_inf.current_compute_type, interactive=True,
                                           info="Select the type of computation to perform.")
             nb_best_of = gr.Number(label="Best Of", value=whisper_params["best_of"], interactive=True,
                                    info="Number of candidates when sampling with non-zero temperature.")
@@ -88,6 +90,9 @@ class App:
             nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=whisper_params["compression_ratio_threshold"],
                                                        interactive=True,
                                                        info="If the gzip compression ratio is above this value, treat as failed.")
             with gr.Group(visible=isinstance(self.whisper_inf, FasterWhisperInference)):
                 nb_length_penalty = gr.Number(label="Length Penalty", value=whisper_params["length_penalty"],
                                               info="Exponential length penalty constant.")
@@ -113,9 +118,6 @@ class App:
                 nb_max_new_tokens = gr.Number(label="Max New Tokens", value=lambda: whisper_params["max_new_tokens"],
                                               precision=0,
                                               info="Maximum number of new tokens to generate per-chunk. If not set, the maximum will be set by the default max_length.")
-                nb_chunk_length = gr.Number(label="Chunk Length", value=lambda: whisper_params["chunk_length"],
-                                            precision=0,
-                                            info="The length of audio segments. If it is not None, it will overwrite the default chunk_length of the FeatureExtractor.")
                 nb_hallucination_silence_threshold = gr.Number(label="Hallucination Silence Threshold (sec)",
                                                                value=lambda: whisper_params["hallucination_silence_threshold"],
                                                                info="When 'Word Timestamps' is True, skip silent periods longer than this threshold (in seconds) when a possible hallucination is detected.")
@@ -127,32 +129,37 @@ class App:
                                                            precision=0,
                                                            info="Number of segments to consider for the language detection.")
             with gr.Group(visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
-                nb_chunk_length_s = gr.Number(label="Chunk Lengths (sec)", value=whisper_params["chunk_length_s"],
-                                              precision=0)
                 nb_batch_size = gr.Number(label="Batch Size", value=whisper_params["batch_size"], precision=0)
-        with gr.Accordion("BGM Separation", open=False):
-            cb_bgm_separation = gr.Checkbox(label="Enable BGM Separation Filter", value=uvr_params["is_separate_bgm"],
-                                            interactive=True)
             dd_uvr_device = gr.Dropdown(label="Device", value=self.whisper_inf.music_separator.device,
                                         choices=self.whisper_inf.music_separator.available_devices)
             dd_uvr_model_size = gr.Dropdown(label="Model", value=uvr_params["model_size"],
                                             choices=self.whisper_inf.music_separator.available_models)
             nb_uvr_segment_size = gr.Number(label="Segment Size", value=uvr_params["segment_size"], precision=0)
             cb_uvr_save_file = gr.Checkbox(label="Save separated files to output", value=uvr_params["save_file"])
-        with gr.Accordion("VAD", open=False):
             cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=vad_params["vad_filter"],
-                                        interactive=True)
-            sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=vad_params["threshold"],
                                      info="Lower it to be more sensitive to small sounds.")
-            nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=vad_params["min_speech_duration_ms"],
                                                   info="Final speech chunks shorter than this time are thrown out")
-            nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=vad_params["max_speech_duration_s"],
-                                                 info="Maximum duration of speech chunks in \"seconds\". Chunks longer"
-                                                        " than this time will be split at the timestamp of the last silence that"
-                                                        " lasts more than 100ms (if any), to prevent aggressive cutting.")
-            nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=vad_params["min_silence_duration_ms"],
                                                    info="In the end of each speech chunk wait for this time"
                                                         " before separating it")
             nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=vad_params["speech_pad_ms"],
@@ -161,7 +168,10 @@ class App:
         with gr.Accordion("Diarization", open=False):
             cb_diarize = gr.Checkbox(label="Enable Diarization", value=diarization_params["is_diarize"])
             tb_hf_token = gr.Text(label="HuggingFace Token", value=diarization_params["hf_token"],
-                                  info="This is only needed the first time you download the model. If you already have models, you don't need to enter. To download the model, you must manually go to \"https://huggingface.co/pyannote/speaker-diarization-3.1\" and agree to their requirement.")
             dd_diarization_device = gr.Dropdown(label="Device",
                                                 choices=self.whisper_inf.diarizer.get_available_device(),
                                                 value=self.whisper_inf.diarizer.get_device())
@@ -177,19 +187,19 @@ class App:
                 temperature=sd_temperature, compression_ratio_threshold=nb_compression_ratio_threshold,
                 vad_filter=cb_vad_filter, threshold=sd_threshold, min_speech_duration_ms=nb_min_speech_duration_ms,
                 max_speech_duration_s=nb_max_speech_duration_s, min_silence_duration_ms=nb_min_silence_duration_ms,
-                speech_pad_ms=nb_speech_pad_ms, chunk_length_s=nb_chunk_length_s, batch_size=nb_batch_size,
                 is_diarize=cb_diarize, hf_token=tb_hf_token, diarization_device=dd_diarization_device,
                 length_penalty=nb_length_penalty, repetition_penalty=nb_repetition_penalty,
                 no_repeat_ngram_size=nb_no_repeat_ngram_size, prefix=tb_prefix, suppress_blank=cb_suppress_blank,
                 suppress_tokens=tb_suppress_tokens, max_initial_timestamp=nb_max_initial_timestamp,
                 word_timestamps=cb_word_timestamps, prepend_punctuations=tb_prepend_punctuations,
-                append_punctuations=tb_append_punctuations, max_new_tokens=nb_max_new_tokens, chunk_length=nb_chunk_length,
                 hallucination_silence_threshold=nb_hallucination_silence_threshold, hotwords=tb_hotwords,
                 language_detection_threshold=nb_language_detection_threshold,
                 language_detection_segments=nb_language_detection_segments,
                 prompt_reset_on_temperature=sld_prompt_reset_on_temperature, is_bgm_separate=cb_bgm_separation,
                 uvr_device=dd_uvr_device, uvr_model_size=dd_uvr_model_size, uvr_segment_size=nb_uvr_segment_size,
-                uvr_save_file=cb_uvr_save_file
             ),
             dd_file_format,
             cb_timestamp

 class App:
     def __init__(self, args):
         self.args = args
+        self.app = gr.Blocks(css=CSS, theme=self.args.theme, delete_cache=(60, 3600))
         self.whisper_inf = WhisperFactory.create_whisper_inference(
             whisper_type=self.args.whisper_type,
             whisper_model_dir=self.args.whisper_model_dir,
         with gr.Row():
             cb_timestamp = gr.Checkbox(value=whisper_params["add_timestamp"], label="Add a timestamp to the end of the filename",
                                        interactive=True)
         with gr.Accordion("Advanced Parameters", open=False):
             nb_beam_size = gr.Number(label="Beam Size", value=whisper_params["beam_size"], precision=0, interactive=True,
                                      info="Beam size to use for decoding.")
                                                info="If the no speech probability is higher than this value AND the average log probability over sampled tokens is below 'Log Prob Threshold', consider the segment as silent.")
             dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types,
                                           value=self.whisper_inf.current_compute_type, interactive=True,
+                                          allow_custom_value=True,
                                           info="Select the type of computation to perform.")
             nb_best_of = gr.Number(label="Best Of", value=whisper_params["best_of"], interactive=True,
                                    info="Number of candidates when sampling with non-zero temperature.")
             nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=whisper_params["compression_ratio_threshold"],
                                                        interactive=True,
                                                        info="If the gzip compression ratio is above this value, treat as failed.")
+            nb_chunk_length = gr.Number(label="Chunk Length (s)", value=lambda: whisper_params["chunk_length"],
+                                        precision=0,
+                                        info="The length of audio segments. If it is not None, it will overwrite the default chunk_length of the FeatureExtractor.")
             with gr.Group(visible=isinstance(self.whisper_inf, FasterWhisperInference)):
                 nb_length_penalty = gr.Number(label="Length Penalty", value=whisper_params["length_penalty"],
                                               info="Exponential length penalty constant.")
                 nb_max_new_tokens = gr.Number(label="Max New Tokens", value=lambda: whisper_params["max_new_tokens"],
                                               precision=0,
                                               info="Maximum number of new tokens to generate per-chunk. If not set, the maximum will be set by the default max_length.")
                 nb_hallucination_silence_threshold = gr.Number(label="Hallucination Silence Threshold (sec)",
                                                                value=lambda: whisper_params["hallucination_silence_threshold"],
                                                                info="When 'Word Timestamps' is True, skip silent periods longer than this threshold (in seconds) when a possible hallucination is detected.")
                                                            precision=0,
                                                            info="Number of segments to consider for the language detection.")
             with gr.Group(visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
                 nb_batch_size = gr.Number(label="Batch Size", value=whisper_params["batch_size"], precision=0)
+        with gr.Accordion("Background Music Remover Filter", open=False):
+            cb_bgm_separation = gr.Checkbox(label="Enable Background Music Remover Filter", value=uvr_params["is_separate_bgm"],
+                                            interactive=True,
+                                            info="Enabling this will remove background music by submodel before"
+                                                 " transcribing ")
             dd_uvr_device = gr.Dropdown(label="Device", value=self.whisper_inf.music_separator.device,
                                         choices=self.whisper_inf.music_separator.available_devices)
             dd_uvr_model_size = gr.Dropdown(label="Model", value=uvr_params["model_size"],
                                             choices=self.whisper_inf.music_separator.available_models)
             nb_uvr_segment_size = gr.Number(label="Segment Size", value=uvr_params["segment_size"], precision=0)
             cb_uvr_save_file = gr.Checkbox(label="Save separated files to output", value=uvr_params["save_file"])
+            cb_uvr_enable_offload = gr.Checkbox(label="Offload sub model after removing background music",
+                                                value=uvr_params["enable_offload"])
+        with gr.Accordion("Voice Detection Filter", open=False):
             cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=vad_params["vad_filter"],
+                                        interactive=True,
+                                        info="Enable this to transcribe only detected voice parts by submodel.")
+            sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold",
+                                     value=vad_params["threshold"],
                                      info="Lower it to be more sensitive to small sounds.")
+            nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0,
+                                                  value=vad_params["min_speech_duration_ms"],
                                                   info="Final speech chunks shorter than this time are thrown out")
+            nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)",
+                                                 value=vad_params["max_speech_duration_s"],
+                                                 info="Maximum duration of speech chunks in \"seconds\".")
+            nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0,
+                                                   value=vad_params["min_silence_duration_ms"],
                                                    info="In the end of each speech chunk wait for this time"
                                                         " before separating it")
             nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=vad_params["speech_pad_ms"],
         with gr.Accordion("Diarization", open=False):
             cb_diarize = gr.Checkbox(label="Enable Diarization", value=diarization_params["is_diarize"])
             tb_hf_token = gr.Text(label="HuggingFace Token", value=diarization_params["hf_token"],
+                                  info="This is only needed the first time you download the model. If you already have"
+                                       " models, you don't need to enter. To download the model, you must manually go "
+                                       "to \"https://huggingface.co/pyannote/speaker-diarization-3.1\" and agree to"
+                                       " their requirement.")
             dd_diarization_device = gr.Dropdown(label="Device",
                                                 choices=self.whisper_inf.diarizer.get_available_device(),
                                                 value=self.whisper_inf.diarizer.get_device())
                 temperature=sd_temperature, compression_ratio_threshold=nb_compression_ratio_threshold,
                 vad_filter=cb_vad_filter, threshold=sd_threshold, min_speech_duration_ms=nb_min_speech_duration_ms,
                 max_speech_duration_s=nb_max_speech_duration_s, min_silence_duration_ms=nb_min_silence_duration_ms,
+                speech_pad_ms=nb_speech_pad_ms, chunk_length=nb_chunk_length, batch_size=nb_batch_size,
                 is_diarize=cb_diarize, hf_token=tb_hf_token, diarization_device=dd_diarization_device,
                 length_penalty=nb_length_penalty, repetition_penalty=nb_repetition_penalty,
                 no_repeat_ngram_size=nb_no_repeat_ngram_size, prefix=tb_prefix, suppress_blank=cb_suppress_blank,
                 suppress_tokens=tb_suppress_tokens, max_initial_timestamp=nb_max_initial_timestamp,
                 word_timestamps=cb_word_timestamps, prepend_punctuations=tb_prepend_punctuations,
+                append_punctuations=tb_append_punctuations, max_new_tokens=nb_max_new_tokens,
                 hallucination_silence_threshold=nb_hallucination_silence_threshold, hotwords=tb_hotwords,
                 language_detection_threshold=nb_language_detection_threshold,
                 language_detection_segments=nb_language_detection_segments,
                 prompt_reset_on_temperature=sld_prompt_reset_on_temperature, is_bgm_separate=cb_bgm_separation,
                 uvr_device=dd_uvr_device, uvr_model_size=dd_uvr_model_size, uvr_segment_size=nb_uvr_segment_size,
+                uvr_save_file=cb_uvr_save_file, uvr_enable_offload=cb_uvr_enable_offload
             ),
             dd_file_format,
             cb_timestamp

configs/default_parameters.yaml CHANGED Viewed

@@ -12,7 +12,7 @@ whisper:
   initial_prompt: null
   temperature: 0
   compression_ratio_threshold: 2.4
-  chunk_length_s: 30
   batch_size: 24
   length_penalty: 1
   repetition_penalty: 1
@@ -25,7 +25,6 @@ whisper:
   prepend_punctuations: "\"'“¿([{-"
   append_punctuations: "\"'.。,，!！?？:：”)]}、"
   max_new_tokens: null
-  chunk_length: null
   hallucination_silence_threshold: null
   hotwords: null
   language_detection_threshold: null
@@ -37,8 +36,8 @@ vad:
   threshold: 0.5
   min_speech_duration_ms: 250
   max_speech_duration_s: 9999
-  min_silence_duration_ms: 2000
-  speech_pad_ms: 400
 diarization:
   is_diarize: false
@@ -49,6 +48,7 @@ bgm_separation:
   model_size: "UVR-MDX-NET-Inst_HQ_4"
   segment_size: 256
   save_file: false
 translation:
   deepl:

   initial_prompt: null
   temperature: 0
   compression_ratio_threshold: 2.4
+  chunk_length: 30
   batch_size: 24
   length_penalty: 1
   repetition_penalty: 1
   prepend_punctuations: "\"'“¿([{-"
   append_punctuations: "\"'.。,，!！?？:：”)]}、"
   max_new_tokens: null
   hallucination_silence_threshold: null
   hotwords: null
   language_detection_threshold: null
   threshold: 0.5
   min_speech_duration_ms: 250
   max_speech_duration_s: 9999
+  min_silence_duration_ms: 1000
+  speech_pad_ms: 2000
 diarization:
   is_diarize: false
   model_size: "UVR-MDX-NET-Inst_HQ_4"
   segment_size: 256
   save_file: false
+  enable_offload: true
 translation:
   deepl:

modules/translation/deepl_api.py CHANGED Viewed

@@ -98,8 +98,8 @@ class DeepLAPI:
                         fileobjs: list,
                         source_lang: str,
                         target_lang: str,
-                        is_pro: bool,
-                        add_timestamp: bool,
                         progress=gr.Progress()) -> list:
         """
         Translate subtitle files using DeepL API
@@ -126,6 +126,9 @@ class DeepLAPI:
         String to return to gr.Textbox()
         Files to return to gr.Files()
         """
         self.cache_parameters(
             api_key=auth_key,
             is_pro=is_pro,
@@ -136,37 +139,28 @@ class DeepLAPI:
         files_info = {}
         for fileobj in fileobjs:
-            file_path = fileobj.name
-            file_name, file_ext = os.path.splitext(os.path.basename(fileobj.name))
             if file_ext == ".srt":
                 parsed_dicts = parse_srt(file_path=file_path)
-                batch_size = self.max_text_batch_size
-                for batch_start in range(0, len(parsed_dicts), batch_size):
-                    batch_end = min(batch_start + batch_size, len(parsed_dicts))
-                    sentences_to_translate = [dic["sentence"] for dic in parsed_dicts[batch_start:batch_end]]
-                    translated_texts = self.request_deepl_translate(auth_key, sentences_to_translate, source_lang,
-                                                                    target_lang, is_pro)
-                    for i, translated_text in enumerate(translated_texts):
-                        parsed_dicts[batch_start + i]["sentence"] = translated_text["text"]
-                    progress(batch_end / len(parsed_dicts), desc="Translating..")
-                subtitle = get_serialized_srt(parsed_dicts)
             elif file_ext == ".vtt":
                 parsed_dicts = parse_vtt(file_path=file_path)
-                batch_size = self.max_text_batch_size
-                for batch_start in range(0, len(parsed_dicts), batch_size):
-                    batch_end = min(batch_start + batch_size, len(parsed_dicts))
-                    sentences_to_translate = [dic["sentence"] for dic in parsed_dicts[batch_start:batch_end]]
-                    translated_texts = self.request_deepl_translate(auth_key, sentences_to_translate, source_lang,
-                                                                    target_lang, is_pro)
-                    for i, translated_text in enumerate(translated_texts):
-                        parsed_dicts[batch_start + i]["sentence"] = translated_text["text"]
-                    progress(batch_end / len(parsed_dicts), desc="Translating..")
                 subtitle = get_serialized_vtt(parsed_dicts)
             if add_timestamp:
@@ -193,8 +187,14 @@ class DeepLAPI:
                                 text: list,
                                 source_lang: str,
                                 target_lang: str,
-                                is_pro: bool):
         """Request API response to DeepL server"""
         url = 'https://api.deepl.com/v2/translate' if is_pro else 'https://api-free.deepl.com/v2/translate'
         headers = {

                         fileobjs: list,
                         source_lang: str,
                         target_lang: str,
+                        is_pro: bool = False,
+                        add_timestamp: bool = True,
                         progress=gr.Progress()) -> list:
         """
         Translate subtitle files using DeepL API
         String to return to gr.Textbox()
         Files to return to gr.Files()
         """
+        if fileobjs and isinstance(fileobjs[0], gr.utils.NamedString):
+            fileobjs = [fileobj.name for fileobj in fileobjs]
         self.cache_parameters(
             api_key=auth_key,
             is_pro=is_pro,
         files_info = {}
         for fileobj in fileobjs:
+            file_path = fileobj
+            file_name, file_ext = os.path.splitext(os.path.basename(fileobj))
             if file_ext == ".srt":
                 parsed_dicts = parse_srt(file_path=file_path)
             elif file_ext == ".vtt":
                 parsed_dicts = parse_vtt(file_path=file_path)
+            batch_size = self.max_text_batch_size
+            for batch_start in range(0, len(parsed_dicts), batch_size):
+                batch_end = min(batch_start + batch_size, len(parsed_dicts))
+                sentences_to_translate = [dic["sentence"] for dic in parsed_dicts[batch_start:batch_end]]
+                translated_texts = self.request_deepl_translate(auth_key, sentences_to_translate, source_lang,
+                                                                target_lang, is_pro)
+                for i, translated_text in enumerate(translated_texts):
+                    parsed_dicts[batch_start + i]["sentence"] = translated_text["text"]
+                progress(batch_end / len(parsed_dicts), desc="Translating..")
+            if file_ext == ".srt":
+                subtitle = get_serialized_srt(parsed_dicts)
+            elif file_ext == ".vtt":
                 subtitle = get_serialized_vtt(parsed_dicts)
             if add_timestamp:
                                 text: list,
                                 source_lang: str,
                                 target_lang: str,
+                                is_pro: bool = False):
         """Request API response to DeepL server"""
+        if source_lang not in list(DEEPL_AVAILABLE_SOURCE_LANGS.keys()):
+            raise ValueError(f"Source language {source_lang} is not supported."
+                             f"Use one of {list(DEEPL_AVAILABLE_SOURCE_LANGS.keys())}")
+        if target_lang not in list(DEEPL_AVAILABLE_TARGET_LANGS.keys()):
+            raise ValueError(f"Target language {target_lang} is not supported."
+                             f"Use one of {list(DEEPL_AVAILABLE_TARGET_LANGS.keys())}")
         url = 'https://api.deepl.com/v2/translate' if is_pro else 'https://api-free.deepl.com/v2/translate'
         headers = {

modules/translation/nllb_inference.py CHANGED Viewed

@@ -38,8 +38,19 @@ class NLLBInference(TranslationBase):
                      model_size: str,
                      src_lang: str,
                      tgt_lang: str,
-                     progress: gr.Progress
                      ):
         if model_size != self.current_model_size or self.model is None:
             print("\nInitializing NLLB Model..\n")
             progress(0, desc="Initializing NLLB Model..")
@@ -51,8 +62,7 @@ class NLLBInference(TranslationBase):
             self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_size,
                                                            cache_dir=os.path.join(self.model_dir, "tokenizers"),
                                                            local_files_only=local_files_only)
-        src_lang = NLLB_AVAILABLE_LANGS[src_lang]
-        tgt_lang = NLLB_AVAILABLE_LANGS[tgt_lang]
         self.pipeline = pipeline("translation",
                                  model=self.model,
                                  tokenizer=self.tokenizer,

                      model_size: str,
                      src_lang: str,
                      tgt_lang: str,
+                     progress: gr.Progress = gr.Progress()
                      ):
+        def validate_language(lang: str) -> str:
+            if lang in NLLB_AVAILABLE_LANGS:
+                return NLLB_AVAILABLE_LANGS[lang]
+            elif lang not in NLLB_AVAILABLE_LANGS.values():
+                raise ValueError(
+                    f"Language '{lang}' is not supported. Use one of: {list(NLLB_AVAILABLE_LANGS.keys())}")
+            return lang
+        src_lang = validate_language(src_lang)
+        tgt_lang = validate_language(tgt_lang)
         if model_size != self.current_model_size or self.model is None:
             print("\nInitializing NLLB Model..\n")
             progress(0, desc="Initializing NLLB Model..")
             self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_size,
                                                            cache_dir=os.path.join(self.model_dir, "tokenizers"),
                                                            local_files_only=local_files_only)
         self.pipeline = pipeline("translation",
                                  model=self.model,
                                  tokenizer=self.tokenizer,

modules/translation/translation_base.py CHANGED Viewed

@@ -40,7 +40,7 @@ class TranslationBase(ABC):
                      model_size: str,
                      src_lang: str,
                      tgt_lang: str,
-                     progress: gr.Progress
                      ):
         pass
@@ -50,8 +50,8 @@ class TranslationBase(ABC):
                        model_size: str,
                        src_lang: str,
                        tgt_lang: str,
-                       max_length: int,
-                       add_timestamp: bool,
                        progress=gr.Progress()) -> list:
         """
         Translate subtitle file from source language to target language
@@ -81,6 +81,9 @@ class TranslationBase(ABC):
         Files to return to gr.Files()
         """
         try:
             self.cache_parameters(model_size=model_size,
                                   src_lang=src_lang,
                                   tgt_lang=tgt_lang,
@@ -94,10 +97,9 @@ class TranslationBase(ABC):
             files_info = {}
             for fileobj in fileobjs:
-                file_path = fileobj.name
-                file_name, file_ext = os.path.splitext(os.path.basename(fileobj.name))
                 if file_ext == ".srt":
-                    parsed_dicts = parse_srt(file_path=file_path)
                     total_progress = len(parsed_dicts)
                     for index, dic in enumerate(parsed_dicts):
                         progress(index / total_progress, desc="Translating..")
@@ -106,7 +108,7 @@ class TranslationBase(ABC):
                     subtitle = get_serialized_srt(parsed_dicts)
                 elif file_ext == ".vtt":
-                    parsed_dicts = parse_vtt(file_path=file_path)
                     total_progress = len(parsed_dicts)
                     for index, dic in enumerate(parsed_dicts):
                         progress(index / total_progress, desc="Translating..")

                      model_size: str,
                      src_lang: str,
                      tgt_lang: str,
+                     progress: gr.Progress = gr.Progress()
                      ):
         pass
                        model_size: str,
                        src_lang: str,
                        tgt_lang: str,
+                       max_length: int = 200,
+                       add_timestamp: bool = True,
                        progress=gr.Progress()) -> list:
         """
         Translate subtitle file from source language to target language
         Files to return to gr.Files()
         """
         try:
+            if fileobjs and isinstance(fileobjs[0], gr.utils.NamedString):
+                fileobjs = [file.name for file in fileobjs]
             self.cache_parameters(model_size=model_size,
                                   src_lang=src_lang,
                                   tgt_lang=tgt_lang,
             files_info = {}
             for fileobj in fileobjs:
+                file_name, file_ext = os.path.splitext(os.path.basename(fileobj))
                 if file_ext == ".srt":
+                    parsed_dicts = parse_srt(file_path=fileobj)
                     total_progress = len(parsed_dicts)
                     for index, dic in enumerate(parsed_dicts):
                         progress(index / total_progress, desc="Translating..")
                     subtitle = get_serialized_srt(parsed_dicts)
                 elif file_ext == ".vtt":
+                    parsed_dicts = parse_vtt(file_path=fileobj)
                     total_progress = len(parsed_dicts)
                     for index, dic in enumerate(parsed_dicts):
                         progress(index / total_progress, desc="Translating..")

modules/utils/subtitle_manager.py CHANGED Viewed

@@ -121,11 +121,8 @@ def get_serialized_vtt(dicts):
 @spaces.GPU(duration=120)
 def safe_filename(name):
-    from app import _args
     INVALID_FILENAME_CHARS = r'[<>:"/\\|?*\x00-\x1f]'
     safe_name = re.sub(INVALID_FILENAME_CHARS, '_', name)
-    if not _args.colab:
-        return safe_name
     # Truncate the filename if it exceeds the max_length (20)
     if len(safe_name) > 20:
         file_extension = safe_name.split('.')[-1]

 @spaces.GPU(duration=120)
 def safe_filename(name):
     INVALID_FILENAME_CHARS = r'[<>:"/\\|?*\x00-\x1f]'
     safe_name = re.sub(INVALID_FILENAME_CHARS, '_', name)
     # Truncate the filename if it exceeds the max_length (20)
     if len(safe_name) > 20:
         file_extension = safe_name.split('.')[-1]

modules/utils/youtube_manager.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from pytubefix import YouTube
 import os
@@ -12,4 +13,21 @@ def get_ytmetas(link):
 def get_ytaudio(ytdata: YouTube):
-    return ytdata.streams.get_audio_only().download(filename=os.path.join("modules", "yt_tmp.wav"))

 from pytubefix import YouTube
+import subprocess
 import os
 def get_ytaudio(ytdata: YouTube):
+    # Somehow the audio is corrupted so need to convert to valid audio file.
+    # Fix for : https://github.com/jhj0517/Whisper-WebUI/issues/304
+    audio_path = ytdata.streams.get_audio_only().download(filename=os.path.join("modules", "yt_tmp.wav"))
+    temp_audio_path = os.path.join("modules", "yt_tmp_fixed.wav")
+    try:
+        subprocess.run([
+            'ffmpeg', '-y',
+            '-i', audio_path,
+            temp_audio_path
+        ], check=True)
+        os.replace(temp_audio_path, audio_path)
+        return audio_path
+    except subprocess.CalledProcessError as e:
+        print(f"Error during ffmpeg conversion: {e}")
+        return None

modules/whisper/faster_whisper_inference.py CHANGED Viewed

@@ -40,7 +40,7 @@ class FasterWhisperInference(WhisperBase):
     def transcribe(self,
                    audio: Union[str, BinaryIO, np.ndarray],
-                   progress: gr.Progress,
                    *whisper_params,
                    ) -> Tuple[List[dict], float]:
         """
@@ -126,7 +126,7 @@ class FasterWhisperInference(WhisperBase):
     def update_model(self,
                      model_size: str,
                      compute_type: str,
-                     progress: gr.Progress
                      ):
         """
         Update current model setting
@@ -159,7 +159,7 @@ class FasterWhisperInference(WhisperBase):
         ----------
         Name list of models
         """
-        model_paths = {model:model for model in whisper.available_models()}
         faster_whisper_prefix = "models--Systran--faster-whisper-"
         existing_models = os.listdir(self.model_dir)

     def transcribe(self,
                    audio: Union[str, BinaryIO, np.ndarray],
+                   progress: gr.Progress = gr.Progress(),
                    *whisper_params,
                    ) -> Tuple[List[dict], float]:
         """
     def update_model(self,
                      model_size: str,
                      compute_type: str,
+                     progress: gr.Progress = gr.Progress()
                      ):
         """
         Update current model setting
         ----------
         Name list of models
         """
+        model_paths = {model:model for model in faster_whisper.available_models()}
         faster_whisper_prefix = "models--Systran--faster-whisper-"
         existing_models = os.listdir(self.model_dir)

modules/whisper/insanely_fast_whisper_inference.py CHANGED Viewed

@@ -39,7 +39,7 @@ class InsanelyFastWhisperInference(WhisperBase):
     def transcribe(self,
                    audio: Union[str, np.ndarray, torch.Tensor],
-                   progress: gr.Progress,
                    *whisper_params,
                    ) -> Tuple[List[dict], float]:
         """
@@ -75,18 +75,25 @@ class InsanelyFastWhisperInference(WhisperBase):
         ) as progress:
             progress.add_task("[yellow]Transcribing...", total=None)
             segments = self.model(
                 inputs=audio,
                 return_timestamps=True,
-                chunk_length_s=params.chunk_length_s,
                 batch_size=params.batch_size,
-                generate_kwargs={
-                    "language": params.lang,
-                    "task": "translate" if params.is_translate and self.current_model_size in self.translatable_models else "transcribe",
-                    "no_speech_threshold": params.no_speech_threshold,
-                    "temperature": params.temperature,
-                    "compression_ratio_threshold": params.compression_ratio_threshold
-                }
             )
         segments_result = self.format_result(
@@ -98,7 +105,7 @@ class InsanelyFastWhisperInference(WhisperBase):
     def update_model(self,
                      model_size: str,
                      compute_type: str,
-                     progress: gr.Progress,
                      ):
         """
         Update current model setting

     def transcribe(self,
                    audio: Union[str, np.ndarray, torch.Tensor],
+                   progress: gr.Progress = gr.Progress(),
                    *whisper_params,
                    ) -> Tuple[List[dict], float]:
         """
         ) as progress:
             progress.add_task("[yellow]Transcribing...", total=None)
+            kwargs = {
+                "no_speech_threshold": params.no_speech_threshold,
+                "temperature": params.temperature,
+                "compression_ratio_threshold": params.compression_ratio_threshold,
+                "logprob_threshold": params.log_prob_threshold,
+            }
+            if self.current_model_size.endswith(".en"):
+                pass
+            else:
+                kwargs["language"] = params.lang
+                kwargs["task"] = "translate" if params.is_translate else "transcribe"
             segments = self.model(
                 inputs=audio,
                 return_timestamps=True,
+                chunk_length_s=params.chunk_length,
                 batch_size=params.batch_size,
+                generate_kwargs=kwargs
             )
         segments_result = self.format_result(
     def update_model(self,
                      model_size: str,
                      compute_type: str,
+                     progress: gr.Progress = gr.Progress(),
                      ):
         """
         Update current model setting

modules/whisper/whisper_Inference.py CHANGED Viewed

@@ -28,7 +28,7 @@ class WhisperInference(WhisperBase):
     def transcribe(self,
                    audio: Union[str, np.ndarray, torch.Tensor],
-                   progress: gr.Progress,
                    *whisper_params,
                    ) -> Tuple[List[dict], float]:
         """
@@ -79,7 +79,7 @@ class WhisperInference(WhisperBase):
     def update_model(self,
                      model_size: str,
                      compute_type: str,
-                     progress: gr.Progress,
                      ):
         """
         Update current model setting

     def transcribe(self,
                    audio: Union[str, np.ndarray, torch.Tensor],
+                   progress: gr.Progress = gr.Progress(),
                    *whisper_params,
                    ) -> Tuple[List[dict], float]:
         """
     def update_model(self,
                      model_size: str,
                      compute_type: str,
+                     progress: gr.Progress = gr.Progress(),
                      ):
         """
         Update current model setting

modules/whisper/whisper_base.py CHANGED Viewed

@@ -53,7 +53,7 @@ class WhisperBase(ABC):
     @abstractmethod
     def transcribe(self,
                    audio: Union[str, BinaryIO, np.ndarray],
-                   progress: gr.Progress,
                    *whisper_params,
                    ):
         """Inference whisper model to transcribe"""
@@ -63,7 +63,7 @@ class WhisperBase(ABC):
     def update_model(self,
                      model_size: str,
                      compute_type: str,
-                     progress: gr.Progress
                      ):
         """Initialize whisper model"""
         pass
@@ -104,7 +104,9 @@ class WhisperBase(ABC):
             add_timestamp=add_timestamp
         )
-        if params.lang == "Automatic Detection":
             params.lang = None
         else:
             language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}
@@ -128,11 +130,12 @@ class WhisperBase(ABC):
                     origin_sample_rate = self.music_separator.audio_info.sample_rate
                 audio = self.resample_audio(audio=audio, original_sample_rate=origin_sample_rate)
-            self.music_separator.offload()
         if params.vad_filter:
             # Explicit value set for float('inf') from gr.Number()
-            if params.max_speech_duration_s >= 9999:
                 params.max_speech_duration_s = float('inf')
             vad_options = VadOptions(
@@ -171,10 +174,10 @@ class WhisperBase(ABC):
         return result, elapsed_time
     def transcribe_file(self,
-                        files: list,
-                        input_folder_path: str,
-                        file_format: str,
-                        add_timestamp: bool,
                         progress=gr.Progress(),
                         *whisper_params,
                         ) -> list:
@@ -207,18 +210,21 @@ class WhisperBase(ABC):
         try:
             if input_folder_path:
                 files = get_media_files(input_folder_path)
-                files = format_gradio_files(files)
             files_info = {}
             for file in files:
                 transcribed_segments, time_for_task = self.run(
-                    file.name,
                     progress,
                     add_timestamp,
                     *whisper_params,
                 )
-                file_name, file_ext = os.path.splitext(os.path.basename(file.name))
                 subtitle, file_path = self.generate_and_write_file(
                     file_name=file_name,
                     transcribed_segments=transcribed_segments,
@@ -245,13 +251,11 @@ class WhisperBase(ABC):
             print(f"Error transcribing file: {e}")
         finally:
             self.release_cuda_memory()
-            if not files:
-                self.remove_input_files([file.name for file in files])
     def transcribe_mic(self,
                        mic_audio: str,
-                       file_format: str,
-                       add_timestamp: bool,
                        progress=gr.Progress(),
                        *whisper_params,
                        ) -> list:
@@ -302,12 +306,11 @@ class WhisperBase(ABC):
             print(f"Error transcribing file: {e}")
         finally:
             self.release_cuda_memory()
-            self.remove_input_files([mic_audio])
     def transcribe_youtube(self,
                            youtube_link: str,
-                           file_format: str,
-                           add_timestamp: bool,
                            progress=gr.Progress(),
                            *whisper_params,
                            ) -> list:
@@ -358,22 +361,15 @@ class WhisperBase(ABC):
             )
             result_str = f"Done in {self.format_time(time_for_task)}! Subtitle file is in the outputs folder.\n\n{subtitle}"
             return [result_str, result_file_path]
         except Exception as e:
             print(f"Error transcribing file: {e}")
         finally:
-            try:
-                if 'yt' not in locals():
-                    yt = get_ytdata(youtube_link)
-                    file_path = get_ytaudio(yt)
-                else:
-                    file_path = get_ytaudio(yt)
-                self.release_cuda_memory()
-                self.remove_input_files([file_path])
-            except Exception as cleanup_error:
-                pass
     @staticmethod
     def generate_and_write_file(file_name: str,
@@ -411,11 +407,12 @@ class WhisperBase(ABC):
         else:
             output_path = os.path.join(output_dir, f"{file_name}")
-        if file_format == "SRT":
             content = get_srt(transcribed_segments)
             output_path += '.srt'
-        elif file_format == "WebVTT":
             content = get_vtt(transcribed_segments)
             output_path += '.vtt'

     @abstractmethod
     def transcribe(self,
                    audio: Union[str, BinaryIO, np.ndarray],
+                   progress: gr.Progress = gr.Progress(),
                    *whisper_params,
                    ):
         """Inference whisper model to transcribe"""
     def update_model(self,
                      model_size: str,
                      compute_type: str,
+                     progress: gr.Progress = gr.Progress()
                      ):
         """Initialize whisper model"""
         pass
             add_timestamp=add_timestamp
         )
+        if params.lang is None:
+            pass
+        elif params.lang == "Automatic Detection":
             params.lang = None
         else:
             language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}
                     origin_sample_rate = self.music_separator.audio_info.sample_rate
                 audio = self.resample_audio(audio=audio, original_sample_rate=origin_sample_rate)
+            if params.uvr_enable_offload:
+                self.music_separator.offload()
         if params.vad_filter:
             # Explicit value set for float('inf') from gr.Number()
+            if params.max_speech_duration_s is None or params.max_speech_duration_s >= 9999:
                 params.max_speech_duration_s = float('inf')
             vad_options = VadOptions(
         return result, elapsed_time
     def transcribe_file(self,
+                        files: Optional[List] = None,
+                        input_folder_path: Optional[str] = None,
+                        file_format: str = "SRT",
+                        add_timestamp: bool = True,
                         progress=gr.Progress(),
                         *whisper_params,
                         ) -> list:
         try:
             if input_folder_path:
                 files = get_media_files(input_folder_path)
+            if isinstance(files, str):
+                files = [files]
+            if files and isinstance(files[0], gr.utils.NamedString):
+                files = [file.name for file in files]
             files_info = {}
             for file in files:
                 transcribed_segments, time_for_task = self.run(
+                    file,
                     progress,
                     add_timestamp,
                     *whisper_params,
                 )
+                file_name, file_ext = os.path.splitext(os.path.basename(file))
                 subtitle, file_path = self.generate_and_write_file(
                     file_name=file_name,
                     transcribed_segments=transcribed_segments,
             print(f"Error transcribing file: {e}")
         finally:
             self.release_cuda_memory()
     def transcribe_mic(self,
                        mic_audio: str,
+                       file_format: str = "SRT",
+                       add_timestamp: bool = True,
                        progress=gr.Progress(),
                        *whisper_params,
                        ) -> list:
             print(f"Error transcribing file: {e}")
         finally:
             self.release_cuda_memory()
     def transcribe_youtube(self,
                            youtube_link: str,
+                           file_format: str = "SRT",
+                           add_timestamp: bool = True,
                            progress=gr.Progress(),
                            *whisper_params,
                            ) -> list:
             )
             result_str = f"Done in {self.format_time(time_for_task)}! Subtitle file is in the outputs folder.\n\n{subtitle}"
+            if os.path.exists(audio):
+                os.remove(audio)
             return [result_str, result_file_path]
         except Exception as e:
             print(f"Error transcribing file: {e}")
         finally:
+            self.release_cuda_memory()
     @staticmethod
     def generate_and_write_file(file_name: str,
         else:
             output_path = os.path.join(output_dir, f"{file_name}")
+        file_format = file_format.strip().lower()
+        if file_format == "srt":
             content = get_srt(transcribed_segments)
             output_path += '.srt'
+        elif file_format == "webvtt":
             content = get_vtt(transcribed_segments)
             output_path += '.vtt'

modules/whisper/whisper_parameter.py CHANGED Viewed

@@ -26,7 +26,6 @@ class WhisperParameters:
     max_speech_duration_s: gr.Number
     min_silence_duration_ms: gr.Number
     speech_pad_ms: gr.Number
-    chunk_length_s: gr.Number
     batch_size: gr.Number
     is_diarize: gr.Checkbox
     hf_token: gr.Textbox
@@ -52,6 +51,7 @@ class WhisperParameters:
     uvr_device: gr.Dropdown
     uvr_segment_size: gr.Number
     uvr_save_file: gr.Checkbox
     """
     A data class for Gradio components of the Whisper Parameters. Use "before" Gradio pre-processing.
     This data class is used to mitigate the key-value problem between Gradio components and function parameters.
@@ -136,10 +136,6 @@ class WhisperParameters:
     speech_pad_ms: gr.Number
         This parameter is related with Silero VAD. Final speech chunks are padded by speech_pad_ms each side
-    chunk_length_s: gr.Number
-        This parameter is related with insanely-fast-whisper pipe.
-        Maximum length of each chunk
     batch_size: gr.Number
         This parameter is related with insanely-fast-whisper pipe. Batch size to pass to the pipe
@@ -193,8 +189,8 @@ class WhisperParameters:
         the maximum will be set by the default max_length.
     chunk_length: gr.Number
-        This parameter is related to faster-whisper. The length of audio segments. If it is not None, it will overwrite the
-        default chunk_length of the FeatureExtractor.
     hallucination_silence_threshold: gr.Number
         This parameter is related to faster-whisper. When word_timestamps is True, skip silent periods longer than this threshold
@@ -223,6 +219,10 @@ class WhisperParameters:
     uvr_save_file: gr.Checkbox
         This parameter is related to UVR. Boolean value that determines whether to save the file or not.
     """
     def as_list(self) -> list:
@@ -252,52 +252,52 @@ class WhisperParameters:
 @dataclass
 class WhisperValues:
-    model_size: str
-    lang: str
-    is_translate: bool
-    beam_size: int
-    log_prob_threshold: float
-    no_speech_threshold: float
-    compute_type: str
-    best_of: int
-    patience: float
-    condition_on_previous_text: bool
-    prompt_reset_on_temperature: float
-    initial_prompt: Optional[str]
-    temperature: float
-    compression_ratio_threshold: float
-    vad_filter: bool
-    threshold: float
-    min_speech_duration_ms: int
-    max_speech_duration_s: float
-    min_silence_duration_ms: int
-    speech_pad_ms: int
-    chunk_length_s: int
-    batch_size: int
-    is_diarize: bool
-    hf_token: str
-    diarization_device: str
-    length_penalty: float
-    repetition_penalty: float
-    no_repeat_ngram_size: int
-    prefix: Optional[str]
-    suppress_blank: bool
-    suppress_tokens: Optional[str]
-    max_initial_timestamp: float
-    word_timestamps: bool
-    prepend_punctuations: Optional[str]
-    append_punctuations: Optional[str]
-    max_new_tokens: Optional[int]
-    chunk_length: Optional[int]
-    hallucination_silence_threshold: Optional[float]
-    hotwords: Optional[str]
-    language_detection_threshold: Optional[float]
-    language_detection_segments: int
-    is_bgm_separate: bool
-    uvr_model_size: str
-    uvr_device: str
-    uvr_segment_size: int
-    uvr_save_file: bool
     """
     A data class to use Whisper parameters.
     """
@@ -318,7 +318,6 @@ class WhisperValues:
                 "initial_prompt": None if not self.initial_prompt else self.initial_prompt,
                 "temperature": self.temperature,
                 "compression_ratio_threshold": self.compression_ratio_threshold,
-                "chunk_length_s": None if self.chunk_length_s is None else self.chunk_length_s,
                 "batch_size": self.batch_size,
                 "length_penalty": self.length_penalty,
                 "repetition_penalty": self.repetition_penalty,
@@ -354,6 +353,17 @@ class WhisperValues:
                 "model_size": self.uvr_model_size,
                 "segment_size": self.uvr_segment_size,
                 "save_file": self.uvr_save_file,
             },
         }
         return data

     max_speech_duration_s: gr.Number
     min_silence_duration_ms: gr.Number
     speech_pad_ms: gr.Number
     batch_size: gr.Number
     is_diarize: gr.Checkbox
     hf_token: gr.Textbox
     uvr_device: gr.Dropdown
     uvr_segment_size: gr.Number
     uvr_save_file: gr.Checkbox
+    uvr_enable_offload: gr.Checkbox
     """
     A data class for Gradio components of the Whisper Parameters. Use "before" Gradio pre-processing.
     This data class is used to mitigate the key-value problem between Gradio components and function parameters.
     speech_pad_ms: gr.Number
         This parameter is related with Silero VAD. Final speech chunks are padded by speech_pad_ms each side
     batch_size: gr.Number
         This parameter is related with insanely-fast-whisper pipe. Batch size to pass to the pipe
         the maximum will be set by the default max_length.
     chunk_length: gr.Number
+        This parameter is related to faster-whisper and insanely-fast-whisper. The length of audio segments in seconds.
+         If it is not None, it will overwrite the default chunk_length of the FeatureExtractor.
     hallucination_silence_threshold: gr.Number
         This parameter is related to faster-whisper. When word_timestamps is True, skip silent periods longer than this threshold
     uvr_save_file: gr.Checkbox
         This parameter is related to UVR. Boolean value that determines whether to save the file or not.
+    uvr_enable_offload: gr.Checkbox
+        This parameter is related to UVR. Boolean value that determines whether to offload the UVR model or not
+        after each transcription.
     """
     def as_list(self) -> list:
 @dataclass
 class WhisperValues:
+    model_size: str = "large-v2"
+    lang: Optional[str] = None
+    is_translate: bool = False
+    beam_size: int = 5
+    log_prob_threshold: float = -1.0
+    no_speech_threshold: float = 0.6
+    compute_type: str = "float16"
+    best_of: int = 5
+    patience: float = 1.0
+    condition_on_previous_text: bool = True
+    prompt_reset_on_temperature: float = 0.5
+    initial_prompt: Optional[str] = None
+    temperature: float = 0.0
+    compression_ratio_threshold: float = 2.4
+    vad_filter: bool = False
+    threshold: float = 0.5
+    min_speech_duration_ms: int = 250
+    max_speech_duration_s: float = float("inf")
+    min_silence_duration_ms: int = 2000
+    speech_pad_ms: int = 400
+    batch_size: int = 24
+    is_diarize: bool = False
+    hf_token: str = ""
+    diarization_device: str = "cuda"
+    length_penalty: float = 1.0
+    repetition_penalty: float = 1.0
+    no_repeat_ngram_size: int = 0
+    prefix: Optional[str] = None
+    suppress_blank: bool = True
+    suppress_tokens: Optional[str] = "[-1]"
+    max_initial_timestamp: float = 0.0
+    word_timestamps: bool = False
+    prepend_punctuations: Optional[str] = "\"'“¿([{-"
+    append_punctuations: Optional[str] = "\"'.。,，!！?？:：”)]}、"
+    max_new_tokens: Optional[int] = None
+    chunk_length: Optional[int] = 30
+    hallucination_silence_threshold: Optional[float] = None
+    hotwords: Optional[str] = None
+    language_detection_threshold: Optional[float] = None
+    language_detection_segments: int = 1
+    is_bgm_separate: bool = False
+    uvr_model_size: str = "UVR-MDX-NET-Inst_HQ_4"
+    uvr_device: str = "cuda"
+    uvr_segment_size: int = 256
+    uvr_save_file: bool = False
+    uvr_enable_offload: bool = True
     """
     A data class to use Whisper parameters.
     """
                 "initial_prompt": None if not self.initial_prompt else self.initial_prompt,
                 "temperature": self.temperature,
                 "compression_ratio_threshold": self.compression_ratio_threshold,
                 "batch_size": self.batch_size,
                 "length_penalty": self.length_penalty,
                 "repetition_penalty": self.repetition_penalty,
                 "model_size": self.uvr_model_size,
                 "segment_size": self.uvr_segment_size,
                 "save_file": self.uvr_save_file,
+                "enable_offload": self.uvr_enable_offload
             },
         }
         return data
+    def as_list(self) -> list:
+        """
+        Converts the data class attributes into a list
+        Returns
+        ----------
+        A list of Whisper parameters
+        """
+        return [getattr(self, f.name) for f in fields(self)]

requirements.txt CHANGED Viewed

@@ -2,14 +2,15 @@
 # If you're using it, update url to your CUDA version (CUDA 12.1 is minimum requirement):
 # For CUDA 12.1, use : https://download.pytorch.org/whl/cu121
 # For CUDA 12.4, use : https://download.pytorch.org/whl/cu124
---extra-index-url https://download.pytorch.org/whl/cu124
-torch
 git+https://github.com/jhj0517/jhj0517-whisper.git
 faster-whisper==1.0.3
-transformers==4.42.3
-gradio==4.43.0
 pytubefix
 ruamel.yaml==0.18.6
 pyannote.audio==3.3.1

 # If you're using it, update url to your CUDA version (CUDA 12.1 is minimum requirement):
 # For CUDA 12.1, use : https://download.pytorch.org/whl/cu121
 # For CUDA 12.4, use : https://download.pytorch.org/whl/cu124
+--extra-index-url https://download.pytorch.org/whl/cu121
+torch==2.3.1
+torchaudio==2.3.1
 git+https://github.com/jhj0517/jhj0517-whisper.git
 faster-whisper==1.0.3
+transformers
+gradio
 pytubefix
 ruamel.yaml==0.18.6
 pyannote.audio==3.3.1

tests/test_bgm_separation.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from modules.utils.paths import *
+from modules.whisper.whisper_factory import WhisperFactory
+from modules.whisper.whisper_parameter import WhisperValues
+from test_config import *
+from test_transcription import download_file, test_transcribe
+import gradio as gr
+import pytest
+import torch
+import os
+@pytest.mark.skipif(
+    not is_cuda_available(),
+    reason="Skipping because the test only works on GPU"
+)
+@pytest.mark.parametrize(
+    "whisper_type,vad_filter,bgm_separation,diarization",
+    [
+        ("whisper", False, True, False),
+        ("faster-whisper", False, True, False),
+        ("insanely_fast_whisper", False, True, False)
+    ]
+)
+def test_bgm_separation_pipeline(
+    whisper_type: str,
+    vad_filter: bool,
+    bgm_separation: bool,
+    diarization: bool,
+):
+    test_transcribe(whisper_type, vad_filter, bgm_separation, diarization)
+@pytest.mark.skipif(
+    not is_cuda_available(),
+    reason="Skipping because the test only works on GPU"
+)
+@pytest.mark.parametrize(
+    "whisper_type,vad_filter,bgm_separation,diarization",
+    [
+        ("whisper", True, True, False),
+        ("faster-whisper", True, True, False),
+        ("insanely_fast_whisper", True, True, False)
+    ]
+)
+def test_bgm_separation_with_vad_pipeline(
+    whisper_type: str,
+    vad_filter: bool,
+    bgm_separation: bool,
+    diarization: bool,
+):
+    test_transcribe(whisper_type, vad_filter, bgm_separation, diarization)

tests/test_config.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from modules.utils.paths import *
+import os
+import torch
+TEST_FILE_DOWNLOAD_URL = "https://github.com/jhj0517/whisper_flutter_new/raw/main/example/assets/jfk.wav"
+TEST_FILE_PATH = os.path.join(WEBUI_DIR, "tests", "jfk.wav")
+TEST_YOUTUBE_URL = "https://www.youtube.com/watch?v=4WEQtgnBu0I&ab_channel=AndriaFitzer"
+TEST_WHISPER_MODEL = "tiny"
+TEST_UVR_MODEL = "UVR-MDX-NET-Inst_HQ_4"
+TEST_NLLB_MODEL = "facebook/nllb-200-distilled-600M"
+TEST_SUBTITLE_SRT_PATH = os.path.join(WEBUI_DIR, "tests", "test_srt.srt")
+TEST_SUBTITLE_VTT_PATH = os.path.join(WEBUI_DIR, "tests", "test_vtt.vtt")
+def is_cuda_available():
+    return torch.cuda.is_available()

tests/test_diarization.py ADDED Viewed

	@@ -0,0 +1,31 @@

+from modules.utils.paths import *
+from modules.whisper.whisper_factory import WhisperFactory
+from modules.whisper.whisper_parameter import WhisperValues
+from test_config import *
+from test_transcription import download_file, test_transcribe
+import gradio as gr
+import pytest
+import os
+@pytest.mark.skipif(
+    not is_cuda_available(),
+    reason="Skipping because the test only works on GPU"
+)
+@pytest.mark.parametrize(
+    "whisper_type,vad_filter,bgm_separation,diarization",
+    [
+        ("whisper", False, False, True),
+        ("faster-whisper", False, False, True),
+        ("insanely_fast_whisper", False, False, True)
+    ]
+)
+def test_diarization_pipeline(
+    whisper_type: str,
+    vad_filter: bool,
+    bgm_separation: bool,
+    diarization: bool,
+):
+    test_transcribe(whisper_type, vad_filter, bgm_separation, diarization)

tests/test_srt.srt ADDED Viewed

	@@ -0,0 +1,7 @@

+1
+00:00:00,000 --> 00:00:02,240
+You've got
+2
+00:00:02,240 --> 00:00:04,160
+a friend in me.

tests/test_transcription.py ADDED Viewed

	@@ -0,0 +1,97 @@

+from modules.whisper.whisper_factory import WhisperFactory
+from modules.whisper.whisper_parameter import WhisperValues
+from modules.utils.paths import WEBUI_DIR
+from test_config import *
+import requests
+import pytest
+import gradio as gr
+import os
+@pytest.mark.parametrize(
+    "whisper_type,vad_filter,bgm_separation,diarization",
+    [
+        ("whisper", False, False, False),
+        ("faster-whisper", False, False, False),
+        ("insanely_fast_whisper", False, False, False)
+    ]
+)
+def test_transcribe(
+    whisper_type: str,
+    vad_filter: bool,
+    bgm_separation: bool,
+    diarization: bool,
+):
+    audio_path_dir = os.path.join(WEBUI_DIR, "tests")
+    audio_path = os.path.join(audio_path_dir, "jfk.wav")
+    if not os.path.exists(audio_path):
+        download_file(TEST_FILE_DOWNLOAD_URL, audio_path_dir)
+    whisper_inferencer = WhisperFactory.create_whisper_inference(
+        whisper_type=whisper_type,
+    )
+    print(
+        f"""Whisper Device : {whisper_inferencer.device}\n"""
+        f"""BGM Separation Device: {whisper_inferencer.music_separator.device}\n"""
+        f"""Diarization Device: {whisper_inferencer.diarizer.device}"""
+    )
+    hparams = WhisperValues(
+        model_size=TEST_WHISPER_MODEL,
+        vad_filter=vad_filter,
+        is_bgm_separate=bgm_separation,
+        compute_type=whisper_inferencer.current_compute_type,
+        uvr_enable_offload=True,
+        is_diarize=diarization,
+    ).as_list()
+    subtitle_str, file_path = whisper_inferencer.transcribe_file(
+        [audio_path],
+        None,
+        "SRT",
+        False,
+        gr.Progress(),
+        *hparams,
+    )
+    assert isinstance(subtitle_str, str) and subtitle_str
+    assert isinstance(file_path[0], str) and file_path
+    whisper_inferencer.transcribe_youtube(
+        TEST_YOUTUBE_URL,
+        "SRT",
+        False,
+        gr.Progress(),
+        *hparams,
+    )
+    assert isinstance(subtitle_str, str) and subtitle_str
+    assert isinstance(file_path[0], str) and file_path
+    whisper_inferencer.transcribe_mic(
+        audio_path,
+        "SRT",
+        False,
+        gr.Progress(),
+        *hparams,
+    )
+    assert isinstance(subtitle_str, str) and subtitle_str
+    assert isinstance(file_path[0], str) and file_path
+def download_file(url, save_dir):
+    if os.path.exists(TEST_FILE_PATH):
+        return
+    if not os.path.exists(save_dir):
+        os.makedirs(save_dir)
+    file_name = url.split("/")[-1]
+    file_path = os.path.join(save_dir, file_name)
+    response = requests.get(url)
+    with open(file_path, "wb") as file:
+        file.write(response.content)
+    print(f"File downloaded to: {file_path}")

tests/test_translation.py ADDED Viewed

	@@ -0,0 +1,52 @@

+from modules.translation.deepl_api import DeepLAPI
+from modules.translation.nllb_inference import NLLBInference
+from test_config import *
+import os
+import pytest
+@pytest.mark.parametrize("model_size, file_path", [
+    (TEST_NLLB_MODEL, TEST_SUBTITLE_SRT_PATH),
+    (TEST_NLLB_MODEL, TEST_SUBTITLE_VTT_PATH),
+])
+def test_nllb_inference(
+    model_size: str,
+    file_path: str
+):
+    nllb_inferencer = NLLBInference()
+    print(f"NLLB Device : {nllb_inferencer.device}")
+    result_str, file_paths = nllb_inferencer.translate_file(
+        fileobjs=[file_path],
+        model_size=model_size,
+        src_lang="eng_Latn",
+        tgt_lang="kor_Hang",
+    )
+    assert isinstance(result_str, str)
+    assert isinstance(file_paths[0], str)
+@pytest.mark.parametrize("file_path", [
+    TEST_SUBTITLE_SRT_PATH,
+    TEST_SUBTITLE_VTT_PATH,
+])
+def test_deepl_api(
+    file_path: str
+):
+    deepl_api = DeepLAPI()
+    api_key = os.getenv("DEEPL_API_KEY")
+    result_str, file_paths = deepl_api.translate_deepl(
+        auth_key=api_key,
+        fileobjs=[file_path],
+        source_lang="English",
+        target_lang="Korean",
+        is_pro=False,
+        add_timestamp=True,
+    )
+    assert isinstance(result_str, str)
+    assert isinstance(file_paths[0], str)

tests/test_vad.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from modules.utils.paths import *
+from modules.whisper.whisper_factory import WhisperFactory
+from modules.whisper.whisper_parameter import WhisperValues
+from test_config import *
+from test_transcription import download_file, test_transcribe
+import gradio as gr
+import pytest
+import os
+@pytest.mark.parametrize(
+    "whisper_type,vad_filter,bgm_separation,diarization",
+    [
+        ("whisper", True, False, False),
+        ("faster-whisper", True, False, False),
+        ("insanely_fast_whisper", True, False, False)
+    ]
+)
+def test_vad_pipeline(
+    whisper_type: str,
+    vad_filter: bool,
+    bgm_separation: bool,
+    diarization: bool,
+):
+    test_transcribe(whisper_type, vad_filter, bgm_separation, diarization)

tests/test_vtt.vtt ADDED Viewed

	@@ -0,0 +1,6 @@

+WEBVTT
+00:00:00.500 --> 00:00:02.000
+You've got
+00:00:02.500 --> 00:00:04.300
+a friend in me.