whisper-webui-translate

Runtime error

App Files Files Community

avans06 commited on Jul 26, 2023

Commit

8077be2

•

1 Parent(s): 7bdbe2a

Added support for translation models (NLLB, NLLB-CT2, MT5)

Browse files

to provide full translation capabilities for Whisper.

The interface now includes optional selection of NLLB Model (for translate) and NLLB Language. If not selected, the translation feature will not be activated.
__________________

Whisper’s Task ‘translate’ only implements the functionality of translating other languages into English. OpenAI does not guarantee translations between arbitrary languages. In such cases, you can choose to use the NLLB Model to implement the translation task. However, it’s important to note that the NLLB Model runs slowly, and the completion time may be twice as long as usual.

The larger the parameters of the NLLB model, the better its performance is expected to be. However, it also requires higher computational resources, making it slower to operate. On the other hand, the version converted from ct2 (CTranslate2) requires lower resources and operates at a faster speed.

Currently, enabling word-level timestamps cannot be used in conjunction with NLLB Model translation because Word Timestamps will split the source text, and after translation, it becomes a non-word-level string.

The ‘mt5-zh-ja-en-trimmed’ model is finetuned from Google’s ‘mt5-base’ model. This model has a relatively good translation speed, but it only supports three languages: Chinese, Japanese, and English.

Files changed (10) hide show

README.md +1 -1
app.py +115 -56
config.json5 +96 -0
requirements-fasterWhisper.txt +3 -2
requirements-whisper.txt +2 -1
requirements.txt +3 -2
src/config.py +11 -4
src/nllb/nllbLangs.py +251 -0
src/nllb/nllbModel.py +221 -0
src/vadParallel.py +1 -1

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: Faster Whisper Webui
 emoji: ✨
 colorFrom: blue
 colorTo: purple

 ---
+title: Faster Whisper Webui with translate
 emoji: ✨
 colorFrom: blue
 colorTo: purple

app.py CHANGED Viewed

@@ -5,8 +5,8 @@ from typing import Iterator, Union
 import argparse
 from io import StringIO
 import os
-import pathlib
 import tempfile
 import zipfile
 import numpy as np
@@ -37,9 +37,14 @@ from src.utils import optional_int, slugify, write_srt, write_vtt
 from src.vad import AbstractTranscription, NonSpeechStrategy, PeriodicTranscriptionConfig, TranscriptionConfig, VadPeriodicTranscription, VadSileroTranscription
 from src.whisper.abstractWhisperContainer import AbstractWhisperContainer
 from src.whisper.whisperFactory import create_whisper_container
 import shutil
 import zhconv
 # Configure more application defaults in config.json5
@@ -92,26 +97,26 @@ class WhisperTranscriber:
             print("[Auto parallel] Using GPU devices " + str(self.parallel_device_list) + " and " + str(self.vad_cpu_cores) + " CPU cores for VAD/transcription.")
     # Entry function for the simple tab
-    def transcribe_webui_simple(self, modelName, languageName, urlData, multipleFiles, microphoneData, task,
                                 vad, vadMergeWindow, vadMaxMergeSize,
                                 word_timestamps: bool = False, highlight_words: bool = False):
-        return self.transcribe_webui_simple_progress(modelName, languageName, urlData, multipleFiles, microphoneData, task,
                                                      vad, vadMergeWindow, vadMaxMergeSize,
                                                      word_timestamps, highlight_words)
     # Entry function for the simple tab progress
-    def transcribe_webui_simple_progress(self, modelName, languageName, urlData, multipleFiles, microphoneData, task,
                                          vad, vadMergeWindow, vadMaxMergeSize,
                                          word_timestamps: bool = False, highlight_words: bool = False,
                                          progress=gr.Progress()):
         vadOptions = VadOptions(vad, vadMergeWindow, vadMaxMergeSize, self.app_config.vad_padding, self.app_config.vad_prompt_window, self.app_config.vad_initial_prompt_mode)
-        return self.transcribe_webui(modelName, languageName, urlData, multipleFiles, microphoneData, task, vadOptions,
                                      word_timestamps=word_timestamps, highlight_words=highlight_words, progress=progress)
     # Entry function for the full tab
-    def transcribe_webui_full(self, modelName, languageName, urlData, multipleFiles, microphoneData, task,
                               vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow, vadInitialPromptMode,
                               # Word timestamps
                               word_timestamps: bool, highlight_words: bool, prepend_punctuations: str, append_punctuations: str,
@@ -119,7 +124,7 @@ class WhisperTranscriber:
                               condition_on_previous_text: bool, fp16: bool, temperature_increment_on_fallback: float,
                               compression_ratio_threshold: float, logprob_threshold: float, no_speech_threshold: float):
-        return self.transcribe_webui_full_progress(modelName, languageName, urlData, multipleFiles, microphoneData, task,
                                 vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow, vadInitialPromptMode,
                                 word_timestamps, highlight_words, prepend_punctuations, append_punctuations,
                                 initial_prompt, temperature, best_of, beam_size, patience, length_penalty, suppress_tokens,
@@ -127,7 +132,7 @@ class WhisperTranscriber:
                                 compression_ratio_threshold, logprob_threshold, no_speech_threshold)
     # Entry function for the full tab with progress
-    def transcribe_webui_full_progress(self, modelName, languageName, urlData, multipleFiles, microphoneData, task,
                                         vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow, vadInitialPromptMode,
                                         # Word timestamps
                                         word_timestamps: bool, highlight_words: bool, prepend_punctuations: str, append_punctuations: str,
@@ -144,21 +149,21 @@ class WhisperTranscriber:
         vadOptions = VadOptions(vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow, vadInitialPromptMode)
-        return self.transcribe_webui(modelName, languageName, urlData, multipleFiles, microphoneData, task, vadOptions,
                                      initial_prompt=initial_prompt, temperature=temperature, best_of=best_of, beam_size=beam_size, patience=patience, length_penalty=length_penalty, suppress_tokens=suppress_tokens,
                                      condition_on_previous_text=condition_on_previous_text, fp16=fp16,
                                      compression_ratio_threshold=compression_ratio_threshold, logprob_threshold=logprob_threshold, no_speech_threshold=no_speech_threshold,
                                      word_timestamps=word_timestamps, prepend_punctuations=prepend_punctuations, append_punctuations=append_punctuations, highlight_words=highlight_words,
                                      progress=progress)
-    def transcribe_webui(self, modelName, languageName, urlData, multipleFiles, microphoneData, task,
                          vadOptions: VadOptions, progress: gr.Progress = None, highlight_words: bool = False,
                          **decodeOptions: dict):
         try:
             sources = self.__get_source(urlData, multipleFiles, microphoneData)
             try:
-                langObj = get_language_from_name(languageName)
                 selectedLanguage = languageName.lower() if languageName is not None and len(languageName) > 0 else None
                 selectedModel = modelName if modelName is not None else "base"
@@ -166,6 +171,12 @@ class WhisperTranscriber:
                                                  model_name=selectedModel, compute_type=self.app_config.compute_type,
                                                  cache=self.model_cache, models=self.app_config.models)
                 # Result
                 download = []
                 zip_file_lookup = {}
@@ -208,7 +219,7 @@ class WhisperTranscriber:
                     # Update progress
                     current_progress += source_audio_duration
-                    source_download, source_text, source_vtt = self.write_result(result, filePrefix, outputDirectory, highlight_words)
                     if len(sources) > 1:
                         # Add new line separators
@@ -252,30 +263,19 @@ class WhisperTranscriber:
                 return download, text, vtt
             finally:
-                if languageName == "Chinese":
-                    for file_path in source_download:
-                        try:
-                            with open(file_path, "r+", encoding="utf-8") as source:
-                                content = source.read()
-                                content = zhconv.convert(content, "zh-tw")
-                                source.seek(0)
-                                source.write(content)
-                        except Exception as e:
-                            # Ignore error - it's just a cleanup
-                            print("Error converting Traditional Chinese with download source file: \n" + file_path + ", \n" + str(e))
                 # Cleanup source
                 if self.deleteUploadedFiles:
                     for source in sources:
                         if self.app_config.merge_subtitle_with_sources and self.app_config.output_dir is not None and len(source_download) > 0:
-                            print("merge subtitle(srt) with source file [" + source.source_name + "]")
                             outRsult = ""
                             try:
                                 srt_path = source_download[0]
                                 save_path = os.path.join(self.app_config.output_dir, source.source_name)
                                 save_without_ext, ext = os.path.splitext(save_path)
-                                lang_ext = "." + langObj.code if langObj is not None else ""
-                                output_with_srt = save_without_ext + lang_ext + ext
                                 #ffmpeg -i "input.mp4" -i "input.srt" -c copy -c:s mov_text output.mp4
                                 input_file = ffmpeg.input(source.source_path)
@@ -435,20 +435,41 @@ class WhisperTranscriber:
         return config
-    def write_result(self, result: dict, source_name: str, output_dir: str, highlight_words: bool = False):
         if not os.path.exists(output_dir):
             os.makedirs(output_dir)
         text = result["text"]
         language = result["language"]
         languageMaxLineWidth = self.__get_max_line_width(language)
-        print("Max line width " + str(languageMaxLineWidth))
         vtt = self.__get_subs(result["segments"], "vtt", languageMaxLineWidth, highlight_words=highlight_words)
         srt = self.__get_subs(result["segments"], "srt", languageMaxLineWidth, highlight_words=highlight_words)
         json_result = json.dumps(result, indent=4, ensure_ascii=False)
-        if language == "zh":
             vtt = zhconv.convert(vtt, "zh-tw")
             srt = zhconv.convert(srt, "zh-tw")
             text = zhconv.convert(text, "zh-tw")
@@ -541,12 +562,29 @@ def create_ui(app_config: ApplicationConfig):
         ui_description += "\n\n" + "Max audio file length: " + str(app_config.input_audio_max_duration) + " s"
     ui_article = "Read the [documentation here](https://gitlab.com/aadnk/whisper-webui/-/blob/main/docs/options.md)."
     whisper_models = app_config.get_model_names()
-    common_inputs = lambda : [
-        gr.Dropdown(choices=whisper_models, value=app_config.default_model_name, label="Model"),
-        gr.Dropdown(choices=sorted(get_language_names()), label="Language", value=app_config.language),
         gr.Text(label="URL (YouTube, etc.)"),
         gr.File(label="Upload Files", file_count="multiple"),
         gr.Audio(source="microphone", type="filepath", label="Microphone Input"),
@@ -579,7 +617,13 @@ def create_ui(app_config: ApplicationConfig):
         with gr.Row():
             with gr.Column():
                 simple_submit = gr.Button("Submit", variant="primary")
-                simple_input = common_inputs() + common_vad_inputs() + common_word_timestamps_inputs()
             with gr.Column():
                 simple_output = common_output()
                 simple_flag = gr.Button("Flag")
@@ -602,27 +646,33 @@ def create_ui(app_config: ApplicationConfig):
         with gr.Row():
             with gr.Column():
                 full_submit = gr.Button("Submit", variant="primary")
-                full_input1 = common_inputs() + common_vad_inputs() + [
-                gr.Number(label="VAD - Padding (s)", precision=None, value=app_config.vad_padding),
-                gr.Number(label="VAD - Prompt Window (s)", precision=None, value=app_config.vad_prompt_window),
-                gr.Dropdown(choices=VAD_INITIAL_PROMPT_MODE_VALUES, label="VAD - Initial Prompt Mode")]
-                full_input2 = common_word_timestamps_inputs() + [
-                gr.Text(label="Word Timestamps - Prepend Punctuations", value=app_config.prepend_punctuations),
-                gr.Text(label="Word Timestamps - Append Punctuations", value=app_config.append_punctuations),
-                gr.TextArea(label="Initial Prompt"),
-                gr.Number(label="Temperature", value=app_config.temperature),
-                gr.Number(label="Best Of - Non-zero temperature", value=app_config.best_of, precision=0),
-                gr.Number(label="Beam Size - Zero temperature", value=app_config.beam_size, precision=0),
-                gr.Number(label="Patience - Zero temperature", value=app_config.patience),
-                gr.Number(label="Length Penalty - Any temperature", value=app_config.length_penalty),
-                gr.Text(label="Suppress Tokens - Comma-separated list of token IDs", value=app_config.suppress_tokens),
-                gr.Checkbox(label="Condition on previous text", value=app_config.condition_on_previous_text),
-                gr.Checkbox(label="FP16", value=app_config.fp16),
-                gr.Number(label="Temperature increment on fallback", value=app_config.temperature_increment_on_fallback),
-                gr.Number(label="Compression ratio threshold", value=app_config.compression_ratio_threshold),
-                gr.Number(label="Logprob threshold", value=app_config.logprob_threshold),
-                gr.Number(label="No speech threshold", value=app_config.no_speech_threshold)]
             with gr.Column():
                 full_output = common_output()
@@ -654,6 +704,7 @@ def create_ui(app_config: ApplicationConfig):
 if __name__ == '__main__':
     default_app_config = ApplicationConfig.create_default()
     whisper_models = default_app_config.get_model_names()
     # Environment variable overrides
     default_whisper_implementation = os.environ.get("WHISPER_IMPLEMENTATION", default_app_config.whisper_implementation)
@@ -707,6 +758,14 @@ if __name__ == '__main__':
     updated_config = default_app_config.update(**args)
     if (threads := args.pop("threads")) > 0:
         torch.set_num_threads(threads)

 import argparse
 from io import StringIO
+import time
 import os
 import tempfile
 import zipfile
 import numpy as np
 from src.vad import AbstractTranscription, NonSpeechStrategy, PeriodicTranscriptionConfig, TranscriptionConfig, VadPeriodicTranscription, VadSileroTranscription
 from src.whisper.abstractWhisperContainer import AbstractWhisperContainer
 from src.whisper.whisperFactory import create_whisper_container
+from src.nllb.nllbModel import NllbModel
+from src.nllb.nllbLangs import _TO_NLLB_LANG_CODE
+from src.nllb.nllbLangs import get_nllb_lang_names
+from src.nllb.nllbLangs import get_nllb_lang_from_name
 import shutil
 import zhconv
+import tqdm
 # Configure more application defaults in config.json5
             print("[Auto parallel] Using GPU devices " + str(self.parallel_device_list) + " and " + str(self.vad_cpu_cores) + " CPU cores for VAD/transcription.")
     # Entry function for the simple tab
+    def transcribe_webui_simple(self, modelName, languageName, nllbModelName, nllbLangName, urlData, multipleFiles, microphoneData, task,
                                 vad, vadMergeWindow, vadMaxMergeSize,
                                 word_timestamps: bool = False, highlight_words: bool = False):
+        return self.transcribe_webui_simple_progress(modelName, languageName, nllbModelName, nllbLangName, urlData, multipleFiles, microphoneData, task,
                                                      vad, vadMergeWindow, vadMaxMergeSize,
                                                      word_timestamps, highlight_words)
     # Entry function for the simple tab progress
+    def transcribe_webui_simple_progress(self, modelName, languageName, nllbModelName, nllbLangName, urlData, multipleFiles, microphoneData, task,
                                          vad, vadMergeWindow, vadMaxMergeSize,
                                          word_timestamps: bool = False, highlight_words: bool = False,
                                          progress=gr.Progress()):
         vadOptions = VadOptions(vad, vadMergeWindow, vadMaxMergeSize, self.app_config.vad_padding, self.app_config.vad_prompt_window, self.app_config.vad_initial_prompt_mode)
+        return self.transcribe_webui(modelName, languageName, nllbModelName, nllbLangName, urlData, multipleFiles, microphoneData, task, vadOptions,
                                      word_timestamps=word_timestamps, highlight_words=highlight_words, progress=progress)
     # Entry function for the full tab
+    def transcribe_webui_full(self, modelName, languageName, nllbModelName, nllbLangName, urlData, multipleFiles, microphoneData, task,
                               vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow, vadInitialPromptMode,
                               # Word timestamps
                               word_timestamps: bool, highlight_words: bool, prepend_punctuations: str, append_punctuations: str,
                               condition_on_previous_text: bool, fp16: bool, temperature_increment_on_fallback: float,
                               compression_ratio_threshold: float, logprob_threshold: float, no_speech_threshold: float):
+        return self.transcribe_webui_full_progress(modelName, languageName, nllbModelName, nllbLangName, urlData, multipleFiles, microphoneData, task,
                                 vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow, vadInitialPromptMode,
                                 word_timestamps, highlight_words, prepend_punctuations, append_punctuations,
                                 initial_prompt, temperature, best_of, beam_size, patience, length_penalty, suppress_tokens,
                                 compression_ratio_threshold, logprob_threshold, no_speech_threshold)
     # Entry function for the full tab with progress
+    def transcribe_webui_full_progress(self, modelName, languageName, nllbModelName, nllbLangName, urlData, multipleFiles, microphoneData, task,
                                         vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow, vadInitialPromptMode,
                                         # Word timestamps
                                         word_timestamps: bool, highlight_words: bool, prepend_punctuations: str, append_punctuations: str,
         vadOptions = VadOptions(vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow, vadInitialPromptMode)
+        return self.transcribe_webui(modelName, languageName, nllbModelName, nllbLangName, urlData, multipleFiles, microphoneData, task, vadOptions,
                                      initial_prompt=initial_prompt, temperature=temperature, best_of=best_of, beam_size=beam_size, patience=patience, length_penalty=length_penalty, suppress_tokens=suppress_tokens,
                                      condition_on_previous_text=condition_on_previous_text, fp16=fp16,
                                      compression_ratio_threshold=compression_ratio_threshold, logprob_threshold=logprob_threshold, no_speech_threshold=no_speech_threshold,
                                      word_timestamps=word_timestamps, prepend_punctuations=prepend_punctuations, append_punctuations=append_punctuations, highlight_words=highlight_words,
                                      progress=progress)
+    def transcribe_webui(self, modelName, languageName, nllbModelName, nllbLangName, urlData, multipleFiles, microphoneData, task,
                          vadOptions: VadOptions, progress: gr.Progress = None, highlight_words: bool = False,
                          **decodeOptions: dict):
         try:
             sources = self.__get_source(urlData, multipleFiles, microphoneData)
             try:
+                whisper_lang = get_language_from_name(languageName)
                 selectedLanguage = languageName.lower() if languageName is not None and len(languageName) > 0 else None
                 selectedModel = modelName if modelName is not None else "base"
                                                  model_name=selectedModel, compute_type=self.app_config.compute_type,
                                                  cache=self.model_cache, models=self.app_config.models)
+                nllb_lang = get_nllb_lang_from_name(nllbLangName)
+                selectedNllbModelName = nllbModelName if nllbModelName is not None and len(nllbModelName) > 0 else "nllb-200-distilled-600M/facebook"
+                selectedNllbModel = next((modelConfig for modelConfig in self.app_config.nllb_models if modelConfig.name == selectedNllbModelName), None)
+                nllb_model = NllbModel(model_config=selectedNllbModel, whisper_lang=whisper_lang, nllb_lang=nllb_lang) # load_model=True
                 # Result
                 download = []
                 zip_file_lookup = {}
                     # Update progress
                     current_progress += source_audio_duration
+                    source_download, source_text, source_vtt = self.write_result(result, nllb_model, filePrefix, outputDirectory, highlight_words)
                     if len(sources) > 1:
                         # Add new line separators
                 return download, text, vtt
             finally:
                 # Cleanup source
                 if self.deleteUploadedFiles:
                     for source in sources:
                         if self.app_config.merge_subtitle_with_sources and self.app_config.output_dir is not None and len(source_download) > 0:
+                            print("\nmerge subtitle(srt) with source file [" + source.source_name + "]\n")
                             outRsult = ""
                             try:
                                 srt_path = source_download[0]
                                 save_path = os.path.join(self.app_config.output_dir, source.source_name)
                                 save_without_ext, ext = os.path.splitext(save_path)
+                                source_lang = "." + whisper_lang.code if whisper_lang is not None else ""
+                                translate_lang = "." + nllb_lang.code if nllb_lang is not None else ""
+                                output_with_srt = save_without_ext + source_lang + translate_lang + ext
                                 #ffmpeg -i "input.mp4" -i "input.srt" -c copy -c:s mov_text output.mp4
                                 input_file = ffmpeg.input(source.source_path)
         return config
+    def write_result(self, result: dict, nllb_model: NllbModel, source_name: str, output_dir: str, highlight_words: bool = False):
         if not os.path.exists(output_dir):
             os.makedirs(output_dir)
         text = result["text"]
+        segments = result["segments"]
         language = result["language"]
         languageMaxLineWidth = self.__get_max_line_width(language)
+        if nllb_model.nllb_lang is not None:
+            try:
+                pbar = tqdm.tqdm(total=len(segments))
+                perf_start_time = time.perf_counter()
+                nllb_model.load_model()
+                for idx, segment in enumerate(segments):
+                    seg_text = segment["text"]
+                    if language == "zh":
+                        segment["text"] = zhconv.convert(seg_text, "zh-tw")
+                    if nllb_model.nllb_lang is not None:
+                        segment["text"] = nllb_model.translation(seg_text)
+                    pbar.update(1)
+                nllb_model.release_vram()
+                perf_end_time = time.perf_counter()
+                print("\n\nprocess segments took {} seconds.\n\n".format(perf_end_time - perf_start_time))
+            except Exception as e:
+                # Ignore error - it's just a cleanup
+                print("Error process segments: " + str(e))
+        print("Max line width " + str(languageMaxLineWidth) + " for language:" + language)
         vtt = self.__get_subs(result["segments"], "vtt", languageMaxLineWidth, highlight_words=highlight_words)
         srt = self.__get_subs(result["segments"], "srt", languageMaxLineWidth, highlight_words=highlight_words)
         json_result = json.dumps(result, indent=4, ensure_ascii=False)
+        if language == "zh" or (nllb_model.nllb_lang is not None and nllb_model.nllb_lang.code == "zho_Hant"):
             vtt = zhconv.convert(vtt, "zh-tw")
             srt = zhconv.convert(srt, "zh-tw")
             text = zhconv.convert(text, "zh-tw")
         ui_description += "\n\n" + "Max audio file length: " + str(app_config.input_audio_max_duration) + " s"
     ui_article = "Read the [documentation here](https://gitlab.com/aadnk/whisper-webui/-/blob/main/docs/options.md)."
+    ui_article += "\n\nWhisper's Task 'translate' only implements the functionality of translating other languages into English. "
+    ui_article += "OpenAI does not guarantee translations between arbitrary languages. In such cases, you can choose to use the NLLB Model to implement the translation task. "
+    ui_article += "However, it's important to note that the NLLB Model runs slowly, and the completion time may be twice as long as usual. "
+    ui_article += "\n\nThe larger the parameters of the NLLB model, the better its performance is expected to be. "
+    ui_article += "However, it also requires higher computational resources, making it slower to operate. "
+    ui_article += "On the other hand, the version converted from ct2 (CTranslate2) requires lower resources and operates at a faster speed."
+    ui_article += "\n\nCurrently, enabling word-level timestamps cannot be used in conjunction with NLLB Model translation "
+    ui_article += "because Word Timestamps will split the source text, and after translation, it becomes a non-word-level string. "
+    ui_article += "\n\nThe 'mt5-zh-ja-en-trimmed' model is finetuned from Google's 'mt5-base' model. "
+    ui_article += "This model has a relatively good translation speed, but it only supports three languages: Chinese, Japanese, and English. "
     whisper_models = app_config.get_model_names()
+    nllb_models = app_config.get_nllb_model_names()
+    common_whisper_inputs = lambda : [
+        gr.Dropdown(label="Whisper Model (for audio)", choices=whisper_models, value=app_config.default_model_name),
+        gr.Dropdown(label="Whisper Language", choices=sorted(get_language_names()), value=app_config.language),
+    ]
+    common_nllb_inputs = lambda : [
+        gr.Dropdown(label="NLLB Model (for translate)", choices=nllb_models),
+        gr.Dropdown(label="NLLB Language", choices=sorted(get_nllb_lang_names())),
+    ]
+    common_audio_inputs = lambda : [
         gr.Text(label="URL (YouTube, etc.)"),
         gr.File(label="Upload Files", file_count="multiple"),
         gr.Audio(source="microphone", type="filepath", label="Microphone Input"),
         with gr.Row():
             with gr.Column():
                 simple_submit = gr.Button("Submit", variant="primary")
+                with gr.Column():
+                    with gr.Row():
+                        simple_input = common_whisper_inputs()
+                    with gr.Row():
+                        simple_input += common_nllb_inputs()
+                with gr.Column():
+                    simple_input += common_audio_inputs() + common_vad_inputs() + common_word_timestamps_inputs()
             with gr.Column():
                 simple_output = common_output()
                 simple_flag = gr.Button("Flag")
         with gr.Row():
             with gr.Column():
                 full_submit = gr.Button("Submit", variant="primary")
+                with gr.Column():
+                    with gr.Row():
+                        full_input1 = common_whisper_inputs()
+                    with gr.Row():
+                        full_input1 += common_nllb_inputs()
+                with gr.Column():
+                    full_input1 += common_audio_inputs() + common_vad_inputs() + [
+                    gr.Number(label="VAD - Padding (s)", precision=None, value=app_config.vad_padding),
+                    gr.Number(label="VAD - Prompt Window (s)", precision=None, value=app_config.vad_prompt_window),
+                    gr.Dropdown(choices=VAD_INITIAL_PROMPT_MODE_VALUES, label="VAD - Initial Prompt Mode")]
+                    full_input2 = common_word_timestamps_inputs() + [
+                    gr.Text(label="Word Timestamps - Prepend Punctuations", value=app_config.prepend_punctuations),
+                    gr.Text(label="Word Timestamps - Append Punctuations", value=app_config.append_punctuations),
+                    gr.TextArea(label="Initial Prompt"),
+                    gr.Number(label="Temperature", value=app_config.temperature),
+                    gr.Number(label="Best Of - Non-zero temperature", value=app_config.best_of, precision=0),
+                    gr.Number(label="Beam Size - Zero temperature", value=app_config.beam_size, precision=0),
+                    gr.Number(label="Patience - Zero temperature", value=app_config.patience),
+                    gr.Number(label="Length Penalty - Any temperature", value=app_config.length_penalty),
+                    gr.Text(label="Suppress Tokens - Comma-separated list of token IDs", value=app_config.suppress_tokens),
+                    gr.Checkbox(label="Condition on previous text", value=app_config.condition_on_previous_text),
+                    gr.Checkbox(label="FP16", value=app_config.fp16),
+                    gr.Number(label="Temperature increment on fallback", value=app_config.temperature_increment_on_fallback),
+                    gr.Number(label="Compression ratio threshold", value=app_config.compression_ratio_threshold),
+                    gr.Number(label="Logprob threshold", value=app_config.logprob_threshold),
+                    gr.Number(label="No speech threshold", value=app_config.no_speech_threshold)]
             with gr.Column():
                 full_output = common_output()
 if __name__ == '__main__':
     default_app_config = ApplicationConfig.create_default()
     whisper_models = default_app_config.get_model_names()
+    nllb_models = default_app_config.get_nllb_model_names()
     # Environment variable overrides
     default_whisper_implementation = os.environ.get("WHISPER_IMPLEMENTATION", default_app_config.whisper_implementation)
     updated_config = default_app_config.update(**args)
+    #updated_config.whisper_implementation = "faster-whisper"
+    #updated_config.input_audio_max_duration = -1
+    #updated_config.default_model_name = "large-v2"
+    #updated_config.output_dir = "output"
+    #updated_config.vad_max_merge_size = 90
+    #updated_config.merge_subtitle_with_sources = True
+    #updated_config.autolaunch = True
     if (threads := args.pop("threads")) > 0:
         torch.set_num_threads(threads)

config.json5 CHANGED Viewed

@@ -43,6 +43,102 @@
         //    "url": "https://example.com/path/to/model",
         //}
     ],
     // Configuration options that will be used if they are not specified in the command line arguments.
     // * WEBUI options *

         //    "url": "https://example.com/path/to/model",
         //}
     ],
+    "nllb_models": [
+        {
+            "name": "nllb-200-distilled-1.3B-ct2fast:int8_float16/michaelfeil",
+            "url": "michaelfeil/ct2fast-nllb-200-distilled-1.3B",
+            "type": "huggingface"
+        },
+        {
+            "name": "nllb-200-3.3B-ct2fast:int8_float16/michaelfeil",
+            "url": "michaelfeil/ct2fast-nllb-200-3.3B",
+            "type": "huggingface"
+        },
+        {
+            "name": "nllb-200-1.3B-ct2:float16/JustFrederik",
+            "url": "JustFrederik/nllb-200-1.3B-ct2-float16",
+            "type": "huggingface"
+        },
+        {
+            "name": "nllb-200-distilled-1.3B-ct2:float16/JustFrederik",
+            "url": "JustFrederik/nllb-200-distilled-1.3B-ct2-float16",
+            "type": "huggingface"
+        },
+        {
+            "name": "nllb-200-1.3B-ct2:int8/JustFrederik",
+            "url": "JustFrederik/nllb-200-1.3B-ct2-int8",
+            "type": "huggingface"
+        },
+        {
+            "name": "nllb-200-distilled-1.3B-ct2:int8/JustFrederik",
+            "url": "JustFrederik/nllb-200-distilled-1.3B-ct2-int8",
+            "type": "huggingface"
+        },
+        {
+            "name": "mt5-zh-ja-en-trimmed/K024",
+            "url": "K024/mt5-zh-ja-en-trimmed",
+            "type": "huggingface"
+        },
+        {
+            "name": "mt5-zh-ja-en-trimmed-fine-tuned-v1/engmatic-earth",
+            "url": "engmatic-earth/mt5-zh-ja-en-trimmed-fine-tuned-v1",
+            "type": "huggingface"
+        },
+        {
+            "name": "nllb-200-distilled-600M/facebook",
+            "url": "facebook/nllb-200-distilled-600M",
+            "type": "huggingface"
+        },
+        {
+            "name": "nllb-200-distilled-600M-ct2/JustFrederik",
+            "url": "JustFrederik/nllb-200-distilled-600M-ct2",
+            "type": "huggingface"
+        },
+        {
+            "name": "nllb-200-distilled-600M-ct2:float16/JustFrederik",
+            "url": "JustFrederik/nllb-200-distilled-600M-ct2-float16",
+            "type": "huggingface"
+        },
+        {
+            "name": "nllb-200-distilled-600M-ct2:int8/JustFrederik",
+            "url": "JustFrederik/nllb-200-distilled-600M-ct2-int8",
+            "type": "huggingface"
+        },
+        // Uncomment to add official Facebook 1.3B and 3.3B model
+        // The official Facebook 1.3B and 3.3B model files are too large,
+        //   and to avoid occupying too much disk space on Hugging Face's free spaces,
+        //   these models are not included in the config.
+        //{
+        //    "name": "nllb-200-distilled-1.3B/facebook",
+        //    "url": "facebook/nllb-200-distilled-1.3B",
+        //    "type": "huggingface"
+        //},
+        //{
+        //    "name": "nllb-200-1.3B/facebook",
+        //    "url": "facebook/nllb-200-1.3B",
+        //    "type": "huggingface"
+        //},
+        //{
+        //    "name": "nllb-200-3.3B/facebook",
+        //    "url": "facebook/nllb-200-3.3B",
+        //    "type": "huggingface"
+        //},
+        //{
+        //    "name": "nllb-200-distilled-1.3B-ct2/JustFrederik",
+        //    "url": "JustFrederik/nllb-200-distilled-1.3B-ct2",
+        //    "type": "huggingface"
+        //},
+        //{
+        //    "name": "nllb-200-1.3B-ct2/JustFrederik",
+        //    "url": "JustFrederik/nllb-200-1.3B-ct2",
+        //    "type": "huggingface"
+        //},
+        //{
+        //    "name": "nllb-200-3.3B-ct2:float16/JustFrederik",
+        //    "url": "JustFrederik/nllb-200-3.3B-ct2-float16",
+        //    "type": "huggingface"
+        //},
+    ],
     // Configuration options that will be used if they are not specified in the command line arguments.
     // * WEBUI options *

requirements-fasterWhisper.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-ctranslate2
 faster-whisper
 ffmpeg-python==0.2.0
 gradio==3.36.0
@@ -7,4 +7,5 @@ json5
 torch
 torchaudio
 more_itertools
-zhconv

+ctranslate2>=3.16.0
 faster-whisper
 ffmpeg-python==0.2.0
 gradio==3.36.0
 torch
 torchaudio
 more_itertools
+zhconv
+sentencepiece

requirements-whisper.txt CHANGED Viewed

@@ -7,4 +7,5 @@ yt-dlp
 torchaudio
 altair
 json5
-zhconv

 torchaudio
 altair
 json5
+zhconv
+sentencepiece

requirements.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-ctranslate2
 faster-whisper
 ffmpeg-python==0.2.0
 gradio==3.36.0
@@ -7,4 +7,5 @@ json5
 torch
 torchaudio
 more_itertools
-zhconv

+ctranslate2>=3.16.0
 faster-whisper
 ffmpeg-python==0.2.0
 gradio==3.36.0
 torch
 torchaudio
 more_itertools
+zhconv
+sentencepiece

src/config.py CHANGED Viewed

@@ -47,11 +47,11 @@ class VadInitialPromptMode(Enum):
             return None
 class ApplicationConfig:
-    def __init__(self, models: List[ModelConfig] = [], input_audio_max_duration: int = 600,
                  share: bool = False, server_name: str = None, server_port: int = 7860,
                  queue_concurrency_count: int = 1, delete_uploaded_files: bool = True,
                  whisper_implementation: str = "whisper",
-                 default_model_name: str = "medium", default_vad: str = "silero-vad",
                  vad_parallel_devices: str = "", vad_cpu_cores: int = 1, vad_process_timeout: int = 1800,
                  auto_parallel: bool = False, output_dir: str = None,
                  model_dir: str = None, device: str = None,
@@ -72,6 +72,7 @@ class ApplicationConfig:
                  highlight_words: bool = False):
         self.models = models
         # WebUI settings
         self.input_audio_max_duration = input_audio_max_duration
@@ -83,6 +84,7 @@ class ApplicationConfig:
         self.whisper_implementation = whisper_implementation
         self.default_model_name = default_model_name
         self.default_vad = default_vad
         self.vad_parallel_devices = vad_parallel_devices
         self.vad_cpu_cores = vad_cpu_cores
@@ -124,6 +126,9 @@ class ApplicationConfig:
     def get_model_names(self):
         return [ x.name for x in self.models ]
     def update(self, **new_values):
         result = ApplicationConfig(**self.__dict__)
@@ -148,7 +153,9 @@ class ApplicationConfig:
             # Load using json5
             data = json5.load(f)
             data_models = data.pop("models", [])
             models = [ ModelConfig(**x) for x in data_models ]
-            return ApplicationConfig(models, **data)

             return None
 class ApplicationConfig:
+    def __init__(self, models: List[ModelConfig] = [], nllb_models: List[ModelConfig] = [], input_audio_max_duration: int = 600,
                  share: bool = False, server_name: str = None, server_port: int = 7860,
                  queue_concurrency_count: int = 1, delete_uploaded_files: bool = True,
                  whisper_implementation: str = "whisper",
+                 default_model_name: str = "medium", default_nllb_model_name: str = "distilled-600M", default_vad: str = "silero-vad",
                  vad_parallel_devices: str = "", vad_cpu_cores: int = 1, vad_process_timeout: int = 1800,
                  auto_parallel: bool = False, output_dir: str = None,
                  model_dir: str = None, device: str = None,
                  highlight_words: bool = False):
         self.models = models
+        self.nllb_models = nllb_models
         # WebUI settings
         self.input_audio_max_duration = input_audio_max_duration
         self.whisper_implementation = whisper_implementation
         self.default_model_name = default_model_name
+        self.default_nllb_model_name = default_nllb_model_name
         self.default_vad = default_vad
         self.vad_parallel_devices = vad_parallel_devices
         self.vad_cpu_cores = vad_cpu_cores
     def get_model_names(self):
         return [ x.name for x in self.models ]
+    def get_nllb_model_names(self):
+        return [ x.name for x in self.nllb_models ]
     def update(self, **new_values):
         result = ApplicationConfig(**self.__dict__)
             # Load using json5
             data = json5.load(f)
             data_models = data.pop("models", [])
+            data_nllb_models = data.pop("nllb_models", [])
             models = [ ModelConfig(**x) for x in data_models ]
+            nllb_models = [ ModelConfig(**x) for x in data_nllb_models ]
+            return ApplicationConfig(models, nllb_models, **data)

src/nllb/nllbLangs.py ADDED Viewed

	@@ -0,0 +1,251 @@

+class NllbLang():
+    def __init__(self, code, name, code_whisper=None, name_whisper=None):
+        self.code = code
+        self.name = name
+        self.code_whisper = code_whisper
+        self.name_whisper = name_whisper
+    def __str__(self):
+        return "Language(code={}, name={})".format(self.code, self.name)
+NLLB_LANGS = [
+    NllbLang('ace_Arab', 'Acehnese (Arabic script)'),
+    NllbLang('ace_Latn', 'Acehnese (Latin script)'),
+    NllbLang('acm_Arab', 'Mesopotamian Arabic', 'ar', 'Arabic'),
+    NllbLang('acq_Arab', 'Ta’izzi-Adeni Arabic', 'ar', 'Arabic'),
+    NllbLang('aeb_Arab', 'Tunisian Arabic'),
+    NllbLang('afr_Latn', 'Afrikaans', 'am', 'Amharic'),
+    NllbLang('ajp_Arab', 'South Levantine Arabic', 'ar', 'Arabic'),
+    NllbLang('aka_Latn', 'Akan'),
+    NllbLang('amh_Ethi', 'Amharic'),
+    NllbLang('apc_Arab', 'North Levantine Arabic', 'ar', 'Arabic'),
+    NllbLang('arb_Arab', 'Modern Standard Arabic', 'ar', 'Arabic'),
+    NllbLang('arb_Latn', 'Modern Standard Arabic (Romanized)'),
+    NllbLang('ars_Arab', 'Najdi Arabic', 'ar', 'Arabic'),
+    NllbLang('ary_Arab', 'Moroccan Arabic', 'ar', 'Arabic'),
+    NllbLang('arz_Arab', 'Egyptian Arabic', 'ar', 'Arabic'),
+    NllbLang('asm_Beng', 'Assamese', 'as', 'Assamese'),
+    NllbLang('ast_Latn', 'Asturian'),
+    NllbLang('awa_Deva', 'Awadhi'),
+    NllbLang('ayr_Latn', 'Central Aymara'),
+    NllbLang('azb_Arab', 'South Azerbaijani', 'az', 'Azerbaijani'),
+    NllbLang('azj_Latn', 'North Azerbaijani', 'az', 'Azerbaijani'),
+    NllbLang('bak_Cyrl', 'Bashkir', 'ba', 'Bashkir'),
+    NllbLang('bam_Latn', 'Bambara'),
+    NllbLang('ban_Latn', 'Balinese'),
+    NllbLang('bel_Cyrl', 'Belarusian', 'be', 'Belarusian'),
+    NllbLang('bem_Latn', 'Bemba'),
+    NllbLang('ben_Beng', 'Bengali', 'bn', 'Bengali'),
+    NllbLang('bho_Deva', 'Bhojpuri'),
+    NllbLang('bjn_Arab', 'Banjar (Arabic script)'),
+    NllbLang('bjn_Latn', 'Banjar (Latin script)'),
+    NllbLang('bod_Tibt', 'Standard Tibetan', 'bo', 'Tibetan'),
+    NllbLang('bos_Latn', 'Bosnian', 'bs', 'Bosnian'),
+    NllbLang('bug_Latn', 'Buginese'),
+    NllbLang('bul_Cyrl', 'Bulgarian', 'bg', 'Bulgarian'),
+    NllbLang('cat_Latn', 'Catalan', 'ca', 'Catalan'),
+    NllbLang('ceb_Latn', 'Cebuano'),
+    NllbLang('ces_Latn', 'Czech', 'cs', 'Czech'),
+    NllbLang('cjk_Latn', 'Chokwe'),
+    NllbLang('ckb_Arab', 'Central Kurdish'),
+    NllbLang('crh_Latn', 'Crimean Tatar'),
+    NllbLang('cym_Latn', 'Welsh', 'cy', 'Welsh'),
+    NllbLang('dan_Latn', 'Danish', 'da', 'Danish'),
+    NllbLang('deu_Latn', 'German', 'de', 'German'),
+    NllbLang('dik_Latn', 'Southwestern Dinka'),
+    NllbLang('dyu_Latn', 'Dyula'),
+    NllbLang('dzo_Tibt', 'Dzongkha'),
+    NllbLang('ell_Grek', 'Greek', 'el', 'Greek'),
+    NllbLang('eng_Latn', 'English', 'en', 'English'),
+    NllbLang('epo_Latn', 'Esperanto'),
+    NllbLang('est_Latn', 'Estonian', 'et', 'Estonian'),
+    NllbLang('eus_Latn', 'Basque', 'eu', 'Basque'),
+    NllbLang('ewe_Latn', 'Ewe'),
+    NllbLang('fao_Latn', 'Faroese', 'fo', 'Faroese'),
+    NllbLang('fij_Latn', 'Fijian'),
+    NllbLang('fin_Latn', 'Finnish', 'fi', 'Finnish'),
+    NllbLang('fon_Latn', 'Fon'),
+    NllbLang('fra_Latn', 'French', 'fr', 'French'),
+    NllbLang('fur_Latn', 'Friulian'),
+    NllbLang('fuv_Latn', 'Nigerian Fulfulde'),
+    NllbLang('gla_Latn', 'Scottish Gaelic'),
+    NllbLang('gle_Latn', 'Irish'),
+    NllbLang('glg_Latn', 'Galician', 'gl', 'Galician'),
+    NllbLang('grn_Latn', 'Guarani'),
+    NllbLang('guj_Gujr', 'Gujarati', 'gu', 'Gujarati'),
+    NllbLang('hat_Latn', 'Haitian Creole', 'ht', 'Haitian creole'),
+    NllbLang('hau_Latn', 'Hausa', 'ha', 'Hausa'),
+    NllbLang('heb_Hebr', 'Hebrew', 'he', 'Hebrew'),
+    NllbLang('hin_Deva', 'Hindi', 'hi', 'Hindi'),
+    NllbLang('hne_Deva', 'Chhattisgarhi'),
+    NllbLang('hrv_Latn', 'Croatian', 'hr', 'Croatian'),
+    NllbLang('hun_Latn', 'Hungarian', 'hu', 'Hungarian'),
+    NllbLang('hye_Armn', 'Armenian', 'hy', 'Armenian'),
+    NllbLang('ibo_Latn', 'Igbo'),
+    NllbLang('ilo_Latn', 'Ilocano'),
+    NllbLang('ind_Latn', 'Indonesian', 'id', 'Indonesian'),
+    NllbLang('isl_Latn', 'Icelandic', 'is', 'Icelandic'),
+    NllbLang('ita_Latn', 'Italian', 'it', 'Italian'),
+    NllbLang('jav_Latn', 'Javanese', 'jw', 'Javanese'),
+    NllbLang('jpn_Jpan', 'Japanese', 'ja', 'Japanese'),
+    NllbLang('kab_Latn', 'Kabyle'),
+    NllbLang('kac_Latn', 'Jingpho'),
+    NllbLang('kam_Latn', 'Kamba'),
+    NllbLang('kan_Knda', 'Kannada', 'kn', 'Kannada'),
+    NllbLang('kas_Arab', 'Kashmiri (Arabic script)'),
+    NllbLang('kas_Deva', 'Kashmiri (Devanagari script)'),
+    NllbLang('kat_Geor', 'Georgian', 'ka', 'Georgian'),
+    NllbLang('knc_Arab', 'Central Kanuri (Arabic script)'),
+    NllbLang('knc_Latn', 'Central Kanuri (Latin script)'),
+    NllbLang('kaz_Cyrl', 'Kazakh', 'kk', 'Kazakh'),
+    NllbLang('kbp_Latn', 'Kabiyè'),
+    NllbLang('kea_Latn', 'Kabuverdianu'),
+    NllbLang('khm_Khmr', 'Khmer', 'km', 'Khmer'),
+    NllbLang('kik_Latn', 'Kikuyu'),
+    NllbLang('kin_Latn', 'Kinyarwanda'),
+    NllbLang('kir_Cyrl', 'Kyrgyz'),
+    NllbLang('kmb_Latn', 'Kimbundu'),
+    NllbLang('kmr_Latn', 'Northern Kurdish'),
+    NllbLang('kon_Latn', 'Kikongo'),
+    NllbLang('kor_Hang', 'Korean', 'ko', 'Korean'),
+    NllbLang('lao_Laoo', 'Lao', 'lo', 'Lao'),
+    NllbLang('lij_Latn', 'Ligurian'),
+    NllbLang('lim_Latn', 'Limburgish'),
+    NllbLang('lin_Latn', 'Lingala', 'ln', 'Lingala'),
+    NllbLang('lit_Latn', 'Lithuanian', 'lt', 'Lithuanian'),
+    NllbLang('lmo_Latn', 'Lombard'),
+    NllbLang('ltg_Latn', 'Latgalian'),
+    NllbLang('ltz_Latn', 'Luxembourgish', 'lb', 'Luxembourgish'),
+    NllbLang('lua_Latn', 'Luba-Kasai'),
+    NllbLang('lug_Latn', 'Ganda'),
+    NllbLang('luo_Latn', 'Luo'),
+    NllbLang('lus_Latn', 'Mizo'),
+    NllbLang('lvs_Latn', 'Standard Latvian', 'lv', 'Latvian'),
+    NllbLang('mag_Deva', 'Magahi'),
+    NllbLang('mai_Deva', 'Maithili'),
+    NllbLang('mal_Mlym', 'Malayalam', 'ml', 'Malayalam'),
+    NllbLang('mar_Deva', 'Marathi', 'mr', 'Marathi'),
+    NllbLang('min_Arab', 'Minangkabau (Arabic script)'),
+    NllbLang('min_Latn', 'Minangkabau (Latin script)'),
+    NllbLang('mkd_Cyrl', 'Macedonian', 'mk', 'Macedonian'),
+    NllbLang('plt_Latn', 'Plateau Malagasy', 'mg', 'Malagasy'),
+    NllbLang('mlt_Latn', 'Maltese', 'mt', 'Maltese'),
+    NllbLang('mni_Beng', 'Meitei (Bengali script)'),
+    NllbLang('khk_Cyrl', 'Halh Mongolian', 'mn', 'Mongolian'),
+    NllbLang('mos_Latn', 'Mossi'),
+    NllbLang('mri_Latn', 'Maori', 'mi', 'Maori'),
+    NllbLang('mya_Mymr', 'Burmese', 'my', 'Myanmar'),
+    NllbLang('nld_Latn', 'Dutch', 'nl', 'Dutch'),
+    NllbLang('nno_Latn', 'Norwegian Nynorsk', 'nn', 'Nynorsk'),
+    NllbLang('nob_Latn', 'Norwegian Bokmål', 'no', 'Norwegian'),
+    NllbLang('npi_Deva', 'Nepali', 'ne', 'Nepali'),
+    NllbLang('nso_Latn', 'Northern Sotho'),
+    NllbLang('nus_Latn', 'Nuer'),
+    NllbLang('nya_Latn', 'Nyanja'),
+    NllbLang('oci_Latn', 'Occitan', 'oc', 'Occitan'),
+    NllbLang('gaz_Latn', 'West Central Oromo'),
+    NllbLang('ory_Orya', 'Odia'),
+    NllbLang('pag_Latn', 'Pangasinan'),
+    NllbLang('pan_Guru', 'Eastern Panjabi', 'pa', 'Punjabi'),
+    NllbLang('pap_Latn', 'Papiamento'),
+    NllbLang('pes_Arab', 'Western Persian', 'fa', 'Persian'),
+    NllbLang('pol_Latn', 'Polish', 'pl', 'Polish'),
+    NllbLang('por_Latn', 'Portuguese', 'pt', 'Portuguese'),
+    NllbLang('prs_Arab', 'Dari'),
+    NllbLang('pbt_Arab', 'Southern Pashto', 'ps', 'Pashto'),
+    NllbLang('quy_Latn', 'Ayacucho Quechua'),
+    NllbLang('ron_Latn', 'Romanian', 'ro', 'Romanian'),
+    NllbLang('run_Latn', 'Rundi'),
+    NllbLang('rus_Cyrl', 'Russian', 'ru', 'Russian'),
+    NllbLang('sag_Latn', 'Sango'),
+    NllbLang('san_Deva', 'Sanskrit', 'sa', 'Sanskrit'),
+    NllbLang('sat_Olck', 'Santali'),
+    NllbLang('scn_Latn', 'Sicilian'),
+    NllbLang('shn_Mymr', 'Shan'),
+    NllbLang('sin_Sinh', 'Sinhala', 'si', 'Sinhala'),
+    NllbLang('slk_Latn', 'Slovak', 'sk', 'Slovak'),
+    NllbLang('slv_Latn', 'Slovenian', 'sl', 'Slovenian'),
+    NllbLang('smo_Latn', 'Samoan'),
+    NllbLang('sna_Latn', 'Shona', 'sn', 'Shona'),
+    NllbLang('snd_Arab', 'Sindhi', 'sd', 'Sindhi'),
+    NllbLang('som_Latn', 'Somali', 'so', 'Somali'),
+    NllbLang('sot_Latn', 'Southern Sotho'),
+    NllbLang('spa_Latn', 'Spanish', 'es', 'Spanish'),
+    NllbLang('als_Latn', 'Tosk Albanian', 'sq', 'Albanian'),
+    NllbLang('srd_Latn', 'Sardinian'),
+    NllbLang('srp_Cyrl', 'Serbian', 'sr', 'Serbian'),
+    NllbLang('ssw_Latn', 'Swati'),
+    NllbLang('sun_Latn', 'Sundanese', 'su', 'Sundanese'),
+    NllbLang('swe_Latn', 'Swedish', 'sv', 'Swedish'),
+    NllbLang('swh_Latn', 'Swahili', 'sw', 'Swahili'),
+    NllbLang('szl_Latn', 'Silesian'),
+    NllbLang('tam_Taml', 'Tamil', 'ta', 'Tamil'),
+    NllbLang('tat_Cyrl', 'Tatar', 'tt', 'Tatar'),
+    NllbLang('tel_Telu', 'Telugu', 'te', 'Telugu'),
+    NllbLang('tgk_Cyrl', 'Tajik', 'tg', 'Tajik'),
+    NllbLang('tgl_Latn', 'Tagalog', 'tl', 'Tagalog'),
+    NllbLang('tha_Thai', 'Thai', 'th', 'Thai'),
+    NllbLang('tir_Ethi', 'Tigrinya'),
+    NllbLang('taq_Latn', 'Tamasheq (Latin script)'),
+    NllbLang('taq_Tfng', 'Tamasheq (Tifinagh script)'),
+    NllbLang('tpi_Latn', 'Tok Pisin'),
+    NllbLang('tsn_Latn', 'Tswana'),
+    NllbLang('tso_Latn', 'Tsonga'),
+    NllbLang('tuk_Latn', 'Turkmen', 'tk', 'Turkmen'),
+    NllbLang('tum_Latn', 'Tumbuka'),
+    NllbLang('tur_Latn', 'Turkish', 'tr', 'Turkish'),
+    NllbLang('twi_Latn', 'Twi'),
+    NllbLang('tzm_Tfng', 'Central Atlas Tamazight'),
+    NllbLang('uig_Arab', 'Uyghur'),
+    NllbLang('ukr_Cyrl', 'Ukrainian', 'uk', 'Ukrainian'),
+    NllbLang('umb_Latn', 'Umbundu'),
+    NllbLang('urd_Arab', 'Urdu', 'ur', 'Urdu'),
+    NllbLang('uzn_Latn', 'Northern Uzbek', 'uz', 'Uzbek'),
+    NllbLang('vec_Latn', 'Venetian'),
+    NllbLang('vie_Latn', 'Vietnamese', 'vi', 'Vietnamese'),
+    NllbLang('war_Latn', 'Waray'),
+    NllbLang('wol_Latn', 'Wolof'),
+    NllbLang('xho_Latn', 'Xhosa'),
+    NllbLang('ydd_Hebr', 'Eastern Yiddish', 'yi', 'Yiddish'),
+    NllbLang('yor_Latn', 'Yoruba', 'yo', 'Yoruba'),
+    NllbLang('yue_Hant', 'Yue Chinese', 'zh', 'Chinese'),
+    NllbLang('zho_Hans', 'Chinese (Simplified)', 'zh', 'Chinese'),
+    NllbLang('zho_Hant', 'Chinese (Traditional)', 'zh', 'Chinese'),
+    NllbLang('zsm_Latn', 'Standard Malay', 'ms', 'Malay'),
+    NllbLang('zul_Latn', 'Zulu'),
+]
+_TO_NLLB_LANG_CODE = {language.code.lower(): language for language in NLLB_LANGS if language.code is not None}
+_TO_NLLB_LANG_NAME = {language.name.lower(): language for language in NLLB_LANGS if language.name is not None}
+_TO_NLLB_LANG_WHISPER_CODE = {language.code_whisper.lower(): language for language in NLLB_LANGS if language.code_whisper is not None}
+_TO_NLLB_LANG_WHISPER_NAME = {language.name_whisper.lower(): language for language in NLLB_LANGS if language.name_whisper is not None}
+def get_nllb_lang_from_code(lang_code, default=None) -> NllbLang:
+    """Return the language from the language code."""
+    return _TO_NLLB_LANG_CODE.get(lang_code, default)
+def get_nllb_lang_from_name(lang_name, default=None) -> NllbLang:
+    """Return the language from the language name."""
+    return _TO_NLLB_LANG_NAME.get(lang_name.lower() if lang_name else None, default)
+def get_nllb_lang_from_code_whisper(lang_code_whisper, default=None) -> NllbLang:
+    """Return the language from the language code."""
+    return _TO_NLLB_LANG_WHISPER_CODE.get(lang_code_whisper, default)
+def get_nllb_lang_from_name_whisper(lang_name_whisper, default=None) -> NllbLang:
+    """Return the language from the language name."""
+    return _TO_NLLB_LANG_WHISPER_NAME.get(lang_name_whisper.lower() if lang_name_whisper else None, default)
+def get_nllb_lang_names():
+    """Return a list of language names."""
+    return [language.name for language in NLLB_LANGS]
+if __name__ == "__main__":
+    # Test lookup
+    print(get_nllb_lang_from_code('eng_Latn'))
+    print(get_nllb_lang_from_name('English'))
+    print(get_nllb_lang_names())

src/nllb/nllbModel.py ADDED Viewed

	@@ -0,0 +1,221 @@

+import os
+import warnings
+import huggingface_hub
+import requests
+import torch
+import ctranslate2
+import transformers
+from typing import Optional
+from src.config import ModelConfig
+from src.languages import Language
+from src.nllb.nllbLangs import NllbLang, get_nllb_lang_from_code_whisper
+class NllbModel:
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        device: str = None,
+        whisper_lang: Language = None,
+        nllb_lang: NllbLang = None,
+        download_root: Optional[str] = None,
+        local_files_only: bool = False,
+        load_model: bool = False,
+    ):
+        """Initializes the Nllb-200 model.
+        Args:
+          model_config: Config of the model to use (distilled-600M, distilled-1.3B,
+            1.3B, 3.3B...) or a path to a converted
+            model directory. When a size is configured, the converted model is downloaded
+            from the Hugging Face Hub.
+          device: Device to use for computation (cpu, cuda, ipu, xpu, mkldnn, opengl, opencl,
+            ideep, hip, ve, fpga, ort, xla, lazy, vulkan, mps, meta, hpu, mtia).
+          device_index: Device ID to use.
+            The model can also be loaded on multiple GPUs by passing a list of IDs
+            (e.g. [0, 1, 2, 3]). In that case, multiple transcriptions can run in parallel
+            when transcribe() is called from multiple Python threads (see also num_workers).
+          compute_type: Type to use for computation.
+            See https://opennmt.net/CTranslate2/quantization.html.
+          cpu_threads: Number of threads to use when running on CPU (4 by default).
+            A non zero value overrides the OMP_NUM_THREADS environment variable.
+          num_workers: When transcribe() is called from multiple Python threads,
+            having multiple workers enables true parallelism when running the model
+            (concurrent calls to self.model.generate() will run in parallel).
+            This can improve the global throughput at the cost of increased memory usage.
+          download_root: Directory where the models should be saved. If not set, the models
+            are saved in the standard Hugging Face cache directory.
+          local_files_only:  If True, avoid downloading the file and return the path to the
+            local cached file if it exists.
+        """
+        self.whisper_lang = whisper_lang
+        self.nllb_whisper_lang = get_nllb_lang_from_code_whisper(whisper_lang.code.lower() if whisper_lang is not None else "en")
+        self.nllb_lang = nllb_lang
+        self.model_config = model_config
+        if os.path.isdir(model_config.url):
+            self.model_path = model_config.url
+        else:
+            self.model_path = download_model(
+                model_config,
+                local_files_only=local_files_only,
+                cache_dir=download_root,
+            )
+        if device is None:
+            if torch.cuda.is_available():
+                device = "cuda" if "ct2" in self.model_path else "cuda:0"
+            else:
+                device = "cpu"
+        self.device = device
+        if load_model:
+            self.load_model()
+    def load_model(self):
+        print('\n\nLoading model: %s\n\n' % self.model_path)
+        if "ct2" in self.model_path:
+            self.target_prefix = [self.nllb_lang.code]
+            self.trans_tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_path, src_lang=self.nllb_whisper_lang.code)
+            self.trans_model = ctranslate2.Translator(self.model_path, compute_type="auto", device=self.device)
+        elif "mt5" in self.model_path:
+            self.mt5_prefix = self.whisper_lang.code + "2" + self.nllb_lang.code_whisper + ": "
+            self.trans_tokenizer = transformers.T5Tokenizer.from_pretrained(self.model_path) #requires spiece.model
+            self.trans_model = transformers.MT5ForConditionalGeneration.from_pretrained(self.model_path)
+            self.trans_translator = transformers.pipeline('text2text-generation', model=self.trans_model, device=self.device, tokenizer=self.trans_tokenizer)
+        else: #NLLB
+            self.trans_tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_path)
+            self.trans_model = transformers.AutoModelForSeq2SeqLM.from_pretrained(self.model_path)
+            self.trans_translator = transformers.pipeline('translation', model=self.trans_model, device=self.device, tokenizer=self.trans_tokenizer, src_lang=self.nllb_whisper_lang.code, tgt_lang=self.nllb_lang.code)
+    def release_vram(self):
+        try:
+            if torch.cuda.is_available():
+                if "ct2" not in self.model_path:
+                    device = torch.device("cpu")
+                    self.trans_model.to(device)
+                del self.trans_model
+                torch.cuda.empty_cache()
+                print("release vram end.")
+        except Exception as e:
+            print("Error release vram: " + str(e))
+    def translation(self, text: str, max_length: int = 400):
+        output = None
+        result = None
+        try:
+            if "ct2" in self.model_path:
+                source = self.trans_tokenizer.convert_ids_to_tokens(self.trans_tokenizer.encode(text))
+                output = self.trans_model.translate_batch([source], target_prefix=[self.target_prefix])
+                target = output[0].hypotheses[0][1:]
+                result = self.trans_tokenizer.decode(self.trans_tokenizer.convert_tokens_to_ids(target))
+            elif "mt5" in self.model_path:
+                output = self.trans_translator(self.mt5_prefix + text, max_length=max_length, num_beams=4)
+                result = output[0]['generated_text']
+            else: #NLLB
+                output = self.trans_translator(text, max_length=max_length)
+                result = output[0]['translation_text']
+        except Exception as e:
+            print("Error translation text: " + str(e))
+        return result
+_MODELS = ["distilled-600M", "distilled-1.3B", "1.3B", "3.3B",
+           "ct2fast-nllb-200-distilled-1.3B-int8_float16",
+           "ct2fast-nllb-200-3.3B-int8_float16",
+           "nllb-200-3.3B-ct2-float16", "nllb-200-1.3B-ct2", "nllb-200-1.3B-ct2-int8", "nllb-200-1.3B-ct2-float16",
+           "nllb-200-distilled-1.3B-ct2", "nllb-200-distilled-1.3B-ct2-int8", "nllb-200-distilled-1.3B-ct2-float16",
+           "nllb-200-distilled-600M-ct2", "nllb-200-distilled-600M-ct2-int8", "nllb-200-distilled-600M-ct2-float16",
+           "mt5-zh-ja-en-trimmed",
+           "mt5-zh-ja-en-trimmed-fine-tuned-v1"]
+def check_model_name(name):
+    return any(allowed_name in name for allowed_name in _MODELS)
+def download_model(
+    model_config: ModelConfig,
+    output_dir: Optional[str] = None,
+    local_files_only: bool = False,
+    cache_dir: Optional[str] = None,
+):
+    """"download_model" is referenced from the "utils.py" script
+      of the "faster_whisper" project, authored by guillaumekln.
+    Downloads a nllb-200 model from the Hugging Face Hub.
+    The model is downloaded from https://huggingface.co/facebook.
+    Args:
+      model_config: config of the model to download (facebook/nllb-distilled-600M,
+        facebook/nllb-distilled-1.3B, facebook/nllb-1.3B, facebook/nllb-3.3B...).
+      output_dir: Directory where the model should be saved. If not set, the model is saved in
+        the cache directory.
+      local_files_only:  If True, avoid downloading the file and return the path to the local
+        cached file if it exists.
+      cache_dir: Path to the folder where cached files are stored.
+    Returns:
+      The path to the downloaded model.
+    Raises:
+      ValueError: if the model size is invalid.
+    """
+    if not check_model_name(model_config.name):
+        raise ValueError(
+            "Invalid model name '%s', expected one of: %s" % (model_config.name, ", ".join(_MODELS))
+        )
+    repo_id = model_config.url #"facebook/nllb-200-%s" %
+    allow_patterns = [
+        "config.json",
+        "generation_config.json",
+        "model.bin",
+        "pytorch_model.bin",
+        "pytorch_model.bin.index.json",
+        "pytorch_model-00001-of-00003.bin",
+        "pytorch_model-00002-of-00003.bin",
+        "pytorch_model-00003-of-00003.bin",
+        "sentencepiece.bpe.model",
+        "tokenizer.json",
+        "tokenizer_config.json",
+        "shared_vocabulary.txt",
+        "shared_vocabulary.json",
+        "special_tokens_map.json",
+        "spiece.model",
+    ]
+    kwargs = {
+        "local_files_only": local_files_only,
+        "allow_patterns": allow_patterns,
+        #"tqdm_class": disabled_tqdm,
+    }
+    if output_dir is not None:
+        kwargs["local_dir"] = output_dir
+        kwargs["local_dir_use_symlinks"] = False
+    if cache_dir is not None:
+        kwargs["cache_dir"] = cache_dir
+    try:
+        return huggingface_hub.snapshot_download(repo_id, **kwargs)
+    except (
+        huggingface_hub.utils.HfHubHTTPError,
+        requests.exceptions.ConnectionError,
+    ) as exception:
+        warnings.warn(
+            "An error occured while synchronizing the model %s from the Hugging Face Hub:\n%s",
+            repo_id,
+            exception,
+        )
+        warnings.warn(
+            "Trying to load the model directly from the local cache, if it exists."
+        )
+        kwargs["local_files_only"] = True
+        return huggingface_hub.snapshot_download(repo_id, **kwargs)

src/vadParallel.py CHANGED Viewed

@@ -204,7 +204,7 @@ class ParallelTranscription(AbstractTranscription):
                 gpu_parallel_context.close()
         perf_end_gpu = time.perf_counter()
-        print("Parallel transcription took " + str(perf_end_gpu - perf_start_gpu) + " seconds")
         return merged

                 gpu_parallel_context.close()
         perf_end_gpu = time.perf_counter()
+        print("\nParallel transcription took " + str(perf_end_gpu - perf_start_gpu) + " seconds")
         return merged