whisper-webui-translate

Runtime error

App Files Files Community

SoybeanMilk commited on Dec 11, 2023

Commit

22034c0

1 Parent(s): 2601bfd

Add support for the ALMA model.

Browse files

Files changed (5) hide show

app.py +19 -0
config.json5 +7 -0
config.py +177 -0
translationModel.py +259 -0
utils.py +314 -0

app.py CHANGED Viewed

@@ -231,6 +231,8 @@ class WhisperTranscriber:
             nllbLangName:     str = decodeOptions.pop("nllbLangName")
             mt5ModelName:     str = decodeOptions.pop("mt5ModelName")
             mt5LangName:      str = decodeOptions.pop("mt5LangName")
             translationBatchSize:         int = decodeOptions.pop("translationBatchSize")
             translationNoRepeatNgramSize: int = decodeOptions.pop("translationNoRepeatNgramSize")
@@ -334,6 +336,10 @@ class WhisperTranscriber:
                     selectedModelName = mt5ModelName if mt5ModelName is not None and len(mt5ModelName) > 0 else "mt5-zh-ja-en-trimmed/K024"
                     selectedModel = next((modelConfig for modelConfig in self.app_config.models["mt5"] if modelConfig.name == selectedModelName), None)
                     translationLang = get_lang_from_m2m100_name(mt5LangName)
                 if translationLang is not None:
                     translationModel = TranslationModel(modelConfig=selectedModel, whisperLang=whisperLang, translationLang=translationLang, batchSize=translationBatchSize, noRepeatNgramSize=translationNoRepeatNgramSize, numBeams=translationNumBeams)
@@ -826,6 +832,7 @@ def create_ui(app_config: ApplicationConfig):
     nllb_models = app_config.get_model_names("nllb")
     m2m100_models = app_config.get_model_names("m2m100")
     mt5_models = app_config.get_model_names("mt5")
     common_whisper_inputs = lambda : {
         gr.Dropdown(label="Whisper - Model (for audio)", choices=whisper_models, value=app_config.default_model_name, elem_id="whisperModelName"),
@@ -843,6 +850,10 @@ def create_ui(app_config: ApplicationConfig):
         gr.Dropdown(label="MT5 - Model (for translate)", choices=mt5_models, elem_id="mt5ModelName"),
         gr.Dropdown(label="MT5 - Language", choices=sorted(get_lang_m2m100_names(["en", "ja", "zh"])), elem_id="mt5LangName"),
     }
     common_translation_inputs = lambda : {
         gr.Number(label="Translation - Batch Size", precision=0, value=app_config.translation_batch_size, elem_id="translationBatchSize"),
@@ -903,9 +914,13 @@ def create_ui(app_config: ApplicationConfig):
                     with gr.Tab(label="MT5") as simpleMT5Tab:
                         with gr.Row():
                             simpleInputDict.update(common_mt5_inputs())
                     simpleM2M100Tab.select(fn=lambda: "m2m100", inputs = [], outputs= [simpleTranslateInput] )
                     simpleNllbTab.select(fn=lambda: "nllb", inputs = [], outputs= [simpleTranslateInput] )
                     simpleMT5Tab.select(fn=lambda: "mt5", inputs = [], outputs= [simpleTranslateInput] )
                 with gr.Column():
                     with gr.Tab(label="URL") as simpleUrlTab:
                         simpleInputDict.update({gr.Text(label="URL (YouTube, etc.)", elem_id = "urlData")})
@@ -962,9 +977,13 @@ def create_ui(app_config: ApplicationConfig):
                     with gr.Tab(label="MT5") as fullMT5Tab:
                         with gr.Row():
                             fullInputDict.update(common_mt5_inputs())
                     fullM2M100Tab.select(fn=lambda: "m2m100", inputs = [], outputs= [fullTranslateInput] )
                     fullNllbTab.select(fn=lambda: "nllb", inputs = [], outputs= [fullTranslateInput] )
                     fullMT5Tab.select(fn=lambda: "mt5", inputs = [], outputs= [fullTranslateInput] )
                 with gr.Column():
                     with gr.Tab(label="URL") as fullUrlTab:
                         fullInputDict.update({gr.Text(label="URL (YouTube, etc.)", elem_id = "urlData")})

             nllbLangName:     str = decodeOptions.pop("nllbLangName")
             mt5ModelName:     str = decodeOptions.pop("mt5ModelName")
             mt5LangName:      str = decodeOptions.pop("mt5LangName")
+            ALMAModelName:    str = decodeOptions.pop("ALMAModelName")
+            ALMALangName:     str = decodeOptions.pop("ALMALangName")
             translationBatchSize:         int = decodeOptions.pop("translationBatchSize")
             translationNoRepeatNgramSize: int = decodeOptions.pop("translationNoRepeatNgramSize")
                     selectedModelName = mt5ModelName if mt5ModelName is not None and len(mt5ModelName) > 0 else "mt5-zh-ja-en-trimmed/K024"
                     selectedModel = next((modelConfig for modelConfig in self.app_config.models["mt5"] if modelConfig.name == selectedModelName), None)
                     translationLang = get_lang_from_m2m100_name(mt5LangName)
+                elif translateInput == "ALMA" and ALMALangName is not None and len(ALMALangName) > 0:
+                    selectedModelName = ALMAModelName if ALMAModelName is not None and len(ALMAModelName) > 0 else "ALMA-13B-GPTQ/TheBloke"
+                    selectedModel = next((modelConfig for modelConfig in self.app_config.models["ALMA"] if modelConfig.name == selectedModelName), None)
+                    translationLang = get_lang_from_m2m100_name(ALMALangName)
                 if translationLang is not None:
                     translationModel = TranslationModel(modelConfig=selectedModel, whisperLang=whisperLang, translationLang=translationLang, batchSize=translationBatchSize, noRepeatNgramSize=translationNoRepeatNgramSize, numBeams=translationNumBeams)
     nllb_models = app_config.get_model_names("nllb")
     m2m100_models = app_config.get_model_names("m2m100")
     mt5_models = app_config.get_model_names("mt5")
+    ALMA_models = app_config.get_model_names("ALMA")
     common_whisper_inputs = lambda : {
         gr.Dropdown(label="Whisper - Model (for audio)", choices=whisper_models, value=app_config.default_model_name, elem_id="whisperModelName"),
         gr.Dropdown(label="MT5 - Model (for translate)", choices=mt5_models, elem_id="mt5ModelName"),
         gr.Dropdown(label="MT5 - Language", choices=sorted(get_lang_m2m100_names(["en", "ja", "zh"])), elem_id="mt5LangName"),
     }
+    common_ALMA_inputs = lambda : {
+        gr.Dropdown(label="ALMA - Model (for translate)", choices=ALMA_models, elem_id="ALMAModelName"),
+        gr.Dropdown(label="ALMA - Language", choices=sorted(get_lang_m2m100_names(["en", "ja", "zh"])), elem_id="ALMALangName"),
+    }
     common_translation_inputs = lambda : {
         gr.Number(label="Translation - Batch Size", precision=0, value=app_config.translation_batch_size, elem_id="translationBatchSize"),
                     with gr.Tab(label="MT5") as simpleMT5Tab:
                         with gr.Row():
                             simpleInputDict.update(common_mt5_inputs())
+                    with gr.Tab(label="ALMA") as simpleALMATab:
+                        with gr.Row():
+                            simpleInputDict.update(common_ALMA_inputs())
                     simpleM2M100Tab.select(fn=lambda: "m2m100", inputs = [], outputs= [simpleTranslateInput] )
                     simpleNllbTab.select(fn=lambda: "nllb", inputs = [], outputs= [simpleTranslateInput] )
                     simpleMT5Tab.select(fn=lambda: "mt5", inputs = [], outputs= [simpleTranslateInput] )
+                    simpleALMATab.select(fn=lambda: "ALMA", inputs = [], outputs= [simpleTranslateInput] )
                 with gr.Column():
                     with gr.Tab(label="URL") as simpleUrlTab:
                         simpleInputDict.update({gr.Text(label="URL (YouTube, etc.)", elem_id = "urlData")})
                     with gr.Tab(label="MT5") as fullMT5Tab:
                         with gr.Row():
                             fullInputDict.update(common_mt5_inputs())
+                    with gr.Tab(label="ALMA") as fullALMATab:
+                        with gr.Row():
+                            fullInputDict.update(common_ALMA_inputs())
                     fullM2M100Tab.select(fn=lambda: "m2m100", inputs = [], outputs= [fullTranslateInput] )
                     fullNllbTab.select(fn=lambda: "nllb", inputs = [], outputs= [fullTranslateInput] )
                     fullMT5Tab.select(fn=lambda: "mt5", inputs = [], outputs= [fullTranslateInput] )
+                    fullALMATab.select(fn=lambda: "ALMA", inputs = [], outputs= [fullTranslateInput] )
                 with gr.Column():
                     with gr.Tab(label="URL") as fullUrlTab:
                         fullInputDict.update({gr.Text(label="URL (YouTube, etc.)", elem_id = "urlData")})

config.json5 CHANGED Viewed

@@ -187,6 +187,13 @@
         "url": "engmatic-earth/mt5-zh-ja-en-trimmed-fine-tuned-v1",
         "type": "huggingface"
       }
     ]
   },
   // Configuration options that will be used if they are not specified in the command line arguments.

         "url": "engmatic-earth/mt5-zh-ja-en-trimmed-fine-tuned-v1",
         "type": "huggingface"
       }
+    ],
+    "ALMA": [
+      {
+        "name": "ALMA-13B-GPTQ/TheBloke",
+        "url": "TheBloke/ALMA-13B-GPTQ",
+        "type": "huggingface",
+      },
     ]
   },
   // Configuration options that will be used if they are not specified in the command line arguments.

config.py ADDED Viewed

	@@ -0,0 +1,177 @@

+from enum import Enum
+import os
+from typing import List, Dict, Literal
+class ModelConfig:
+    def __init__(self, name: str, url: str, path: str = None, type: str = "whisper", tokenizer_url: str = None):
+        """
+        Initialize a model configuration.
+        name: Name of the model
+        url: URL to download the model from
+        path: Path to the model file. If not set, the model will be downloaded from the URL.
+        type: Type of model. Can be whisper or huggingface.
+        """
+        self.name = name
+        self.url = url
+        self.path = path
+        self.type = type
+        self.tokenizer_url = tokenizer_url
+VAD_INITIAL_PROMPT_MODE_VALUES=["prepend_all_segments", "prepend_first_segment", "json_prompt_mode"]
+class VadInitialPromptMode(Enum):
+    PREPEND_ALL_SEGMENTS = 1
+    PREPREND_FIRST_SEGMENT = 2
+    JSON_PROMPT_MODE = 3
+    @staticmethod
+    def from_string(s: str):
+        normalized = s.lower() if s is not None and len(s) > 0 else None
+        if normalized == "prepend_all_segments":
+            return VadInitialPromptMode.PREPEND_ALL_SEGMENTS
+        elif normalized == "prepend_first_segment":
+            return VadInitialPromptMode.PREPREND_FIRST_SEGMENT
+        elif normalized == "json_prompt_mode":
+            return VadInitialPromptMode.JSON_PROMPT_MODE
+        elif normalized is not None and normalized != "":
+            raise ValueError(f"Invalid value for VadInitialPromptMode: {s}")
+        else:
+            return None
+class ApplicationConfig:
+    def __init__(self, models: Dict[Literal["whisper", "m2m100", "nllb", "mt5", "ALMA"], List[ModelConfig]],
+                 input_audio_max_duration: int = 600, share: bool = False, server_name: str = None, server_port: int = 7860,
+                 queue_concurrency_count: int = 1, delete_uploaded_files: bool = True,
+                 whisper_implementation: str = "whisper", default_model_name: str = "medium",
+                 default_nllb_model_name: str = "distilled-600M", default_vad: str = "silero-vad",
+                 vad_parallel_devices: str = "", vad_cpu_cores: int = 1, vad_process_timeout: int = 1800,
+                 auto_parallel: bool = False, output_dir: str = None,
+                 model_dir: str = None, device: str = None,
+                 verbose: bool = True, task: str = "transcribe", language: str = None,
+                 vad_initial_prompt_mode: str = "prepend_first_segment ",
+                 vad_merge_window: float = 5, vad_max_merge_size: float = 30,
+                 vad_padding: float = 1, vad_prompt_window: float = 3,
+                 temperature: float = 0, best_of: int = 5, beam_size: int = 5,
+                 patience: float = None, length_penalty: float = None,
+                 suppress_tokens: str = "-1", initial_prompt: str = None,
+                 condition_on_previous_text: bool = True, fp16: bool = True,
+                 compute_type: str = "float16",
+                 temperature_increment_on_fallback: float = 0.2, compression_ratio_threshold: float = 2.4,
+                 logprob_threshold: float = -1.0, no_speech_threshold: float = 0.6,
+                 repetition_penalty: float = 1.0, no_repeat_ngram_size: int = 0,
+                 # Word timestamp settings
+                 word_timestamps: bool = False, prepend_punctuations: str = "\"\'“¿([{-",
+                 append_punctuations: str = "\"\'.。,，!！?？:：”)]}、",
+                 highlight_words: bool = False,
+                 # Diarization
+                 auth_token: str = None, diarization: bool = False, diarization_speakers: int = 2,
+                 diarization_min_speakers: int = 1, diarization_max_speakers: int = 5,
+                 diarization_process_timeout: int = 60,
+                 # Translation
+                 translation_batch_size: int = 2,
+                 translation_no_repeat_ngram_size: int = 3,
+                 translation_num_beams: int = 2,
+                 ):
+        self.models = models
+        # WebUI settings
+        self.input_audio_max_duration = input_audio_max_duration
+        self.share = share
+        self.server_name = server_name
+        self.server_port = server_port
+        self.queue_concurrency_count = queue_concurrency_count
+        self.delete_uploaded_files = delete_uploaded_files
+        self.whisper_implementation = whisper_implementation
+        self.default_model_name = default_model_name
+        self.default_nllb_model_name = default_nllb_model_name
+        self.default_vad = default_vad
+        self.vad_parallel_devices = vad_parallel_devices
+        self.vad_cpu_cores = vad_cpu_cores
+        self.vad_process_timeout = vad_process_timeout
+        self.auto_parallel = auto_parallel
+        self.output_dir = output_dir
+        self.model_dir = model_dir
+        self.device = device
+        self.verbose = verbose
+        self.task = task
+        self.language = language
+        self.vad_initial_prompt_mode = vad_initial_prompt_mode
+        self.vad_merge_window = vad_merge_window
+        self.vad_max_merge_size = vad_max_merge_size
+        self.vad_padding = vad_padding
+        self.vad_prompt_window = vad_prompt_window
+        self.temperature = temperature
+        self.best_of = best_of
+        self.beam_size = beam_size
+        self.patience = patience
+        self.length_penalty = length_penalty
+        self.suppress_tokens = suppress_tokens
+        self.initial_prompt = initial_prompt
+        self.condition_on_previous_text = condition_on_previous_text
+        self.fp16 = fp16
+        self.compute_type = compute_type
+        self.temperature_increment_on_fallback = temperature_increment_on_fallback
+        self.compression_ratio_threshold = compression_ratio_threshold
+        self.logprob_threshold = logprob_threshold
+        self.no_speech_threshold = no_speech_threshold
+        self.repetition_penalty = repetition_penalty
+        self.no_repeat_ngram_size = no_repeat_ngram_size
+        # Word timestamp settings
+        self.word_timestamps = word_timestamps
+        self.prepend_punctuations = prepend_punctuations
+        self.append_punctuations = append_punctuations
+        self.highlight_words = highlight_words
+        # Diarization settings
+        self.auth_token = auth_token
+        self.diarization = diarization
+        self.diarization_speakers = diarization_speakers
+        self.diarization_min_speakers = diarization_min_speakers
+        self.diarization_max_speakers = diarization_max_speakers
+        self.diarization_process_timeout = diarization_process_timeout
+        # Translation
+        self.translation_batch_size = translation_batch_size
+        self.translation_no_repeat_ngram_size = translation_no_repeat_ngram_size
+        self.translation_num_beams = translation_num_beams
+    def get_model_names(self, name: str):
+        return [ x.name for x in self.models[name] ]
+    def update(self, **new_values):
+        result = ApplicationConfig(**self.__dict__)
+        for key, value in new_values.items():
+            setattr(result, key, value)
+        return result
+    @staticmethod
+    def create_default(**kwargs):
+        app_config = ApplicationConfig.parse_file(os.environ.get("WHISPER_WEBUI_CONFIG", "config.json5"))
+        # Update with kwargs
+        if len(kwargs) > 0:
+            app_config = app_config.update(**kwargs)
+        return app_config
+    @staticmethod
+    def parse_file(config_path: str):
+        import json5
+        with open(config_path, "r", encoding="utf-8") as f:
+            # Load using json5
+            data = json5.load(f)
+            data_models = data.pop("models", [])
+            models: Dict[Literal["whisper", "m2m100", "nllb", "mt5", "ALMA"], List[ModelConfig]] = {
+                key: [ModelConfig(**item) for item in value]
+                for key, value in data_models.items()
+            }
+            return ApplicationConfig(models, **data)

translationModel.py ADDED Viewed

	@@ -0,0 +1,259 @@

+import os
+import warnings
+import huggingface_hub
+import requests
+import torch
+import ctranslate2
+import transformers
+import re
+from typing import Optional
+from src.config import ModelConfig
+from src.translation.translationLangs import TranslationLang, get_lang_from_whisper_code
+from peft import PeftModel
+class TranslationModel:
+    def __init__(
+        self,
+        modelConfig: ModelConfig,
+        device: str = None,
+        whisperLang: TranslationLang = None,
+        translationLang: TranslationLang = None,
+        batchSize: int = 2,
+        noRepeatNgramSize: int = 3,
+        numBeams: int = 2,
+        downloadRoot: Optional[str] = None,
+        localFilesOnly: bool = False,
+        loadModel: bool = False,
+    ):
+        """Initializes the M2M100 / Nllb-200 / mt5 model.
+        Args:
+          modelConfig: Config of the model to use (distilled-600M, distilled-1.3B,
+            1.3B, 3.3B...) or a path to a converted
+            model directory. When a size is configured, the converted model is downloaded
+            from the Hugging Face Hub.
+          device: Device to use for computation (cpu, cuda, ipu, xpu, mkldnn, opengl, opencl,
+            ideep, hip, ve, fpga, ort, xla, lazy, vulkan, mps, meta, hpu, mtia).
+          device_index: Device ID to use.
+            The model can also be loaded on multiple GPUs by passing a list of IDs
+            (e.g. [0, 1, 2, 3]). In that case, multiple transcriptions can run in parallel
+            when transcribe() is called from multiple Python threads (see also num_workers).
+          compute_type: Type to use for computation.
+            See https://opennmt.net/CTranslate2/quantization.html.
+          cpu_threads: Number of threads to use when running on CPU (4 by default).
+            A non zero value overrides the OMP_NUM_THREADS environment variable.
+          num_workers: When transcribe() is called from multiple Python threads,
+            having multiple workers enables true parallelism when running the model
+            (concurrent calls to self.model.generate() will run in parallel).
+            This can improve the global throughput at the cost of increased memory usage.
+          downloadRoot: Directory where the models should be saved. If not set, the models
+            are saved in the standard Hugging Face cache directory.
+          localFilesOnly:  If True, avoid downloading the file and return the path to the
+            local cached file if it exists.
+        """
+        self.modelConfig = modelConfig
+        self.whisperLang = whisperLang # self.translationLangWhisper = get_lang_from_whisper_code(whisperLang.code.lower() if whisperLang is not None else "en")
+        self.translationLang = translationLang
+        if translationLang is None:
+            return
+        self.batchSize = batchSize
+        self.noRepeatNgramSize = noRepeatNgramSize
+        self.numBeams = numBeams
+        if os.path.isdir(modelConfig.url):
+            self.modelPath = modelConfig.url
+        else:
+            self.modelPath = download_model(
+                modelConfig,
+                localFilesOnly=localFilesOnly,
+                cacheDir=downloadRoot,
+            )
+        if device is None:
+            if torch.cuda.is_available():
+                device = "cuda" if "ct2" in self.modelPath else "cuda:0"
+            else:
+                device = "cpu"
+        self.device = device
+        if loadModel:
+            self.load_model()
+    def load_model(self):
+        print('\n\nLoading model: %s\n\n' % self.modelPath)
+        if "ct2" in self.modelPath:
+            if "nllb" in self.modelPath:
+                self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelConfig.tokenizer_url if self.modelConfig.tokenizer_url is not None and len(self.modelConfig.tokenizer_url) > 0 else self.modelPath, src_lang=self.whisperLang.nllb.code)
+                self.targetPrefix = [self.translationLang.nllb.code]
+            elif "m2m100" in self.modelPath:
+                self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelConfig.tokenizer_url if self.modelConfig.tokenizer_url is not None and len(self.modelConfig.tokenizer_url) > 0 else self.modelPath, src_lang=self.whisperLang.m2m100.code)
+                self.targetPrefix = [self.transTokenizer.lang_code_to_token[self.translationLang.m2m100.code]]
+            self.transModel = ctranslate2.Translator(self.modelPath, compute_type="auto", device=self.device)
+        elif "mt5" in self.modelPath:
+            self.mt5Prefix = self.whisperLang.whisper.code + "2" + self.translationLang.whisper.code + ": "
+            self.transTokenizer = transformers.T5Tokenizer.from_pretrained(self.modelPath, legacy=False) #requires spiece.model
+            self.transModel = transformers.MT5ForConditionalGeneration.from_pretrained(self.modelPath)
+            self.transTranslator = transformers.pipeline('text2text-generation', model=self.transModel, device=self.device, tokenizer=self.transTokenizer)
+        elif "ALMA" in self.modelPath:
+            self.ALMAPrefix = "Translate this from " + self.whisperLang.whisper.code + " to " + self.translationLang.whisper.code + ":" + self.whisperLang.whisper.code + ":"
+            self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelPath, use_fast=True)
+            self.transModel = transformers.AutoModelForCausalLM.from_pretrained(self.modelPath, device_map="auto", trust_remote_code=False, revision="main")
+            self.transTranslator = transformers.pipeline("text-generation", model=self.transModel, tokenizer=self.transTokenizer, batch_size=2, do_sample=True, temperature=0.7, top_p=0.95, top_k=40, repetition_penalty=1.1)
+        else:
+            self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelPath)
+            self.transModel = transformers.AutoModelForSeq2SeqLM.from_pretrained(self.modelPath)
+            if "m2m100" in self.modelPath:
+                self.transTranslator = transformers.pipeline('translation', model=self.transModel, device=self.device, tokenizer=self.transTokenizer, src_lang=self.whisperLang.m2m100.code, tgt_lang=self.translationLang.m2m100.code)
+            else: #NLLB
+                self.transTranslator = transformers.pipeline('translation', model=self.transModel, device=self.device, tokenizer=self.transTokenizer, src_lang=self.whisperLang.nllb.code, tgt_lang=self.translationLang.nllb.code)
+    def release_vram(self):
+        try:
+            if torch.cuda.is_available():
+                if "ct2" not in self.modelPath:
+                    device = torch.device("cpu")
+                    self.transModel.to(device)
+                del self.transModel
+                torch.cuda.empty_cache()
+                print("release vram end.")
+        except Exception as e:
+            print("Error release vram: " + str(e))
+    def translation(self, text: str, max_length: int = 400):
+        output = None
+        result = None
+        try:
+            if "ct2" in self.modelPath:
+                source = self.transTokenizer.convert_ids_to_tokens(self.transTokenizer.encode(text))
+                output = self.transModel.translate_batch([source], target_prefix=[self.targetPrefix], max_batch_size=self.batchSize, no_repeat_ngram_size=self.noRepeatNgramSize, beam_size=self.numBeams)
+                target = output[0].hypotheses[0][1:]
+                result = self.transTokenizer.decode(self.transTokenizer.convert_tokens_to_ids(target))
+            elif "mt5" in self.modelPath:
+                output = self.transTranslator(self.mt5Prefix + text, max_length=max_length, batch_size=self.batchSize, no_repeat_ngram_size=self.noRepeatNgramSize, num_beams=self.numBeams) #, num_return_sequences=2
+                result = output[0]['generated_text']
+            elif "ALMA" in self.modelPath:
+                output = self.transTranslator(self.ALMAPrefix + text + self.translationLang.whisper.code + ":", max_length=max_length, batch_size=self.batchSize, no_repeat_ngram_size=self.noRepeatNgramSize, num_beams=self.numBeams)
+                result = output[0]['generated_text']
+                result = re.sub(rf'^(.*{self.translationLang.whisper.code}: )', '', result)  # Remove the prompt from the result
+                result = re.sub(rf'^(Translate this from .* to .*:)', '', result)  # Remove the translation instruction
+                return result.strip()
+            else: #M2M100 & NLLB
+                output = self.transTranslator(text, max_length=max_length, batch_size=self.batchSize, no_repeat_ngram_size=self.noRepeatNgramSize, num_beams=self.numBeams)
+                result = output[0]['translation_text']
+        except Exception as e:
+            print("Error translation text: " + str(e))
+        return result
+_MODELS = ["distilled-600M", "distilled-1.3B", "1.3B", "3.3B",
+           "ct2fast-nllb-200-distilled-1.3B-int8_float16",
+           "ct2fast-nllb-200-3.3B-int8_float16",
+           "nllb-200-3.3B-ct2-float16", "nllb-200-1.3B-ct2", "nllb-200-1.3B-ct2-int8", "nllb-200-1.3B-ct2-float16",
+           "nllb-200-distilled-1.3B-ct2", "nllb-200-distilled-1.3B-ct2-int8", "nllb-200-distilled-1.3B-ct2-float16",
+           "nllb-200-distilled-600M-ct2", "nllb-200-distilled-600M-ct2-int8", "nllb-200-distilled-600M-ct2-float16",
+           "m2m100_1.2B-ct2", "m2m100_418M-ct2", "m2m100-12B-ct2",
+           "m2m100_1.2B", "m2m100_418M",
+           "mt5-zh-ja-en-trimmed",
+           "mt5-zh-ja-en-trimmed-fine-tuned-v1",
+           "ALMA-13B-GPTQ"]
+def check_model_name(name):
+    return any(allowed_name in name for allowed_name in _MODELS)
+def download_model(
+    modelConfig: ModelConfig,
+    outputDir: Optional[str] = None,
+    localFilesOnly: bool = False,
+    cacheDir: Optional[str] = None,
+):
+    """"download_model" is referenced from the "utils.py" script
+      of the "faster_whisper" project, authored by guillaumekln.
+    Downloads a nllb-200 model from the Hugging Face Hub.
+    The model is downloaded from https://huggingface.co/facebook.
+    Args:
+      modelConfig: config of the model to download (facebook/nllb-distilled-600M,
+        facebook/nllb-distilled-1.3B, facebook/nllb-1.3B, facebook/nllb-3.3B...).
+      outputDir: Directory where the model should be saved. If not set, the model is saved in
+        the cache directory.
+      localFilesOnly:  If True, avoid downloading the file and return the path to the local
+        cached file if it exists.
+      cacheDir: Path to the folder where cached files are stored.
+    Returns:
+      The path to the downloaded model.
+    Raises:
+      ValueError: if the model size is invalid.
+    """
+    if not check_model_name(modelConfig.name):
+        raise ValueError(
+            "Invalid model name '%s', expected one of: %s" % (modelConfig.name, ", ".join(_MODELS))
+        )
+    repoId = modelConfig.url #"facebook/nllb-200-%s" %
+    allowPatterns = [
+        "config.json",
+        "generation_config.json",
+        "model.bin",
+        "pytorch_model.bin",
+        "pytorch_model.bin.index.json",
+        "pytorch_model-*.bin",
+        "pytorch_model-00001-of-00003.bin",
+        "pytorch_model-00002-of-00003.bin",
+        "pytorch_model-00003-of-00003.bin",
+        "sentencepiece.bpe.model",
+        "tokenizer.json",
+        "tokenizer_config.json",
+        "shared_vocabulary.txt",
+        "shared_vocabulary.json",
+        "special_tokens_map.json",
+        "spiece.model",
+        "vocab.json", #m2m100
+        "model.safetensors",
+        "quantize_config.json",
+        "tokenizer.model"
+    ]
+    kwargs = {
+        "local_files_only": localFilesOnly,
+        "allow_patterns": allowPatterns,
+        #"tqdm_class": disabled_tqdm,
+    }
+    if outputDir is not None:
+        kwargs["local_dir"] = outputDir
+        kwargs["local_dir_use_symlinks"] = False
+    if cacheDir is not None:
+        kwargs["cache_dir"] = cacheDir
+    try:
+        return huggingface_hub.snapshot_download(repoId, **kwargs)
+    except (
+        huggingface_hub.utils.HfHubHTTPError,
+        requests.exceptions.ConnectionError,
+    ) as exception:
+        warnings.warn(
+            "An error occured while synchronizing the model %s from the Hugging Face Hub:\n%s",
+            repoId,
+            exception,
+        )
+        warnings.warn(
+            "Trying to load the model directly from the local cache, if it exists."
+        )
+        kwargs["local_files_only"] = True
+        return huggingface_hub.snapshot_download(repoId, **kwargs)

utils.py ADDED Viewed

	@@ -0,0 +1,314 @@

+import textwrap
+import unicodedata
+import re
+import zlib
+from typing import Iterator, TextIO, Union
+import tqdm
+import urllib3
+def exact_div(x, y):
+    assert x % y == 0
+    return x // y
+def str2bool(string):
+    str2val = {"True": True, "False": False}
+    if string in str2val:
+        return str2val[string]
+    else:
+        raise ValueError(f"Expected one of {set(str2val.keys())}, got {string}")
+def optional_int(string):
+    return None if string == "None" else int(string)
+def optional_float(string):
+    return None if string == "None" else float(string)
+def compression_ratio(text) -> float:
+    return len(text) / len(zlib.compress(text.encode("utf-8")))
+def format_timestamp(seconds: float, always_include_hours: bool = False, fractionalSeperator: str = '.'):
+    assert seconds >= 0, "non-negative timestamp expected"
+    milliseconds = round(seconds * 1000.0)
+    hours = milliseconds // 3_600_000
+    milliseconds -= hours * 3_600_000
+    minutes = milliseconds // 60_000
+    milliseconds -= minutes * 60_000
+    seconds = milliseconds // 1_000
+    milliseconds -= seconds * 1_000
+    hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
+    return f"{hours_marker}{minutes:02d}:{seconds:02d}{fractionalSeperator}{milliseconds:03d}"
+def write_txt(transcript: Iterator[dict], file: TextIO):
+    for segment in transcript:
+        print(segment['text'].strip(), file=file, flush=True)
+def write_vtt(transcript: Iterator[dict], file: TextIO,
+              maxLineWidth=None, highlight_words: bool = False):
+    iterator  = __subtitle_preprocessor_iterator(transcript, maxLineWidth, highlight_words)
+    print("WEBVTT\n", file=file)
+    for segment in iterator:
+        text = segment['text'].replace('-->', '->')
+        print(
+            f"{format_timestamp(segment['start'])} --> {format_timestamp(segment['end'])}\n"
+            f"{text}\n",
+            file=file,
+            flush=True,
+        )
+def write_srt(transcript: Iterator[dict], file: TextIO,
+              maxLineWidth=None, highlight_words: bool = False):
+    """
+    Write a transcript to a file in SRT format.
+    Example usage:
+        from pathlib import Path
+        from whisper.utils import write_srt
+        result = transcribe(model, audio_path, temperature=temperature, **args)
+        # save SRT
+        audio_basename = Path(audio_path).stem
+        with open(Path(output_dir) / (audio_basename + ".srt"), "w", encoding="utf-8") as srt:
+            write_srt(result["segments"], file=srt)
+    """
+    iterator  = __subtitle_preprocessor_iterator(transcript, maxLineWidth, highlight_words)
+    for i, segment in enumerate(iterator, start=1):
+        text = segment['text'].replace('-->', '->')
+        # write srt lines
+        print(
+            f"{i}\n"
+            f"{format_timestamp(segment['start'], always_include_hours=True, fractionalSeperator=',')} --> "
+            f"{format_timestamp(segment['end'], always_include_hours=True, fractionalSeperator=',')}\n"
+            f"{text}\n",
+            file=file,
+            flush=True,
+        )
+def write_srt_original(transcript: Iterator[dict], file: TextIO,
+              maxLineWidth=None, highlight_words: bool = False, bilingual: bool = False):
+    """
+    Write a transcript to a file in SRT format.
+    Example usage:
+        from pathlib import Path
+        from whisper.utils import write_srt
+        result = transcribe(model, audio_path, temperature=temperature, **args)
+        # save SRT
+        audio_basename = Path(audio_path).stem
+        with open(Path(output_dir) / (audio_basename + ".srt"), "w", encoding="utf-8") as srt:
+            write_srt(result["segments"], file=srt)
+    """
+    iterator  = __subtitle_preprocessor_iterator(transcript, maxLineWidth, highlight_words)
+    for i, segment in enumerate(iterator, start=1):
+        if "original" not in segment:
+            continue
+        original = segment['original'].replace('-->', '->')
+        # write srt lines
+        print(
+            f"{i}\n"
+            f"{format_timestamp(segment['start'], always_include_hours=True, fractionalSeperator=',')} --> "
+            f"{format_timestamp(segment['end'], always_include_hours=True, fractionalSeperator=',')}",
+            file=file,
+            flush=True,
+        )
+        if original is not None: print(f"{original}\n",
+            file=file,
+            flush=True)
+        if bilingual:
+            text = segment['text'].replace('-->', '->')
+            print(f"{text}\n",
+            file=file,
+            flush=True)
+def __subtitle_preprocessor_iterator(transcript: Iterator[dict], maxLineWidth: int = None, highlight_words: bool = False):
+    for segment in transcript:
+        words: list = segment.get('words', [])
+        # Append longest speaker ID if available
+        segment_longest_speaker = segment.get('longest_speaker', None)
+        # Yield the segment as-is or processed
+        if len(words) == 0 and (maxLineWidth is None or maxLineWidth < 0) and segment_longest_speaker is None:
+           yield segment
+        if segment_longest_speaker is not None:
+            segment_longest_speaker = segment_longest_speaker.replace("SPEAKER", "S")
+        subtitle_start = segment['start']
+        subtitle_end   = segment['end']
+        text           = segment['text'].strip()
+        original_text  = segment['original'].strip() if 'original' in segment else None
+        if len(words) == 0:
+            # Prepend the longest speaker ID if available
+            if segment_longest_speaker is not None:
+                text = f"({segment_longest_speaker}) {text}"
+            result = {
+                'start': subtitle_start,
+                'end'  : subtitle_end,
+                'text' : process_text(text, maxLineWidth)
+            }
+            if original_text is not None and len(original_text) > 0:
+                result.update({'original': process_text(original_text, maxLineWidth)})
+            yield result
+            # We are done
+            continue
+        if segment_longest_speaker is not None:
+            # Add the beginning
+            words.insert(0, {
+                'start': subtitle_start,
+                'end'  : subtitle_start,
+                'word' : f"({segment_longest_speaker})"
+            })
+        text_words = [text] if not highlight_words and original_text is not None and len(original_text) > 0 else [ this_word["word"] for this_word in words ]
+        subtitle_text = __join_words(text_words, maxLineWidth)
+        # Iterate over the words in the segment
+        if highlight_words:
+            last = subtitle_start
+            for i, this_word in enumerate(words):
+                start = this_word['start']
+                end = this_word['end']
+                if last != start:
+                    # Display the text up to this point
+                    yield {
+                        'start': last,
+                        'end'  : start,
+                        'text' : subtitle_text
+                    }
+                # Display the text with the current word highlighted
+                yield {
+                    'start': start,
+                    'end'  : end,
+                    'text' : __join_words(
+                        [
+                            {
+                                "word": re.sub(r"^(\s*)(.*)$", r"\1<u>\2</u>", word)
+                                        if j == i
+                                        else word,
+                                # The HTML tags <u> and </u> are not displayed,
+                                # # so they should not be counted in the word length
+                                "length": len(word)
+                            } for j, word in enumerate(text_words)
+                        ], maxLineWidth)
+                }
+                last = end
+            if last != subtitle_end:
+                # Display the last part of the text
+                yield {
+                    'start': last,
+                    'end'  : subtitle_end,
+                    'text' : subtitle_text
+                }
+        # Just return the subtitle text
+        else:
+            result = {
+                'start': subtitle_start,
+                'end'  : subtitle_end,
+                'text' : subtitle_text
+            }
+            if original_text is not None and len(original_text) > 0:
+                result.update({'original': original_text})
+            yield result
+def __join_words(words: Iterator[Union[str, dict]], maxLineWidth: int = None):
+    if maxLineWidth is None or maxLineWidth < 0:
+        return " ".join(words)
+    lines = []
+    current_line = ""
+    current_length = 0
+    for entry in words:
+        # Either accept a string or a dict with a 'word' and 'length' field
+        if isinstance(entry, dict):
+            word = entry['word']
+            word_length = entry['length']
+        else:
+            word = entry
+            word_length = len(word)
+        if current_length > 0 and current_length + word_length > maxLineWidth:
+            lines.append(current_line)
+            current_line = ""
+            current_length = 0
+        current_length += word_length
+        # The word will be prefixed with a space by Whisper, so we don't need to add one here
+        current_line += word
+    if len(current_line) > 0:
+        lines.append(current_line)
+    return "\n".join(lines)
+def process_text(text: str, maxLineWidth=None):
+    if (maxLineWidth is None or maxLineWidth < 0):
+        return text
+    lines = textwrap.wrap(text, width=maxLineWidth, tabsize=4)
+    return '\n'.join(lines)
+def slugify(value, allow_unicode=False, is_lower=False):
+    """
+    Taken from https://github.com/django/django/blob/master/django/utils/text.py
+    Convert to ASCII if 'allow_unicode' is False. Convert spaces or repeated
+    dashes to single dashes. Remove characters that aren't alphanumerics,
+    underscores, or hyphens. Convert to lowercase. Also strip leading and
+    trailing whitespace, dashes, and underscores.
+    """
+    value = str(value)
+    if allow_unicode:
+        value = unicodedata.normalize('NFKC', value)
+    else:
+        value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
+    if is_lower:
+        value = value.lower()
+    value = re.sub(r'[^\w\s-]', '', value.replace("/","_").replace("⧸","_"))
+    return re.sub(r'[-\s]+', '-', value).strip('-_')
+def download_file(url: str, destination: str):
+        with urllib3.request.urlopen(url) as source, open(destination, "wb") as output:
+            with tqdm(
+                total=int(source.info().get("Content-Length")),
+                ncols=80,
+                unit="iB",
+                unit_scale=True,
+                unit_divisor=1024,
+            ) as loop:
+                while True:
+                    buffer = source.read(8192)
+                    if not buffer:
+                        break
+                    output.write(buffer)
+                    loop.update(len(buffer))