whisper-webui-translate

Sleeping

avans06 commited on Dec 18, 2023

Commit

d0c7a01

1 Parent(s): 4e2f72e

Merge branch 'main' of https://huggingface.co/spaces/SoybeanMilk/whisper-webui-translate

1.Thank you SoybeanMilk for assisting in the development and integration of the excellent madlad400 translation model.

2. Added translationTorchDtypeFloat16 Checkbox to confirm if the non-quantized Translation models are set to Torch Dtype float16.
- transformers: torch_dtype=torch.float16
Load the float32 translation model with float16 when the system supports GPU (reducing VRAM usage, Not applicable to quantized models such as Ctranslate2, GPTQ, GGUF)

3. Added non-quantized versions of ALMA and madlad400 options in the translation tab menu. They will be displayed only in environments where GPU support is available.

Files changed (6) hide show

app.py +12 -9
config.json5 +27 -0
docs/options.md +5 -1
docs/translateModel.md +47 -2
src/config.py +2 -0
src/translation/translationModel.py +52 -14

app.py CHANGED Viewed

@@ -236,9 +236,10 @@ class WhisperTranscriber:
             madlad400ModelName: str = decodeOptions.pop("madlad400ModelName")
             madlad400LangName:  str = decodeOptions.pop("madlad400LangName")
-            translationBatchSize:         int = decodeOptions.pop("translationBatchSize")
-            translationNoRepeatNgramSize: int = decodeOptions.pop("translationNoRepeatNgramSize")
-            translationNumBeams:          int = decodeOptions.pop("translationNumBeams")
             sourceInput:    str  = decodeOptions.pop("sourceInput")
             urlData:        str  = decodeOptions.pop("urlData")
@@ -367,16 +368,16 @@ class WhisperTranscriber:
                     selectedModel = next((modelConfig for modelConfig in self.app_config.models["mt5"] if modelConfig.name == selectedModelName), None)
                     translationLang = get_lang_from_m2m100_name(mt5LangName)
                 elif translateInput == "ALMA" and ALMALangName is not None and len(ALMALangName) > 0:
-                    selectedModelName = ALMAModelName if ALMAModelName is not None and len(ALMAModelName) > 0 else "ALMA-13B-GPTQ/TheBloke"
                     selectedModel = next((modelConfig for modelConfig in self.app_config.models["ALMA"] if modelConfig.name == selectedModelName), None)
                     translationLang = get_lang_from_m2m100_name(ALMALangName)
                 elif translateInput == "madlad400" and madlad400LangName is not None and len(madlad400LangName) > 0:
-                    selectedModelName = madlad400ModelName if madlad400ModelName is not None and len(madlad400ModelName) > 0 else "madlad400-10b-mt-ct2-int8_float16"
                     selectedModel = next((modelConfig for modelConfig in self.app_config.models["madlad400"] if modelConfig.name == selectedModelName), None)
                     translationLang = get_lang_from_m2m100_name(madlad400LangName)
                 if translationLang is not None:
-                    translationModel = TranslationModel(modelConfig=selectedModel, whisperLang=whisperLang, translationLang=translationLang, batchSize=translationBatchSize, noRepeatNgramSize=translationNoRepeatNgramSize, numBeams=translationNumBeams)
                 progress(0, desc="init transcribe")
                 # Result
@@ -936,8 +937,9 @@ def create_ui(app_config: ApplicationConfig):
     mt5_models = app_config.get_model_names("mt5")
     ALMA_models = app_config.get_model_names("ALMA")
     madlad400_models = app_config.get_model_names("madlad400")
-    if not torch.cuda.is_available(): #Due to the poor support of GPTQ for CPUs, the execution time per iteration exceeds a thousand seconds when operating on a CPU. Therefore, when the system does not support a GPU, the GPTQ model is removed from the list.
-        ALMA_models = list(filter(lambda alma: "GPTQ" not in alma, ALMA_models))
     common_whisper_inputs = lambda : {
         gr.Dropdown(label="Whisper - Model (for audio)", choices=whisper_models, value=app_config.default_model_name, elem_id="whisperModelName"),
@@ -967,7 +969,8 @@ def create_ui(app_config: ApplicationConfig):
     common_translation_inputs = lambda : {
         gr.Number(label="Translation - Batch Size", precision=0, value=app_config.translation_batch_size, elem_id="translationBatchSize"),
         gr.Number(label="Translation - No Repeat Ngram Size", precision=0, value=app_config.translation_no_repeat_ngram_size, elem_id="translationNoRepeatNgramSize"),
-        gr.Number(label="Translation - Num Beams", precision=0, value=app_config.translation_num_beams, elem_id="translationNumBeams")
     }
     common_vad_inputs = lambda : {

             madlad400ModelName: str = decodeOptions.pop("madlad400ModelName")
             madlad400LangName:  str = decodeOptions.pop("madlad400LangName")
+            translationBatchSize:         int  = decodeOptions.pop("translationBatchSize")
+            translationNoRepeatNgramSize: int  = decodeOptions.pop("translationNoRepeatNgramSize")
+            translationNumBeams:          int  = decodeOptions.pop("translationNumBeams")
+            translationTorchDtypeFloat16: bool = decodeOptions.pop("translationTorchDtypeFloat16")
             sourceInput:    str  = decodeOptions.pop("sourceInput")
             urlData:        str  = decodeOptions.pop("urlData")
                     selectedModel = next((modelConfig for modelConfig in self.app_config.models["mt5"] if modelConfig.name == selectedModelName), None)
                     translationLang = get_lang_from_m2m100_name(mt5LangName)
                 elif translateInput == "ALMA" and ALMALangName is not None and len(ALMALangName) > 0:
+                    selectedModelName = ALMAModelName if ALMAModelName is not None and len(ALMAModelName) > 0 else "ALMA-7B-ct2:int8_float16/avan"
                     selectedModel = next((modelConfig for modelConfig in self.app_config.models["ALMA"] if modelConfig.name == selectedModelName), None)
                     translationLang = get_lang_from_m2m100_name(ALMALangName)
                 elif translateInput == "madlad400" and madlad400LangName is not None and len(madlad400LangName) > 0:
+                    selectedModelName = madlad400ModelName if madlad400ModelName is not None and len(madlad400ModelName) > 0 else "madlad400-3b-mt-ct2-int8_float16/SoybeanMilk"
                     selectedModel = next((modelConfig for modelConfig in self.app_config.models["madlad400"] if modelConfig.name == selectedModelName), None)
                     translationLang = get_lang_from_m2m100_name(madlad400LangName)
                 if translationLang is not None:
+                    translationModel = TranslationModel(modelConfig=selectedModel, whisperLang=whisperLang, translationLang=translationLang, batchSize=translationBatchSize, noRepeatNgramSize=translationNoRepeatNgramSize, numBeams=translationNumBeams, torchDtypeFloat16=translationTorchDtypeFloat16)
                 progress(0, desc="init transcribe")
                 # Result
     mt5_models = app_config.get_model_names("mt5")
     ALMA_models = app_config.get_model_names("ALMA")
     madlad400_models = app_config.get_model_names("madlad400")
+    if not torch.cuda.is_available(): #Load only GGUF and CT2 translation models in pure CPU environments..
+        ALMA_models = list(filter(lambda alma: "GGUF" in alma or "ct2" in alma, ALMA_models))
+        madlad400_models = list(filter(lambda madlad400: "ct2" in madlad400, madlad400_models))
     common_whisper_inputs = lambda : {
         gr.Dropdown(label="Whisper - Model (for audio)", choices=whisper_models, value=app_config.default_model_name, elem_id="whisperModelName"),
     common_translation_inputs = lambda : {
         gr.Number(label="Translation - Batch Size", precision=0, value=app_config.translation_batch_size, elem_id="translationBatchSize"),
         gr.Number(label="Translation - No Repeat Ngram Size", precision=0, value=app_config.translation_no_repeat_ngram_size, elem_id="translationNoRepeatNgramSize"),
+        gr.Number(label="Translation - Num Beams", precision=0, value=app_config.translation_num_beams, elem_id="translationNumBeams"),
+        gr.Checkbox(label="Translation - Torch Dtype float16", info="Load the float32 translation model with float16 when the system supports GPU (reducing VRAM usage, not applicable to quantized models, such as Ctranslate2, GPTQ, GGUF)", value=app_config.translation_torch_dtype_float16, elem_id="translationTorchDtypeFloat16")
     }
     common_vad_inputs = lambda : {

config.json5 CHANGED Viewed

@@ -229,6 +229,16 @@
         "type": "huggingface",
         "tokenizer_url": "haoranxu/ALMA-13B"
       },
     ],
     "madlad400": [
       {
@@ -243,6 +253,21 @@
         "type": "huggingface",
         "tokenizer_url": "jbochi/madlad400-10b-mt"
       },
     ]
   },
   // Configuration options that will be used if they are not specified in the command line arguments.
@@ -373,4 +398,6 @@
   "translation_no_repeat_ngram_size": 3,
   // Translation - Beam size (1 for greedy search).
   "translation_num_beams": 2,
 }

         "type": "huggingface",
         "tokenizer_url": "haoranxu/ALMA-13B"
       },
+      {
+        "name": "ALMA-7B/haoranxu",
+        "url": "haoranxu/ALMA-7B",
+        "type": "huggingface"
+      },
+      {
+        "name": "ALMA-13B/haoranxu",
+        "url": "haoranxu/ALMA-13B",
+        "type": "huggingface"
+      },
     ],
     "madlad400": [
       {
         "type": "huggingface",
         "tokenizer_url": "jbochi/madlad400-10b-mt"
       },
+      {
+        "name": "madlad400-3b-mt/jbochi",
+        "url": "jbochi/madlad400-3b-mt",
+        "type": "huggingface",
+      },
+      {
+        "name": "madlad400-7b-mt-bt/jbochi",
+        "url": "jbochi/madlad400-7b-mt-bt",
+        "type": "huggingface",
+      },
+      {
+        "name": "madlad400-10b-mt/jbochi",
+        "url": "jbochi/madlad400-10b-mt",
+        "type": "huggingface",
+      },
     ]
   },
   // Configuration options that will be used if they are not specified in the command line arguments.
   "translation_no_repeat_ngram_size": 3,
   // Translation - Beam size (1 for greedy search).
   "translation_num_beams": 2,
+  // Translation - Torch Dtype float16, Load the float32 translation model with float16 when the system supports GPU (reducing VRAM usage, not applicable to quantized models, such as Ctranslate2, GPTQ, GGUF).
+  "translation_torch_dtype_float16": true,
 }

docs/options.md CHANGED Viewed

@@ -200,4 +200,8 @@ Prevent repetitions of ngrams with this size (set 0 to disable).
 - transformers: num_beams
 Number of beams for beam search that will be used by default in the generate method of the model. 1 means no beam search.
 - ctranslate2: beam_size
-Beam size (1 for greedy search).

 - transformers: num_beams
 Number of beams for beam search that will be used by default in the generate method of the model. 1 means no beam search.
 - ctranslate2: beam_size
+Beam size (1 for greedy search).
+## Translation - Torch Dtype float16
+- transformers: torch_dtype=torch.float16
+Load the float32 translation model with float16 when the system supports GPU (reducing VRAM usage, not applicable to quantized models such as Ctranslate2, GPTQ, GGUF)

docs/translateModel.md CHANGED Viewed

@@ -42,6 +42,7 @@ NLLB-200 is a multilingual translation model introduced by Meta AI in July 2022.
 | [facebook/nllb-200-distilled-1.3B](https://huggingface.co/facebook/nllb-200-distilled-1.3B) | 1.3B | 5.48 GB | float32 | ≈5.9 GB |
 | [facebook/nllb-200-1.3B](https://huggingface.co/facebook/nllb-200-1.3B) | 1.3B | 5.48 GB | float32 | ≈5.8 GB |
 | [facebook/nllb-200-3.3B](https://huggingface.co/facebook/nllb-200-3.3B) | 3.3B | 17.58 GB | float32 | ≈13.4 GB |
 ## NLLB-200-CTranslate2
@@ -78,8 +79,8 @@ The official support for ALMA currently includes 10 language directions: English
 | Name | Parameters | Size | type/quantize | Required VRAM |
 |------|------------|------|---------------|---------------|
-| [haoranxu/ALMA-7B](https://huggingface.co/haoranxu/ALMA-7B) | 7B | 26.95 GB | float32 | N/A |
-| [haoranxu/ALMA-13B](https://huggingface.co/haoranxu/ALMA-13B) | 13B | 52.07 GB | float32 | N/A |
 ## ALMA-GPTQ
@@ -111,6 +112,46 @@ GGUF is a file format for storing models for inference with GGML and executors b
 | [avans06/ALMA-7B-ct2-int8_float16](https://huggingface.co/avans06/ALMA-7B-ct2-int8_float16) | 7B | 6.74 GB | int8_float16 | ≈6.6 GB |
 | [avans06/ALMA-13B-ct2-int8_float16](https://huggingface.co/avans06/ALMA-13B-ct2-int8_float16) | 13B | 13 GB | int8_float16 | ≈12.6 GB |
 # Options
@@ -131,3 +172,7 @@ Prevent repetitions of ngrams with this size (set 0 to disable).
 Number of beams for beam search that will be used by default in the generate method of the model. 1 means no beam search.
 - ctranslate2: beam_size
 Beam size (1 for greedy search).

 | [facebook/nllb-200-distilled-1.3B](https://huggingface.co/facebook/nllb-200-distilled-1.3B) | 1.3B | 5.48 GB | float32 | ≈5.9 GB |
 | [facebook/nllb-200-1.3B](https://huggingface.co/facebook/nllb-200-1.3B) | 1.3B | 5.48 GB | float32 | ≈5.8 GB |
 | [facebook/nllb-200-3.3B](https://huggingface.co/facebook/nllb-200-3.3B) | 3.3B | 17.58 GB | float32 | ≈13.4 GB |
+| [facebook/nllb-moe-54b](https://huggingface.co/facebook/nllb-moe-54b) | 54B | 220.2 GB | float32 | N/A |
 ## NLLB-200-CTranslate2
 | Name | Parameters | Size | type/quantize | Required VRAM |
 |------|------------|------|---------------|---------------|
+| [haoranxu/ALMA-7B](https://huggingface.co/haoranxu/ALMA-7B) | 7B | 26.95 GB | float32 | ≈13.2 GB (torch dtype in float16) |
+| [haoranxu/ALMA-13B](https://huggingface.co/haoranxu/ALMA-13B) | 13B | 52.07 GB | float32 | ≈25.4 GB (torch dtype in float16) |
 ## ALMA-GPTQ
 | [avans06/ALMA-7B-ct2-int8_float16](https://huggingface.co/avans06/ALMA-7B-ct2-int8_float16) | 7B | 6.74 GB | int8_float16 | ≈6.6 GB |
 | [avans06/ALMA-13B-ct2-int8_float16](https://huggingface.co/avans06/ALMA-13B-ct2-int8_float16) | 13B | 13 GB | int8_float16 | ≈12.6 GB |
+## madlad400
+madlad400 is a multilingual machine translation model based on the T5 architecture introduced by Google DeepMind, Google Research in Sep 2023. It was trained on 250 billion tokens covering over 450 languages using publicly available data. The paper is titled "`MADLAD-400: A Multilingual And Document-Level Large Audited Dataset`" ([arXiv:2309.04662](https://arxiv.org/abs/2309.04662)).
+| Name | Parameters | Size | type/quantize | Required VRAM |
+|------|------------|------|---------------|---------------|
+| [jbochi/madlad400-3b-mt](https://huggingface.co/jbochi/madlad400-3b-mt) | 3B | 11.8 GB | float32 | ≈12 GB |
+| [jbochi/madlad400-7b-mt](https://huggingface.co/jbochi/madlad400-7b-mt) | 7.2B | 33.2 GB | float32 | ≈19.7 GB (torch dtype in float16) |
+| [jbochi/madlad400-7b-mt-bt](https://huggingface.co/jbochi/madlad400-7b-mt-bt) | 7.2B | 33.2 GB | float32 (finetuned on backtranslated data) | ≈19.7 GB (torch dtype in float16) |
+| [jbochi/madlad400-8b-lm](https://huggingface.co/jbochi/madlad400-8b-lm) | 8B | 34.52 GB | float32 | N/A |
+| [jbochi/madlad400-10b-mt](https://huggingface.co/jbochi/madlad400-10b-mt) | 10.7B | 42.86 GB | float32 | ≈24.3 GB (torch dtype in float16) |
+## madlad400-CTranslate2
+| Name | Parameters | Size | type/quantize | Required VRAM |
+|------|------------|------|---------------|---------------|
+| [SoybeanMilk/madlad400-3b-mt-ct2-int8_float16](https://huggingface.co/SoybeanMilk/madlad400-3b-mt-ct2-int8_float16) | 3B | 2.95 GB | int8_float16 | ≈2.7 GB |
+| [SoybeanMilk/madlad400-10b-mt-ct2-int8_float16](https://huggingface.co/SoybeanMilk/madlad400-10b-mt-ct2-int8_float16) | 10.7B | 10.7 GB | int8_float16 | ≈10 GB |
+## SeamlessM4T
+SeamlessM4T is a collection of models designed to provide high quality translation, allowing people from different linguistic communities to communicate effortlessly through speech and text.
+It enables multiple tasks without relying on separate models:
+Speech-to-speech translation (S2ST)
+Speech-to-text translation (S2TT)
+Text-to-speech translation (T2ST)
+Text-to-text translation (T2TT)
+Automatic speech recognition (ASR)
+SeamlessM4T-v1 introduced by Seamless Communication team from Meta AI in Aug 2023. The paper is titled "`SeamlessM4T: Massively Multilingual & Multimodal Machine Translation`"([arXiv:2308.11596](https://arxiv.org/abs/2308.11596))
+SeamlessM4T-v2 introduced by Seamless Communication team from Meta AI in Dec 2023. The paper is titled "`Seamless: Multilingual Expressive and Streaming Speech Translation`"([arXiv:2312.05187](https://arxiv.org/abs/2312.05187))
+| Name | Parameters | Size | type/quantize | Required VRAM |
+|------|------------|------|---------------|---------------|
+| [facebook/hf-seamless-m4t-medium](https://huggingface.co/facebook/hf-seamless-m4t-medium) | 1.2B | 4.84 GB | float32 | N/A |
+| [facebook/seamless-m4t-large](https://huggingface.co/facebook/seamless-m4t-large) | 2.3B | 11.4 GB | float32 | N/A |
+| [facebook/seamless-m4t-v2-large](https://huggingface.co/facebook/seamless-m4t-v2-large) | 2.3B | 11.4 GB (safetensors:9.24 GB) | float32 | N/A |
 # Options
 Number of beams for beam search that will be used by default in the generate method of the model. 1 means no beam search.
 - ctranslate2: beam_size
 Beam size (1 for greedy search).
+## Translation - Torch Dtype float16
+- transformers: torch_dtype=torch.float16
+Load the float32 translation model with float16 when the system supports GPU (reducing VRAM usage, not applicable to quantized models such as Ctranslate2, GPTQ, GGUF)

src/config.py CHANGED Viewed

@@ -82,6 +82,7 @@ class ApplicationConfig:
                  translation_batch_size: int = 2,
                  translation_no_repeat_ngram_size: int = 3,
                  translation_num_beams: int = 2,
                  # Whisper Segments Filter
                  whisper_segments_filter: bool = False,
                  whisper_segments_filters: List[str] = [],
@@ -150,6 +151,7 @@ class ApplicationConfig:
         self.translation_batch_size = translation_batch_size
         self.translation_no_repeat_ngram_size = translation_no_repeat_ngram_size
         self.translation_num_beams = translation_num_beams
         # Whisper Segments Filter
         self.whisper_segments_filter = whisper_segments_filter
         self.whisper_segments_filters = whisper_segments_filters

                  translation_batch_size: int = 2,
                  translation_no_repeat_ngram_size: int = 3,
                  translation_num_beams: int = 2,
+                 translation_torch_dtype_float16: bool = True,
                  # Whisper Segments Filter
                  whisper_segments_filter: bool = False,
                  whisper_segments_filters: List[str] = [],
         self.translation_batch_size = translation_batch_size
         self.translation_no_repeat_ngram_size = translation_no_repeat_ngram_size
         self.translation_num_beams = translation_num_beams
+        self.translation_torch_dtype_float16 = translation_torch_dtype_float16
         # Whisper Segments Filter
         self.whisper_segments_filter = whisper_segments_filter
         self.whisper_segments_filters = whisper_segments_filters

src/translation/translationModel.py CHANGED Viewed

@@ -21,11 +21,12 @@ class TranslationModel:
         batchSize: int = 2,
         noRepeatNgramSize: int = 3,
         numBeams: int = 2,
         downloadRoot: Optional[str] = None,
         localFilesOnly: bool = False,
         loadModel: bool = False,
     ):
-        """Initializes the M2M100 / Nllb-200 / mt5 model.
         Args:
           modelConfig: Config of the model to use (distilled-600M, distilled-1.3B,
@@ -76,8 +77,10 @@ class TranslationModel:
                 device = "cuda" if "ct2" in self.modelPath else "cuda:0"
             else:
                 device = "cpu"
         self.device = device
         if loadModel:
             self.load_model()
@@ -85,8 +88,31 @@ class TranslationModel:
     def load_model(self):
         """
         [from_pretrained]
-        low_cpu_mem_usage(bool, optional)
-        Tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model. This is an experimental feature and a subject to change at any moment.
         [transformers.AutoTokenizer.from_pretrained]
         use_fast (bool, optional, defaults to True):
@@ -166,7 +192,7 @@ class TranslationModel:
             elif "mt5" in self.modelPath:
                 self.mt5Prefix = self.whisperLang.whisper.code + "2" + self.translationLang.whisper.code + ": "
                 self.transTokenizer = transformers.T5Tokenizer.from_pretrained(self.modelPath, legacy=False) #requires spiece.model
-                self.transModel = transformers.MT5ForConditionalGeneration.from_pretrained(self.modelPath, low_cpu_mem_usage=True)
                 self.transTranslator = transformers.pipeline('text2text-generation', model=self.transModel, device=self.device, tokenizer=self.transTokenizer)
             elif "ALMA" in self.modelPath:
                 self.ALMAPrefix = "Translate this from " + self.whisperLang.whisper.names[0] + " to " + self.translationLang.whisper.names[0] + ":\n" + self.whisperLang.whisper.names[0] + ": "
@@ -185,18 +211,25 @@ class TranslationModel:
                     import ctransformers
                     self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelConfig.tokenizer_url)
                     if self.device == "cpu":
-                        self.transModel = ctransformers.AutoModelForCausalLM.from_pretrained(self.modelPath, hf=True, model_file=self.modelConfig.model_file)
                     else:
-                        self.transModel = ctransformers.AutoModelForCausalLM.from_pretrained(self.modelPath, hf=True, model_file=self.modelConfig.model_file, gpu_layers=50)
-                self.transTranslator = transformers.pipeline("text-generation", model=self.transModel, tokenizer=self.transTokenizer, do_sample=True, temperature=0.7, top_k=40, top_p=0.95, repetition_penalty=1.1)
             else:
                 self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelPath)
-                self.transModel = transformers.AutoModelForSeq2SeqLM.from_pretrained(self.modelPath)
                 if "m2m100" in self.modelPath:
                     self.transTranslator = transformers.pipeline('translation', model=self.transModel, device=self.device, tokenizer=self.transTokenizer, src_lang=self.whisperLang.m2m100.code, tgt_lang=self.translationLang.m2m100.code)
                 else: #NLLB
                     self.transTranslator = transformers.pipeline('translation', model=self.transModel, device=self.device, tokenizer=self.transTokenizer, src_lang=self.whisperLang.nllb.code, tgt_lang=self.translationLang.nllb.code)
         except Exception as e:
             self.release_vram()
             raise e
@@ -223,13 +256,13 @@ class TranslationModel:
                     del self.transTokenizer
                 if getattr(self, "transModel", None) is not None:
                     del self.transModel
                 try:
                     torch.cuda.empty_cache()
                 except Exception as e:
                     print(traceback.format_exc())
                     print("\tcuda empty cache, error: " + str(e))
-                import gc
-                gc.collect()
                 print("release vram end.")
         except Exception as e:
             print(traceback.format_exc())
@@ -294,6 +327,12 @@ class TranslationModel:
                     output = self.transTranslator(self.ALMAPrefix + text + "\n" + self.translationLang.whisper.names[0] + ": ", max_length=max_length, batch_size=self.batchSize, no_repeat_ngram_size=self.noRepeatNgramSize, num_beams=self.numBeams, return_full_text=False)
                 elif "GGUF" in self.modelPath:
                     output = self.transTranslator(self.ALMAPrefix + text + "\n" + self.translationLang.whisper.names[0] + ": ", max_length=max_length, batch_size=self.batchSize, no_repeat_ngram_size=self.noRepeatNgramSize, num_beams=self.numBeams, return_full_text=False)
                 result = output[0]['generated_text']
             else: #M2M100 & NLLB
                 output = self.transTranslator(text, max_length=max_length, batch_size=self.batchSize, no_repeat_ngram_size=self.noRepeatNgramSize, num_beams=self.numBeams)
@@ -356,9 +395,6 @@ def download_model(
         "pytorch_model.bin",
         "pytorch_model.bin.index.json",
         "pytorch_model-*.bin",
-        "pytorch_model-00001-of-00003.bin",
-        "pytorch_model-00002-of-00003.bin",
-        "pytorch_model-00003-of-00003.bin",
         "sentencepiece.bpe.model",
         "tokenizer.json",
         "tokenizer_config.json",
@@ -368,6 +404,8 @@ def download_model(
         "spiece.model",
         "vocab.json", #m2m100
         "model.safetensors",
         "quantize_config.json",
         "tokenizer.model",
         "vocabulary.json"

         batchSize: int = 2,
         noRepeatNgramSize: int = 3,
         numBeams: int = 2,
+        torchDtypeFloat16: bool = True,
         downloadRoot: Optional[str] = None,
         localFilesOnly: bool = False,
         loadModel: bool = False,
     ):
+        """Initializes the M2M100 / Nllb-200 / mt5 / ALMA / madlad400 translation model.
         Args:
           modelConfig: Config of the model to use (distilled-600M, distilled-1.3B,
                 device = "cuda" if "ct2" in self.modelPath else "cuda:0"
             else:
                 device = "cpu"
+                torchDtypeFloat16 = False
         self.device = device
+        self.torchDtypeFloat16 = torchDtypeFloat16
         if loadModel:
             self.load_model()
     def load_model(self):
         """
         [from_pretrained]
+        low_cpu_mem_usage(bool, optional):
+            Tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model. This is an experimental feature and a subject to change at any moment.
+        torch_dtype (str or torch.dtype, optional):
+            Override the default torch.dtype and load the model under a specific dtype. The different options are:
+            1. torch.float16 or torch.bfloat16 or torch.float: load in a specified dtype, ignoring the model’s config.torch_dtype if one exists.
+               If not specified the model will get loaded in torch.float (fp32).
+            2. "auto" - A torch_dtype entry in the config.json file of the model will be attempted to be used.
+               If this entry isn’t found then next check the dtype of the first weight in the checkpoint that’s of a floating point type and use that as dtype.
+               This will load the model using the dtype it was saved in at the end of the training. It can’t be used as an indicator of how the model was trained.
+               Since it could be trained in one of half precision dtypes, but saved in fp32.
+            For some models the dtype they were trained in is unknown - you may try to check the model’s paper or reach out to the authors and
+            ask them to add this information to the model’s card and to insert the torch_dtype entry in config.json on the hub.
+        device_map (str or Dict[str, Union[int, str, torch.device]] or int or torch.device, optional):
+            A map that specifies where each submodule should go. It doesn’t need to be refined to each parameter/buffer name,
+            once a given module name is inside, every submodule of it will be sent to the same device.
+            If we only pass the device (e.g., "cpu", "cuda:1", "mps", or a GPU ordinal rank like 1) on which the model will be allocated,
+            the device map will map the entire model to this device. Passing device_map = 0 means put the whole model on GPU 0.
+            To have Accelerate compute the most optimized device_map automatically, set device_map="auto". For more information about each option see designing a device map.
+        load_in_8bit (bool, optional, defaults to False)
+            If True, will convert the loaded model into mixed-8bit quantized model. To use this feature please install bitsandbytes (pip install -U bitsandbytes).
+        load_in_4bit (bool, optional, defaults to False)
+            If True, will convert the loaded model into 4bit precision quantized model. To use this feature install the latest version of bitsandbytes (pip install -U bitsandbytes).
         [transformers.AutoTokenizer.from_pretrained]
         use_fast (bool, optional, defaults to True):
             elif "mt5" in self.modelPath:
                 self.mt5Prefix = self.whisperLang.whisper.code + "2" + self.translationLang.whisper.code + ": "
                 self.transTokenizer = transformers.T5Tokenizer.from_pretrained(self.modelPath, legacy=False) #requires spiece.model
+                self.transModel = transformers.MT5ForConditionalGeneration.from_pretrained(self.modelPath, low_cpu_mem_usage=True, torch_dtype=torch.float16 if self.torchDtypeFloat16 else "auto")
                 self.transTranslator = transformers.pipeline('text2text-generation', model=self.transModel, device=self.device, tokenizer=self.transTokenizer)
             elif "ALMA" in self.modelPath:
                 self.ALMAPrefix = "Translate this from " + self.whisperLang.whisper.names[0] + " to " + self.translationLang.whisper.names[0] + ":\n" + self.whisperLang.whisper.names[0] + ": "
                     import ctransformers
                     self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelConfig.tokenizer_url)
                     if self.device == "cpu":
+                        self.transModel = ctransformers.AutoModelForCausalLM.from_pretrained(self.modelPath, hf=True, model_file=self.modelConfig.model_file, low_cpu_mem_usage=True)
                     else:
+                        self.transModel = ctransformers.AutoModelForCausalLM.from_pretrained(self.modelPath, hf=True, model_file=self.modelConfig.model_file, gpu_layers=50, low_cpu_mem_usage=True)
+                else:
+                    self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelPath, use_fast=True)
+                    self.transModel = transformers.AutoModelForCausalLM.from_pretrained(self.modelPath, torch_dtype=torch.float16 if self.torchDtypeFloat16 else "auto", low_cpu_mem_usage=True) #, device_map="auto"
+                self.transTranslator = transformers.pipeline("text-generation", model=self.transModel, device=self.device if "GPTQ" not in self.modelPath and "GGUF" not in self.modelPath else None, tokenizer=self.transTokenizer, do_sample=True, temperature=0.7, top_k=40, top_p=0.95, repetition_penalty=1.1)
+            elif "madlad400" in self.modelPath:
+                self.madlad400Prefix = "<2" + self.translationLang.whisper.code + "> "
+                self.transTokenizer = transformers.T5Tokenizer.from_pretrained(self.modelPath, legacy=False)
+                self.transModel = transformers.T5ForConditionalGeneration.from_pretrained(self.modelPath, torch_dtype=torch.float16 if self.torchDtypeFloat16 else "auto", low_cpu_mem_usage=True) #, device_map="auto"
+                self.transTranslator = transformers.pipeline('text2text-generation', model=self.transModel, device=self.device, tokenizer=self.transTokenizer)
             else:
                 self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelPath)
+                self.transModel = transformers.AutoModelForSeq2SeqLM.from_pretrained(self.modelPath, torch_dtype=torch.float16 if self.torchDtypeFloat16 else "auto")
                 if "m2m100" in self.modelPath:
                     self.transTranslator = transformers.pipeline('translation', model=self.transModel, device=self.device, tokenizer=self.transTokenizer, src_lang=self.whisperLang.m2m100.code, tgt_lang=self.translationLang.m2m100.code)
                 else: #NLLB
                     self.transTranslator = transformers.pipeline('translation', model=self.transModel, device=self.device, tokenizer=self.transTokenizer, src_lang=self.whisperLang.nllb.code, tgt_lang=self.translationLang.nllb.code)
         except Exception as e:
             self.release_vram()
             raise e
                     del self.transTokenizer
                 if getattr(self, "transModel", None) is not None:
                     del self.transModel
+                import gc
+                gc.collect()
                 try:
                     torch.cuda.empty_cache()
                 except Exception as e:
                     print(traceback.format_exc())
                     print("\tcuda empty cache, error: " + str(e))
                 print("release vram end.")
         except Exception as e:
             print(traceback.format_exc())
                     output = self.transTranslator(self.ALMAPrefix + text + "\n" + self.translationLang.whisper.names[0] + ": ", max_length=max_length, batch_size=self.batchSize, no_repeat_ngram_size=self.noRepeatNgramSize, num_beams=self.numBeams, return_full_text=False)
                 elif "GGUF" in self.modelPath:
                     output = self.transTranslator(self.ALMAPrefix + text + "\n" + self.translationLang.whisper.names[0] + ": ", max_length=max_length, batch_size=self.batchSize, no_repeat_ngram_size=self.noRepeatNgramSize, num_beams=self.numBeams, return_full_text=False)
+                else:
+                    output = self.transTranslator(self.ALMAPrefix + text + "\n" + self.translationLang.whisper.names[0] + ": ", max_length=max_length, batch_size=self.batchSize, no_repeat_ngram_size=self.noRepeatNgramSize, num_beams=self.numBeams, return_full_text=False)
+                result = output[0]['generated_text']
+            elif "madlad400" in self.modelPath:
+                output = self.transTranslator(self.madlad400Prefix + text, max_length=max_length, batch_size=self.batchSize, no_repeat_ngram_size=self.noRepeatNgramSize, num_beams=self.numBeams) #, num_return_sequences=2
                 result = output[0]['generated_text']
             else: #M2M100 & NLLB
                 output = self.transTranslator(text, max_length=max_length, batch_size=self.batchSize, no_repeat_ngram_size=self.noRepeatNgramSize, num_beams=self.numBeams)
         "pytorch_model.bin",
         "pytorch_model.bin.index.json",
         "pytorch_model-*.bin",
         "sentencepiece.bpe.model",
         "tokenizer.json",
         "tokenizer_config.json",
         "spiece.model",
         "vocab.json", #m2m100
         "model.safetensors",
+        "model-*.safetensors",
+        "model.safetensors.index.json",
         "quantize_config.json",
         "tokenizer.model",
         "vocabulary.json"