avans06 commited on
Commit
d0c7a01
1 Parent(s): 4e2f72e

Merge branch 'main' of https://huggingface.co/spaces/SoybeanMilk/whisper-webui-translate

Browse files

1.Thank you SoybeanMilk for assisting in the development and integration of the excellent madlad400 translation model.

2. Added translationTorchDtypeFloat16 Checkbox to confirm if the non-quantized Translation models are set to Torch Dtype float16.
- transformers: torch_dtype=torch.float16
Load the float32 translation model with float16 when the system supports GPU (reducing VRAM usage, Not applicable to quantized models such as Ctranslate2, GPTQ, GGUF)

3. Added non-quantized versions of ALMA and madlad400 options in the translation tab menu. They will be displayed only in environments where GPU support is available.

app.py CHANGED
@@ -236,9 +236,10 @@ class WhisperTranscriber:
236
  madlad400ModelName: str = decodeOptions.pop("madlad400ModelName")
237
  madlad400LangName: str = decodeOptions.pop("madlad400LangName")
238
 
239
- translationBatchSize: int = decodeOptions.pop("translationBatchSize")
240
- translationNoRepeatNgramSize: int = decodeOptions.pop("translationNoRepeatNgramSize")
241
- translationNumBeams: int = decodeOptions.pop("translationNumBeams")
 
242
 
243
  sourceInput: str = decodeOptions.pop("sourceInput")
244
  urlData: str = decodeOptions.pop("urlData")
@@ -367,16 +368,16 @@ class WhisperTranscriber:
367
  selectedModel = next((modelConfig for modelConfig in self.app_config.models["mt5"] if modelConfig.name == selectedModelName), None)
368
  translationLang = get_lang_from_m2m100_name(mt5LangName)
369
  elif translateInput == "ALMA" and ALMALangName is not None and len(ALMALangName) > 0:
370
- selectedModelName = ALMAModelName if ALMAModelName is not None and len(ALMAModelName) > 0 else "ALMA-13B-GPTQ/TheBloke"
371
  selectedModel = next((modelConfig for modelConfig in self.app_config.models["ALMA"] if modelConfig.name == selectedModelName), None)
372
  translationLang = get_lang_from_m2m100_name(ALMALangName)
373
  elif translateInput == "madlad400" and madlad400LangName is not None and len(madlad400LangName) > 0:
374
- selectedModelName = madlad400ModelName if madlad400ModelName is not None and len(madlad400ModelName) > 0 else "madlad400-10b-mt-ct2-int8_float16"
375
  selectedModel = next((modelConfig for modelConfig in self.app_config.models["madlad400"] if modelConfig.name == selectedModelName), None)
376
  translationLang = get_lang_from_m2m100_name(madlad400LangName)
377
 
378
  if translationLang is not None:
379
- translationModel = TranslationModel(modelConfig=selectedModel, whisperLang=whisperLang, translationLang=translationLang, batchSize=translationBatchSize, noRepeatNgramSize=translationNoRepeatNgramSize, numBeams=translationNumBeams)
380
 
381
  progress(0, desc="init transcribe")
382
  # Result
@@ -936,8 +937,9 @@ def create_ui(app_config: ApplicationConfig):
936
  mt5_models = app_config.get_model_names("mt5")
937
  ALMA_models = app_config.get_model_names("ALMA")
938
  madlad400_models = app_config.get_model_names("madlad400")
939
- if not torch.cuda.is_available(): #Due to the poor support of GPTQ for CPUs, the execution time per iteration exceeds a thousand seconds when operating on a CPU. Therefore, when the system does not support a GPU, the GPTQ model is removed from the list.
940
- ALMA_models = list(filter(lambda alma: "GPTQ" not in alma, ALMA_models))
 
941
 
942
  common_whisper_inputs = lambda : {
943
  gr.Dropdown(label="Whisper - Model (for audio)", choices=whisper_models, value=app_config.default_model_name, elem_id="whisperModelName"),
@@ -967,7 +969,8 @@ def create_ui(app_config: ApplicationConfig):
967
  common_translation_inputs = lambda : {
968
  gr.Number(label="Translation - Batch Size", precision=0, value=app_config.translation_batch_size, elem_id="translationBatchSize"),
969
  gr.Number(label="Translation - No Repeat Ngram Size", precision=0, value=app_config.translation_no_repeat_ngram_size, elem_id="translationNoRepeatNgramSize"),
970
- gr.Number(label="Translation - Num Beams", precision=0, value=app_config.translation_num_beams, elem_id="translationNumBeams")
 
971
  }
972
 
973
  common_vad_inputs = lambda : {
 
236
  madlad400ModelName: str = decodeOptions.pop("madlad400ModelName")
237
  madlad400LangName: str = decodeOptions.pop("madlad400LangName")
238
 
239
+ translationBatchSize: int = decodeOptions.pop("translationBatchSize")
240
+ translationNoRepeatNgramSize: int = decodeOptions.pop("translationNoRepeatNgramSize")
241
+ translationNumBeams: int = decodeOptions.pop("translationNumBeams")
242
+ translationTorchDtypeFloat16: bool = decodeOptions.pop("translationTorchDtypeFloat16")
243
 
244
  sourceInput: str = decodeOptions.pop("sourceInput")
245
  urlData: str = decodeOptions.pop("urlData")
 
368
  selectedModel = next((modelConfig for modelConfig in self.app_config.models["mt5"] if modelConfig.name == selectedModelName), None)
369
  translationLang = get_lang_from_m2m100_name(mt5LangName)
370
  elif translateInput == "ALMA" and ALMALangName is not None and len(ALMALangName) > 0:
371
+ selectedModelName = ALMAModelName if ALMAModelName is not None and len(ALMAModelName) > 0 else "ALMA-7B-ct2:int8_float16/avan"
372
  selectedModel = next((modelConfig for modelConfig in self.app_config.models["ALMA"] if modelConfig.name == selectedModelName), None)
373
  translationLang = get_lang_from_m2m100_name(ALMALangName)
374
  elif translateInput == "madlad400" and madlad400LangName is not None and len(madlad400LangName) > 0:
375
+ selectedModelName = madlad400ModelName if madlad400ModelName is not None and len(madlad400ModelName) > 0 else "madlad400-3b-mt-ct2-int8_float16/SoybeanMilk"
376
  selectedModel = next((modelConfig for modelConfig in self.app_config.models["madlad400"] if modelConfig.name == selectedModelName), None)
377
  translationLang = get_lang_from_m2m100_name(madlad400LangName)
378
 
379
  if translationLang is not None:
380
+ translationModel = TranslationModel(modelConfig=selectedModel, whisperLang=whisperLang, translationLang=translationLang, batchSize=translationBatchSize, noRepeatNgramSize=translationNoRepeatNgramSize, numBeams=translationNumBeams, torchDtypeFloat16=translationTorchDtypeFloat16)
381
 
382
  progress(0, desc="init transcribe")
383
  # Result
 
937
  mt5_models = app_config.get_model_names("mt5")
938
  ALMA_models = app_config.get_model_names("ALMA")
939
  madlad400_models = app_config.get_model_names("madlad400")
940
+ if not torch.cuda.is_available(): #Load only GGUF and CT2 translation models in pure CPU environments..
941
+ ALMA_models = list(filter(lambda alma: "GGUF" in alma or "ct2" in alma, ALMA_models))
942
+ madlad400_models = list(filter(lambda madlad400: "ct2" in madlad400, madlad400_models))
943
 
944
  common_whisper_inputs = lambda : {
945
  gr.Dropdown(label="Whisper - Model (for audio)", choices=whisper_models, value=app_config.default_model_name, elem_id="whisperModelName"),
 
969
  common_translation_inputs = lambda : {
970
  gr.Number(label="Translation - Batch Size", precision=0, value=app_config.translation_batch_size, elem_id="translationBatchSize"),
971
  gr.Number(label="Translation - No Repeat Ngram Size", precision=0, value=app_config.translation_no_repeat_ngram_size, elem_id="translationNoRepeatNgramSize"),
972
+ gr.Number(label="Translation - Num Beams", precision=0, value=app_config.translation_num_beams, elem_id="translationNumBeams"),
973
+ gr.Checkbox(label="Translation - Torch Dtype float16", info="Load the float32 translation model with float16 when the system supports GPU (reducing VRAM usage, not applicable to quantized models, such as Ctranslate2, GPTQ, GGUF)", value=app_config.translation_torch_dtype_float16, elem_id="translationTorchDtypeFloat16")
974
  }
975
 
976
  common_vad_inputs = lambda : {
config.json5 CHANGED
@@ -229,6 +229,16 @@
229
  "type": "huggingface",
230
  "tokenizer_url": "haoranxu/ALMA-13B"
231
  },
 
 
 
 
 
 
 
 
 
 
232
  ],
233
  "madlad400": [
234
  {
@@ -243,6 +253,21 @@
243
  "type": "huggingface",
244
  "tokenizer_url": "jbochi/madlad400-10b-mt"
245
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
246
  ]
247
  },
248
  // Configuration options that will be used if they are not specified in the command line arguments.
@@ -373,4 +398,6 @@
373
  "translation_no_repeat_ngram_size": 3,
374
  // Translation - Beam size (1 for greedy search).
375
  "translation_num_beams": 2,
 
 
376
  }
 
229
  "type": "huggingface",
230
  "tokenizer_url": "haoranxu/ALMA-13B"
231
  },
232
+ {
233
+ "name": "ALMA-7B/haoranxu",
234
+ "url": "haoranxu/ALMA-7B",
235
+ "type": "huggingface"
236
+ },
237
+ {
238
+ "name": "ALMA-13B/haoranxu",
239
+ "url": "haoranxu/ALMA-13B",
240
+ "type": "huggingface"
241
+ },
242
  ],
243
  "madlad400": [
244
  {
 
253
  "type": "huggingface",
254
  "tokenizer_url": "jbochi/madlad400-10b-mt"
255
  },
256
+ {
257
+ "name": "madlad400-3b-mt/jbochi",
258
+ "url": "jbochi/madlad400-3b-mt",
259
+ "type": "huggingface",
260
+ },
261
+ {
262
+ "name": "madlad400-7b-mt-bt/jbochi",
263
+ "url": "jbochi/madlad400-7b-mt-bt",
264
+ "type": "huggingface",
265
+ },
266
+ {
267
+ "name": "madlad400-10b-mt/jbochi",
268
+ "url": "jbochi/madlad400-10b-mt",
269
+ "type": "huggingface",
270
+ },
271
  ]
272
  },
273
  // Configuration options that will be used if they are not specified in the command line arguments.
 
398
  "translation_no_repeat_ngram_size": 3,
399
  // Translation - Beam size (1 for greedy search).
400
  "translation_num_beams": 2,
401
+ // Translation - Torch Dtype float16, Load the float32 translation model with float16 when the system supports GPU (reducing VRAM usage, not applicable to quantized models, such as Ctranslate2, GPTQ, GGUF).
402
+ "translation_torch_dtype_float16": true,
403
  }
docs/options.md CHANGED
@@ -200,4 +200,8 @@ Prevent repetitions of ngrams with this size (set 0 to disable).
200
  - transformers: num_beams
201
  Number of beams for beam search that will be used by default in the generate method of the model. 1 means no beam search.
202
  - ctranslate2: beam_size
203
- Beam size (1 for greedy search).
 
 
 
 
 
200
  - transformers: num_beams
201
  Number of beams for beam search that will be used by default in the generate method of the model. 1 means no beam search.
202
  - ctranslate2: beam_size
203
+ Beam size (1 for greedy search).
204
+
205
+ ## Translation - Torch Dtype float16
206
+ - transformers: torch_dtype=torch.float16
207
+ Load the float32 translation model with float16 when the system supports GPU (reducing VRAM usage, not applicable to quantized models such as Ctranslate2, GPTQ, GGUF)
docs/translateModel.md CHANGED
@@ -42,6 +42,7 @@ NLLB-200 is a multilingual translation model introduced by Meta AI in July 2022.
42
  | [facebook/nllb-200-distilled-1.3B](https://huggingface.co/facebook/nllb-200-distilled-1.3B) | 1.3B | 5.48 GB | float32 | ≈5.9 GB |
43
  | [facebook/nllb-200-1.3B](https://huggingface.co/facebook/nllb-200-1.3B) | 1.3B | 5.48 GB | float32 | ≈5.8 GB |
44
  | [facebook/nllb-200-3.3B](https://huggingface.co/facebook/nllb-200-3.3B) | 3.3B | 17.58 GB | float32 | ≈13.4 GB |
 
45
 
46
  ## NLLB-200-CTranslate2
47
 
@@ -78,8 +79,8 @@ The official support for ALMA currently includes 10 language directions: English
78
 
79
  | Name | Parameters | Size | type/quantize | Required VRAM |
80
  |------|------------|------|---------------|---------------|
81
- | [haoranxu/ALMA-7B](https://huggingface.co/haoranxu/ALMA-7B) | 7B | 26.95 GB | float32 | N/A |
82
- | [haoranxu/ALMA-13B](https://huggingface.co/haoranxu/ALMA-13B) | 13B | 52.07 GB | float32 | N/A |
83
 
84
  ## ALMA-GPTQ
85
 
@@ -111,6 +112,46 @@ GGUF is a file format for storing models for inference with GGML and executors b
111
  | [avans06/ALMA-7B-ct2-int8_float16](https://huggingface.co/avans06/ALMA-7B-ct2-int8_float16) | 7B | 6.74 GB | int8_float16 | ≈6.6 GB |
112
  | [avans06/ALMA-13B-ct2-int8_float16](https://huggingface.co/avans06/ALMA-13B-ct2-int8_float16) | 13B | 13 GB | int8_float16 | ≈12.6 GB |
113
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
 
115
  # Options
116
 
@@ -131,3 +172,7 @@ Prevent repetitions of ngrams with this size (set 0 to disable).
131
  Number of beams for beam search that will be used by default in the generate method of the model. 1 means no beam search.
132
  - ctranslate2: beam_size
133
  Beam size (1 for greedy search).
 
 
 
 
 
42
  | [facebook/nllb-200-distilled-1.3B](https://huggingface.co/facebook/nllb-200-distilled-1.3B) | 1.3B | 5.48 GB | float32 | ≈5.9 GB |
43
  | [facebook/nllb-200-1.3B](https://huggingface.co/facebook/nllb-200-1.3B) | 1.3B | 5.48 GB | float32 | ≈5.8 GB |
44
  | [facebook/nllb-200-3.3B](https://huggingface.co/facebook/nllb-200-3.3B) | 3.3B | 17.58 GB | float32 | ≈13.4 GB |
45
+ | [facebook/nllb-moe-54b](https://huggingface.co/facebook/nllb-moe-54b) | 54B | 220.2 GB | float32 | N/A |
46
 
47
  ## NLLB-200-CTranslate2
48
 
 
79
 
80
  | Name | Parameters | Size | type/quantize | Required VRAM |
81
  |------|------------|------|---------------|---------------|
82
+ | [haoranxu/ALMA-7B](https://huggingface.co/haoranxu/ALMA-7B) | 7B | 26.95 GB | float32 | ≈13.2 GB (torch dtype in float16) |
83
+ | [haoranxu/ALMA-13B](https://huggingface.co/haoranxu/ALMA-13B) | 13B | 52.07 GB | float32 | ≈25.4 GB (torch dtype in float16) |
84
 
85
  ## ALMA-GPTQ
86
 
 
112
  | [avans06/ALMA-7B-ct2-int8_float16](https://huggingface.co/avans06/ALMA-7B-ct2-int8_float16) | 7B | 6.74 GB | int8_float16 | ≈6.6 GB |
113
  | [avans06/ALMA-13B-ct2-int8_float16](https://huggingface.co/avans06/ALMA-13B-ct2-int8_float16) | 13B | 13 GB | int8_float16 | ≈12.6 GB |
114
 
115
+ ## madlad400
116
+
117
+ madlad400 is a multilingual machine translation model based on the T5 architecture introduced by Google DeepMind, Google Research in Sep 2023. It was trained on 250 billion tokens covering over 450 languages using publicly available data. The paper is titled "`MADLAD-400: A Multilingual And Document-Level Large Audited Dataset`" ([arXiv:2309.04662](https://arxiv.org/abs/2309.04662)).
118
+
119
+ | Name | Parameters | Size | type/quantize | Required VRAM |
120
+ |------|------------|------|---------------|---------------|
121
+ | [jbochi/madlad400-3b-mt](https://huggingface.co/jbochi/madlad400-3b-mt) | 3B | 11.8 GB | float32 | ≈12 GB |
122
+ | [jbochi/madlad400-7b-mt](https://huggingface.co/jbochi/madlad400-7b-mt) | 7.2B | 33.2 GB | float32 | ≈19.7 GB (torch dtype in float16) |
123
+ | [jbochi/madlad400-7b-mt-bt](https://huggingface.co/jbochi/madlad400-7b-mt-bt) | 7.2B | 33.2 GB | float32 (finetuned on backtranslated data) | ≈19.7 GB (torch dtype in float16) |
124
+ | [jbochi/madlad400-8b-lm](https://huggingface.co/jbochi/madlad400-8b-lm) | 8B | 34.52 GB | float32 | N/A |
125
+ | [jbochi/madlad400-10b-mt](https://huggingface.co/jbochi/madlad400-10b-mt) | 10.7B | 42.86 GB | float32 | ≈24.3 GB (torch dtype in float16) |
126
+
127
+ ## madlad400-CTranslate2
128
+
129
+ | Name | Parameters | Size | type/quantize | Required VRAM |
130
+ |------|------------|------|---------------|---------------|
131
+ | [SoybeanMilk/madlad400-3b-mt-ct2-int8_float16](https://huggingface.co/SoybeanMilk/madlad400-3b-mt-ct2-int8_float16) | 3B | 2.95 GB | int8_float16 | ≈2.7 GB |
132
+ | [SoybeanMilk/madlad400-10b-mt-ct2-int8_float16](https://huggingface.co/SoybeanMilk/madlad400-10b-mt-ct2-int8_float16) | 10.7B | 10.7 GB | int8_float16 | ≈10 GB |
133
+
134
+ ## SeamlessM4T
135
+
136
+ SeamlessM4T is a collection of models designed to provide high quality translation, allowing people from different linguistic communities to communicate effortlessly through speech and text.
137
+
138
+ It enables multiple tasks without relying on separate models:
139
+
140
+ Speech-to-speech translation (S2ST)
141
+ Speech-to-text translation (S2TT)
142
+ Text-to-speech translation (T2ST)
143
+ Text-to-text translation (T2TT)
144
+ Automatic speech recognition (ASR)
145
+
146
+ SeamlessM4T-v1 introduced by Seamless Communication team from Meta AI in Aug 2023. The paper is titled "`SeamlessM4T: Massively Multilingual & Multimodal Machine Translation`"([arXiv:2308.11596](https://arxiv.org/abs/2308.11596))
147
+ SeamlessM4T-v2 introduced by Seamless Communication team from Meta AI in Dec 2023. The paper is titled "`Seamless: Multilingual Expressive and Streaming Speech Translation`"([arXiv:2312.05187](https://arxiv.org/abs/2312.05187))
148
+
149
+ | Name | Parameters | Size | type/quantize | Required VRAM |
150
+ |------|------------|------|---------------|---------------|
151
+ | [facebook/hf-seamless-m4t-medium](https://huggingface.co/facebook/hf-seamless-m4t-medium) | 1.2B | 4.84 GB | float32 | N/A |
152
+ | [facebook/seamless-m4t-large](https://huggingface.co/facebook/seamless-m4t-large) | 2.3B | 11.4 GB | float32 | N/A |
153
+ | [facebook/seamless-m4t-v2-large](https://huggingface.co/facebook/seamless-m4t-v2-large) | 2.3B | 11.4 GB (safetensors:9.24 GB) | float32 | N/A |
154
+
155
 
156
  # Options
157
 
 
172
  Number of beams for beam search that will be used by default in the generate method of the model. 1 means no beam search.
173
  - ctranslate2: beam_size
174
  Beam size (1 for greedy search).
175
+
176
+ ## Translation - Torch Dtype float16
177
+ - transformers: torch_dtype=torch.float16
178
+ Load the float32 translation model with float16 when the system supports GPU (reducing VRAM usage, not applicable to quantized models such as Ctranslate2, GPTQ, GGUF)
src/config.py CHANGED
@@ -82,6 +82,7 @@ class ApplicationConfig:
82
  translation_batch_size: int = 2,
83
  translation_no_repeat_ngram_size: int = 3,
84
  translation_num_beams: int = 2,
 
85
  # Whisper Segments Filter
86
  whisper_segments_filter: bool = False,
87
  whisper_segments_filters: List[str] = [],
@@ -150,6 +151,7 @@ class ApplicationConfig:
150
  self.translation_batch_size = translation_batch_size
151
  self.translation_no_repeat_ngram_size = translation_no_repeat_ngram_size
152
  self.translation_num_beams = translation_num_beams
 
153
  # Whisper Segments Filter
154
  self.whisper_segments_filter = whisper_segments_filter
155
  self.whisper_segments_filters = whisper_segments_filters
 
82
  translation_batch_size: int = 2,
83
  translation_no_repeat_ngram_size: int = 3,
84
  translation_num_beams: int = 2,
85
+ translation_torch_dtype_float16: bool = True,
86
  # Whisper Segments Filter
87
  whisper_segments_filter: bool = False,
88
  whisper_segments_filters: List[str] = [],
 
151
  self.translation_batch_size = translation_batch_size
152
  self.translation_no_repeat_ngram_size = translation_no_repeat_ngram_size
153
  self.translation_num_beams = translation_num_beams
154
+ self.translation_torch_dtype_float16 = translation_torch_dtype_float16
155
  # Whisper Segments Filter
156
  self.whisper_segments_filter = whisper_segments_filter
157
  self.whisper_segments_filters = whisper_segments_filters
src/translation/translationModel.py CHANGED
@@ -21,11 +21,12 @@ class TranslationModel:
21
  batchSize: int = 2,
22
  noRepeatNgramSize: int = 3,
23
  numBeams: int = 2,
 
24
  downloadRoot: Optional[str] = None,
25
  localFilesOnly: bool = False,
26
  loadModel: bool = False,
27
  ):
28
- """Initializes the M2M100 / Nllb-200 / mt5 model.
29
 
30
  Args:
31
  modelConfig: Config of the model to use (distilled-600M, distilled-1.3B,
@@ -76,8 +77,10 @@ class TranslationModel:
76
  device = "cuda" if "ct2" in self.modelPath else "cuda:0"
77
  else:
78
  device = "cpu"
 
79
 
80
  self.device = device
 
81
 
82
  if loadModel:
83
  self.load_model()
@@ -85,8 +88,31 @@ class TranslationModel:
85
  def load_model(self):
86
  """
87
  [from_pretrained]
88
- low_cpu_mem_usage(bool, optional)
89
- Tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model. This is an experimental feature and a subject to change at any moment.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
  [transformers.AutoTokenizer.from_pretrained]
92
  use_fast (bool, optional, defaults to True):
@@ -166,7 +192,7 @@ class TranslationModel:
166
  elif "mt5" in self.modelPath:
167
  self.mt5Prefix = self.whisperLang.whisper.code + "2" + self.translationLang.whisper.code + ": "
168
  self.transTokenizer = transformers.T5Tokenizer.from_pretrained(self.modelPath, legacy=False) #requires spiece.model
169
- self.transModel = transformers.MT5ForConditionalGeneration.from_pretrained(self.modelPath, low_cpu_mem_usage=True)
170
  self.transTranslator = transformers.pipeline('text2text-generation', model=self.transModel, device=self.device, tokenizer=self.transTokenizer)
171
  elif "ALMA" in self.modelPath:
172
  self.ALMAPrefix = "Translate this from " + self.whisperLang.whisper.names[0] + " to " + self.translationLang.whisper.names[0] + ":\n" + self.whisperLang.whisper.names[0] + ": "
@@ -185,18 +211,25 @@ class TranslationModel:
185
  import ctransformers
186
  self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelConfig.tokenizer_url)
187
  if self.device == "cpu":
188
- self.transModel = ctransformers.AutoModelForCausalLM.from_pretrained(self.modelPath, hf=True, model_file=self.modelConfig.model_file)
189
  else:
190
- self.transModel = ctransformers.AutoModelForCausalLM.from_pretrained(self.modelPath, hf=True, model_file=self.modelConfig.model_file, gpu_layers=50)
191
- self.transTranslator = transformers.pipeline("text-generation", model=self.transModel, tokenizer=self.transTokenizer, do_sample=True, temperature=0.7, top_k=40, top_p=0.95, repetition_penalty=1.1)
 
 
 
 
 
 
 
 
192
  else:
193
  self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelPath)
194
- self.transModel = transformers.AutoModelForSeq2SeqLM.from_pretrained(self.modelPath)
195
  if "m2m100" in self.modelPath:
196
  self.transTranslator = transformers.pipeline('translation', model=self.transModel, device=self.device, tokenizer=self.transTokenizer, src_lang=self.whisperLang.m2m100.code, tgt_lang=self.translationLang.m2m100.code)
197
  else: #NLLB
198
  self.transTranslator = transformers.pipeline('translation', model=self.transModel, device=self.device, tokenizer=self.transTokenizer, src_lang=self.whisperLang.nllb.code, tgt_lang=self.translationLang.nllb.code)
199
-
200
  except Exception as e:
201
  self.release_vram()
202
  raise e
@@ -223,13 +256,13 @@ class TranslationModel:
223
  del self.transTokenizer
224
  if getattr(self, "transModel", None) is not None:
225
  del self.transModel
 
 
226
  try:
227
  torch.cuda.empty_cache()
228
  except Exception as e:
229
  print(traceback.format_exc())
230
  print("\tcuda empty cache, error: " + str(e))
231
- import gc
232
- gc.collect()
233
  print("release vram end.")
234
  except Exception as e:
235
  print(traceback.format_exc())
@@ -294,6 +327,12 @@ class TranslationModel:
294
  output = self.transTranslator(self.ALMAPrefix + text + "\n" + self.translationLang.whisper.names[0] + ": ", max_length=max_length, batch_size=self.batchSize, no_repeat_ngram_size=self.noRepeatNgramSize, num_beams=self.numBeams, return_full_text=False)
295
  elif "GGUF" in self.modelPath:
296
  output = self.transTranslator(self.ALMAPrefix + text + "\n" + self.translationLang.whisper.names[0] + ": ", max_length=max_length, batch_size=self.batchSize, no_repeat_ngram_size=self.noRepeatNgramSize, num_beams=self.numBeams, return_full_text=False)
 
 
 
 
 
 
297
  result = output[0]['generated_text']
298
  else: #M2M100 & NLLB
299
  output = self.transTranslator(text, max_length=max_length, batch_size=self.batchSize, no_repeat_ngram_size=self.noRepeatNgramSize, num_beams=self.numBeams)
@@ -356,9 +395,6 @@ def download_model(
356
  "pytorch_model.bin",
357
  "pytorch_model.bin.index.json",
358
  "pytorch_model-*.bin",
359
- "pytorch_model-00001-of-00003.bin",
360
- "pytorch_model-00002-of-00003.bin",
361
- "pytorch_model-00003-of-00003.bin",
362
  "sentencepiece.bpe.model",
363
  "tokenizer.json",
364
  "tokenizer_config.json",
@@ -368,6 +404,8 @@ def download_model(
368
  "spiece.model",
369
  "vocab.json", #m2m100
370
  "model.safetensors",
 
 
371
  "quantize_config.json",
372
  "tokenizer.model",
373
  "vocabulary.json"
 
21
  batchSize: int = 2,
22
  noRepeatNgramSize: int = 3,
23
  numBeams: int = 2,
24
+ torchDtypeFloat16: bool = True,
25
  downloadRoot: Optional[str] = None,
26
  localFilesOnly: bool = False,
27
  loadModel: bool = False,
28
  ):
29
+ """Initializes the M2M100 / Nllb-200 / mt5 / ALMA / madlad400 translation model.
30
 
31
  Args:
32
  modelConfig: Config of the model to use (distilled-600M, distilled-1.3B,
 
77
  device = "cuda" if "ct2" in self.modelPath else "cuda:0"
78
  else:
79
  device = "cpu"
80
+ torchDtypeFloat16 = False
81
 
82
  self.device = device
83
+ self.torchDtypeFloat16 = torchDtypeFloat16
84
 
85
  if loadModel:
86
  self.load_model()
 
88
  def load_model(self):
89
  """
90
  [from_pretrained]
91
+ low_cpu_mem_usage(bool, optional):
92
+ Tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model. This is an experimental feature and a subject to change at any moment.
93
+
94
+ torch_dtype (str or torch.dtype, optional):
95
+ Override the default torch.dtype and load the model under a specific dtype. The different options are:
96
+ 1. torch.float16 or torch.bfloat16 or torch.float: load in a specified dtype, ignoring the model’s config.torch_dtype if one exists.
97
+ If not specified the model will get loaded in torch.float (fp32).
98
+ 2. "auto" - A torch_dtype entry in the config.json file of the model will be attempted to be used.
99
+ If this entry isn’t found then next check the dtype of the first weight in the checkpoint that’s of a floating point type and use that as dtype.
100
+ This will load the model using the dtype it was saved in at the end of the training. It can’t be used as an indicator of how the model was trained.
101
+ Since it could be trained in one of half precision dtypes, but saved in fp32.
102
+ For some models the dtype they were trained in is unknown - you may try to check the model’s paper or reach out to the authors and
103
+ ask them to add this information to the model’s card and to insert the torch_dtype entry in config.json on the hub.
104
+
105
+ device_map (str or Dict[str, Union[int, str, torch.device]] or int or torch.device, optional):
106
+ A map that specifies where each submodule should go. It doesn’t need to be refined to each parameter/buffer name,
107
+ once a given module name is inside, every submodule of it will be sent to the same device.
108
+ If we only pass the device (e.g., "cpu", "cuda:1", "mps", or a GPU ordinal rank like 1) on which the model will be allocated,
109
+ the device map will map the entire model to this device. Passing device_map = 0 means put the whole model on GPU 0.
110
+ To have Accelerate compute the most optimized device_map automatically, set device_map="auto". For more information about each option see designing a device map.
111
+
112
+ load_in_8bit (bool, optional, defaults to False)
113
+ If True, will convert the loaded model into mixed-8bit quantized model. To use this feature please install bitsandbytes (pip install -U bitsandbytes).
114
+ load_in_4bit (bool, optional, defaults to False)
115
+ If True, will convert the loaded model into 4bit precision quantized model. To use this feature install the latest version of bitsandbytes (pip install -U bitsandbytes).
116
 
117
  [transformers.AutoTokenizer.from_pretrained]
118
  use_fast (bool, optional, defaults to True):
 
192
  elif "mt5" in self.modelPath:
193
  self.mt5Prefix = self.whisperLang.whisper.code + "2" + self.translationLang.whisper.code + ": "
194
  self.transTokenizer = transformers.T5Tokenizer.from_pretrained(self.modelPath, legacy=False) #requires spiece.model
195
+ self.transModel = transformers.MT5ForConditionalGeneration.from_pretrained(self.modelPath, low_cpu_mem_usage=True, torch_dtype=torch.float16 if self.torchDtypeFloat16 else "auto")
196
  self.transTranslator = transformers.pipeline('text2text-generation', model=self.transModel, device=self.device, tokenizer=self.transTokenizer)
197
  elif "ALMA" in self.modelPath:
198
  self.ALMAPrefix = "Translate this from " + self.whisperLang.whisper.names[0] + " to " + self.translationLang.whisper.names[0] + ":\n" + self.whisperLang.whisper.names[0] + ": "
 
211
  import ctransformers
212
  self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelConfig.tokenizer_url)
213
  if self.device == "cpu":
214
+ self.transModel = ctransformers.AutoModelForCausalLM.from_pretrained(self.modelPath, hf=True, model_file=self.modelConfig.model_file, low_cpu_mem_usage=True)
215
  else:
216
+ self.transModel = ctransformers.AutoModelForCausalLM.from_pretrained(self.modelPath, hf=True, model_file=self.modelConfig.model_file, gpu_layers=50, low_cpu_mem_usage=True)
217
+ else:
218
+ self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelPath, use_fast=True)
219
+ self.transModel = transformers.AutoModelForCausalLM.from_pretrained(self.modelPath, torch_dtype=torch.float16 if self.torchDtypeFloat16 else "auto", low_cpu_mem_usage=True) #, device_map="auto"
220
+ self.transTranslator = transformers.pipeline("text-generation", model=self.transModel, device=self.device if "GPTQ" not in self.modelPath and "GGUF" not in self.modelPath else None, tokenizer=self.transTokenizer, do_sample=True, temperature=0.7, top_k=40, top_p=0.95, repetition_penalty=1.1)
221
+ elif "madlad400" in self.modelPath:
222
+ self.madlad400Prefix = "<2" + self.translationLang.whisper.code + "> "
223
+ self.transTokenizer = transformers.T5Tokenizer.from_pretrained(self.modelPath, legacy=False)
224
+ self.transModel = transformers.T5ForConditionalGeneration.from_pretrained(self.modelPath, torch_dtype=torch.float16 if self.torchDtypeFloat16 else "auto", low_cpu_mem_usage=True) #, device_map="auto"
225
+ self.transTranslator = transformers.pipeline('text2text-generation', model=self.transModel, device=self.device, tokenizer=self.transTokenizer)
226
  else:
227
  self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelPath)
228
+ self.transModel = transformers.AutoModelForSeq2SeqLM.from_pretrained(self.modelPath, torch_dtype=torch.float16 if self.torchDtypeFloat16 else "auto")
229
  if "m2m100" in self.modelPath:
230
  self.transTranslator = transformers.pipeline('translation', model=self.transModel, device=self.device, tokenizer=self.transTokenizer, src_lang=self.whisperLang.m2m100.code, tgt_lang=self.translationLang.m2m100.code)
231
  else: #NLLB
232
  self.transTranslator = transformers.pipeline('translation', model=self.transModel, device=self.device, tokenizer=self.transTokenizer, src_lang=self.whisperLang.nllb.code, tgt_lang=self.translationLang.nllb.code)
 
233
  except Exception as e:
234
  self.release_vram()
235
  raise e
 
256
  del self.transTokenizer
257
  if getattr(self, "transModel", None) is not None:
258
  del self.transModel
259
+ import gc
260
+ gc.collect()
261
  try:
262
  torch.cuda.empty_cache()
263
  except Exception as e:
264
  print(traceback.format_exc())
265
  print("\tcuda empty cache, error: " + str(e))
 
 
266
  print("release vram end.")
267
  except Exception as e:
268
  print(traceback.format_exc())
 
327
  output = self.transTranslator(self.ALMAPrefix + text + "\n" + self.translationLang.whisper.names[0] + ": ", max_length=max_length, batch_size=self.batchSize, no_repeat_ngram_size=self.noRepeatNgramSize, num_beams=self.numBeams, return_full_text=False)
328
  elif "GGUF" in self.modelPath:
329
  output = self.transTranslator(self.ALMAPrefix + text + "\n" + self.translationLang.whisper.names[0] + ": ", max_length=max_length, batch_size=self.batchSize, no_repeat_ngram_size=self.noRepeatNgramSize, num_beams=self.numBeams, return_full_text=False)
330
+ else:
331
+ output = self.transTranslator(self.ALMAPrefix + text + "\n" + self.translationLang.whisper.names[0] + ": ", max_length=max_length, batch_size=self.batchSize, no_repeat_ngram_size=self.noRepeatNgramSize, num_beams=self.numBeams, return_full_text=False)
332
+
333
+ result = output[0]['generated_text']
334
+ elif "madlad400" in self.modelPath:
335
+ output = self.transTranslator(self.madlad400Prefix + text, max_length=max_length, batch_size=self.batchSize, no_repeat_ngram_size=self.noRepeatNgramSize, num_beams=self.numBeams) #, num_return_sequences=2
336
  result = output[0]['generated_text']
337
  else: #M2M100 & NLLB
338
  output = self.transTranslator(text, max_length=max_length, batch_size=self.batchSize, no_repeat_ngram_size=self.noRepeatNgramSize, num_beams=self.numBeams)
 
395
  "pytorch_model.bin",
396
  "pytorch_model.bin.index.json",
397
  "pytorch_model-*.bin",
 
 
 
398
  "sentencepiece.bpe.model",
399
  "tokenizer.json",
400
  "tokenizer_config.json",
 
404
  "spiece.model",
405
  "vocab.json", #m2m100
406
  "model.safetensors",
407
+ "model-*.safetensors",
408
+ "model.safetensors.index.json",
409
  "quantize_config.json",
410
  "tokenizer.model",
411
  "vocabulary.json"