my_whisper_demo

Sleeping

App Files Files Community

SoybeanMilk commited on Dec 12, 2023

Commit

e8762f9

1 Parent(s): 50167d4

Add support for the ALMA model.

Browse files

Files changed (5) hide show

app.py +19 -0
config.json5 +7 -0
src/config.py +2 -2
src/translation/translationModel.py +18 -1
src/utils.py +1 -1

app.py CHANGED Viewed

@@ -231,6 +231,8 @@ class WhisperTranscriber:
             nllbLangName:     str = decodeOptions.pop("nllbLangName")
             mt5ModelName:     str = decodeOptions.pop("mt5ModelName")
             mt5LangName:      str = decodeOptions.pop("mt5LangName")
             translationBatchSize:         int = decodeOptions.pop("translationBatchSize")
             translationNoRepeatNgramSize: int = decodeOptions.pop("translationNoRepeatNgramSize")
@@ -337,6 +339,10 @@ class WhisperTranscriber:
                     selectedModelName = mt5ModelName if mt5ModelName is not None and len(mt5ModelName) > 0 else "mt5-zh-ja-en-trimmed/K024"
                     selectedModel = next((modelConfig for modelConfig in self.app_config.models["mt5"] if modelConfig.name == selectedModelName), None)
                     translationLang = get_lang_from_m2m100_name(mt5LangName)
                 if translationLang is not None:
                     translationModel = TranslationModel(modelConfig=selectedModel, whisperLang=whisperLang, translationLang=translationLang, batchSize=translationBatchSize, noRepeatNgramSize=translationNoRepeatNgramSize, numBeams=translationNumBeams)
@@ -828,6 +834,7 @@ def create_ui(app_config: ApplicationConfig):
     nllb_models = app_config.get_model_names("nllb")
     m2m100_models = app_config.get_model_names("m2m100")
     mt5_models = app_config.get_model_names("mt5")
     common_whisper_inputs = lambda : {
         gr.Dropdown(label="Whisper - Model (for audio)", choices=whisper_models, value=app_config.default_model_name, elem_id="whisperModelName"),
@@ -845,6 +852,10 @@ def create_ui(app_config: ApplicationConfig):
         gr.Dropdown(label="MT5 - Model (for translate)", choices=mt5_models, elem_id="mt5ModelName"),
         gr.Dropdown(label="MT5 - Language", choices=sorted(get_lang_m2m100_names(["en", "ja", "zh"])), elem_id="mt5LangName"),
     }
     common_translation_inputs = lambda : {
         gr.Number(label="Translation - Batch Size", precision=0, value=app_config.translation_batch_size, elem_id="translationBatchSize"),
@@ -905,9 +916,13 @@ def create_ui(app_config: ApplicationConfig):
                     with gr.Tab(label="MT5") as simpleMT5Tab:
                         with gr.Row():
                             simpleInputDict.update(common_mt5_inputs())
                     simpleM2M100Tab.select(fn=lambda: "m2m100", inputs = [], outputs= [simpleTranslateInput] )
                     simpleNllbTab.select(fn=lambda: "nllb", inputs = [], outputs= [simpleTranslateInput] )
                     simpleMT5Tab.select(fn=lambda: "mt5", inputs = [], outputs= [simpleTranslateInput] )
                 with gr.Column():
                     with gr.Tab(label="URL") as simpleUrlTab:
                         simpleInputDict.update({gr.Text(label="URL (YouTube, etc.)", elem_id = "urlData")})
@@ -964,9 +979,13 @@ def create_ui(app_config: ApplicationConfig):
                     with gr.Tab(label="MT5") as fullMT5Tab:
                         with gr.Row():
                             fullInputDict.update(common_mt5_inputs())
                     fullM2M100Tab.select(fn=lambda: "m2m100", inputs = [], outputs= [fullTranslateInput] )
                     fullNllbTab.select(fn=lambda: "nllb", inputs = [], outputs= [fullTranslateInput] )
                     fullMT5Tab.select(fn=lambda: "mt5", inputs = [], outputs= [fullTranslateInput] )
                 with gr.Column():
                     with gr.Tab(label="URL") as fullUrlTab:
                         fullInputDict.update({gr.Text(label="URL (YouTube, etc.)", elem_id = "urlData")})

             nllbLangName:     str = decodeOptions.pop("nllbLangName")
             mt5ModelName:     str = decodeOptions.pop("mt5ModelName")
             mt5LangName:      str = decodeOptions.pop("mt5LangName")
+            ALMAModelName:    str = decodeOptions.pop("ALMAModelName")
+            ALMALangName:     str = decodeOptions.pop("ALMALangName")
             translationBatchSize:         int = decodeOptions.pop("translationBatchSize")
             translationNoRepeatNgramSize: int = decodeOptions.pop("translationNoRepeatNgramSize")
                     selectedModelName = mt5ModelName if mt5ModelName is not None and len(mt5ModelName) > 0 else "mt5-zh-ja-en-trimmed/K024"
                     selectedModel = next((modelConfig for modelConfig in self.app_config.models["mt5"] if modelConfig.name == selectedModelName), None)
                     translationLang = get_lang_from_m2m100_name(mt5LangName)
+                elif translateInput == "ALMA" and ALMALangName is not None and len(ALMALangName) > 0:
+                    selectedModelName = ALMAModelName if ALMAModelName is not None and len(ALMAModelName) > 0 else "ALMA-13B-GPTQ/TheBloke"
+                    selectedModel = next((modelConfig for modelConfig in self.app_config.models["ALMA"] if modelConfig.name == selectedModelName), None)
+                    translationLang = get_lang_from_m2m100_name(ALMALangName)
                 if translationLang is not None:
                     translationModel = TranslationModel(modelConfig=selectedModel, whisperLang=whisperLang, translationLang=translationLang, batchSize=translationBatchSize, noRepeatNgramSize=translationNoRepeatNgramSize, numBeams=translationNumBeams)
     nllb_models = app_config.get_model_names("nllb")
     m2m100_models = app_config.get_model_names("m2m100")
     mt5_models = app_config.get_model_names("mt5")
+    ALMA_models = app_config.get_model_names("ALMA")
     common_whisper_inputs = lambda : {
         gr.Dropdown(label="Whisper - Model (for audio)", choices=whisper_models, value=app_config.default_model_name, elem_id="whisperModelName"),
         gr.Dropdown(label="MT5 - Model (for translate)", choices=mt5_models, elem_id="mt5ModelName"),
         gr.Dropdown(label="MT5 - Language", choices=sorted(get_lang_m2m100_names(["en", "ja", "zh"])), elem_id="mt5LangName"),
     }
+    common_ALMA_inputs = lambda : {
+        gr.Dropdown(label="ALMA - Model (for translate)", choices=ALMA_models, elem_id="ALMAModelName"),
+        gr.Dropdown(label="ALMA - Language", choices=sorted(get_lang_m2m100_names(["en", "ja", "zh"])), elem_id="ALMALangName"),
+    }
     common_translation_inputs = lambda : {
         gr.Number(label="Translation - Batch Size", precision=0, value=app_config.translation_batch_size, elem_id="translationBatchSize"),
                     with gr.Tab(label="MT5") as simpleMT5Tab:
                         with gr.Row():
                             simpleInputDict.update(common_mt5_inputs())
+                    with gr.Tab(label="ALMA") as simpleALMATab:
+                        with gr.Row():
+                            simpleInputDict.update(common_ALMA_inputs())
                     simpleM2M100Tab.select(fn=lambda: "m2m100", inputs = [], outputs= [simpleTranslateInput] )
                     simpleNllbTab.select(fn=lambda: "nllb", inputs = [], outputs= [simpleTranslateInput] )
                     simpleMT5Tab.select(fn=lambda: "mt5", inputs = [], outputs= [simpleTranslateInput] )
+                    simpleALMATab.select(fn=lambda: "ALMA", inputs = [], outputs= [simpleTranslateInput] )
                 with gr.Column():
                     with gr.Tab(label="URL") as simpleUrlTab:
                         simpleInputDict.update({gr.Text(label="URL (YouTube, etc.)", elem_id = "urlData")})
                     with gr.Tab(label="MT5") as fullMT5Tab:
                         with gr.Row():
                             fullInputDict.update(common_mt5_inputs())
+                    with gr.Tab(label="ALMA") as fullALMATab:
+                        with gr.Row():
+                            fullInputDict.update(common_ALMA_inputs())
                     fullM2M100Tab.select(fn=lambda: "m2m100", inputs = [], outputs= [fullTranslateInput] )
                     fullNllbTab.select(fn=lambda: "nllb", inputs = [], outputs= [fullTranslateInput] )
                     fullMT5Tab.select(fn=lambda: "mt5", inputs = [], outputs= [fullTranslateInput] )
+                    fullALMATab.select(fn=lambda: "ALMA", inputs = [], outputs= [fullTranslateInput] )
                 with gr.Column():
                     with gr.Tab(label="URL") as fullUrlTab:
                         fullInputDict.update({gr.Text(label="URL (YouTube, etc.)", elem_id = "urlData")})

config.json5 CHANGED Viewed

@@ -191,6 +191,13 @@
         "url": "engmatic-earth/mt5-zh-ja-en-trimmed-fine-tuned-v1",
         "type": "huggingface"
       }
     ]
   },
   // Configuration options that will be used if they are not specified in the command line arguments.

         "url": "engmatic-earth/mt5-zh-ja-en-trimmed-fine-tuned-v1",
         "type": "huggingface"
       }
+    ],
+    "ALMA": [
+      {
+        "name": "ALMA-13B-GPTQ/TheBloke",
+        "url": "TheBloke/ALMA-13B-GPTQ",
+        "type": "huggingface",
+      },
     ]
   },
   // Configuration options that will be used if they are not specified in the command line arguments.

src/config.py CHANGED Viewed

@@ -43,7 +43,7 @@ class VadInitialPromptMode(Enum):
             return None
 class ApplicationConfig:
-    def __init__(self, models: Dict[Literal["whisper", "m2m100", "nllb", "mt5"], List[ModelConfig]],
                  input_audio_max_duration: int = 600, share: bool = False, server_name: str = None, server_port: int = 7860,
                  queue_concurrency_count: int = 1, delete_uploaded_files: bool = True,
                  whisper_implementation: str = "whisper", default_model_name: str = "medium",
@@ -169,7 +169,7 @@ class ApplicationConfig:
             # Load using json5
             data = json5.load(f)
             data_models = data.pop("models", [])
-            models: Dict[Literal["whisper", "m2m100", "nllb", "mt5"], List[ModelConfig]] = {
                 key: [ModelConfig(**item) for item in value]
                 for key, value in data_models.items()
             }

             return None
 class ApplicationConfig:
+    def __init__(self, models: Dict[Literal["whisper", "m2m100", "nllb", "mt5", "ALMA"], List[ModelConfig]],
                  input_audio_max_duration: int = 600, share: bool = False, server_name: str = None, server_port: int = 7860,
                  queue_concurrency_count: int = 1, delete_uploaded_files: bool = True,
                  whisper_implementation: str = "whisper", default_model_name: str = "medium",
             # Load using json5
             data = json5.load(f)
             data_models = data.pop("models", [])
+            models: Dict[Literal["whisper", "m2m100", "nllb", "mt5", "ALMA"], List[ModelConfig]] = {
                 key: [ModelConfig(**item) for item in value]
                 for key, value in data_models.items()
             }

src/translation/translationModel.py CHANGED Viewed

@@ -7,6 +7,8 @@ import torch
 import ctranslate2
 import transformers
 from typing import Optional
 from src.config import ModelConfig
 from src.translation.translationLangs import TranslationLang, get_lang_from_whisper_code
@@ -97,6 +99,11 @@ class TranslationModel:
             self.transTokenizer = transformers.T5Tokenizer.from_pretrained(self.modelPath, legacy=False) #requires spiece.model
             self.transModel = transformers.MT5ForConditionalGeneration.from_pretrained(self.modelPath)
             self.transTranslator = transformers.pipeline('text2text-generation', model=self.transModel, device=self.device, tokenizer=self.transTokenizer)
         else:
             self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelPath)
             self.transModel = transformers.AutoModelForSeq2SeqLM.from_pretrained(self.modelPath)
@@ -130,6 +137,12 @@ class TranslationModel:
             elif "mt5" in self.modelPath:
                 output = self.transTranslator(self.mt5Prefix + text, max_length=max_length, batch_size=self.batchSize, no_repeat_ngram_size=self.noRepeatNgramSize, num_beams=self.numBeams) #, num_return_sequences=2
                 result = output[0]['generated_text']
             else: #M2M100 & NLLB
                 output = self.transTranslator(text, max_length=max_length, batch_size=self.batchSize, no_repeat_ngram_size=self.noRepeatNgramSize, num_beams=self.numBeams)
                 result = output[0]['translation_text']
@@ -148,7 +161,8 @@ _MODELS = ["distilled-600M", "distilled-1.3B", "1.3B", "3.3B",
            "m2m100_1.2B-ct2", "m2m100_418M-ct2", "m2m100-12B-ct2",
            "m2m100_1.2B", "m2m100_418M",
            "mt5-zh-ja-en-trimmed",
-           "mt5-zh-ja-en-trimmed-fine-tuned-v1"]
 def check_model_name(name):
     return any(allowed_name in name for allowed_name in _MODELS)
@@ -206,6 +220,9 @@ def download_model(
         "special_tokens_map.json",
         "spiece.model",
         "vocab.json", #m2m100
     ]
     kwargs = {

 import ctranslate2
 import transformers
+import re
 from typing import Optional
 from src.config import ModelConfig
 from src.translation.translationLangs import TranslationLang, get_lang_from_whisper_code
             self.transTokenizer = transformers.T5Tokenizer.from_pretrained(self.modelPath, legacy=False) #requires spiece.model
             self.transModel = transformers.MT5ForConditionalGeneration.from_pretrained(self.modelPath)
             self.transTranslator = transformers.pipeline('text2text-generation', model=self.transModel, device=self.device, tokenizer=self.transTokenizer)
+        elif "ALMA" in self.modelPath:
+            self.ALMAPrefix = "Translate this from " + self.whisperLang.whisper.code + " to " + self.translationLang.whisper.code + ":" + self.whisperLang.whisper.code + ":"
+            self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelPath, use_fast=True)
+            self.transModel = transformers.AutoModelForCausalLM.from_pretrained(self.modelPath, device_map="auto", trust_remote_code=False, revision="main")
+            self.transTranslator = transformers.pipeline("text-generation", model=self.transModel, tokenizer=self.transTokenizer, batch_size=2, do_sample=True, temperature=0.7, top_p=0.95, top_k=40, repetition_penalty=1.1)
         else:
             self.transTokenizer = transformers.AutoTokenizer.from_pretrained(self.modelPath)
             self.transModel = transformers.AutoModelForSeq2SeqLM.from_pretrained(self.modelPath)
             elif "mt5" in self.modelPath:
                 output = self.transTranslator(self.mt5Prefix + text, max_length=max_length, batch_size=self.batchSize, no_repeat_ngram_size=self.noRepeatNgramSize, num_beams=self.numBeams) #, num_return_sequences=2
                 result = output[0]['generated_text']
+            elif "ALMA" in self.modelPath:
+                output = self.transTranslator(self.ALMAPrefix + text + self.translationLang.whisper.code + ":", max_length=max_length, batch_size=self.batchSize, no_repeat_ngram_size=self.noRepeatNgramSize, num_beams=self.numBeams)
+                result = output[0]['generated_text']
+                result = re.sub(rf'^(.*{self.translationLang.whisper.code}: )', '', result)  # Remove the prompt from the result
+                result = re.sub(rf'^(Translate this from .* to .*:)', '', result)  # Remove the translation instruction
+                return result.strip()
             else: #M2M100 & NLLB
                 output = self.transTranslator(text, max_length=max_length, batch_size=self.batchSize, no_repeat_ngram_size=self.noRepeatNgramSize, num_beams=self.numBeams)
                 result = output[0]['translation_text']
            "m2m100_1.2B-ct2", "m2m100_418M-ct2", "m2m100-12B-ct2",
            "m2m100_1.2B", "m2m100_418M",
            "mt5-zh-ja-en-trimmed",
+           "mt5-zh-ja-en-trimmed-fine-tuned-v1",
+           "ALMA-13B-GPTQ"]
 def check_model_name(name):
     return any(allowed_name in name for allowed_name in _MODELS)
         "special_tokens_map.json",
         "spiece.model",
         "vocab.json", #m2m100
+        "model.safetensors",
+        "quantize_config.json",
+        "tokenizer.model"
     ]
     kwargs = {

src/utils.py CHANGED Viewed

@@ -130,7 +130,7 @@ def write_srt_original(transcript: Iterator[dict], file: TextIO,
             flush=True,
         )
-        if original is not None: print(f"{original}",
             file=file,
             flush=True)

             flush=True,
         )
+        if original is not None: print(f"{original}\n",
             file=file,
             flush=True)