Spaces:

mms-meta
/

mms-zeroshot

Running

App Files Files Community

Vineel Pratap commited on Jul 3

Commit

a4107b1

•

1 Parent(s): 9f2bd1d

update_model

Browse files

Files changed (10) hide show

app.py +48 -7
style.css +6 -0
upload/mms_zs/config.json +108 -0
upload/mms_zs/model.safetensors +3 -0
upload/mms_zs/preprocessor_config.json +10 -0
upload/mms_zs/special_tokens_map.json +6 -0
upload/mms_zs/tokenizer_config.json +48 -0
upload/mms_zs/tokens.txt +32 -0
upload/mms_zs/vocab.json +34 -0
zeroshot.py +51 -46

app.py CHANGED Viewed

@@ -1,24 +1,65 @@
 import gradio as gr
 from zeroshot import process, ZS_EXAMPLES
-with gr.Blocks() as demo:
-    gr.Markdown("")
     gr.Markdown(
         "<p align='center' style='font-size: 20px;'>MMS Zero-shot ASR Demo. See our arXiV <a href='https://arxiv.org/'>paper</a> for model details.</p>"
     )
     gr.HTML(
-        """<center>The demo works on input audio in any language, as long as you provide a list of words for that language and an optional n-gram language model (even a simple 1-gram model will work!) to help with accuracy.</center>"""
     )
     with gr.Row():
         with gr.Column():
             audio = gr.Audio(label="Audio Input\n(use microphone or upload a file)")
             with gr.Row():
-                words_file = gr.File(label="Words File\n(one word per line)")
                 lm_file = gr.File(label="Language Model\n(optional)")
-            btn = gr.Button("Submit")
         with gr.Column():
             text = gr.Textbox(label="Transcript")
-    btn.click(process, inputs=[audio, words_file, lm_file], outputs=text)
     examples = gr.Examples(examples=ZS_EXAMPLES, inputs=[audio, words_file])
-demo.launch(share=True)

 import gradio as gr
 from zeroshot import process, ZS_EXAMPLES
+with gr.Blocks(css="style.css") as demo:
     gr.Markdown(
         "<p align='center' style='font-size: 20px;'>MMS Zero-shot ASR Demo. See our arXiV <a href='https://arxiv.org/'>paper</a> for model details.</p>"
     )
     gr.HTML(
+        """<center>The demo works on input audio in any language, as long as you provide a list of words or sentences for that language and an optional n-gram language model (even a simple 1-gram model will work!) to help with accuracy.<br>We recommend having a minimum of 5000 distinct words in the textfile to acheive a good performance.</center>"""
     )
     with gr.Row():
         with gr.Column():
             audio = gr.Audio(label="Audio Input\n(use microphone or upload a file)")
             with gr.Row():
+                words_file = gr.File(label="Text Data")
                 lm_file = gr.File(label="Language Model\n(optional)")
+            with gr.Accordion("Advanced Settings", open=False):
+                gr.Markdown(
+                    "The following parameters are used for beam-search decoding. Use the default values if you are not sure."
+                )
+                with gr.Row():
+                    wscore = gr.Slider(
+                        minimum=-10.0,
+                        maximum=10.0,
+                        value=0,
+                        step=0.1,
+                        interactive=True,
+                        label="Word Insertion Score",
+                    )
+                    lmscore = gr.Slider(
+                        minimum=-10.0,
+                        maximum=10.0,
+                        value=0,
+                        step=0.1,
+                        interactive=True,
+                        label="Language Model Score",
+                    )
+                with gr.Row():
+                    wscore_usedefault = gr.Checkbox(
+                        label="Use Default Word Insertion Score", value=True
+                    )
+                    lmscore_usedefault = gr.Checkbox(
+                        label="Use Default Language Model Score", value=True
+                    )
+            btn = gr.Button("Submit", elem_id="submit")
         with gr.Column():
             text = gr.Textbox(label="Transcript")
+    btn.click(
+        process,
+        inputs=[
+            audio,
+            words_file,
+            lm_file,
+            wscore,
+            lmscore,
+            wscore_usedefault,
+            lmscore_usedefault,
+        ],
+        outputs=text,
+    )
     examples = gr.Examples(examples=ZS_EXAMPLES, inputs=[audio, words_file])
+demo.launch()

style.css ADDED Viewed

	@@ -0,0 +1,6 @@

+#submit {
+    margin: auto;
+    color: #fff;
+    background: #1565c0;
+    border-radius: 100vh;
+  }

upload/mms_zs/config.json ADDED Viewed

	@@ -0,0 +1,108 @@

+{
+  "activation_dropout": 0.0,
+  "adapter_attn_dim": null,
+  "adapter_kernel_size": 3,
+  "adapter_stride": 2,
+  "add_adapter": false,
+  "apply_spec_augment": true,
+  "architectures": [
+    "Wav2Vec2ForCTC"
+  ],
+  "attention_dropout": 0.1,
+  "bos_token_id": 1,
+  "classifier_proj_size": 256,
+  "codevector_dim": 768,
+  "contrastive_logits_temperature": 0.1,
+  "conv_bias": true,
+  "conv_dim": [
+    512,
+    512,
+    512,
+    512,
+    512,
+    512,
+    512
+  ],
+  "conv_kernel": [
+    10,
+    3,
+    3,
+    3,
+    3,
+    2,
+    2
+  ],
+  "conv_stride": [
+    5,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2
+  ],
+  "ctc_loss_reduction": "sum",
+  "ctc_zero_infinity": false,
+  "diversity_loss_weight": 0.1,
+  "do_stable_layer_norm": true,
+  "eos_token_id": 2,
+  "feat_extract_activation": "gelu",
+  "feat_extract_dropout": 0.0,
+  "feat_extract_norm": "layer",
+  "feat_proj_dropout": 0.1,
+  "feat_quantizer_dropout": 0.0,
+  "final_dropout": 0.0,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.1,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-05,
+  "layerdrop": 0.1,
+  "mask_feature_length": 10,
+  "mask_feature_min_masks": 0,
+  "mask_feature_prob": 0.0,
+  "mask_time_length": 10,
+  "mask_time_min_masks": 2,
+  "mask_time_prob": 0.075,
+  "model_type": "wav2vec2",
+  "num_adapter_layers": 3,
+  "num_attention_heads": 16,
+  "num_codevector_groups": 2,
+  "num_codevectors_per_group": 320,
+  "num_conv_pos_embedding_groups": 16,
+  "num_conv_pos_embeddings": 128,
+  "num_feat_extract_layers": 7,
+  "num_hidden_layers": 24,
+  "num_negatives": 100,
+  "output_hidden_size": 1024,
+  "pad_token_id": 0,
+  "proj_codevector_dim": 768,
+  "tdnn_dilation": [
+    1,
+    2,
+    3,
+    1,
+    1
+  ],
+  "tdnn_dim": [
+    512,
+    512,
+    512,
+    512,
+    1500
+  ],
+  "tdnn_kernel": [
+    5,
+    3,
+    3,
+    1,
+    1
+  ],
+  "torch_dtype": "float32",
+  "transformers_version": "4.42.1",
+  "use_weighted_layer_sum": false,
+  "vocab_size": 32,
+  "xvector_output_dim": 512
+}

upload/mms_zs/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:39baa2c87b9abd9910c1982bf82aabda3dbe3ba615e20d5ee0be1026975dcb8c
+size 1261938632

upload/mms_zs/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "do_normalize": true,
+  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
+  "feature_size": 1,
+  "padding_side": "right",
+  "padding_value": 0,
+  "processor_class": "Wav2Vec2Processor",
+  "return_attention_mask": true,
+  "sampling_rate": 16000
+}

upload/mms_zs/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "bos_token": "<s>",
+  "eos_token": "</s>",
+  "pad_token": "<pad>",
+  "unk_token": "<unk>"
+}

upload/mms_zs/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": true,
+  "do_lower_case": false,
+  "eos_token": "</s>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "processor_class": "Wav2Vec2Processor",
+  "replace_word_delimiter_char": " ",
+  "target_lang": null,
+  "tokenizer_class": "Wav2Vec2CTCTokenizer",
+  "unk_token": "<unk>",
+  "word_delimiter_token": "|"
+}

upload/mms_zs/tokens.txt ADDED Viewed

	@@ -0,0 +1,32 @@

+<s>
+<pad>
+</s>
+<unk>
+|
+a
+i
+e
+n
+o
+u
+t
+k
+m
+s
+r
+l
+h
+g
+d
+y
+b
+p
+c
+w
+j
+'
+v
+z
+f
+q
+x

upload/mms_zs/vocab.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "'": 26,
+  "</s>": 2,
+  "<pad>": 0,
+  "<s>": 1,
+  "<unk>": 3,
+  "a": 5,
+  "b": 21,
+  "c": 23,
+  "d": 19,
+  "e": 7,
+  "f": 29,
+  "g": 18,
+  "h": 17,
+  "i": 6,
+  "j": 25,
+  "k": 12,
+  "l": 16,
+  "m": 13,
+  "n": 8,
+  "o": 9,
+  "p": 22,
+  "q": 30,
+  "r": 15,
+  "s": 14,
+  "t": 11,
+  "u": 10,
+  "v": 27,
+  "w": 24,
+  "x": 31,
+  "y": 20,
+  "z": 28,
+  "|": 4
+}

zeroshot.py CHANGED Viewed

@@ -16,34 +16,17 @@ UROMAN_PL = os.path.join(uroman_dir, "bin", "uroman.pl")
 ASR_SAMPLING_RATE = 16_000
-MODEL_ID = "facebook/mms-1b-all"
 processor = AutoProcessor.from_pretrained(MODEL_ID)
 model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
-lm_decoding_config = {}
-lm_decoding_configfile = hf_hub_download(
-    repo_id="facebook/mms-cclms",
-    filename="decoding_config.json",
-    subfolder="mms-1b-all",
-)
-with open(lm_decoding_configfile) as f:
-    lm_decoding_config = json.loads(f.read())
-decoding_config = lm_decoding_config["eng"]
-lm_file = hf_hub_download(
-    repo_id="facebook/mms-cclms",
-    filename=decoding_config["lmfile"].rsplit("/", 1)[1],
-    subfolder=decoding_config["lmfile"].rsplit("/", 1)[0],
-)
-token_file = hf_hub_download(
-    repo_id="facebook/mms-cclms",
-    filename=decoding_config["tokensfile"].rsplit("/", 1)[1],
-    subfolder=decoding_config["tokensfile"].rsplit("/", 1)[0],
-)
 def error_check_file(filepath):
     if not isinstance(filepath, str):
@@ -53,13 +36,15 @@ def error_check_file(filepath):
     if not os.path.exists(filepath):
         return "Input file '{}' doesn't exists".format(type(filepath))
 def norm_uroman(text):
     text = text.lower()
     text = text.replace("’", "'")
     text = re.sub("([^a-z' ])", " ", text)
-    text = re.sub(' +', ' ', text)
     return text.strip()
 def uromanize(words):
     iso = "xxx"
     with tempfile.NamedTemporaryFile() as tf, tempfile.NamedTemporaryFile() as tf2:
@@ -72,24 +57,35 @@ def uromanize(words):
         lexicon = {}
         with open(tf2.name) as f:
             for idx, line in enumerate(f):
                 line = re.sub(r"\s+", " ", norm_uroman(line)).strip()
                 lexicon[words[idx]] = " ".join(line) + " |"
     return lexicon
 def load_lexicon(filepath):
-    words = []
     with open(filepath) as f:
         for line in f:
             line = line.strip()
             # ignore invalid words.
             if not line or " " in line or len(line) > 50:
                 continue
-            words.append(line)
-    return uromanize(words)
-def process(audio_data, words_file, lm_path=None):
     if isinstance(audio_data, tuple):
         # microphone
         sr, audio_samples = audio_data
@@ -101,17 +97,18 @@ def process(audio_data, words_file, lm_path=None):
         audio_samples = librosa.load(audio_data, sr=ASR_SAMPLING_RATE, mono=True)[0]
     # print(audio_samples[:10])
     # print("I'm here 102")
-    # print("len audio_samples", len(audio_samples))
     lang_code = "eng"
-    processor.tokenizer.set_target_lang(lang_code)
     # print("I'm here 107")
-    model.load_adapter(lang_code)
     # print("I'm here 109")
     inputs = processor(
         audio_samples, sampling_rate=ASR_SAMPLING_RATE, return_tensors="pt"
     )
     # print("I'm here 106")
     # set device
     if torch.cuda.is_available():
         device = torch.device("cuda")
@@ -123,27 +120,37 @@ def process(audio_data, words_file, lm_path=None):
         device = torch.device("mps")
     else:
         device = torch.device("cpu")
     model.to(device)
     inputs = inputs.to(device)
     # print("I'm here 122")
     with torch.no_grad():
         outputs = model(**inputs).logits
-    # Setup lexicon and decoder
     # print("before uroman")
     lexicon = load_lexicon(words_file)
     # print("after uroman")
     # print("len lexicon", len(lexicon))
     with tempfile.NamedTemporaryFile() as lexicon_file:
         with open(lexicon_file.name, "w") as f:
             idx = 10
             for word, spelling in lexicon.items():
                 f.write(word + " " + spelling + "\n")
-                if idx%100 == 0:
                     print(word, spelling, flush=True)
-                idx+=1
         beam_search_decoder = ctc_decoder(
             lexicon=lexicon_file.name,
             tokens=token_file,
@@ -151,9 +158,9 @@ def process(audio_data, words_file, lm_path=None):
             nbest=1,
             beam_size=500,
             beam_size_token=50,
-            lm_weight=float(decoding_config["lmweight"]),
-            word_score=float(decoding_config["wordscore"]),
-            sil_score=float(decoding_config["silweight"]),
             blank_token="<s>",
         )
@@ -163,8 +170,6 @@ def process(audio_data, words_file, lm_path=None):
     return transcription
-ZS_EXAMPLES = [
-    ["upload/english.mp3", "upload/words_top10k.txt"]
-]
-# print(process("upload/english.mp3", "upload/words_top10k.txt"))

 ASR_SAMPLING_RATE = 16_000
+WORD_SCORE_DEAULT_IF_LM = -0.18
+WORD_SCORE_DEAULT_IF_NOLM = -3.5
+LM_SCORE_DEAULT = 1.48
+MODEL_ID = "upload/mms_zs"
 processor = AutoProcessor.from_pretrained(MODEL_ID)
 model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
+token_file = "upload/mms_zs/tokens.txt"
 def error_check_file(filepath):
     if not isinstance(filepath, str):
     if not os.path.exists(filepath):
         return "Input file '{}' doesn't exists".format(type(filepath))
 def norm_uroman(text):
     text = text.lower()
     text = text.replace("’", "'")
     text = re.sub("([^a-z' ])", " ", text)
+    text = re.sub(" +", " ", text)
     return text.strip()
 def uromanize(words):
     iso = "xxx"
     with tempfile.NamedTemporaryFile() as tf, tempfile.NamedTemporaryFile() as tf2:
         lexicon = {}
         with open(tf2.name) as f:
             for idx, line in enumerate(f):
+                if not line.strip():
+                    continue
                 line = re.sub(r"\s+", " ", norm_uroman(line)).strip()
                 lexicon[words[idx]] = " ".join(line) + " |"
     return lexicon
 def load_lexicon(filepath):
+    words = {}
     with open(filepath) as f:
         for line in f:
             line = line.strip()
             # ignore invalid words.
             if not line or " " in line or len(line) > 50:
                 continue
+            for w in line.split():
+                words[w.lower()] = True
+    return uromanize(list(words.keys()))
+def process(
+    audio_data,
+    words_file,
+    lm_path=None,
+    wscore=None,
+    lmscore=None,
+    wscore_usedefault=True,
+    lmscore_usedefault=True,
+):
     if isinstance(audio_data, tuple):
         # microphone
         sr, audio_samples = audio_data
         audio_samples = librosa.load(audio_data, sr=ASR_SAMPLING_RATE, mono=True)[0]
     # print(audio_samples[:10])
     # print("I'm here 102")
+    print("len audio_samples", len(audio_samples))
     lang_code = "eng"
+    # processor.tokenizer.set_target_lang(lang_code)
     # print("I'm here 107")
+    # model.load_adapter(lang_code)
     # print("I'm here 109")
     inputs = processor(
         audio_samples, sampling_rate=ASR_SAMPLING_RATE, return_tensors="pt"
     )
     # print("I'm here 106")
+    print("inputs type", type(inputs))
+    # print("inputs size", inputs.size)
     # set device
     if torch.cuda.is_available():
         device = torch.device("cuda")
         device = torch.device("mps")
     else:
         device = torch.device("cpu")
+    device = torch.device("cpu")
     model.to(device)
     inputs = inputs.to(device)
     # print("I'm here 122")
     with torch.no_grad():
         outputs = model(**inputs).logits
+    # Setup lexicon and decoder
     # print("before uroman")
     lexicon = load_lexicon(words_file)
     # print("after uroman")
     # print("len lexicon", len(lexicon))
     with tempfile.NamedTemporaryFile() as lexicon_file:
         with open(lexicon_file.name, "w") as f:
             idx = 10
             for word, spelling in lexicon.items():
                 f.write(word + " " + spelling + "\n")
+                if idx % 100 == 0:
                     print(word, spelling, flush=True)
+                idx += 1
+        if wscore_usedefault:
+            wscore = (
+                WORD_SCORE_DEAULT_IF_LM
+                if lm_path is not None
+                else WORD_SCORE_DEAULT_IF_NOLM
+            )
+        if lmscore_usedefault:
+            lmscore = LM_SCORE_DEAULT if lm_path is not None else 0
         beam_search_decoder = ctc_decoder(
             lexicon=lexicon_file.name,
             tokens=token_file,
             nbest=1,
             beam_size=500,
             beam_size_token=50,
+            lm_weight=lmscore,
+            word_score=wscore,
+            sil_score=0,
             blank_token="<s>",
         )
     return transcription
+ZS_EXAMPLES = [["upload/english.mp3", "upload/words_top10k.txt"]]
+print(process("upload/english.mp3", "upload/words_top10k.txt"))