First model version (wiki-gl LM)

Browse files

Files changed (15) hide show

README.md +64 -0
added_tokens.json +1 -0
alphabet.json +1 -0
config.json +76 -0
flax_model.msgpack +3 -0
language_model/attrs.json +1 -0
language_model/unigrams.txt +0 -0
language_model/wiki-gl.arpa.bin +3 -0
preprocessor_config.json +8 -0
pytorch_model.bin +3 -0
special_tokens_map.json +1 -0
tokenizer_config.json +1 -0
vocab.json +1 -0
wav2vec_gl_wer_cv.py +123 -0
wav2vec_gl_wer_slr77.py +118 -0

README.md ADDED Viewed

	@@ -0,0 +1,64 @@

+---
+language: gl
+datasets:
+- OpenSLR 77
+metrics:
+- wer
+- cer
+tags:
+- audio
+- automatic-speech-recognition
+- gl
+license:
+model-index:
+- name: Wav2Vec2-Large-XLSR-53-Galician-With-LM
+  results:
+  - task:
+      name: Automatic Speech Recognition
+      type: automatic-speech-recognition
+    dataset:
+      name: OpenSLR
+      type: openslr
+      args: gl
+    metrics:
+       - name: Test WER
+         type: wer
+         value: 9.10
+       - name: Test CER
+         type: cer
+         value: 3.94
+       - name: Test WER (+LM)
+         type: wer
+         value: 6.86
+       - name: Test CER (+LM)
+         type: cer
+         value: 2.20
+  - task:
+      name: Automatic Speech Recognition
+      type: automatic-speech-recognition
+    dataset:
+      name: Common Voice 7.0
+      type: mozilla-foundation/common_voice_7_0
+      args: gl
+    metrics:
+       - name: Test WER
+         type: wer
+         value: 22.12
+       - name: Test CER
+         type: cer
+         value: 5.09
+       - name: Test WER (+LM)
+         type: wer
+         value: 15.20
+       - name: Test CER (+LM)
+         type: cer
+         value: 3.87
+---
+## Wav2Vec2-Large-XLSR-53-Galician-With-LM
+This is copy of the model [diego-fustes/wav2vec2-large-xlsr-gl](https://huggingface.co/diego-fustes/wav2vec2-large-xlsr-gl) with an integrated language model.

added_tokens.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {}

alphabet.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"labels": ["g", "m", "n", "y", "x", "s", "e", "\u00e1", " ", "\u00f3", "w", "\u00ed", "i", "\u00f1", "q", "c", "j", "h", "p", "l", "u", "d", "\u00e9", "z", "o", "\u00fa", "r", "b", "f", "k", "v", "t", "a", "\u2047", ""], "is_bpe": false}

config.json ADDED Viewed

	@@ -0,0 +1,76 @@

+{
+  "_name_or_path": "facebook/wav2vec2-large-xlsr-53",
+  "activation_dropout": 0.0,
+  "apply_spec_augment": true,
+  "architectures": [
+    "Wav2Vec2ForCTC"
+  ],
+  "attention_dropout": 0.1,
+  "bos_token_id": 1,
+  "conv_bias": true,
+  "conv_dim": [
+    512,
+    512,
+    512,
+    512,
+    512,
+    512,
+    512
+  ],
+  "conv_kernel": [
+    10,
+    3,
+    3,
+    3,
+    3,
+    2,
+    2
+  ],
+  "conv_stride": [
+    5,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2
+  ],
+  "ctc_loss_reduction": "mean",
+  "ctc_zero_infinity": false,
+  "do_stable_layer_norm": true,
+  "eos_token_id": 2,
+  "feat_extract_activation": "gelu",
+  "feat_extract_dropout": 0.0,
+  "feat_extract_norm": "layer",
+  "feat_proj_dropout": 0.0,
+  "final_dropout": 0.0,
+  "gradient_checkpointing": true,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.1,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-05,
+  "layerdrop": 0.1,
+  "mask_channel_length": 10,
+  "mask_channel_min_space": 1,
+  "mask_channel_other": 0.0,
+  "mask_channel_prob": 0.0,
+  "mask_channel_selection": "static",
+  "mask_feature_length": 10,
+  "mask_feature_prob": 0.0,
+  "mask_time_length": 10,
+  "mask_time_min_space": 1,
+  "mask_time_other": 0.0,
+  "mask_time_prob": 0.05,
+  "mask_time_selection": "static",
+  "model_type": "wav2vec2",
+  "num_attention_heads": 16,
+  "num_conv_pos_embedding_groups": 16,
+  "num_conv_pos_embeddings": 128,
+  "num_feat_extract_layers": 7,
+  "num_hidden_layers": 24,
+  "pad_token_id": 34,
+  "transformers_version": "4.4.0",
+  "vocab_size": 35
+}

flax_model.msgpack ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6aac96ca5e6a849b3f26720862aa09b4afe1512adf00e44e5e8a8a99de1e4147
+size 1261913772

language_model/attrs.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"alpha": 0.5, "beta": 1.5, "unk_score_offset": -10.0, "score_boundary": true}

language_model/unigrams.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

language_model/wiki-gl.arpa.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bc855b7e4fd8f2cc980fed3baddeb2545b32e6363b2edf5fe4c6a23c1e878b71
+size 353924525

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "do_normalize": true,
+  "feature_size": 1,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "return_attention_mask": true,
+  "sampling_rate": 16000
+}

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e56c670e29cbd696e26a5006d4c6126a6a08ca82fec4fc226ad47b2cf650623e
+size 1262077335

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"bos_token": null, "eos_token": null, "unk_token": "[UNK]", "pad_token": "[PAD]"}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"unk_token": "[UNK]", "bos_token": null, "eos_token": null, "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "\|"}

vocab.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"g": 0, "m": 1, "n": 2, "y": 3, "x": 4, "s": 5, "e": 6, "á": 7, "ó": 9, "w": 10, "í": 11, "i": 12, "ñ": 13, "q": 14, "c": 15, "j": 16, "h": 17, "p": 18, "l": 19, "u": 20, "d": 21, "é": 22, "z": 23, "o": 24, "ú": 25, "r": 26, "b": 27, "f": 28, "k": 29, "v": 30, "t": 31, "a": 32, "\|": 8, "[UNK]": 33, "[PAD]": 34}

wav2vec_gl_wer_cv.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import torch
+import torchaudio
+from datasets import load_dataset, load_metric, Audio
+from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, Wav2Vec2ForCTC, Wav2Vec2ForCTC, AutoModelForCTC, Wav2Vec2ProcessorWithLM, Wav2Vec2CTCTokenizer
+import numpy
+import re
+import sys
+import random
+# decide if lm should be used for decoding or not via command line
+do_lm = bool(int(sys.argv[1]))
+# set the number of random examples to be shown via command line
+n_elements = int(sys.argv[2])
+#eval_size = int(sys.argv[3])
+print("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")
+print("Decoding with language model\n") if do_lm else print("Decoding without language model\n")
+print("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")
+# Empty cache
+torch.cuda.empty_cache()
+# set devide
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# load dataset
+common_voice_test = load_dataset("mozilla-foundation/common_voice_7_0", "gl", split="test")
+#common_voice_test = load_dataset("mozilla-foundation/common_voice_7_0", "gl", split="test[:1%]")
+print("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")
+print("Common Voice test dataset:\n")
+print(common_voice_test)
+print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")
+print("Number of elements in Common Voice test dataset:", common_voice_test.num_rows, "\n")
+# load metric
+# the predominant metric in ASR is the word error rate (WER)
+wer = load_metric("wer")
+cer = load_metric("cer")
+# Chars to be removed
+chars_to_remove_regex = '[^A-Za-záéíóúñüÁÉÍÓÚÑÜ\- ]'
+#chars_to_remove_regex = '[\,\¿\?\.\¡\!\;\:\"\n\t()\{\}\[\]]'
+# load model and processor
+model_path = "./"
+processor = Wav2Vec2ProcessorWithLM.from_pretrained(model_path, eos_token=None, bos_token=None) if do_lm else Wav2Vec2Processor.from_pretrained(model_path)
+model = AutoModelForCTC.from_pretrained(model_path).to(device)
+# Remove special characters and lowcase normalization
+def remove_special_characters(batch):
+    batch["sentence"] = re.sub(chars_to_remove_regex, '', batch["sentence"]).lower()
+    return batch
+# Preprocessing the dataset
+def prepare_dataset(batch):
+    # batched output is "un-batched"
+    audio = batch["audio"]
+    batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
+    batch["input_length"] = len(batch["input_values"])
+    with processor.as_target_processor():
+        batch["labels"] = processor(batch["sentence"]).input_ids
+    return batch
+# Evaluation of the model
+def evaluate(batch):
+    inputs = processor(batch["input_values"], sampling_rate=16_000, return_tensors="pt", padding=True).to(device)
+    with torch.no_grad():
+        #logits = model(inputs.input_values.to(device), attention_mask=inputs.attention_mask.to(device)).logits
+        logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
+    if do_lm:
+        # batch["pred_strings"] = processor.batch_decode(logits.detach().numpy()).text
+         batch["pred_strings"] = processor.batch_decode(logits.cpu().numpy()).text
+    else:
+        pred_ids = torch.argmax(logits, dim=-1)
+        batch["pred_strings"] = processor.batch_decode(pred_ids)
+    return batch
+# Show N random elements of the dataset
+def show_random_elements(dataset, num_examples):
+    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
+    picks = []
+    for _ in range(num_examples):
+        pick = random.randint(0, len(dataset)-1)
+        while pick in picks:
+            pick = random.randint(0, len(dataset)-1)
+        picks.append(pick)
+    # Print headings
+    print(f"\n{'Id':<4}{'File':<14}{'P':<3}{'N':<3}{'Sentence':<95}{'Prediction':<95}\n")
+    # Pring data
+    for i in range(0,num_examples):
+        row = picks[i]
+        path = dataset[row]["path"][-12:]
+        up_votes = dataset[row]["up_votes"]
+        down_votes = dataset[row]["down_votes"]
+        reference = dataset[row]["sentence"]
+        prediction = dataset[row]["pred_strings"]
+        print(f"{i:<4}{path:<14}{up_votes:<3}{down_votes:<3}{reference:<95}{prediction:<95}")
+# Remove special characters and loowcase normalization
+test_dataset = common_voice_test.map(remove_special_characters)
+# resampling to 16KHz
+test_dataset = test_dataset.cast_column("audio", Audio(sampling_rate=16_000))
+# Prepare dataset
+test_dataset = test_dataset.map(prepare_dataset)
+# Evaluate dataset
+result = test_dataset.map(evaluate, batched=True, batch_size=8)
+print("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")
+print(f"Showing {n_elements} random elementes:\n")
+show_random_elements(result, n_elements)
+print("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
+print("WER: {:2f}".format(100 * wer.compute(references=result["sentence"], predictions=result["pred_strings"])))
+print("CER: {:2f}".format(100 * cer.compute(references=result["sentence"], predictions=result["pred_strings"])))
+print("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")

wav2vec_gl_wer_slr77.py ADDED Viewed

	@@ -0,0 +1,118 @@

+import torch
+import torchaudio
+from datasets import load_dataset, load_metric, Audio
+from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, Wav2Vec2ForCTC, Wav2Vec2ForCTC, AutoModelForCTC, Wav2Vec2ProcessorWithLM, Wav2Vec2CTCTokenizer
+import numpy
+import re
+import sys
+import random
+import pandas as pd
+# decide if lm should be used for decoding or not via command line
+do_lm = bool(int(sys.argv[1]))
+# set the number of random examples to be shown via command line
+n_elements = int(sys.argv[2])
+#eval_size = int(sys.argv[2])
+print("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")
+print("Decoding with language model\n") if do_lm else print("Decoding without language model\n")
+print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")
+# Empty cache
+torch.cuda.empty_cache()
+# set devide
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# load dataset
+#test_dataset = load_dataset("openslr", "SLR77", split="train[:1%]")
+slr77_test = load_dataset("json", data_files='../xlsr-fine-tuning-gl/elra_test_manifest2.json')
+print("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")
+print("SLR77 test:\n")
+print(slr77_test)
+print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")
+print("Number of elements in SLR77 test dataset:", slr77_test["train"].num_rows, "\n")
+# load metric
+# the predominant metric in ASR is the word error rate (WER)
+wer = load_metric("wer")
+cer = load_metric("cer")
+# Chars to be removed
+chars_to_remove_regex = '[^A-Za-záéíóúñüÁÉÍÓÚÑÜ\- ]'
+#chars_to_remove_regex = '[\,\¿\?\.\¡\!\;\:\"\n\t()\{\}\[\]]'
+# load model and processor
+model_path = "./"
+processor = Wav2Vec2ProcessorWithLM.from_pretrained(model_path, eos_token=None, bos_token=None) if do_lm else Wav2Vec2Processor.from_pretrained(model_path)
+model = AutoModelForCTC.from_pretrained(model_path).to(device)
+resampler = torchaudio.transforms.Resample(48_000, 16_000)
+# Remove special characters and loowcase normalization
+def remove_special_characters(batch):
+    batch["text"] = re.sub(chars_to_remove_regex, '', batch["text"]).lower()
+    return batch
+# Preprocessing the datasets.
+# We need to read the audio files as arrays
+def prepare_dataset(batch):
+    # batched output is "un-batched"
+    speech_array, sampling_rate = torchaudio.load(batch["audio_filepath"])
+    # resampling to 16KHz
+    batch["speech"] = resampler(speech_array).squeeze().numpy()
+    return batch
+# Evaluation of the model.
+def evaluate(batch):
+    inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True).to(device)
+    with torch.no_grad():
+        logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
+    if do_lm:
+        # batch["pred_strings"] = processor.batch_decode(logits.detach().numpy())
+         batch["pred_strings"] = processor.batch_decode(logits.cpu().numpy()).text
+    else:
+        pred_ids = torch.argmax(logits, dim=-1)
+        batch["pred_strings"] = processor.batch_decode(pred_ids)
+    return batch
+# Show N random elements of the dataset
+def show_random_elements(dataset, num_examples):
+    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
+    picks = []
+    for _ in range(num_examples):
+        pick = random.randint(0, len(dataset)-1)
+        while pick in picks:
+            pick = random.randint(0, len(dataset)-1)
+        picks.append(pick)
+    #picks = [74, 77, 66, 682, 556, 603, 394, 420, 384, 789, 735, 696, 6, 294, 497, 421]
+    # Print headings
+    print(f"\n{'Row':<4}{'File':<28}{'Sentence':<105}{'Prediction':<105}\n")
+    # Pring data
+    for i in range(0,num_examples):
+        row = picks[i]
+        path = dataset[row]["audio_filepath"][-25:]
+        reference = dataset[row]["text"]
+        prediction = dataset[row]["pred_strings"]
+        print(f"{row:<4}{path:<28}{reference:<105}{prediction:<105}")
+# Remove special characters and loowcase normalization
+test_dataset = slr77_test.map(remove_special_characters)
+# Prepare dataset
+test_dataset = test_dataset.map(prepare_dataset)
+result = test_dataset.map(evaluate, batched=True, batch_size=8)
+print("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")
+print(f"Showing {n_elements} random elementes:\n")
+show_random_elements(result["train"], n_elements)
+print("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
+print("WER: {:2f}".format(100 * wer.compute(references=result["train"]["text"], predictions=result["train"]["pred_strings"])))
+print("CER: {:2f}".format(100 * cer.compute(references=result["train"]["text"], predictions=result["train"]["pred_strings"])))
+print("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")