cmagui commited on
Commit
740d8bb
1 Parent(s): b23186b

First model version (wiki-gl LM)

Browse files
README.md ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ---
3
+ language: gl
4
+ datasets:
5
+ - OpenSLR 77
6
+ metrics:
7
+ - wer
8
+ - cer
9
+ tags:
10
+ - audio
11
+ - automatic-speech-recognition
12
+ - gl
13
+ license:
14
+ model-index:
15
+ - name: Wav2Vec2-Large-XLSR-53-Galician-With-LM
16
+ results:
17
+ - task:
18
+ name: Automatic Speech Recognition
19
+ type: automatic-speech-recognition
20
+ dataset:
21
+ name: OpenSLR
22
+ type: openslr
23
+ args: gl
24
+ metrics:
25
+ - name: Test WER
26
+ type: wer
27
+ value: 9.10
28
+ - name: Test CER
29
+ type: cer
30
+ value: 3.94
31
+ - name: Test WER (+LM)
32
+ type: wer
33
+ value: 6.86
34
+ - name: Test CER (+LM)
35
+ type: cer
36
+ value: 2.20
37
+ - task:
38
+ name: Automatic Speech Recognition
39
+ type: automatic-speech-recognition
40
+ dataset:
41
+ name: Common Voice 7.0
42
+ type: mozilla-foundation/common_voice_7_0
43
+ args: gl
44
+ metrics:
45
+ - name: Test WER
46
+ type: wer
47
+ value: 22.12
48
+ - name: Test CER
49
+ type: cer
50
+ value: 5.09
51
+ - name: Test WER (+LM)
52
+ type: wer
53
+ value: 15.20
54
+ - name: Test CER (+LM)
55
+ type: cer
56
+ value: 3.87
57
+
58
+
59
+ ---
60
+
61
+ ## Wav2Vec2-Large-XLSR-53-Galician-With-LM
62
+
63
+ This is copy of the model [diego-fustes/wav2vec2-large-xlsr-gl](https://huggingface.co/diego-fustes/wav2vec2-large-xlsr-gl) with an integrated language model.
64
+
added_tokens.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {}
alphabet.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"labels": ["g", "m", "n", "y", "x", "s", "e", "\u00e1", " ", "\u00f3", "w", "\u00ed", "i", "\u00f1", "q", "c", "j", "h", "p", "l", "u", "d", "\u00e9", "z", "o", "\u00fa", "r", "b", "f", "k", "v", "t", "a", "\u2047", ""], "is_bpe": false}
config.json ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "facebook/wav2vec2-large-xlsr-53",
3
+ "activation_dropout": 0.0,
4
+ "apply_spec_augment": true,
5
+ "architectures": [
6
+ "Wav2Vec2ForCTC"
7
+ ],
8
+ "attention_dropout": 0.1,
9
+ "bos_token_id": 1,
10
+ "conv_bias": true,
11
+ "conv_dim": [
12
+ 512,
13
+ 512,
14
+ 512,
15
+ 512,
16
+ 512,
17
+ 512,
18
+ 512
19
+ ],
20
+ "conv_kernel": [
21
+ 10,
22
+ 3,
23
+ 3,
24
+ 3,
25
+ 3,
26
+ 2,
27
+ 2
28
+ ],
29
+ "conv_stride": [
30
+ 5,
31
+ 2,
32
+ 2,
33
+ 2,
34
+ 2,
35
+ 2,
36
+ 2
37
+ ],
38
+ "ctc_loss_reduction": "mean",
39
+ "ctc_zero_infinity": false,
40
+ "do_stable_layer_norm": true,
41
+ "eos_token_id": 2,
42
+ "feat_extract_activation": "gelu",
43
+ "feat_extract_dropout": 0.0,
44
+ "feat_extract_norm": "layer",
45
+ "feat_proj_dropout": 0.0,
46
+ "final_dropout": 0.0,
47
+ "gradient_checkpointing": true,
48
+ "hidden_act": "gelu",
49
+ "hidden_dropout": 0.1,
50
+ "hidden_size": 1024,
51
+ "initializer_range": 0.02,
52
+ "intermediate_size": 4096,
53
+ "layer_norm_eps": 1e-05,
54
+ "layerdrop": 0.1,
55
+ "mask_channel_length": 10,
56
+ "mask_channel_min_space": 1,
57
+ "mask_channel_other": 0.0,
58
+ "mask_channel_prob": 0.0,
59
+ "mask_channel_selection": "static",
60
+ "mask_feature_length": 10,
61
+ "mask_feature_prob": 0.0,
62
+ "mask_time_length": 10,
63
+ "mask_time_min_space": 1,
64
+ "mask_time_other": 0.0,
65
+ "mask_time_prob": 0.05,
66
+ "mask_time_selection": "static",
67
+ "model_type": "wav2vec2",
68
+ "num_attention_heads": 16,
69
+ "num_conv_pos_embedding_groups": 16,
70
+ "num_conv_pos_embeddings": 128,
71
+ "num_feat_extract_layers": 7,
72
+ "num_hidden_layers": 24,
73
+ "pad_token_id": 34,
74
+ "transformers_version": "4.4.0",
75
+ "vocab_size": 35
76
+ }
flax_model.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6aac96ca5e6a849b3f26720862aa09b4afe1512adf00e44e5e8a8a99de1e4147
3
+ size 1261913772
language_model/attrs.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"alpha": 0.5, "beta": 1.5, "unk_score_offset": -10.0, "score_boundary": true}
language_model/unigrams.txt ADDED
The diff for this file is too large to render. See raw diff
 
language_model/wiki-gl.arpa.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc855b7e4fd8f2cc980fed3baddeb2545b32e6363b2edf5fe4c6a23c1e878b71
3
+ size 353924525
preprocessor_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_size": 1,
4
+ "padding_side": "right",
5
+ "padding_value": 0.0,
6
+ "return_attention_mask": true,
7
+ "sampling_rate": 16000
8
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e56c670e29cbd696e26a5006d4c6126a6a08ca82fec4fc226ad47b2cf650623e
3
+ size 1262077335
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": null, "eos_token": null, "unk_token": "[UNK]", "pad_token": "[PAD]"}
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "[UNK]", "bos_token": null, "eos_token": null, "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|"}
vocab.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"g": 0, "m": 1, "n": 2, "y": 3, "x": 4, "s": 5, "e": 6, "á": 7, "ó": 9, "w": 10, "í": 11, "i": 12, "ñ": 13, "q": 14, "c": 15, "j": 16, "h": 17, "p": 18, "l": 19, "u": 20, "d": 21, "é": 22, "z": 23, "o": 24, "ú": 25, "r": 26, "b": 27, "f": 28, "k": 29, "v": 30, "t": 31, "a": 32, "|": 8, "[UNK]": 33, "[PAD]": 34}
wav2vec_gl_wer_cv.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torchaudio
3
+ from datasets import load_dataset, load_metric, Audio
4
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, Wav2Vec2ForCTC, Wav2Vec2ForCTC, AutoModelForCTC, Wav2Vec2ProcessorWithLM, Wav2Vec2CTCTokenizer
5
+ import numpy
6
+ import re
7
+ import sys
8
+ import random
9
+
10
+
11
+ # decide if lm should be used for decoding or not via command line
12
+ do_lm = bool(int(sys.argv[1]))
13
+ # set the number of random examples to be shown via command line
14
+ n_elements = int(sys.argv[2])
15
+ #eval_size = int(sys.argv[3])
16
+ print("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")
17
+ print("Decoding with language model\n") if do_lm else print("Decoding without language model\n")
18
+ print("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")
19
+
20
+ # Empty cache
21
+ torch.cuda.empty_cache()
22
+
23
+ # set devide
24
+ device = "cuda" if torch.cuda.is_available() else "cpu"
25
+
26
+ # load dataset
27
+ common_voice_test = load_dataset("mozilla-foundation/common_voice_7_0", "gl", split="test")
28
+ #common_voice_test = load_dataset("mozilla-foundation/common_voice_7_0", "gl", split="test[:1%]")
29
+ print("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")
30
+ print("Common Voice test dataset:\n")
31
+ print(common_voice_test)
32
+ print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")
33
+ print("Number of elements in Common Voice test dataset:", common_voice_test.num_rows, "\n")
34
+
35
+ # load metric
36
+ # the predominant metric in ASR is the word error rate (WER)
37
+ wer = load_metric("wer")
38
+ cer = load_metric("cer")
39
+
40
+ # Chars to be removed
41
+ chars_to_remove_regex = '[^A-Za-záéíóúñüÁÉÍÓÚÑÜ\- ]'
42
+ #chars_to_remove_regex = '[\,\¿\?\.\¡\!\;\:\"\n\t()\{\}\[\]]'
43
+
44
+ # load model and processor
45
+ model_path = "./"
46
+ processor = Wav2Vec2ProcessorWithLM.from_pretrained(model_path, eos_token=None, bos_token=None) if do_lm else Wav2Vec2Processor.from_pretrained(model_path)
47
+ model = AutoModelForCTC.from_pretrained(model_path).to(device)
48
+
49
+ # Remove special characters and lowcase normalization
50
+ def remove_special_characters(batch):
51
+ batch["sentence"] = re.sub(chars_to_remove_regex, '', batch["sentence"]).lower()
52
+ return batch
53
+
54
+ # Preprocessing the dataset
55
+ def prepare_dataset(batch):
56
+ # batched output is "un-batched"
57
+ audio = batch["audio"]
58
+ batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
59
+ batch["input_length"] = len(batch["input_values"])
60
+
61
+ with processor.as_target_processor():
62
+ batch["labels"] = processor(batch["sentence"]).input_ids
63
+ return batch
64
+
65
+ # Evaluation of the model
66
+ def evaluate(batch):
67
+ inputs = processor(batch["input_values"], sampling_rate=16_000, return_tensors="pt", padding=True).to(device)
68
+ with torch.no_grad():
69
+ #logits = model(inputs.input_values.to(device), attention_mask=inputs.attention_mask.to(device)).logits
70
+ logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
71
+
72
+ if do_lm:
73
+ # batch["pred_strings"] = processor.batch_decode(logits.detach().numpy()).text
74
+ batch["pred_strings"] = processor.batch_decode(logits.cpu().numpy()).text
75
+ else:
76
+ pred_ids = torch.argmax(logits, dim=-1)
77
+ batch["pred_strings"] = processor.batch_decode(pred_ids)
78
+
79
+ return batch
80
+
81
+ # Show N random elements of the dataset
82
+ def show_random_elements(dataset, num_examples):
83
+ assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
84
+ picks = []
85
+ for _ in range(num_examples):
86
+ pick = random.randint(0, len(dataset)-1)
87
+ while pick in picks:
88
+ pick = random.randint(0, len(dataset)-1)
89
+ picks.append(pick)
90
+
91
+ # Print headings
92
+ print(f"\n{'Id':<4}{'File':<14}{'P':<3}{'N':<3}{'Sentence':<95}{'Prediction':<95}\n")
93
+ # Pring data
94
+ for i in range(0,num_examples):
95
+ row = picks[i]
96
+ path = dataset[row]["path"][-12:]
97
+ up_votes = dataset[row]["up_votes"]
98
+ down_votes = dataset[row]["down_votes"]
99
+ reference = dataset[row]["sentence"]
100
+ prediction = dataset[row]["pred_strings"]
101
+ print(f"{i:<4}{path:<14}{up_votes:<3}{down_votes:<3}{reference:<95}{prediction:<95}")
102
+
103
+ # Remove special characters and loowcase normalization
104
+ test_dataset = common_voice_test.map(remove_special_characters)
105
+
106
+ # resampling to 16KHz
107
+ test_dataset = test_dataset.cast_column("audio", Audio(sampling_rate=16_000))
108
+
109
+ # Prepare dataset
110
+ test_dataset = test_dataset.map(prepare_dataset)
111
+
112
+ # Evaluate dataset
113
+ result = test_dataset.map(evaluate, batched=True, batch_size=8)
114
+
115
+ print("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")
116
+ print(f"Showing {n_elements} random elementes:\n")
117
+ show_random_elements(result, n_elements)
118
+
119
+
120
+ print("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
121
+ print("WER: {:2f}".format(100 * wer.compute(references=result["sentence"], predictions=result["pred_strings"])))
122
+ print("CER: {:2f}".format(100 * cer.compute(references=result["sentence"], predictions=result["pred_strings"])))
123
+ print("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
wav2vec_gl_wer_slr77.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torchaudio
3
+ from datasets import load_dataset, load_metric, Audio
4
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, Wav2Vec2ForCTC, Wav2Vec2ForCTC, AutoModelForCTC, Wav2Vec2ProcessorWithLM, Wav2Vec2CTCTokenizer
5
+ import numpy
6
+ import re
7
+ import sys
8
+ import random
9
+ import pandas as pd
10
+
11
+ # decide if lm should be used for decoding or not via command line
12
+ do_lm = bool(int(sys.argv[1]))
13
+ # set the number of random examples to be shown via command line
14
+ n_elements = int(sys.argv[2])
15
+ #eval_size = int(sys.argv[2])
16
+ print("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")
17
+ print("Decoding with language model\n") if do_lm else print("Decoding without language model\n")
18
+ print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")
19
+
20
+ # Empty cache
21
+ torch.cuda.empty_cache()
22
+
23
+ # set devide
24
+ device = "cuda" if torch.cuda.is_available() else "cpu"
25
+
26
+ # load dataset
27
+ #test_dataset = load_dataset("openslr", "SLR77", split="train[:1%]")
28
+ slr77_test = load_dataset("json", data_files='../xlsr-fine-tuning-gl/elra_test_manifest2.json')
29
+ print("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")
30
+ print("SLR77 test:\n")
31
+ print(slr77_test)
32
+ print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")
33
+ print("Number of elements in SLR77 test dataset:", slr77_test["train"].num_rows, "\n")
34
+
35
+ # load metric
36
+ # the predominant metric in ASR is the word error rate (WER)
37
+ wer = load_metric("wer")
38
+ cer = load_metric("cer")
39
+
40
+ # Chars to be removed
41
+ chars_to_remove_regex = '[^A-Za-záéíóúñüÁÉÍÓÚÑÜ\- ]'
42
+ #chars_to_remove_regex = '[\,\¿\?\.\¡\!\;\:\"\n\t()\{\}\[\]]'
43
+
44
+ # load model and processor
45
+ model_path = "./"
46
+ processor = Wav2Vec2ProcessorWithLM.from_pretrained(model_path, eos_token=None, bos_token=None) if do_lm else Wav2Vec2Processor.from_pretrained(model_path)
47
+ model = AutoModelForCTC.from_pretrained(model_path).to(device)
48
+
49
+ resampler = torchaudio.transforms.Resample(48_000, 16_000)
50
+
51
+ # Remove special characters and loowcase normalization
52
+ def remove_special_characters(batch):
53
+ batch["text"] = re.sub(chars_to_remove_regex, '', batch["text"]).lower()
54
+ return batch
55
+
56
+ # Preprocessing the datasets.
57
+ # We need to read the audio files as arrays
58
+ def prepare_dataset(batch):
59
+ # batched output is "un-batched"
60
+ speech_array, sampling_rate = torchaudio.load(batch["audio_filepath"])
61
+ # resampling to 16KHz
62
+ batch["speech"] = resampler(speech_array).squeeze().numpy()
63
+ return batch
64
+
65
+ # Evaluation of the model.
66
+ def evaluate(batch):
67
+ inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True).to(device)
68
+ with torch.no_grad():
69
+ logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
70
+
71
+ if do_lm:
72
+ # batch["pred_strings"] = processor.batch_decode(logits.detach().numpy())
73
+ batch["pred_strings"] = processor.batch_decode(logits.cpu().numpy()).text
74
+ else:
75
+ pred_ids = torch.argmax(logits, dim=-1)
76
+ batch["pred_strings"] = processor.batch_decode(pred_ids)
77
+
78
+ return batch
79
+
80
+ # Show N random elements of the dataset
81
+ def show_random_elements(dataset, num_examples):
82
+ assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
83
+ picks = []
84
+ for _ in range(num_examples):
85
+ pick = random.randint(0, len(dataset)-1)
86
+ while pick in picks:
87
+ pick = random.randint(0, len(dataset)-1)
88
+ picks.append(pick)
89
+ #picks = [74, 77, 66, 682, 556, 603, 394, 420, 384, 789, 735, 696, 6, 294, 497, 421]
90
+
91
+ # Print headings
92
+ print(f"\n{'Row':<4}{'File':<28}{'Sentence':<105}{'Prediction':<105}\n")
93
+ # Pring data
94
+ for i in range(0,num_examples):
95
+ row = picks[i]
96
+ path = dataset[row]["audio_filepath"][-25:]
97
+ reference = dataset[row]["text"]
98
+ prediction = dataset[row]["pred_strings"]
99
+ print(f"{row:<4}{path:<28}{reference:<105}{prediction:<105}")
100
+
101
+
102
+ # Remove special characters and loowcase normalization
103
+ test_dataset = slr77_test.map(remove_special_characters)
104
+
105
+ # Prepare dataset
106
+ test_dataset = test_dataset.map(prepare_dataset)
107
+
108
+ result = test_dataset.map(evaluate, batched=True, batch_size=8)
109
+
110
+ print("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")
111
+ print(f"Showing {n_elements} random elementes:\n")
112
+ show_random_elements(result["train"], n_elements)
113
+
114
+
115
+ print("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
116
+ print("WER: {:2f}".format(100 * wer.compute(references=result["train"]["text"], predictions=result["train"]["pred_strings"])))
117
+ print("CER: {:2f}".format(100 * cer.compute(references=result["train"]["text"], predictions=result["train"]["pred_strings"])))
118
+ print("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")