# PereLluis13 /Wav2Vec2-Large-XLSR-53-catalan

 1 --- 2 language: ca 3 datasets: 4 - common_voice 5 metrics: 6 - wer 7 tags: 8 - audio 9 - automatic-speech-recognition 10 - speech 11 - xlsr-fine-tuning-week 12 license: apache-2.0 13 model-index: 14 - name: Catalan XLSR Wav2Vec Large 53 #TODO: replace {human_readable_name} with a name of your model as it should appear on the leaderboard. It could be something like Elgeish XLSR Wav2Vec2 Large 53 15 results: 16 - task: 17 name: Speech Recognition 18 type: automatic-speech-recognition 19 dataset: 20 name: Common Voice ca 21 type: common_voice 22 args: ca #TODO: 23 metrics: 24 - name: Test WER 25 type: wer 26 value: 8.11 27 --- 28 29 # Wav2Vec2-Large-XLSR-53-ca 30 31 Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on catalan using the [Common Voice](https://huggingface.co/datasets/common_voice) dataset. 32 When using this model, make sure that your speech input is sampled at 16kHz. 33 34 ## Usage 35 36 The model can be used directly (without a language model) as follows: 37 38 python 39 import torch 40 import torchaudio 41 from datasets import load_dataset 42 from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor 43 44 test_dataset = load_dataset("common_voice", "ca", split="test[:2%]") 45 46 processor = Wav2Vec2Processor.from_pretrained("PereLluis13/Wav2Vec2-Large-XLSR-53-catalan") 47 model = Wav2Vec2ForCTC.from_pretrained("PereLluis13/Wav2Vec2-Large-XLSR-53-catalan") 48 49 resampler = torchaudio.transforms.Resample(48_000, 16_000) 50 51 # Preprocessing the datasets. 52 # We need to read the aduio files as arrays 53 def speech_file_to_array_fn(batch): 54 speech_array, sampling_rate = torchaudio.load(batch["path"]) 55 batch["speech"] = resampler(speech_array).squeeze().numpy() 56 return batch 57 58 test_dataset = test_dataset.map(speech_file_to_array_fn) 59 inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True) 60 61 with torch.no_grad(): 62 logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits 63 64 predicted_ids = torch.argmax(logits, dim=-1) 65 66 print("Prediction:", processor.batch_decode(predicted_ids)) 67 print("Reference:", test_dataset["sentence"][:2]) 68  69 70 71 ## Evaluation 72 73 The model can be evaluated as follows on the catalan test data of Common Voice. 74 75 python 76 import torch 77 import torchaudio 78 from datasets import load_dataset, load_metric 79 from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor 80 import re 81 82 test_dataset = load_dataset("common_voice", "ca", split="test") 83 wer = load_metric("wer") 84 85 processor = Wav2Vec2Processor.from_pretrained("PereLluis13/Wav2Vec2-Large-XLSR-53-catalan") 86 model = Wav2Vec2ForCTC.from_pretrained("PereLluis13/Wav2Vec2-Large-XLSR-53-catalan") 87 model.to("cuda") 88 89 chars_to_ignore_regex = '[\,\?\.\!\;\:\"\“]' 90 resampler = torchaudio.transforms.Resample(48_000, 16_000) 91 92 # Preprocessing the datasets. 93 # We need to read the aduio files as arrays 94 def speech_file_to_array_fn(batch): 95 batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() 96 speech_array, sampling_rate = torchaudio.load(batch["path"]) 97 batch["speech"] = resampler(speech_array).squeeze().numpy() 98 return batch 99 100 test_dataset = test_dataset.map(speech_file_to_array_fn) 101 102 # Preprocessing the datasets. 103 # We need to read the aduio files as arrays 104 def evaluate(batch): 105 inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True) 106 107 with torch.no_grad(): 108 logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits 109 110 pred_ids = torch.argmax(logits, dim=-1) 111 batch["pred_strings"] = processor.batch_decode(pred_ids) 112 return batch 113 114 result = test_dataset.map(evaluate, batched=True, batch_size=8) 115 import jiwer 116 117 # Chunk WER computation due to memory issues, taken from https://huggingface.co/pcuenq/wav2vec2-large-xlsr-53-es 118 def chunked_wer(targets, predictions, chunk_size=None): 119 if chunk_size is None: return jiwer.wer(targets, predictions) 120 start = 0 121 end = chunk_size 122 H, S, D, I = 0, 0, 0, 0 123 while start < len(targets): 124 chunk_metrics = jiwer.compute_measures(targets[start:end], predictions[start:end]) 125 H = H + chunk_metrics["hits"] 126 S = S + chunk_metrics["substitutions"] 127 D = D + chunk_metrics["deletions"] 128 I = I + chunk_metrics["insertions"] 129 start += chunk_size 130 end += chunk_size 131 return float(S + D + I) / float(H + S + D) 132 133 print("WER: {:2f}".format(100 * chunked_wer(result["sentence"], result["pred_strings"], chunk_size=4000))) 134  135 136 **Test Result**: 8.11 % 137 138 ## Training 139 140 The Common Voice train, validation datasets were used for training. At the second epoch training was halted due to a memory issue, and was continued with lower batch size, but acc. gradient steps were scaled to keep it at 32 batch size during all training. Then the model was trained for an additional 10 epochs where half the male samples were pitched up. 141 142 The script used for training can be found [here](https://github.com/huggingface/transformers/blob/master/examples/research_projects/wav2vec2/run_common_voice.py). Slight modifications were done in order to speed up the ordering by length during training, which can be found [here](https://discuss.huggingface.co/t/spanish-asr-fine-tuning-wav2vec2/4586/6). Another version trained for catalan can be found [here](https://huggingface.co/ccoreilly/wav2vec2-large-xlsr-catala), which may be better than this one since it was trained with extra data and for longer time. Whoever, since it used different splits that include part of the Common Voice test set, this version can be used to get a baseline on the Common Voice dataset.