1 ---
2 language: de
3 datasets:
4 - common_voice
5 - wer
6 tags:
7 - audio
8 - automatic-speech-recognition
9 - speech
10 - xlsr-fine-tuning-week
11 license: apache-2.0
12 model-index:
13 - name: XLSR Wav2Vec2 Large 53
14 results:
15 - task:
16 name: Speech Recognition
17 type: automatic-speech-recognition
18 dataset:
19 name: Common Voice de
20 type: common_voice
21 args: de
22 metrics:
23 - name: Test WER
24 type: wer
25 value: 15.80
26 ---
27
28 # Wav2Vec2-Large-XLSR-53-German
29
30 Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on German using the [Common Voice](https://huggingface.co/datasets/common_voice) dataset.
31 When using this model, make sure that your speech input is sampled at 16kHz.
32
33 ## Usage
34
35 The model can be used directly (without a language model) as follows:
36
37 ```python
38 import torch
39 import torchaudio
40 from datasets import load_dataset
41 from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
42
43 test_dataset = load_dataset("common_voice", "de", split="test[:2%]")
44
45 processor = Wav2Vec2Processor.from_pretrained("marcel/wav2vec2-large-xlsr-53-german")
46 model = Wav2Vec2ForCTC.from_pretrained("marcel/wav2vec2-large-xlsr-53-german")
47
48 resampler = torchaudio.transforms.Resample(48_000, 16_000)
49
50 # Preprocessing the datasets.
51 # We need to read the aduio files as arrays
52 def speech_file_to_array_fn(batch):
53 speech_array, sampling_rate = torchaudio.load(batch["path"])
54 batch["speech"] = resampler(speech_array).squeeze().numpy()
55 return batch
56
57 test_dataset = test_dataset.map(speech_file_to_array_fn)
58 inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
59
60 with torch.no_grad():
61 logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
62
63 predicted_ids = torch.argmax(logits, dim=-1)
64
65 print("Prediction:", processor.batch_decode(predicted_ids))
66 print("Reference:", test_dataset["sentence"][:2])
67 ```
68
69
70 ## Evaluation
71
72 The model can be evaluated as follows on the {language} test data of Common Voice.
73
74
75 ```python
76 import torch
77 import torchaudio
78 from datasets import load_dataset, load_metric
79 from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
80 import re
81
82 test_dataset = load_dataset("common_voice", "de", split="test")
83 wer = load_metric("wer")
84
85 processor = Wav2Vec2Processor.from_pretrained("marcel/wav2vec2-large-xlsr-53-german")
86 model = Wav2Vec2ForCTC.from_pretrained("marcel/wav2vec2-large-xlsr-53-german")
87 model.to("cuda")
88
89 chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\”\�\カ\æ\無\ན\カ\臣\ѹ\…\«\»\ð\ı\„\幺\א\ב\比\ш\ע\)\ứ\в\œ\ч\+\—\ш\‚\נ\м\ń\乡\$\=\ש\ф\支\(\°\и\к\̇]'
90 substitutions = {
91 'e' : '[\ə\é\ě\ę\ê\ế\ế\ë\ė\е]',
92 'o' : '[\ō\ô\ô\ó\ò\ø\ọ\ŏ\õ\ő\о]',
93 'a' : '[\á\ā\ā\ă\ã\å\â\à\ą\а]',
94 'c' : '[\č\ć\ç\с]',
95 'l' : '[\ł]',
96 'u' : '[\ú\ū\ứ\ů]',
97 'und' : '[\&]',
98 'r' : '[\ř]',
99 'y' : '[\ý]',
100 's' : '[\ś\š\ș\ş]',
101 'i' : '[\ī\ǐ\í\ï\î\ï]',
102 'z' : '[\ź\ž\ź\ż]',
103 'n' : '[\ñ\ń\ņ]',
104 'g' : '[\ğ]',
105 'ss' : '[\ß]',
106 't' : '[\ț\ť]',
107 'd' : '[\ď\đ]',
108 "'": '[\ʿ\་\’\`\´\ʻ\`\‘]',
109 'p': '\р'
110 }
111 resampler = torchaudio.transforms.Resample(48_000, 16_000)
112
113 # Preprocessing the datasets.
114 # We need to read the aduio files as arrays
115 def speech_file_to_array_fn(batch):
116 batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
117 for x in substitutions:
118 batch["sentence"] = re.sub(substitutions[x], x, batch["sentence"])
119 speech_array, sampling_rate = torchaudio.load(batch["path"])
120 speech_array, sampling_rate = torchaudio.load(batch["path"])
121 batch["speech"] = resampler(speech_array).squeeze().numpy()
122 return batch
123
124 test_dataset = test_dataset.map(speech_file_to_array_fn)
125
126 # Preprocessing the datasets.
127 # We need to read the aduio files as arrays
128 def evaluate(batch):
129 inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
130
131 with torch.no_grad():
132 logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
133
134 pred_ids = torch.argmax(logits, dim=-1)
135 batch["pred_strings"] = processor.batch_decode(pred_ids)
136 return batch
137
138 result = test_dataset.map(evaluate, batched=True, batch_size=8)
139
140 print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
141 ```
142
143 The model can also be evaluated with in 10% chunks which needs less ressources (to be tested).
144
145 ```
146 import torch
147 import torchaudio
148 from datasets import load_dataset, load_metric
149 from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
150 import re
151 import jiwer
152 lang_id = "de"
153
154 processor = Wav2Vec2Processor.from_pretrained("marcel/wav2vec2-large-xlsr-53-german")
155 model = Wav2Vec2ForCTC.from_pretrained("marcel/wav2vec2-large-xlsr-53-german")
156 model.to("cuda")
157
158 chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\”\�\カ\æ\無\ན\カ\臣\ѹ\…\«\»\ð\ı\„\幺\א\ב\比\ш\ע\)\ứ\в\œ\ч\+\—\ш\‚\נ\м\ń\乡\$\=\ש\ф\支\(\°\и\к\̇]'
159 substitutions = {
160 'e' : '[\ə\é\ě\ę\ê\ế\ế\ë\ė\е]',
161 'o' : '[\ō\ô\ô\ó\ò\ø\ọ\ŏ\õ\ő\о]',
162 'a' : '[\á\ā\ā\ă\ã\å\â\à\ą\а]',
163 'c' : '[\č\ć\ç\с]',
164 'l' : '[\ł]',
165 'u' : '[\ú\ū\ứ\ů]',
166 'und' : '[\&]',
167 'r' : '[\ř]',
168 'y' : '[\ý]',
169 's' : '[\ś\š\ș\ş]',
170 'i' : '[\ī\ǐ\í\ï\î\ï]',
171 'z' : '[\ź\ž\ź\ż]',
172 'n' : '[\ñ\ń\ņ]',
173 'g' : '[\ğ]',
174 'ss' : '[\ß]',
175 't' : '[\ț\ť]',
176 'd' : '[\ď\đ]',
177 "'": '[\ʿ\་\’\`\´\ʻ\`\‘]',
178 'p': '\р'
179 }
180 resampler = torchaudio.transforms.Resample(48_000, 16_000)
181
182 # Preprocessing the datasets.
183 # We need to read the aduio files as arrays
184 def speech_file_to_array_fn(batch):
185 batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
186 for x in substitutions:
187 batch["sentence"] = re.sub(substitutions[x], x, batch["sentence"])
188 speech_array, sampling_rate = torchaudio.load(batch["path"])
189 speech_array, sampling_rate = torchaudio.load(batch["path"])
190 batch["speech"] = resampler(speech_array).squeeze().numpy()
191 return batch
192
193
194
195 # Preprocessing the datasets.
196 # We need to read the aduio files as arrays
197 def evaluate(batch):
198 inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
199
200 with torch.no_grad():
201 logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
202
203 pred_ids = torch.argmax(logits, dim=-1)
204 batch["pred_strings"] = processor.batch_decode(pred_ids)
205 return batch
206
207 H, S, D, I = 0, 0, 0, 0
208 for i in range(10):
209 print("test["+str(10*i)+"%:"+str(10*(i+1))+"%]")
210 test_dataset = load_dataset("common_voice", "de", split="test["+str(10*i)+"%:"+str(10*(i+1))+"%]")
211 test_dataset = test_dataset.map(speech_file_to_array_fn)
212 result = test_dataset.map(evaluate, batched=True, batch_size=8)
213 predictions = result["pred_strings"]
214 targets = result["sentence"]
215 chunk_metrics = jiwer.compute_measures(targets, predictions)
216 H = H + chunk_metrics["hits"]
217 S = S + chunk_metrics["substitutions"]
218 D = D + chunk_metrics["deletions"]
219 I = I + chunk_metrics["insertions"]
220 WER = float(S + D + I) / float(H + S + D)
221 print("WER: {:2f}".format(WER*100))
222 ```
223
224 **Test Result**: 15.80 %
225
226
227 ## Training
228
229 The first 50% of the Common Voice `train`, and 12% of the `validation` datasets were used for training (30 epochs on first 12% and 3 epochs on the remainder).
230
231