Commit
•
03930f8
1
Parent(s):
0fb6288
Create README.md
Browse files
README.md
ADDED
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
language: de
|
3 |
+
datasets:
|
4 |
+
- common_voice
|
5 |
+
metrics:
|
6 |
+
- wer
|
7 |
+
- cer
|
8 |
+
tags:
|
9 |
+
- audio
|
10 |
+
- automatic-speech-recognition
|
11 |
+
- speech
|
12 |
+
- hf-asr-leaderboard
|
13 |
+
license: apache-2.0
|
14 |
+
model-index:
|
15 |
+
- name: wav2vec 2.0 XLS-R 1B + TEVR tokens + 5-gram LM by Hajo Nils Krabbenhöft
|
16 |
+
results:
|
17 |
+
- task:
|
18 |
+
name: Speech Recognition
|
19 |
+
type: automatic-speech-recognition
|
20 |
+
dataset:
|
21 |
+
name: Common Voice de
|
22 |
+
type: common_voice
|
23 |
+
args: de
|
24 |
+
metrics:
|
25 |
+
- name: Test WER
|
26 |
+
type: wer
|
27 |
+
value: 3.6433399042523233
|
28 |
+
- name: Test CER
|
29 |
+
type: cer
|
30 |
+
value: 1.5398893560981173
|
31 |
+
---
|
32 |
+
|
33 |
+
|
34 |
+
## Overview
|
35 |
+
|
36 |
+
This folder contains a fully trained German speech recognition pipeline
|
37 |
+
consisting of an acoustic model using the new wav2vec 2.0 XLS-R 1B TEVR architecture
|
38 |
+
and a 5-gram KenLM language model.
|
39 |
+
For an explanation of the TEVR enhancements and their motivation, please see our paper:
|
40 |
+
TEVR: Improving XLS-R for German ASR through Token Entropy Variance Reduction
|
41 |
+
(Krabbenhöft et al., 2022).
|
42 |
+
|
43 |
+
|
44 |
+
This pipeline scores a very competitive (as of June 2022) **word error rate of 3.64%** on CommonVoice German.
|
45 |
+
|
46 |
+
To evalue this pipeline yourself and/or on your own data, see the `HF Eval Script.ipynb` Jupyter Notebook
|
47 |
+
or use the following python script:
|
48 |
+
|
49 |
+
## Evaluation
|
50 |
+
|
51 |
+
```python
|
52 |
+
!pip install --quiet --root-user-action=ignore --upgrade pip
|
53 |
+
!pip install --quiet --root-user-action=ignore "datasets>=1.18.3" "transformers==4.11.3" librosa jiwer huggingface_hub
|
54 |
+
!pip install --quiet --root-user-action=ignore https://github.com/kpu/kenlm/archive/master.zip pyctcdecode
|
55 |
+
!pip install --quiet --root-user-action=ignore --upgrade transformers
|
56 |
+
!pip install --quiet --root-user-action=ignore torch_audiomentations audiomentations
|
57 |
+
```
|
58 |
+
|
59 |
+
|
60 |
+
```python
|
61 |
+
from datasets import load_dataset, Audio, load_metric
|
62 |
+
from transformers import AutoModelForCTC, Wav2Vec2ProcessorWithLM
|
63 |
+
import torchaudio.transforms as T
|
64 |
+
import torch
|
65 |
+
import unicodedata
|
66 |
+
import numpy as np
|
67 |
+
import re
|
68 |
+
|
69 |
+
# load testing dataset
|
70 |
+
testing_dataset = load_dataset("common_voice", "de", split="test")
|
71 |
+
|
72 |
+
# replace invisible characters with space
|
73 |
+
allchars = list(set([c for t in testing_dataset['sentence'] for c in list(t)]))
|
74 |
+
map_to_space = [c for c in allchars if unicodedata.category(c)[0] in 'PSZ' and c not in 'ʻ-']
|
75 |
+
replacements = ''.maketrans(''.join(map_to_space), ''.join(' ' for i in range(len(map_to_space))), '\'ʻ')
|
76 |
+
|
77 |
+
def text_fix(text):
|
78 |
+
# change ß to ss
|
79 |
+
text = text.replace('ß','ss')
|
80 |
+
# convert dash to space and remove double-space
|
81 |
+
text = text.replace('-',' ').replace(' ',' ').replace(' ',' ')
|
82 |
+
# make lowercase
|
83 |
+
text = text.lower()
|
84 |
+
# remap all invisible characters to space
|
85 |
+
text = text.translate(replacements).strip()
|
86 |
+
# for easier comparison to Zimmermeister, replace unrepresentable characters with ?
|
87 |
+
text = re.sub("[âşěýňעảנźțãòàǔł̇æồאắîשðșęūāñë生בøúıśžçćńřğ]+","?",text)
|
88 |
+
# remove multiple spaces (again)
|
89 |
+
text = ' '.join([w for w in text.split(' ') if w != ''])
|
90 |
+
return text
|
91 |
+
|
92 |
+
# load model
|
93 |
+
model = AutoModelForCTC.from_pretrained("fxtentacle/wav2vec2-xls-r-1b-tevr")
|
94 |
+
model.to('cuda')
|
95 |
+
# load processor
|
96 |
+
class HajoProcessor(Wav2Vec2ProcessorWithLM):
|
97 |
+
@staticmethod
|
98 |
+
def get_missing_alphabet_tokens(decoder, tokenizer):
|
99 |
+
return []
|
100 |
+
processor = HajoProcessor.from_pretrained("fxtentacle/wav2vec2-xls-r-1b-tevr")
|
101 |
+
|
102 |
+
# this function will be called for each WAV file
|
103 |
+
def predict_single_audio(batch, image=False):
|
104 |
+
audio = batch['audio']['array']
|
105 |
+
# resample, if needed
|
106 |
+
if batch['audio']['sampling_rate'] != 16000:
|
107 |
+
audio = T.Resample(orig_freq=batch['audio']['sampling_rate'], new_freq=16000)(torch.from_numpy(audio)).numpy()
|
108 |
+
# normalize
|
109 |
+
audio = (audio - audio.mean()) / np.sqrt(audio.var() + 1e-7)
|
110 |
+
# ask HF processor to prepare audio for GPU eval
|
111 |
+
input_values = processor(audio, return_tensors="pt", sampling_rate=16_000).input_values
|
112 |
+
# call model on GPU
|
113 |
+
with torch.no_grad():
|
114 |
+
logits = model(input_values.to('cuda')).logits.cpu().numpy()[0]
|
115 |
+
# ask HF processor to decode logits
|
116 |
+
decoded = processor.decode(logits, beam_width=500)
|
117 |
+
# return as dictionary
|
118 |
+
return { 'groundtruth': text_fix(batch['sentence']), 'prediction': decoded.text }
|
119 |
+
|
120 |
+
# process all audio files
|
121 |
+
all_predictions = testing_dataset.map(predict_single_audio, remove_columns=testing_dataset.column_names)
|
122 |
+
|
123 |
+
# print results
|
124 |
+
print('WER', load_metric("wer").compute(predictions=all_predictions['prediction'], references=all_predictions['groundtruth'])*100.0, '%')
|
125 |
+
print('CER', load_metric("cer").compute(predictions=all_predictions['prediction'], references=all_predictions['groundtruth'])*100.0, '%')
|
126 |
+
```
|
127 |
+
|
128 |
+
WER 3.6433399042523233 %
|
129 |
+
CER 1.5398893560981173 %
|
130 |
+
|