flozi00 commited on
Commit
7a8d542
1 Parent(s): 2dbba5f

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +101 -0
README.md ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: de
3
+ datasets:
4
+ - common_voice
5
+ metrics:
6
+ - wer
7
+ - cer
8
+ tags:
9
+ - audio
10
+ - automatic-speech-recognition
11
+ - speech
12
+ - xlsr-fine-tuning-week
13
+ license: apache-2.0
14
+ model-index:
15
+ - name: XLSR Wav2Vec2 German with LM by Florian Zimmermeister
16
+ results:
17
+ - task:
18
+ name: Speech Recognition
19
+ type: automatic-speech-recognition
20
+ dataset:
21
+ name: Common Voice de
22
+ type: common_voice
23
+ args: de
24
+ metrics:
25
+ - name: Test WER
26
+ type: wer
27
+ value: 5.7467896819046755
28
+ - name: Test CER
29
+ type: cer
30
+ value: 1.8980142607670552
31
+ ---
32
+
33
+ **Test Result**
34
+
35
+ | Model | WER | CER |
36
+ | ------------- | ------------- | ------------- |
37
+ | flozi00/wav2vec2-large-xlsr-53-german-with-lm | **5.7467896819046755%** | **1.8980142607670552%** |
38
+
39
+ ## Evaluation
40
+ The model can be evaluated as follows on the German test data of Common Voice.
41
+
42
+ ```python
43
+ import torchaudio.functional as F
44
+ import torch
45
+ from transformers import AutoModelForCTC, AutoProcessor
46
+ import re
47
+ from datasets import load_dataset, load_metric
48
+
49
+ CHARS_TO_IGNORE = [",", "?", "¿", ".", "!", "¡", ";", ";", ":", '""', "%", '"', "�", "ʿ", "·", "჻", "~", "՞",
50
+ "؟", "،", "।", "॥", "«", "»", "„", "“", "”", "「", "」", "‘", "’", "《", "》", "(", ")", "[", "]",
51
+ "{", "}", "=", "`", "_", "+", "<", ">", "…", "–", "°", "´", "ʾ", "‹", "›", "©", "®", "—", "→", "。",
52
+ "、", "﹂", "﹁", "‧", "~", "﹏", ",", "{", "}", "(", ")", "[", "]", "【", "】", "‥", "〽",
53
+ "『", "』", "〝", "〟", "⟨", "⟩", "〜", ":", "!", "?", "♪", "؛", "/", "\\", "º", "−", "^", "ʻ", "ˆ"]
54
+
55
+ chars_to_ignore_regex = f"[{re.escape(''.join(CHARS_TO_IGNORE))}]"
56
+
57
+ counter = 0
58
+ wer_counter = 0
59
+ cer_counter = 0
60
+
61
+ def main():
62
+ model = AutoModelForCTC.from_pretrained("wav2vec2-large-xlsr-53-german-with-lm")
63
+ processor = AutoProcessor.from_pretrained("wav2vec2-large-xlsr-53-german-with-lm")
64
+
65
+ wer = load_metric("wer")
66
+ cer = load_metric("cer")
67
+
68
+ ds = load_dataset("common_voice", "de", split="test")
69
+ #ds = ds.select(range(100))
70
+
71
+ def calculate_metrics(batch):
72
+ global counter, wer_counter, cer_counter
73
+ resampled_audio = F.resample(torch.tensor(batch["audio"]["array"]), 48_000, 16_000).numpy()
74
+
75
+ input_values = processor(resampled_audio, return_tensors="pt", sampling_rate=16_000).input_values
76
+
77
+ with torch.no_grad():
78
+ logits = model(input_values).logits.numpy()[0]
79
+
80
+
81
+ decoded = processor.decode(logits)
82
+ pred = decoded.text
83
+
84
+ ref = re.sub(chars_to_ignore_regex, "", batch["sentence"]).upper()
85
+
86
+ wer_result = wer.compute(predictions=[pred], references=[ref])
87
+ cer_result = cer.compute(predictions=[pred], references=[ref])
88
+
89
+ counter += 1
90
+ wer_counter += wer_result
91
+ cer_counter += cer_result
92
+
93
+ print(f"WER: {(wer_counter/counter)*100} | CER: {(cer_counter/counter)*100}")
94
+
95
+ return batch
96
+
97
+
98
+ ds.map(calculate_metrics, remove_columns=ds.column_names)
99
+
100
+ main()
101
+ ```