patrickvonplaten commited on
Commit
76c4d1f
1 Parent(s): 5724a7f

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +61 -10
README.md CHANGED
@@ -21,7 +21,7 @@ model-index:
21
  metrics:
22
  - name: Test WER
23
  type: wer
24
- value: 39.60
25
  ---
26
 
27
  # Greek (el) version of the XLSR-Wav2Vec2 automatic speech recognition (ASR) model
@@ -33,7 +33,7 @@ model-index:
33
  * model: XLSR-Wav2Vec2, trained for 60 epochs
34
  * metrics: Word Error Rate (WER)
35
 
36
- ### Model description
37
 
38
  Wav2Vec2 is a pretrained model for Automatic Speech Recognition (ASR) and was released in September 2020 by Alexei Baevski, Michael Auli, and Alex Conneau. Soon after the superior performance of Wav2Vec2 was demonstrated on the English ASR dataset LibriSpeech, Facebook AI presented XLSR-Wav2Vec2. XLSR stands for cross-lingual speech representations and refers to XLSR-Wav2Vec2`s ability to learn speech representations that are useful across multiple languages.
39
 
@@ -41,7 +41,7 @@ Similar to Wav2Vec2, XLSR-Wav2Vec2 learns powerful speech representations from h
41
 
42
  This model was trained on Greek CommonVoice speech data (364MB) for 60 epochs on a single NVIDIA RTX 3080, for aprox. 8hrs.
43
 
44
- ### How to use for inference:
45
 
46
  For live demo, make sure that speech files are sampled at 16kHz.
47
 
@@ -63,7 +63,7 @@ import numpy as np
63
  from datasets import load_dataset, load_metric
64
  import torch
65
 
66
- chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�]'
67
 
68
  def remove_special_characters(batch):
69
  batch["text"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
@@ -96,13 +96,13 @@ def prepare_dataset(batch):
96
 
97
  # Loading model and dataset processor
98
 
99
- model = Wav2Vec2ForCTC.from_pretrained(".").to("cuda")
100
- processor = Wav2Vec2Processor.from_pretrained(".")
101
 
102
 
103
  # Preparing speech dataset to be suitable for inference
104
 
105
- common_voice_test = load_dataset("common_voice", "el", data_dir="cv-corpus-6.1-2020-12-11", split="test")
106
 
107
  common_voice_test = common_voice_test.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"])
108
 
@@ -117,7 +117,7 @@ common_voice_test = common_voice_test.map(prepare_dataset, remove_columns=common
117
 
118
  # Loading test dataset
119
 
120
- common_voice_test_transcription = load_dataset("common_voice", "el", data_dir="./cv-corpus-6.1-2020-12-11", split="test")
121
 
122
 
123
  #Performing inference on a random sample. Change the "example" value to try inference on different CommonVoice extracts
@@ -134,12 +134,63 @@ print("Prediction:")
134
  print(processor.decode(pred_ids[0]))
135
  # πού θέλεις να πάμε ρώτησε φοβισμένα ο βασιλιάς
136
 
137
- print("\nReference:")
 
138
  print(common_voice_test_transcription["sentence"][example].lower())
139
  # πού θέλεις να πάμε; ρώτησε φοβισμένα ο βασιλιάς.
140
 
141
  ```
142
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  ### How to use for training:
144
 
145
  Instructions and code to replicate the process are provided in the Fine_Tune_XLSR_Wav2Vec2_on_Greek_ASR_with_���_Transformers.ipynb notebook.
@@ -151,7 +202,7 @@ Instructions and code to replicate the process are provided in the Fine_Tune_XLS
151
  | ----------- | ----------- |
152
  | Training Loss | 0.0287 |
153
  | Validation Loss | 0.6062 |
154
- | WER on CommonVoice Test *| 0.3960 |
155
  * Reference transcripts were lower-cased and striped of punctuation and special characters.
156
 
157
  Full metrics log here:
 
21
  metrics:
22
  - name: Test WER
23
  type: wer
24
+ value: 30.92
25
  ---
26
 
27
  # Greek (el) version of the XLSR-Wav2Vec2 automatic speech recognition (ASR) model
 
33
  * model: XLSR-Wav2Vec2, trained for 60 epochs
34
  * metrics: Word Error Rate (WER)
35
 
36
+ ## Model description
37
 
38
  Wav2Vec2 is a pretrained model for Automatic Speech Recognition (ASR) and was released in September 2020 by Alexei Baevski, Michael Auli, and Alex Conneau. Soon after the superior performance of Wav2Vec2 was demonstrated on the English ASR dataset LibriSpeech, Facebook AI presented XLSR-Wav2Vec2. XLSR stands for cross-lingual speech representations and refers to XLSR-Wav2Vec2`s ability to learn speech representations that are useful across multiple languages.
39
 
 
41
 
42
  This model was trained on Greek CommonVoice speech data (364MB) for 60 epochs on a single NVIDIA RTX 3080, for aprox. 8hrs.
43
 
44
+ ## How to use for inference:
45
 
46
  For live demo, make sure that speech files are sampled at 16kHz.
47
 
 
63
  from datasets import load_dataset, load_metric
64
  import torch
65
 
66
+ chars_to_ignore_regex = '[\\,\\?\\.\\!\\-\\;\\:\\"\\“\\%\\‘\\”\\�]'
67
 
68
  def remove_special_characters(batch):
69
  batch["text"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
 
96
 
97
  # Loading model and dataset processor
98
 
99
+ model = Wav2Vec2ForCTC.from_pretrained("lighteternal/wav2vec2-large-xlsr-53-greek").to("cuda")
100
+ processor = Wav2Vec2Processor.from_pretrained("lighteternal/wav2vec2-large-xlsr-53-greek")
101
 
102
 
103
  # Preparing speech dataset to be suitable for inference
104
 
105
+ common_voice_test = load_dataset("common_voice", "el", split="test")
106
 
107
  common_voice_test = common_voice_test.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"])
108
 
 
117
 
118
  # Loading test dataset
119
 
120
+ common_voice_test_transcription = load_dataset("common_voice", "el", split="test")
121
 
122
 
123
  #Performing inference on a random sample. Change the "example" value to try inference on different CommonVoice extracts
 
134
  print(processor.decode(pred_ids[0]))
135
  # πού θέλεις να πάμε ρώτησε φοβισμένα ο βασιλιάς
136
 
137
+ print("\
138
+ Reference:")
139
  print(common_voice_test_transcription["sentence"][example].lower())
140
  # πού θέλεις να πάμε; ρώτησε φοβισμένα ο βασιλιάς.
141
 
142
  ```
143
 
144
+ ## Evaluation
145
+
146
+ The model can be evaluated as follows on the Greek test data of Common Voice.
147
+
148
+
149
+ ```python
150
+ import torch
151
+ import torchaudio
152
+ from datasets import load_dataset, load_metric
153
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
154
+ import re
155
+
156
+ test_dataset = load_dataset("common_voice", "el", split="test")
157
+ wer = load_metric("wer")
158
+
159
+ processor = Wav2Vec2Processor.from_pretrained("lighteternal/wav2vec2-large-xlsr-53-greek")
160
+ model = Wav2Vec2ForCTC.from_pretrained("lighteternal/wav2vec2-large-xlsr-53-greek")
161
+ model.to("cuda")
162
+
163
+ chars_to_ignore_regex = '[\\,\\?\\.\\!\\-\\;\\:\\"\\“\\%\\‘\\”\\�]'
164
+ resampler = torchaudio.transforms.Resample(48_000, 16_000)
165
+
166
+ # Preprocessing the datasets.
167
+ # We need to read the aduio files as arrays
168
+
169
+ def speech_file_to_array_fn(batch):
170
+ batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
171
+ speech_array, sampling_rate = torchaudio.load(batch["path"])
172
+ batch["speech"] = resampler(speech_array).squeeze().numpy()
173
+ return batch
174
+
175
+ test_dataset = test_dataset.map(speech_file_to_array_fn)
176
+
177
+ # Preprocessing the datasets.
178
+ # We need to read the aduio files as arrays
179
+
180
+ def evaluate(batch):
181
+ inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
182
+ with torch.no_grad():
183
+ logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
184
+ pred_ids = torch.argmax(logits, dim=-1)
185
+ batch["pred_strings"] = processor.batch_decode(pred_ids)
186
+ return batch
187
+
188
+ result = test_dataset.map(evaluate, batched=True, batch_size=8)
189
+ print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
190
+ ```
191
+
192
+ **Test Result**: 30.92 %
193
+
194
  ### How to use for training:
195
 
196
  Instructions and code to replicate the process are provided in the Fine_Tune_XLSR_Wav2Vec2_on_Greek_ASR_with_���_Transformers.ipynb notebook.
 
202
  | ----------- | ----------- |
203
  | Training Loss | 0.0287 |
204
  | Validation Loss | 0.6062 |
205
+ | WER on CommonVoice Test *| 0.3092 |
206
  * Reference transcripts were lower-cased and striped of punctuation and special characters.
207
 
208
  Full metrics log here: