Nhut commited on
Commit
2e04e40
1 Parent(s): cbbf4b8

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +13 -34
README.md CHANGED
@@ -2,7 +2,8 @@
2
  language: vi
3
  datasets:
4
  - common_voice
5
- - TODO: https://data.mendeley.com/datasets/k9sxg2twv4/4
 
6
  metrics:
7
  - wer
8
  tags:
@@ -26,22 +27,16 @@ model-index:
26
  type: wer
27
  value: 52.48
28
  ---
29
-
30
  # Wav2Vec2-Large-XLSR-53-Vietnamese
31
-
32
- Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on Vietnamese using the [Common Voice](https://huggingface.co/datasets/common_voice), and [FOSD](https://data.mendeley.com/datasets/k9sxg2twv4/4).
33
  When using this model, make sure that your speech input is sampled at 16kHz.
34
-
35
  ## Usage
36
-
37
  The model can be used directly (without a language model) as follows:
38
-
39
  ```python
40
  import torch
41
  import torchaudio
42
  from datasets import load_dataset
43
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
44
-
45
  ENCODER = {
46
  "ia ": "iê ",
47
  "ìa ": "iề ",
@@ -149,46 +144,32 @@ def decode_string(x):
149
  for k, v in list(reversed(list(ENCODER.items()))):
150
  x = x.replace(v, k)
151
  return x
152
-
153
-
154
  test_dataset = load_dataset("common_voice", "vi", split="test[:2%]")
155
-
156
  processor = Wav2Vec2Processor.from_pretrained("Nhut/wav2vec2-large-xlsr-vietnamese")
157
  model = Wav2Vec2ForCTC.from_pretrained("Nhut/wav2vec2-large-xlsr-vietnamese")
158
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
159
-
160
  # Preprocessing the datasets.
161
  # We need to read the aduio files as arrays
162
  def speech_file_to_array_fn(batch):
163
  speech_array, sampling_rate = torchaudio.load(batch["path"])
164
  batch["speech"] = resampler(speech_array).squeeze().numpy()
165
  return batch
166
-
167
  test_dataset = test_dataset.map(speech_file_to_array_fn)
168
  inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
169
-
170
  with torch.no_grad():
171
  logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
172
-
173
  predicted_ids = torch.argmax(logits, dim=-1)
174
-
175
  print("Prediction:", [decode_string(x) for x in processor.batch_decode(predicted_ids)])
176
  print("Reference:", test_dataset["sentence"][:2])
177
  ```
178
-
179
-
180
  ## Evaluation
181
-
182
  The model can be evaluated as follows on the Vietnamese test data of Common Voice.
183
-
184
-
185
  ```python
186
  import torch
187
  import torchaudio
188
  from datasets import load_dataset, load_metric
189
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
190
  import re
191
-
192
  ENCODER = {
193
  "ia ": "iê ",
194
  "ìa ": "iề ",
@@ -296,11 +277,8 @@ def decode_string(x):
296
  for k, v in list(reversed(list(ENCODER.items()))):
297
  x = x.replace(v, k)
298
  return x
299
-
300
-
301
  test_dataset = load_dataset("common_voice", "vi", split="test")
302
  wer = load_metric("wer")
303
-
304
  processor = Wav2Vec2Processor.from_pretrained(MODEL)
305
  model = Wav2Vec2ForCTC.from_pretrained(MODEL)
306
  model.to("cuda")
@@ -308,29 +286,30 @@ model.to("cuda")
308
  chars_to_ignore_regex = '[\\\+\@\ǀ\,\?\.\!\-\;\:\"\“\%\‘\”\�]'
309
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
310
 
 
 
 
 
 
 
 
 
 
311
  # Preprocessing the datasets.
312
  # We need to read the aduio files as arrays
313
  def evaluate(batch):
314
  inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
315
-
316
  with torch.no_grad():
317
  logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
318
-
319
  pred_ids = torch.argmax(logits, dim=-1)
320
  batch["pred_strings"] = processor.batch_decode(pred_ids)
321
  # decode_string: We replace the encoded letter with the initial letters
322
  batch["pred_strings"] = [decode_string(x) for x in batch["pred_strings"]]
323
  return batch
324
-
325
  result = test_dataset.map(evaluate, batched=True, batch_size=8)
326
  print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
327
  ```
328
-
329
  **Test Result**: 52.48 %
330
-
331
-
332
  ## Training
333
-
334
- The Common Voice `train`, `validation`, and FOSD datasets were used for training as well.
335
-
336
  The script used for training can be found [here](https://colab.research.google.com/drive/11pP4uVJj4SYZTzGjlCUtOHywlhYqs0cPx)
 
2
  language: vi
3
  datasets:
4
  - common_voice
5
+ - FOSD: https://data.mendeley.com/datasets/k9sxg2twv4/4
6
+ - VIVOS: https://ailab.hcmus.edu.vn/vivos
7
  metrics:
8
  - wer
9
  tags:
 
27
  type: wer
28
  value: 52.48
29
  ---
 
30
  # Wav2Vec2-Large-XLSR-53-Vietnamese
31
+ Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on Vietnamese using the [Common Voice](https://huggingface.co/datasets/common_voice), [FOSD](https://data.mendeley.com/datasets/k9sxg2twv4/4) and [VIVOS](https://ailab.hcmus.edu.vn/vivos).
 
32
  When using this model, make sure that your speech input is sampled at 16kHz.
 
33
  ## Usage
 
34
  The model can be used directly (without a language model) as follows:
 
35
  ```python
36
  import torch
37
  import torchaudio
38
  from datasets import load_dataset
39
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
 
40
  ENCODER = {
41
  "ia ": "iê ",
42
  "ìa ": "iề ",
 
144
  for k, v in list(reversed(list(ENCODER.items()))):
145
  x = x.replace(v, k)
146
  return x
 
 
147
  test_dataset = load_dataset("common_voice", "vi", split="test[:2%]")
 
148
  processor = Wav2Vec2Processor.from_pretrained("Nhut/wav2vec2-large-xlsr-vietnamese")
149
  model = Wav2Vec2ForCTC.from_pretrained("Nhut/wav2vec2-large-xlsr-vietnamese")
150
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
 
151
  # Preprocessing the datasets.
152
  # We need to read the aduio files as arrays
153
  def speech_file_to_array_fn(batch):
154
  speech_array, sampling_rate = torchaudio.load(batch["path"])
155
  batch["speech"] = resampler(speech_array).squeeze().numpy()
156
  return batch
 
157
  test_dataset = test_dataset.map(speech_file_to_array_fn)
158
  inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
 
159
  with torch.no_grad():
160
  logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
 
161
  predicted_ids = torch.argmax(logits, dim=-1)
 
162
  print("Prediction:", [decode_string(x) for x in processor.batch_decode(predicted_ids)])
163
  print("Reference:", test_dataset["sentence"][:2])
164
  ```
 
 
165
  ## Evaluation
 
166
  The model can be evaluated as follows on the Vietnamese test data of Common Voice.
 
 
167
  ```python
168
  import torch
169
  import torchaudio
170
  from datasets import load_dataset, load_metric
171
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
172
  import re
 
173
  ENCODER = {
174
  "ia ": "iê ",
175
  "ìa ": "iề ",
 
277
  for k, v in list(reversed(list(ENCODER.items()))):
278
  x = x.replace(v, k)
279
  return x
 
 
280
  test_dataset = load_dataset("common_voice", "vi", split="test")
281
  wer = load_metric("wer")
 
282
  processor = Wav2Vec2Processor.from_pretrained(MODEL)
283
  model = Wav2Vec2ForCTC.from_pretrained(MODEL)
284
  model.to("cuda")
 
286
  chars_to_ignore_regex = '[\\\+\@\ǀ\,\?\.\!\-\;\:\"\“\%\‘\”\�]'
287
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
288
 
289
+ # Preprocessing the datasets.
290
+ # We need to read the aduio files as arrays
291
+ def speech_file_to_array_fn(batch):
292
+ batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
293
+ speech_array, sampling_rate = torchaudio.load(batch["path"])
294
+ batch["speech"] = resampler(speech_array).squeeze().numpy()
295
+ return batch
296
+
297
+
298
  # Preprocessing the datasets.
299
  # We need to read the aduio files as arrays
300
  def evaluate(batch):
301
  inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
 
302
  with torch.no_grad():
303
  logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
 
304
  pred_ids = torch.argmax(logits, dim=-1)
305
  batch["pred_strings"] = processor.batch_decode(pred_ids)
306
  # decode_string: We replace the encoded letter with the initial letters
307
  batch["pred_strings"] = [decode_string(x) for x in batch["pred_strings"]]
308
  return batch
 
309
  result = test_dataset.map(evaluate, batched=True, batch_size=8)
310
  print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
311
  ```
 
312
  **Test Result**: 52.48 %
 
 
313
  ## Training
314
+ The Common Voice `train`, `validation` and FOSD datasets and VIVOS datasets were used for training as well.
 
 
315
  The script used for training can be found [here](https://colab.research.google.com/drive/11pP4uVJj4SYZTzGjlCUtOHywlhYqs0cPx)