lucio commited on
Commit
b1be9a4
1 Parent(s): ab6470e

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +24 -7
README.md CHANGED
@@ -23,7 +23,7 @@ model-index:
23
  metrics:
24
  - name: Test WER
25
  type: wer
26
- value: ??
27
  ---
28
 
29
  # Wav2Vec2-Large-XLSR-53-rw
@@ -41,6 +41,7 @@ import torchaudio
41
  from datasets import load_dataset
42
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
43
 
 
44
  test_dataset = load_dataset("common_voice", "rw", split="test[:2%]")
45
 
46
  processor = Wav2Vec2Processor.from_pretrained("lucio/wav2vec2-large-xlsr-kinyarwanda")
@@ -70,10 +71,11 @@ print("Reference:", test_dataset["sentence"][:2])
70
 
71
  ## Evaluation
72
 
73
- The model can be evaluated as follows on the Kinyarwanda test data of Common Voice.
74
 
75
 
76
  ```python
 
77
  import torch
78
  import torchaudio
79
  from datasets import load_dataset, load_metric
@@ -87,7 +89,7 @@ processor = Wav2Vec2Processor.from_pretrained("lucio/wav2vec2-large-xlsr-kinyarw
87
  model = Wav2Vec2ForCTC.from_pretrained("lucio/wav2vec2-large-xlsr-kinyarwanda")
88
  model.to("cuda")
89
 
90
- chars_to_ignore_regex = '[\[\],?.!;:%\'"‘’“”(){}‟ˮ´ʺ″«»/…‽�–-]'
91
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
92
 
93
  # Preprocessing the datasets.
@@ -98,7 +100,7 @@ def speech_file_to_array_fn(batch):
98
  batch["speech"] = resampler(speech_array).squeeze().numpy()
99
  return batch
100
 
101
- test_dataset = test_dataset.map(speech_file_to_array_fn)
102
 
103
  # Preprocessing the datasets.
104
  # We need to read the audio files as arrays
@@ -114,13 +116,28 @@ def evaluate(batch):
114
 
115
  result = test_dataset.map(evaluate, batched=True, batch_size=8)
116
 
117
- print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  ```
119
 
120
- **Test Result**: ?? %
121
 
122
  ## Training
123
 
124
  The Common Voice `validation` dataset was used for training, with 12% of the test dataset used for validation, trained on 1 V100 GPU for 48 hours (20 epochs).
125
 
126
- The script used for training was just the `run_finetuning.py` script provided in OVHcloud's databuzzword/hf-wav2vec image.
 
23
  metrics:
24
  - name: Test WER
25
  type: wer
26
+ value: 47.99
27
  ---
28
 
29
  # Wav2Vec2-Large-XLSR-53-rw
 
41
  from datasets import load_dataset
42
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
43
 
44
+ # WARNING! This will download and extract to use about 80GB on disk.
45
  test_dataset = load_dataset("common_voice", "rw", split="test[:2%]")
46
 
47
  processor = Wav2Vec2Processor.from_pretrained("lucio/wav2vec2-large-xlsr-kinyarwanda")
 
71
 
72
  ## Evaluation
73
 
74
+ The model can be evaluated as follows on the Kinyarwanda test data of Common Voice. Note that to even load the test data, the whole 40GB Kinyarwanda dataset will be downloaded and extracted into another 40GB directory, so you will need that space available on disk (e.g. not possible in the free tier of Google Colab). This script uses the `chunked_wer` function from [pcuenq](https://huggingface.co/pcuenq/wav2vec2-large-xlsr-53-es).
75
 
76
 
77
  ```python
78
+ import jiwer
79
  import torch
80
  import torchaudio
81
  from datasets import load_dataset, load_metric
 
89
  model = Wav2Vec2ForCTC.from_pretrained("lucio/wav2vec2-large-xlsr-kinyarwanda")
90
  model.to("cuda")
91
 
92
+ chars_to_ignore_regex = '[\\[\\],?.!;:%\\'"‘’“”(){}‟ˮ´ʺ″«»/…‽�–-]'
93
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
94
 
95
  # Preprocessing the datasets.
 
100
  batch["speech"] = resampler(speech_array).squeeze().numpy()
101
  return batch
102
 
103
+ test_dataset = test_dataset.map(speech_file_to_array_fn, remove_columns=['path'])
104
 
105
  # Preprocessing the datasets.
106
  # We need to read the audio files as arrays
 
116
 
117
  result = test_dataset.map(evaluate, batched=True, batch_size=8)
118
 
119
+ def chunked_wer(targets, predictions, chunk_size=None):
120
+ if chunk_size is None: return jiwer.wer(targets, predictions)
121
+ start = 0
122
+ end = chunk_size
123
+ H, S, D, I = 0, 0, 0, 0
124
+ while start < len(targets):
125
+ chunk_metrics = jiwer.compute_measures(targets[start:end], predictions[start:end])
126
+ H = H + chunk_metrics["hits"]
127
+ S = S + chunk_metrics["substitutions"]
128
+ D = D + chunk_metrics["deletions"]
129
+ I = I + chunk_metrics["insertions"]
130
+ start += chunk_size
131
+ end += chunk_size
132
+ return float(S + D + I) / float(H + S + D)
133
+
134
+ print("WER: {:2f}".format(100 * chunked_wer(result["sentence"], result["pred_strings"], chunk_size=4000)))
135
  ```
136
 
137
+ **Test Result**: 47.99 %
138
 
139
  ## Training
140
 
141
  The Common Voice `validation` dataset was used for training, with 12% of the test dataset used for validation, trained on 1 V100 GPU for 48 hours (20 epochs).
142
 
143
+ The script used for training was just the `run_finetuning.py` script provided in OVHcloud's `databuzzword/hf-wav2vec` image.