lucio commited on
Commit
1edb526
1 Parent(s): f8903fb

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +17 -18
README.md CHANGED
@@ -23,7 +23,7 @@ model-index:
23
  metrics:
24
  - name: Test WER
25
  type: wer
26
- value: 48.47 #TODO (IMPORTANT): replace {wer_result_on_test} with the WER error rate you achieved on the common_voice test set. It should be in the format XX.XX (don't add the % sign here). **Please** remember to fill out this value after you evaluated your model, so that your model appears on the leaderboard. If you fill out this model card before evaluating your model, please remember to edit the model card afterward to fill in your value
27
  ---
28
 
29
  # Wav2Vec2-Large-XLSR-53-lg
@@ -51,15 +51,15 @@ resampler = torchaudio.transforms.Resample(48_000, 16_000)
51
  # Preprocessing the datasets.
52
  # We need to read the audio files as arrays
53
  def speech_file_to_array_fn(batch):
54
- speech_array, sampling_rate = torchaudio.load(batch["path"])
55
- batch["speech"] = resampler(speech_array).squeeze().numpy()
56
- return batch
57
 
58
  test_dataset = test_dataset.map(speech_file_to_array_fn)
59
  inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
60
 
61
  with torch.no_grad():
62
- logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
63
 
64
  predicted_ids = torch.argmax(logits, dim=-1)
65
 
@@ -87,38 +87,37 @@ processor = Wav2Vec2Processor.from_pretrained("lucio/wav2vec2-large-xlsr-luganda
87
  model = Wav2Vec2ForCTC.from_pretrained("lucio/wav2vec2-large-xlsr-luganda")
88
  model.to("cuda")
89
 
90
- chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“]'
91
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
92
 
93
  # Preprocessing the datasets.
94
  # We need to read the audio files as arrays
95
  def speech_file_to_array_fn(batch):
96
- batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
97
- speech_array, sampling_rate = torchaudio.load(batch["path"])
98
- batch["speech"] = resampler(speech_array).squeeze().numpy()
99
- return batch
100
 
101
  test_dataset = test_dataset.map(speech_file_to_array_fn)
102
 
103
  # Preprocessing the datasets.
104
  # We need to read the audio files as arrays
105
  def evaluate(batch):
106
- inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
107
 
108
- with torch.no_grad():
109
- logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
110
 
111
- pred_ids = torch.argmax(logits, dim=-1)
112
- batch["pred_strings"] = processor.batch_decode(pred_ids)
113
- return batch
114
 
115
  result = test_dataset.map(evaluate, batched=True, batch_size=8)
116
 
117
  print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
118
  ```
119
 
120
- **Test Result**: 48.47 % # TODO: write output of print here. IMPORTANT: Please remember to also replace {wer_result_on_test} at the top of with this value here. tags.
121
-
122
 
123
  ## Training
124
 
 
23
  metrics:
24
  - name: Test WER
25
  type: wer
26
+ value: 48.47
27
  ---
28
 
29
  # Wav2Vec2-Large-XLSR-53-lg
 
51
  # Preprocessing the datasets.
52
  # We need to read the audio files as arrays
53
  def speech_file_to_array_fn(batch):
54
+ \tspeech_array, sampling_rate = torchaudio.load(batch["path"])
55
+ \tbatch["speech"] = resampler(speech_array).squeeze().numpy()
56
+ \treturn batch
57
 
58
  test_dataset = test_dataset.map(speech_file_to_array_fn)
59
  inputs = processor(test_dataset["speech"][:2], sampling_rate=16_000, return_tensors="pt", padding=True)
60
 
61
  with torch.no_grad():
62
+ \tlogits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
63
 
64
  predicted_ids = torch.argmax(logits, dim=-1)
65
 
 
87
  model = Wav2Vec2ForCTC.from_pretrained("lucio/wav2vec2-large-xlsr-luganda")
88
  model.to("cuda")
89
 
90
+ chars_to_ignore_regex = '[\\,\\?\\.\\!\\-\\;\\:\\"\\“]'
91
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
92
 
93
  # Preprocessing the datasets.
94
  # We need to read the audio files as arrays
95
  def speech_file_to_array_fn(batch):
96
+ \tbatch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
97
+ \tspeech_array, sampling_rate = torchaudio.load(batch["path"])
98
+ \tbatch["speech"] = resampler(speech_array).squeeze().numpy()
99
+ \treturn batch
100
 
101
  test_dataset = test_dataset.map(speech_file_to_array_fn)
102
 
103
  # Preprocessing the datasets.
104
  # We need to read the audio files as arrays
105
  def evaluate(batch):
106
+ \tinputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
107
 
108
+ \twith torch.no_grad():
109
+ \t\tlogits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
110
 
111
+ \tpred_ids = torch.argmax(logits, dim=-1)
112
+ \tbatch["pred_strings"] = processor.batch_decode(pred_ids)
113
+ \treturn batch
114
 
115
  result = test_dataset.map(evaluate, batched=True, batch_size=8)
116
 
117
  print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
118
  ```
119
 
120
+ **Test Result**: 48.47 %
 
121
 
122
  ## Training
123