wietsedv commited on
Commit
6e8c169
β€’
1 Parent(s): f6127ab

Fix character whitelist

Browse files
Files changed (2) hide show
  1. README.md +3 -3
  2. tokenizer_config.json +8 -1
README.md CHANGED
@@ -23,7 +23,7 @@ model-index:
23
  metrics:
24
  - name: Test WER
25
  type: wer
26
- value: 17.47
27
  ---
28
 
29
  # Wav2Vec2-Large-XLSR-53-Dutch
@@ -87,7 +87,7 @@ processor = Wav2Vec2Processor.from_pretrained("wietsedv/wav2vec2-large-xlsr-53-f
87
  model = Wav2Vec2ForCTC.from_pretrained("wietsedv/wav2vec2-large-xlsr-53-frisian")
88
  model.to("cuda")
89
 
90
- chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\β€œ\%\β€˜\”]'
91
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
92
 
93
  # Preprocessing the datasets.
@@ -117,7 +117,7 @@ result = test_dataset.map(evaluate, batched=True, batch_size=8)
117
  print("WER: {:.2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
118
  ```
119
 
120
- **Test Result**: 17.47 %
121
 
122
 
123
  ## Training
23
  metrics:
24
  - name: Test WER
25
  type: wer
26
+ value: 16.25
27
  ---
28
 
29
  # Wav2Vec2-Large-XLSR-53-Dutch
87
  model = Wav2Vec2ForCTC.from_pretrained("wietsedv/wav2vec2-large-xlsr-53-frisian")
88
  model.to("cuda")
89
 
90
+ chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\'\β€œ\%\β€˜\”]'
91
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
92
 
93
  # Preprocessing the datasets.
117
  print("WER: {:.2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
118
  ```
119
 
120
+ **Test Result**: 16.25 %
121
 
122
 
123
  ## Training
tokenizer_config.json CHANGED
@@ -1 +1,8 @@
1
- {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "<pad>", "do_lower_case": false, "word_delimiter_token": "|"}
 
 
 
 
 
 
 
1
+ {
2
+ "unk_token": "<unk>",
3
+ "bos_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "pad_token": "<pad>",
6
+ "do_lower_case": true,
7
+ "word_delimiter_token": "|"
8
+ }