marcel commited on
Commit
48df8f4
1 Parent(s): 032618e

added character substitution

Browse files
Files changed (1) hide show
  1. README.md +27 -4
README.md CHANGED
@@ -22,7 +22,7 @@ model-index:
22
  metrics:
23
  - name: Test WER
24
  type: wer
25
- value: 29.48
26
  ---
27
 
28
  # Wav2Vec2-Large-XLSR-53-German
@@ -88,14 +88,37 @@ model = Wav2Vec2ForCTC.from_pretrained("de")
88
  `elgeish/wav2vec2-large-xlsr-53-arabic`
89
  model.to("cuda")
90
 
91
- chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\”\�\カ\æ\無\ན\カ\臣\ѹ\…\«\»\ð\ı\„\幺\א\ב\比\ш\ע\)\ứ\в\œ\ч\+\—\ш\‚\נ\м\ń\乡\$\=\ש\ф\支\(\°\и\к\̇]'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
93
 
94
  # Preprocessing the datasets.
95
  # We need to read the aduio files as arrays
96
  def speech_file_to_array_fn(batch):
97
  \tbatch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
98
- \tbatch["sentence"] = re.sub('\ß', 'ss', batch["sentence"])
 
 
99
  \tspeech_array, sampling_rate = torchaudio.load(batch["path"])
100
  \tbatch["speech"] = resampler(speech_array).squeeze().numpy()
101
  \treturn batch
@@ -119,7 +142,7 @@ result = test_dataset.map(evaluate, batched=True, batch_size=8)
119
  print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
120
  ```
121
 
122
- **Test Result**: 29.48 %
123
 
124
 
125
  ## Training
 
22
  metrics:
23
  - name: Test WER
24
  type: wer
25
+ value: 29.35
26
  ---
27
 
28
  # Wav2Vec2-Large-XLSR-53-German
 
88
  `elgeish/wav2vec2-large-xlsr-53-arabic`
89
  model.to("cuda")
90
 
91
+ chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\”\�\カ\æ\無\ན\カ\臣\ѹ\…\«\»\ð\ı\„\幺\א\ב\比\ш\ע\)\ứ\в\œ\ч\+\—\ш\‚\נ\м\ń\乡\$\=\ש\ф\支\(\°\и\к\̇]'
92
+ substitutions = {
93
+ \t'e' : '[\ə\é\ě\ę\ê\ế\ế\ë\ė\е]',
94
+ \t'o' : '[\ō\ô\ô\ó\ò\ø\ọ\ŏ\õ\ő\о]',
95
+ \t'a' : '[\á\ā\ā\ă\ã\å\â\à\ą\а]',
96
+ \t'c' : '[\č\ć\ç\с]',
97
+ \t'l' : '[\ł]',
98
+ \t'u' : '[\ú\ū\ứ\ů]',
99
+ \t'und' : '[\&]',
100
+ \t'r' : '[\ř]',
101
+ \t'y' : '[\ý]',
102
+ \t's' : '[\ś\š\ș\ş]',
103
+ \t'i' : '[\ī\ǐ\í\ï\î\ï]',
104
+ \t'z' : '[\ź\ž\ź\ż]',
105
+ \t'n' : '[\ñ\ń\ņ]',
106
+ \t'g' : '[\ğ]',
107
+ \t'ss' : '[\ß]',
108
+ \t't' : '[\ț\ť]',
109
+ \t'd' : '[\ď\đ]',
110
+ \t"'": '[\ʿ\་\’\`\´\ʻ\`\‘]',
111
+ \t'p': '\р'
112
+ }
113
  resampler = torchaudio.transforms.Resample(48_000, 16_000)
114
 
115
  # Preprocessing the datasets.
116
  # We need to read the aduio files as arrays
117
  def speech_file_to_array_fn(batch):
118
  \tbatch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
119
+ \tfor x in substitutions:
120
+ \t\tbatch["sentence"] = re.sub(substitutions[x], x, batch["sentence"])
121
+ \t\tspeech_array, sampling_rate = torchaudio.load(batch["path"])
122
  \tspeech_array, sampling_rate = torchaudio.load(batch["path"])
123
  \tbatch["speech"] = resampler(speech_array).squeeze().numpy()
124
  \treturn batch
 
142
  print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
143
  ```
144
 
145
+ **Test Result**: 29.35 %
146
 
147
 
148
  ## Training