arampacha
/

wav2vec2-large-xlsr-ukrainian

@@ -45,14 +45,13 @@ test_dataset = load_dataset("common_voice", "uk", split="test[:2%]")
 processor = Wav2Vec2Processor.from_pretrained("arampacha/wav2vec2-large-xlsr-ukrainian")
 model = Wav2Vec2ForCTC.from_pretrained("arampacha/wav2vec2-large-xlsr-ukrainian")
-resampler = torchaudio.transforms.Resample(48_000, 16_000)
 # Preprocessing the datasets.
 # We need to read the aduio files as arrays
 def speech_file_to_array_fn(batch):
     speech_array, sampling_rate = torchaudio.load(batch["path"])
-    batch["speech"] = resampler(speech_array).squeeze().numpy()
     return batch
 test_dataset = test_dataset.map(speech_file_to_array_fn)
@@ -92,13 +91,18 @@ chars_to_ignore_regex = f'[{"".join(chars_to_ignore)}]'
 resampler = torchaudio.transforms.Resample(48_000, 16_000)
 # Preprocessing the datasets.
-# We need to read the aduio files as arrays
 def speech_file_to_array_fn(batch):
-    batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower().strip()
     batch["sentence"] = re.sub(re.compile('i'), 'і', batch['sentence'])
     batch['sentence'] = re.sub('  ', ' ', batch['sentence'])
     speech_array, sampling_rate = torchaudio.load(batch["path"])
-    batch["speech"] = resampler(speech_array).squeeze().numpy()
     return batch
 test_dataset = test_dataset.map(speech_file_to_array_fn)

 processor = Wav2Vec2Processor.from_pretrained("arampacha/wav2vec2-large-xlsr-ukrainian")
 model = Wav2Vec2ForCTC.from_pretrained("arampacha/wav2vec2-large-xlsr-ukrainian")
 # Preprocessing the datasets.
 # We need to read the aduio files as arrays
 def speech_file_to_array_fn(batch):
     speech_array, sampling_rate = torchaudio.load(batch["path"])
+    batch["speech"] = torchaudio.transforms.Resample(sampling_rate, 16_000)(speech_array).squeeze().numpy()
     return batch
 test_dataset = test_dataset.map(speech_file_to_array_fn)
 resampler = torchaudio.transforms.Resample(48_000, 16_000)
 # Preprocessing the datasets.
+# We need to read the aduio files as arrays and normalize charecters
 def speech_file_to_array_fn(batch):
+    batch["sentence"] = re.sub(re.compile(chars_to_ignore_regex), '', batch["sentence"]).lower().strip()
     batch["sentence"] = re.sub(re.compile('i'), 'і', batch['sentence'])
+    batch["sentence"] = re.sub(re.compile('o'), 'о', batch['sentence'])
+    batch["sentence"] = re.sub(re.compile('a'), 'а', batch['sentence'])
+    batch["sentence"] = re.sub(re.compile('ы'), 'и', batch['sentence'])
+    batch["sentence"] = re.sub(re.compile("['`]"), '’', batch['sentence'])
+    batch["sentence"] = re.sub(re.compile("–"), '', batch['sentence'])
     batch['sentence'] = re.sub('  ', ' ', batch['sentence'])
     speech_array, sampling_rate = torchaudio.load(batch["path"])
+    batch["speech"] = torchaudio.transforms.Resample(sampling_rate, 16_000)(speech_array).squeeze().numpy()
     return batch
 test_dataset = test_dataset.map(speech_file_to_array_fn)

pytorch_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:797a5b497ee33bf44024d2596e6bebf6591ba15e6a3d18df7e147bb1ed92d1be
 size 1262118359

 version https://git-lfs.github.com/spec/v1
+oid sha256:045342a8ae4fc38f2e579a7d22a977235dc905eecb3eae4004b38d1a45835fbb
 size 1262118359