gorkemgoknar
/

wav2vec2-large-xlsr-53-turkish

Automatic Speech Recognition

xlsr-fine-tuning-week

Inference Endpoints

Model card Files Files and versions Community

gorkemgoknar commited on Mar 28, 2021

Commit

0dd7f50

•

1 Parent(s): 96e034b

Update README.md

Files changed (1) hide show

README.md +6 -6

README.md CHANGED Viewed

@@ -120,7 +120,8 @@ model.to("cuda")
 #Note: Not ignoring "'"  on this one
 #Note: Not ignoring "'"  on this one
-chars_to_ignore_regex = """[\\,\\?\\.\\!\\-\\;\\:\\"\\“\\%\\‘\\”\\�\\#\\>\\<\\_\\’\\[\\]\\{\\}]"""
 #resampler = torchaudio.transforms.Resample(48_000, 16_000)
 #using custom load and transformer for audio  -> see audio_resampler
@@ -151,23 +152,22 @@ def audio_resampler(batch, new_sample_rate = 16000):
     return batch
 def remove_special_characters(batch):
     ##this one comes from subtitles if additional timestamps not processed  -> 00:01:01   00:01:01,33
-    batch["sentence"] = re.sub('\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\b\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\d{2}:\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\d{2}:\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\d{2}(,+\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\d{2})?\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\b', ' ', batch["sentence"])
     ##remove all caps in text [AÇIKLAMA] etc, do it before..
-    batch["sentence"] = re.sub('\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\[(\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\b[A-Z]+\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\])', '', batch["sentence"])
     ##replace three dots (that are inside string with single)
-    batch["sentence"] = re.sub("([a-zA-Z]+)\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\.\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\.\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\.", r"\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\1.", batch["sentence"])
     #standart ignore list
     batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
     return batch
 # Preprocessing the datasets.
 # We need to read the aduio files as arrays

 #Note: Not ignoring "'"  on this one
 #Note: Not ignoring "'"  on this one
+chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\#\>\<\_\’\[\]\{\}]'
 #resampler = torchaudio.transforms.Resample(48_000, 16_000)
 #using custom load and transformer for audio  -> see audio_resampler
     return batch
 def remove_special_characters(batch):
     ##this one comes from subtitles if additional timestamps not processed  -> 00:01:01   00:01:01,33
+    batch["sentence"] = re.sub('\b\d{2}:\d{2}:\d{2}(,+\d{2})?\b', ' ', batch["sentence"])
     ##remove all caps in text [AÇIKLAMA] etc, do it before..
+    batch["sentence"] = re.sub('\[(\b[A-Z]+\])', '', batch["sentence"])
     ##replace three dots (that are inside string with single)
+    batch["sentence"] = re.sub("([a-zA-Z]+)\.\.\.", r"\1.", batch["sentence"])
     #standart ignore list
     batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
     return batch
 # Preprocessing the datasets.
 # We need to read the aduio files as arrays