gorkemgoknar commited on
Commit
0dd7f50
1 Parent(s): 96e034b

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +6 -6
README.md CHANGED
@@ -120,7 +120,8 @@ model.to("cuda")
120
 
121
  #Note: Not ignoring "'" on this one
122
  #Note: Not ignoring "'" on this one
123
- chars_to_ignore_regex = """[\\,\\?\\.\\!\\-\\;\\:\\"\\“\\%\\‘\\”\\�\\#\\>\\<\\_\\’\\[\\]\\{\\}]"""
 
124
 
125
  #resampler = torchaudio.transforms.Resample(48_000, 16_000)
126
  #using custom load and transformer for audio -> see audio_resampler
@@ -151,23 +152,22 @@ def audio_resampler(batch, new_sample_rate = 16000):
151
 
152
  return batch
153
 
154
-
155
  def remove_special_characters(batch):
156
 
157
  ##this one comes from subtitles if additional timestamps not processed -> 00:01:01 00:01:01,33
158
- batch["sentence"] = re.sub('\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\b\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\d{2}:\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\d{2}:\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\d{2}(,+\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\d{2})?\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\b', ' ', batch["sentence"])
159
 
160
  ##remove all caps in text [AÇIKLAMA] etc, do it before..
161
- batch["sentence"] = re.sub('\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\[(\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\b[A-Z]+\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\])', '', batch["sentence"])
162
 
163
  ##replace three dots (that are inside string with single)
164
- batch["sentence"] = re.sub("([a-zA-Z]+)\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\.\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\.\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\.", r"\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\1.", batch["sentence"])
165
 
166
  #standart ignore list
167
  batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
168
-
169
 
170
  return batch
 
171
 
172
  # Preprocessing the datasets.
173
  # We need to read the aduio files as arrays
 
120
 
121
  #Note: Not ignoring "'" on this one
122
  #Note: Not ignoring "'" on this one
123
+ chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\#\>\<\_\’\[\]\{\}]'
124
+
125
 
126
  #resampler = torchaudio.transforms.Resample(48_000, 16_000)
127
  #using custom load and transformer for audio -> see audio_resampler
 
152
 
153
  return batch
154
 
 
155
  def remove_special_characters(batch):
156
 
157
  ##this one comes from subtitles if additional timestamps not processed -> 00:01:01 00:01:01,33
158
+ batch["sentence"] = re.sub('\b\d{2}:\d{2}:\d{2}(,+\d{2})?\b', ' ', batch["sentence"])
159
 
160
  ##remove all caps in text [AÇIKLAMA] etc, do it before..
161
+ batch["sentence"] = re.sub('\[(\b[A-Z]+\])', '', batch["sentence"])
162
 
163
  ##replace three dots (that are inside string with single)
164
+ batch["sentence"] = re.sub("([a-zA-Z]+)\.\.\.", r"\1.", batch["sentence"])
165
 
166
  #standart ignore list
167
  batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
 
168
 
169
  return batch
170
+
171
 
172
  # Preprocessing the datasets.
173
  # We need to read the aduio files as arrays