gorkemgoknar commited on
Commit
eefd788
1 Parent(s): c53aa1d

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +8 -8
README.md CHANGED
@@ -30,7 +30,7 @@ model-index:
30
  # Wav2Vec2-Large-XLSR-53-Turkish
31
 
32
  Note: This model is trained with 5 Turkish movies additional to common voice dataset.
33
- Although WER is high (50%) per common voice test dataset, its recognition (with some letter errors) seems better.
34
 
35
  Please try speech yourself on the right side to see its performance.
36
 
@@ -120,7 +120,7 @@ model.to("cuda")
120
 
121
  #Note: Not ignoring "'" on this one
122
  #Note: Not ignoring "'" on this one
123
- chars_to_ignore_regex = '[\\\\,\\\\?\\\\.\\\\!\\\\-\\\\;\\\\:\\\\"\\\\“\\\\%\\\\‘\\\\”\\\\�\\\\#\\\\>\\\\<\\\\_\\\\’\\\\[\\\\]\\\\{\\\\}]'
124
 
125
 
126
  #resampler = torchaudio.transforms.Resample(48_000, 16_000)
@@ -151,11 +151,11 @@ def audio_resampler(batch, new_sample_rate = 16000):
151
  def remove_special_characters(batch):
152
 
153
  ##this one comes from subtitles if additional timestamps not processed -> 00:01:01 00:01:01,33
154
- batch["sentence"] = re.sub('\\\\b\\\\d{2}:\\\\d{2}:\\\\d{2}(,+\\\\d{2})?\\\\b', ' ', batch["sentence"])
155
  ##remove all caps in text [AÇIKLAMA] etc, do it before..
156
- batch["sentence"] = re.sub('\\\\[(\\\\b[A-Z]+\\\\])', '', batch["sentence"])
157
  ##replace three dots (that are inside string with single)
158
- batch["sentence"] = re.sub("([a-zA-Z]+)\\\\.\\\\.\\\\.", r"\\\\1.", batch["sentence"])
159
  #standart ignore list
160
  batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
161
 
@@ -218,7 +218,7 @@ from datasets import Dataset
218
  import csv
219
 
220
  #Walk all subdirectories of base_set_path and find csv files
221
- base_set_path = r'C:\\\\dataset_extracts'
222
  csv_files = []
223
  for path, subdirs, files in os.walk(base_set_path):
224
  for name in files:
@@ -228,7 +228,7 @@ for path, subdirs, files in os.walk(base_set_path):
228
 
229
  def get_dataset_from_csv_file(csvfilename,names=['sentence', 'path']):
230
  path = Path(csvfilename)
231
- csv_delimiter="\\\\t" ##tab seperated, change if something else
232
 
233
  ##Pandas has bug reading non-ascii file names, make sure use open with encoding
234
  df=pd.read_csv(open(path, 'r', encoding='utf-8'), delimiter=csv_delimiter,header=None , names=names, encoding='utf8')
@@ -246,7 +246,7 @@ from datasets import concatenate_datasets, load_dataset
246
  from datasets import load_from_disk
247
 
248
  # Merge datasets together (from csv files)
249
- dataset_file_path = ".\\\\dataset_file"
250
  custom_datasets_concat = concatenate_datasets( [dset for dset in custom_datasets] )
251
 
252
  #save this one to disk
 
30
  # Wav2Vec2-Large-XLSR-53-Turkish
31
 
32
  Note: This model is trained with 5 Turkish movies additional to common voice dataset.
33
+ Although WER is high (50%) per common voice test dataset, testing with voice with background noise and on browser, derived letters are pretty close.
34
 
35
  Please try speech yourself on the right side to see its performance.
36
 
 
120
 
121
  #Note: Not ignoring "'" on this one
122
  #Note: Not ignoring "'" on this one
123
+ chars_to_ignore_regex = '[\\\\\\\\,\\\\\\\\?\\\\\\\\.\\\\\\\\!\\\\\\\\-\\\\\\\\;\\\\\\\\:\\\\\\\\"\\\\\\\\“\\\\\\\\%\\\\\\\\‘\\\\\\\\”\\\\\\\\�\\\\\\\\#\\\\\\\\>\\\\\\\\<\\\\\\\\_\\\\\\\\’\\\\\\\\[\\\\\\\\]\\\\\\\\{\\\\\\\\}]'
124
 
125
 
126
  #resampler = torchaudio.transforms.Resample(48_000, 16_000)
 
151
  def remove_special_characters(batch):
152
 
153
  ##this one comes from subtitles if additional timestamps not processed -> 00:01:01 00:01:01,33
154
+ batch["sentence"] = re.sub('\\\\\\\\b\\\\\\\\d{2}:\\\\\\\\d{2}:\\\\\\\\d{2}(,+\\\\\\\\d{2})?\\\\\\\\b', ' ', batch["sentence"])
155
  ##remove all caps in text [AÇIKLAMA] etc, do it before..
156
+ batch["sentence"] = re.sub('\\\\\\\\[(\\\\\\\\b[A-Z]+\\\\\\\\])', '', batch["sentence"])
157
  ##replace three dots (that are inside string with single)
158
+ batch["sentence"] = re.sub("([a-zA-Z]+)\\\\\\\\.\\\\\\\\.\\\\\\\\.", r"\\\\\\\\1.", batch["sentence"])
159
  #standart ignore list
160
  batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
161
 
 
218
  import csv
219
 
220
  #Walk all subdirectories of base_set_path and find csv files
221
+ base_set_path = r'C:\\\\\\\\dataset_extracts'
222
  csv_files = []
223
  for path, subdirs, files in os.walk(base_set_path):
224
  for name in files:
 
228
 
229
  def get_dataset_from_csv_file(csvfilename,names=['sentence', 'path']):
230
  path = Path(csvfilename)
231
+ csv_delimiter="\\\\\\\\t" ##tab seperated, change if something else
232
 
233
  ##Pandas has bug reading non-ascii file names, make sure use open with encoding
234
  df=pd.read_csv(open(path, 'r', encoding='utf-8'), delimiter=csv_delimiter,header=None , names=names, encoding='utf8')
 
246
  from datasets import load_from_disk
247
 
248
  # Merge datasets together (from csv files)
249
+ dataset_file_path = ".\\\\\\\\dataset_file"
250
  custom_datasets_concat = concatenate_datasets( [dset for dset in custom_datasets] )
251
 
252
  #save this one to disk