gorkemgoknar commited on
Commit
329f311
1 Parent(s): eefd788

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +12 -10
README.md CHANGED
@@ -30,7 +30,9 @@ model-index:
30
  # Wav2Vec2-Large-XLSR-53-Turkish
31
 
32
  Note: This model is trained with 5 Turkish movies additional to common voice dataset.
33
- Although WER is high (50%) per common voice test dataset, testing with voice with background noise and on browser, derived letters are pretty close.
 
 
34
 
35
  Please try speech yourself on the right side to see its performance.
36
 
@@ -120,7 +122,7 @@ model.to("cuda")
120
 
121
  #Note: Not ignoring "'" on this one
122
  #Note: Not ignoring "'" on this one
123
- chars_to_ignore_regex = '[\\\\\\\\,\\\\\\\\?\\\\\\\\.\\\\\\\\!\\\\\\\\-\\\\\\\\;\\\\\\\\:\\\\\\\\"\\\\\\\\“\\\\\\\\%\\\\\\\\‘\\\\\\\\”\\\\\\\\�\\\\\\\\#\\\\\\\\>\\\\\\\\<\\\\\\\\_\\\\\\\\’\\\\\\\\[\\\\\\\\]\\\\\\\\{\\\\\\\\}]'
124
 
125
 
126
  #resampler = torchaudio.transforms.Resample(48_000, 16_000)
@@ -151,11 +153,11 @@ def audio_resampler(batch, new_sample_rate = 16000):
151
  def remove_special_characters(batch):
152
 
153
  ##this one comes from subtitles if additional timestamps not processed -> 00:01:01 00:01:01,33
154
- batch["sentence"] = re.sub('\\\\\\\\b\\\\\\\\d{2}:\\\\\\\\d{2}:\\\\\\\\d{2}(,+\\\\\\\\d{2})?\\\\\\\\b', ' ', batch["sentence"])
155
  ##remove all caps in text [AÇIKLAMA] etc, do it before..
156
- batch["sentence"] = re.sub('\\\\\\\\[(\\\\\\\\b[A-Z]+\\\\\\\\])', '', batch["sentence"])
157
  ##replace three dots (that are inside string with single)
158
- batch["sentence"] = re.sub("([a-zA-Z]+)\\\\\\\\.\\\\\\\\.\\\\\\\\.", r"\\\\\\\\1.", batch["sentence"])
159
  #standart ignore list
160
  batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
161
 
@@ -202,10 +204,10 @@ print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"],
202
  ## Training
203
 
204
 
205
- The Common Voice `train` and `validation` datasets were used for training. `Additional 5 Turkish movies with subtitles` also used for training.
206
  Similar training model used as base fine-tuning, additional audio resampler is on above code.
207
 
208
- Putting Dataset building from csv and merging/saving as Dataset code below for reference
209
 
210
 
211
  ```python
@@ -218,7 +220,7 @@ from datasets import Dataset
218
  import csv
219
 
220
  #Walk all subdirectories of base_set_path and find csv files
221
- base_set_path = r'C:\\\\\\\\dataset_extracts'
222
  csv_files = []
223
  for path, subdirs, files in os.walk(base_set_path):
224
  for name in files:
@@ -228,7 +230,7 @@ for path, subdirs, files in os.walk(base_set_path):
228
 
229
  def get_dataset_from_csv_file(csvfilename,names=['sentence', 'path']):
230
  path = Path(csvfilename)
231
- csv_delimiter="\\\\\\\\t" ##tab seperated, change if something else
232
 
233
  ##Pandas has bug reading non-ascii file names, make sure use open with encoding
234
  df=pd.read_csv(open(path, 'r', encoding='utf-8'), delimiter=csv_delimiter,header=None , names=names, encoding='utf8')
@@ -246,7 +248,7 @@ from datasets import concatenate_datasets, load_dataset
246
  from datasets import load_from_disk
247
 
248
  # Merge datasets together (from csv files)
249
- dataset_file_path = ".\\\\\\\\dataset_file"
250
  custom_datasets_concat = concatenate_datasets( [dset for dset in custom_datasets] )
251
 
252
  #save this one to disk
 
30
  # Wav2Vec2-Large-XLSR-53-Turkish
31
 
32
  Note: This model is trained with 5 Turkish movies additional to common voice dataset.
33
+ Although WER is high (50%) per common voice test dataset, performance from "other sources " seems pretty good.
34
+
35
+ Dataset building from csv and merging code can be found on below of this Readme.
36
 
37
  Please try speech yourself on the right side to see its performance.
38
 
 
122
 
123
  #Note: Not ignoring "'" on this one
124
  #Note: Not ignoring "'" on this one
125
+ chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\#\>\<\_\’\[\]\{\}]'
126
 
127
 
128
  #resampler = torchaudio.transforms.Resample(48_000, 16_000)
 
153
  def remove_special_characters(batch):
154
 
155
  ##this one comes from subtitles if additional timestamps not processed -> 00:01:01 00:01:01,33
156
+ batch["sentence"] = re.sub('\b\d{2}:\d{2}:\d{2}(,+\d{2})?\b', ' ', batch["sentence"])
157
  ##remove all caps in text [AÇIKLAMA] etc, do it before..
158
+ batch["sentence"] = re.sub('\[(\b[A-Z]+\])', '', batch["sentence"])
159
  ##replace three dots (that are inside string with single)
160
+ batch["sentence"] = re.sub("([a-zA-Z]+)\.\.\.", r"\1.", batch["sentence"])
161
  #standart ignore list
162
  batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
163
 
 
204
  ## Training
205
 
206
 
207
+ The Common Voice `train` and `validation` datasets were used for training. Additional 5 Turkish movies with subtitles also used for training.
208
  Similar training model used as base fine-tuning, additional audio resampler is on above code.
209
 
210
+ Putting model building and merging code below for reference
211
 
212
 
213
  ```python
 
220
  import csv
221
 
222
  #Walk all subdirectories of base_set_path and find csv files
223
+ base_set_path = r'C:\dataset_extracts'
224
  csv_files = []
225
  for path, subdirs, files in os.walk(base_set_path):
226
  for name in files:
 
230
 
231
  def get_dataset_from_csv_file(csvfilename,names=['sentence', 'path']):
232
  path = Path(csvfilename)
233
+ csv_delimiter="\t" ##tab seperated, change if something else
234
 
235
  ##Pandas has bug reading non-ascii file names, make sure use open with encoding
236
  df=pd.read_csv(open(path, 'r', encoding='utf-8'), delimiter=csv_delimiter,header=None , names=names, encoding='utf8')
 
248
  from datasets import load_from_disk
249
 
250
  # Merge datasets together (from csv files)
251
+ dataset_file_path = ".\dataset_file"
252
  custom_datasets_concat = concatenate_datasets( [dset for dset in custom_datasets] )
253
 
254
  #save this one to disk