gorkemgoknar
/

wav2vec2-large-xlsr-53-turkish

@@ -30,7 +30,9 @@ model-index:
 # Wav2Vec2-Large-XLSR-53-Turkish
 Note: This model is trained with 5 Turkish movies additional to common voice dataset.
-Although WER is high (50%) per common voice test dataset, testing with voice with background noise and on browser, derived letters are pretty close.
 Please try speech yourself on the right side to see its performance.
@@ -120,7 +122,7 @@ model.to("cuda")
 #Note: Not ignoring "'"  on this one
 #Note: Not ignoring "'"  on this one
-chars_to_ignore_regex = '[\\\\\\\\,\\\\\\\\?\\\\\\\\.\\\\\\\\!\\\\\\\\-\\\\\\\\;\\\\\\\\:\\\\\\\\"\\\\\\\\“\\\\\\\\%\\\\\\\\‘\\\\\\\\”\\\\\\\\�\\\\\\\\#\\\\\\\\>\\\\\\\\<\\\\\\\\_\\\\\\\\’\\\\\\\\[\\\\\\\\]\\\\\\\\{\\\\\\\\}]'
 #resampler = torchaudio.transforms.Resample(48_000, 16_000)
@@ -151,11 +153,11 @@ def audio_resampler(batch, new_sample_rate = 16000):
 def remove_special_characters(batch):
     ##this one comes from subtitles if additional timestamps not processed  -> 00:01:01   00:01:01,33
-    batch["sentence"] = re.sub('\\\\\\\\b\\\\\\\\d{2}:\\\\\\\\d{2}:\\\\\\\\d{2}(,+\\\\\\\\d{2})?\\\\\\\\b', ' ', batch["sentence"])
     ##remove all caps in text [AÇIKLAMA] etc, do it before..
-    batch["sentence"] = re.sub('\\\\\\\\[(\\\\\\\\b[A-Z]+\\\\\\\\])', '', batch["sentence"])
     ##replace three dots (that are inside string with single)
-    batch["sentence"] = re.sub("([a-zA-Z]+)\\\\\\\\.\\\\\\\\.\\\\\\\\.", r"\\\\\\\\1.", batch["sentence"])
     #standart ignore list
     batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
@@ -202,10 +204,10 @@ print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"],
 ## Training
-The Common Voice `train` and `validation` datasets were used for training. `Additional 5 Turkish movies with subtitles` also used for training.
 Similar training model used as base fine-tuning, additional audio resampler is on above code.
-Putting Dataset building from csv and merging/saving as Dataset code below for reference
 ```python
@@ -218,7 +220,7 @@ from datasets import Dataset
 import csv
 #Walk all subdirectories of base_set_path and find csv files
-base_set_path = r'C:\\\\\\\\dataset_extracts'
 csv_files = []
 for path, subdirs, files in os.walk(base_set_path):
     for name in files:
@@ -228,7 +230,7 @@ for path, subdirs, files in os.walk(base_set_path):
 def get_dataset_from_csv_file(csvfilename,names=['sentence', 'path']):
   path = Path(csvfilename)
-  csv_delimiter="\\\\\\\\t"  ##tab seperated, change if something else
   ##Pandas has bug reading non-ascii file names, make sure use open with encoding
   df=pd.read_csv(open(path, 'r', encoding='utf-8'), delimiter=csv_delimiter,header=None , names=names, encoding='utf8')
@@ -246,7 +248,7 @@ from datasets import concatenate_datasets, load_dataset
 from datasets import load_from_disk
 # Merge datasets together (from csv files)
-dataset_file_path = ".\\\\\\\\dataset_file"
 custom_datasets_concat = concatenate_datasets( [dset for dset in custom_datasets] )
 #save this one to disk

 # Wav2Vec2-Large-XLSR-53-Turkish
 Note: This model is trained with 5 Turkish movies additional to common voice dataset.
+Although WER is high (50%) per common voice test dataset,  performance from "other sources " seems pretty good.
+Dataset building from csv and merging code can be found on below of this Readme.
 Please try speech yourself on the right side to see its performance.
 #Note: Not ignoring "'"  on this one
 #Note: Not ignoring "'"  on this one
+chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\#\>\<\_\’\[\]\{\}]'
 #resampler = torchaudio.transforms.Resample(48_000, 16_000)
 def remove_special_characters(batch):
     ##this one comes from subtitles if additional timestamps not processed  -> 00:01:01   00:01:01,33
+    batch["sentence"] = re.sub('\b\d{2}:\d{2}:\d{2}(,+\d{2})?\b', ' ', batch["sentence"])
     ##remove all caps in text [AÇIKLAMA] etc, do it before..
+    batch["sentence"] = re.sub('\[(\b[A-Z]+\])', '', batch["sentence"])
     ##replace three dots (that are inside string with single)
+    batch["sentence"] = re.sub("([a-zA-Z]+)\.\.\.", r"\1.", batch["sentence"])
     #standart ignore list
     batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
 ## Training
+The Common Voice `train` and `validation` datasets were used for training. Additional 5 Turkish movies with subtitles also used for training.
 Similar training model used as base fine-tuning, additional audio resampler is on above code.
+Putting model building and merging code below for reference
 ```python
 import csv
 #Walk all subdirectories of base_set_path and find csv files
+base_set_path = r'C:\dataset_extracts'
 csv_files = []
 for path, subdirs, files in os.walk(base_set_path):
     for name in files:
 def get_dataset_from_csv_file(csvfilename,names=['sentence', 'path']):
   path = Path(csvfilename)
+  csv_delimiter="\t"  ##tab seperated, change if something else
   ##Pandas has bug reading non-ascii file names, make sure use open with encoding
   df=pd.read_csv(open(path, 'r', encoding='utf-8'), delimiter=csv_delimiter,header=None , names=names, encoding='utf8')
 from datasets import load_from_disk
 # Merge datasets together (from csv files)
+dataset_file_path = ".\dataset_file"
 custom_datasets_concat = concatenate_datasets( [dset for dset in custom_datasets] )
 #save this one to disk