import datasets import torchaudio import re def get_test_dataset(data_path='ISSAI_KSC_335RS_v1.1'): def read_sentence(idx): with open(f"{data_path}/Transcriptions/{idx}.txt", 'r') as f: text = ' '.join(f.readlines()) return text def read_text(batch): batch["sentence"] = read_sentence(batch['uttID']) return batch chars_to_ignore = ["f", "m"] chars_to_ignore_regex = f'[{"".join(chars_to_ignore)}]' def process_text(batch): batch["text"] = re.sub(chars_to_ignore_regex, "", batch["sentence"]).lower() + " " batch['text'] = batch['text'].replace('a', 'а').replace('ə', 'ә').replace('ɵ', 'ө') return batch def load_audio(batch): path = f"{data_path}/Audios_flac/{batch['uttID']}.flac" speech_array, sr = torchaudio.load(path) batch["speech"] = speech_array batch["sampling_rate"] = sr return batch test_dataset = datasets.load_dataset( 'csv', data_files=f"{data_path}/Meta/test.csv", delimiter=' ', split='train' ) test_dataset = test_dataset.map(read_text) test_dataset = test_dataset.map(process_text) test_dataset = test_dataset.map(load_audio, num_proc=1) return test_dataset