|
import datasets |
|
import torchaudio |
|
import re |
|
|
|
|
|
def get_test_dataset(data_path='ISSAI_KSC_335RS_v1.1'): |
|
def read_sentence(idx): |
|
with open(f"{data_path}/Transcriptions/{idx}.txt", 'r') as f: |
|
text = ' '.join(f.readlines()) |
|
return text |
|
|
|
def read_text(batch): |
|
batch["sentence"] = read_sentence(batch['uttID']) |
|
return batch |
|
|
|
chars_to_ignore = ["f", "m"] |
|
chars_to_ignore_regex = f'[{"".join(chars_to_ignore)}]' |
|
|
|
def process_text(batch): |
|
batch["text"] = re.sub(chars_to_ignore_regex, "", batch["sentence"]).lower() + " " |
|
batch['text'] = batch['text'].replace('a', 'а').replace('ə', 'ә').replace('ɵ', 'ө') |
|
return batch |
|
|
|
def load_audio(batch): |
|
path = f"{data_path}/Audios_flac/{batch['uttID']}.flac" |
|
speech_array, sr = torchaudio.load(path) |
|
batch["speech"] = speech_array |
|
batch["sampling_rate"] = sr |
|
return batch |
|
|
|
test_dataset = datasets.load_dataset( |
|
'csv', |
|
data_files=f"{data_path}/Meta/test.csv", |
|
delimiter=' ', |
|
split='train' |
|
) |
|
|
|
test_dataset = test_dataset.map(read_text) |
|
test_dataset = test_dataset.map(process_text) |
|
test_dataset = test_dataset.map(load_audio, num_proc=1) |
|
|
|
return test_dataset |
|
|