File size: 1,290 Bytes

5e7d20b

import datasets
import torchaudio
import re


def get_test_dataset(data_path='ISSAI_KSC_335RS_v1.1'):
    def read_sentence(idx):
        with open(f"{data_path}/Transcriptions/{idx}.txt", 'r') as f:
            text = ' '.join(f.readlines())
        return text

    def read_text(batch):
        batch["sentence"] = read_sentence(batch['uttID'])
        return batch

    chars_to_ignore = ["f", "m"]
    chars_to_ignore_regex = f'[{"".join(chars_to_ignore)}]'

    def process_text(batch):
        batch["text"] = re.sub(chars_to_ignore_regex, "", batch["sentence"]).lower() + " "
        batch['text'] = batch['text'].replace('a', 'а').replace('ə', 'ә').replace('ɵ', 'ө')
        return batch

    def load_audio(batch):
        path = f"{data_path}/Audios_flac/{batch['uttID']}.flac"
        speech_array, sr = torchaudio.load(path)
        batch["speech"] = speech_array
        batch["sampling_rate"] = sr
        return batch
        
    test_dataset = datasets.load_dataset(
        'csv', 
        data_files=f"{data_path}/Meta/test.csv", 
        delimiter=' ',
        split='train'
    )

    test_dataset = test_dataset.map(read_text)
    test_dataset = test_dataset.map(process_text)
    test_dataset = test_dataset.map(load_audio, num_proc=1)

    return test_dataset