import argparse import os import pathlib import time from concurrent.futures import ProcessPoolExecutor from typing import List, NoReturn import h5py import numpy as np from bytesep.utils import float32_to_int16, load_audio def pack_audios_to_hdf5s(args) -> NoReturn: r"""Pack (resampled) audio files into hdf5 files to speed up loading. Args: dataset_dir: str split: str, 'train' | 'test' hdf5s_dir: str, directory to write out hdf5 files sample_rate: int channels_num: int mono: bool Returns: NoReturn """ # arguments & parameters dataset_dir = args.dataset_dir split = args.split hdf5s_dir = args.hdf5s_dir sample_rate = args.sample_rate channels = args.channels mono = True if channels == 1 else False # Only pack data for training data. assert split == "train" speech_dir = os.path.join(dataset_dir, "clean_{}set_wav".format(split)) mixture_dir = os.path.join(dataset_dir, "noisy_{}set_wav".format(split)) os.makedirs(hdf5s_dir, exist_ok=True) # Read names. audio_names = sorted(os.listdir(speech_dir)) params = [] for audio_index, audio_name in enumerate(audio_names): speech_path = os.path.join(speech_dir, audio_name) mixture_path = os.path.join(mixture_dir, audio_name) hdf5_path = os.path.join( hdf5s_dir, "{}.h5".format(pathlib.Path(audio_name).stem) ) param = ( audio_index, audio_name, speech_path, mixture_path, mono, sample_rate, hdf5_path, ) params.append(param) # Uncomment for debug. # write_single_audio_to_hdf5(params[0]) # os._exit(0) pack_hdf5s_time = time.time() with ProcessPoolExecutor(max_workers=None) as pool: # Maximum works on the machine pool.map(write_single_audio_to_hdf5, params) print("Pack hdf5 time: {:.3f} s".format(time.time() - pack_hdf5s_time)) def write_single_audio_to_hdf5(param: List) -> NoReturn: r"""Write single audio into hdf5 file.""" ( audio_index, audio_name, speech_path, mixture_path, mono, sample_rate, hdf5_path, ) = param with h5py.File(hdf5_path, "w") as hf: hf.attrs.create("audio_name", data=audio_name, dtype="S100") hf.attrs.create("sample_rate", data=sample_rate, dtype=np.int32) speech = load_audio(audio_path=speech_path, mono=mono, sample_rate=sample_rate) # speech: (channels_num, audio_samples) mixture = load_audio( audio_path=mixture_path, mono=mono, sample_rate=sample_rate ) # mixture: (channels_num, audio_samples) noise = mixture - speech # noise: (channels_num, audio_samples) hf.create_dataset(name='speech', data=float32_to_int16(speech), dtype=np.int16) hf.create_dataset(name='noise', data=float32_to_int16(noise), dtype=np.int16) print('{} Write hdf5 to {}'.format(audio_index, hdf5_path)) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "--dataset_dir", type=str, required=True, help="Directory of the Voicebank-Demand dataset.", ) parser.add_argument("--split", type=str, required=True, choices=["train", "test"]) parser.add_argument( "--hdf5s_dir", type=str, required=True, help="Directory to write out hdf5 files.", ) parser.add_argument("--sample_rate", type=int, required=True, help="Sample rate.") parser.add_argument( "--channels", type=int, required=True, help="Use 1 for mono, 2 for stereo." ) # Parse arguments. args = parser.parse_args() # Pack audios into hdf5 files. pack_audios_to_hdf5s(args)