import argparse import os from torchaudio.datasets import CMUARCTIC from tqdm import tqdm SPLITS = { "train": list(range( 0, 932)), "valid": list(range( 932, 1032)), "test": list(range(1032, 1132)), } def get_parser(): parser = argparse.ArgumentParser() parser.add_argument( "root", metavar="DIR", help="root directory containing wav files to index" ) parser.add_argument( "--dest", default=".", type=str, metavar="DIR", help="output directory" ) parser.add_argument( "--source", default="bdl,clb,slt,rms", type=str, help="Source voice from slt, clb, bdl, rms." ) parser.add_argument( "--target", default="bdl,clb,slt,rms", type=str, help="Target voice from slt, clb, bdl, rms." ) parser.add_argument( "--splits", default="932,100,100", type=str, help="Split of train,valid,test seperate by comma." ) parser.add_argument( "--wav-root", default=None, type=str, metavar="DIR", help="saved waveform root directory for tsv" ) parser.add_argument( "--spkemb-npy-dir", required=True, type=str, help="speaker embedding directory" ) return parser def main(args): dest_dir = args.dest wav_root = args.wav_root if not os.path.exists(dest_dir): os.makedirs(dest_dir) source = args.source.split(",") target = args.target.split(",") spks = sorted(list(set(source + target))) datasets = {} datasets["slt"] = CMUARCTIC(args.root, url="slt", folder_in_archive="ARCTIC", download=False) for spk in spks: if spk != "slt": datasets[spk] = CMUARCTIC(args.root, url=spk, folder_in_archive="ARCTIC", download=False) datasets[spk]._walker = list(datasets["slt"]._walker) # some text sentences is missing if "slt" not in spks: del datasets["slt"] num_splits = [int(n_split) for n_split in args.splits.split(',')] assert sum(num_splits) == 1132, f"Missing utterances: {sum(num_splits)} != 1132" tsv = {} for split in SPLITS.keys(): tsv[split] = open(os.path.join(dest_dir, f"{split}.tsv"), "w") print(wav_root, file=tsv[split]) for split, indices in SPLITS.items(): for i in tqdm(indices, desc=f"[{'-'.join(spks)}]tsv/wav/spk"): for src_spk in source: for tgt_spk in target: if src_spk == tgt_spk: continue # wav, sample_rate, utterance, utt_no src_i = datasets[src_spk][i] tgt_i = datasets[tgt_spk][i] assert src_i[1] == tgt_i[1], f"{src_i[1]}-{tgt_i[1]}" assert src_i[3] == tgt_i[3], f"{src_i[3]}-{tgt_i[3]}" src_wav = os.path.join(os.path.basename(datasets[src_spk]._path), datasets[src_spk]._folder_audio, f"arctic_{src_i[3]}.wav") src_nframes = src_i[0].shape[-1] tgt_wav = os.path.join(os.path.basename(datasets[tgt_spk]._path), datasets[tgt_spk]._folder_audio, f"arctic_{tgt_i[3]}.wav") tgt_nframes = tgt_i[0].shape[-1] tgt_spkemb = os.path.join(args.spkemb_npy_dir, f"{os.path.basename(datasets[tgt_spk]._path)}-{datasets[tgt_spk]._folder_audio}-arctic_{tgt_i[3]}.npy") print(f"{src_wav}\t{src_nframes}\t{tgt_wav}\t{tgt_nframes}\t{tgt_spkemb}", file=tsv[split]) for split in tsv.keys(): tsv[split].close() if __name__ == "__main__": parser = get_parser() args = parser.parse_args() main(args)