speecht5-vc / manifest /utils /cmu_arctic_manifest.py
wr
add manifest and pretrained vocoders
0233e7e
import argparse
import os
from torchaudio.datasets import CMUARCTIC
from tqdm import tqdm
SPLITS = {
"train": list(range( 0, 932)),
"valid": list(range( 932, 1032)),
"test": list(range(1032, 1132)),
}
def get_parser():
parser = argparse.ArgumentParser()
parser.add_argument(
"root", metavar="DIR", help="root directory containing wav files to index"
)
parser.add_argument(
"--dest", default=".", type=str, metavar="DIR", help="output directory"
)
parser.add_argument(
"--source", default="bdl,clb,slt,rms", type=str, help="Source voice from slt, clb, bdl, rms."
)
parser.add_argument(
"--target", default="bdl,clb,slt,rms", type=str, help="Target voice from slt, clb, bdl, rms."
)
parser.add_argument(
"--splits", default="932,100,100", type=str, help="Split of train,valid,test seperate by comma."
)
parser.add_argument(
"--wav-root", default=None, type=str, metavar="DIR", help="saved waveform root directory for tsv"
)
parser.add_argument(
"--spkemb-npy-dir", required=True, type=str, help="speaker embedding directory"
)
return parser
def main(args):
dest_dir = args.dest
wav_root = args.wav_root
if not os.path.exists(dest_dir):
os.makedirs(dest_dir)
source = args.source.split(",")
target = args.target.split(",")
spks = sorted(list(set(source + target)))
datasets = {}
datasets["slt"] = CMUARCTIC(args.root, url="slt", folder_in_archive="ARCTIC", download=False)
for spk in spks:
if spk != "slt":
datasets[spk] = CMUARCTIC(args.root, url=spk, folder_in_archive="ARCTIC", download=False)
datasets[spk]._walker = list(datasets["slt"]._walker) # some text sentences is missing
if "slt" not in spks:
del datasets["slt"]
num_splits = [int(n_split) for n_split in args.splits.split(',')]
assert sum(num_splits) == 1132, f"Missing utterances: {sum(num_splits)} != 1132"
tsv = {}
for split in SPLITS.keys():
tsv[split] = open(os.path.join(dest_dir, f"{split}.tsv"), "w")
print(wav_root, file=tsv[split])
for split, indices in SPLITS.items():
for i in tqdm(indices, desc=f"[{'-'.join(spks)}]tsv/wav/spk"):
for src_spk in source:
for tgt_spk in target:
if src_spk == tgt_spk: continue
# wav, sample_rate, utterance, utt_no
src_i = datasets[src_spk][i]
tgt_i = datasets[tgt_spk][i]
assert src_i[1] == tgt_i[1], f"{src_i[1]}-{tgt_i[1]}"
assert src_i[3] == tgt_i[3], f"{src_i[3]}-{tgt_i[3]}"
src_wav = os.path.join(os.path.basename(datasets[src_spk]._path), datasets[src_spk]._folder_audio, f"arctic_{src_i[3]}.wav")
src_nframes = src_i[0].shape[-1]
tgt_wav = os.path.join(os.path.basename(datasets[tgt_spk]._path), datasets[tgt_spk]._folder_audio, f"arctic_{tgt_i[3]}.wav")
tgt_nframes = tgt_i[0].shape[-1]
tgt_spkemb = os.path.join(args.spkemb_npy_dir, f"{os.path.basename(datasets[tgt_spk]._path)}-{datasets[tgt_spk]._folder_audio}-arctic_{tgt_i[3]}.npy")
print(f"{src_wav}\t{src_nframes}\t{tgt_wav}\t{tgt_nframes}\t{tgt_spkemb}", file=tsv[split])
for split in tsv.keys():
tsv[split].close()
if __name__ == "__main__":
parser = get_parser()
args = parser.parse_args()
main(args)