speecht5-sid / manifest /utils /voxceleb1_manifest.py
mechanicalsea's picture
add log, results, and manifest
90556f9
raw
history blame contribute delete
No virus
2.79 kB
"""
Generate VoxCeleb1 SID manifest for SpeechT5.
iden_split.txt
1 id11251/s4R4hvqrhFw/00002.wav
1 id11251/gFfcgOVmiO0/00006.wav
3 id11251/7GtZpUtReJ8/00001.wav
2 id11251/5-6lI5JQtb8/00001.wav
3 id11251/7GtZpUtReJ8/00006.wav
"""
import logging
import argparse
import os
from scipy.io import wavfile
from tqdm import tqdm
logger = logging.getLogger(__name__)
SPLITS = {
"train": 1,
"valid": 2,
"test": 3,
}
class VoxCeleb1SID:
def __init__(self, root, split, iden_path):
self.root = root
self.speakers = []
self.paths = []
with open(iden_path, "r") as f:
for line in f:
items = line.strip().split(" ")
split_type = int(items[0])
wav_path = items[1]
if split == split_type:
self.speakers.append(wav_path.split("/")[0])
self.paths.append(wav_path)
def __len__(self):
return len(self.paths)
def __getitem__(self, index):
speaker = self.speakers[index]
file_audio = os.path.join(self.root, self.paths[index])
sample_rate, wav = wavfile.read(file_audio)
n_frames = wav.shape[0]
return n_frames, sample_rate, speaker, self.paths[index]
def get_parser():
parser = argparse.ArgumentParser()
parser.add_argument(
"root", metavar="DIR", help="root directory containing wav files to index"
)
parser.add_argument(
"--output", default=".", type=str, metavar="DIR", help="output directory of manifest"
)
parser.add_argument(
"--split", required=True, type=str, choices=["train", "valid", "test"], help="dataset splits"
)
parser.add_argument(
"--wav-root", default=None, type=str, metavar="DIR", help="saved waveform root directory for tsv"
)
parser.add_argument(
"--iden-split", required=True, type=str, help="officially released split for identification"
)
return parser
def main(args):
dest_dir = args.output
wav_root = args.wav_root
if not os.path.exists(args.iden_split):
logger.error(f"split {args.iden_split} does not exist")
if not os.path.exists(dest_dir):
os.makedirs(dest_dir)
dataset = VoxCeleb1SID(args.root, SPLITS[args.split], args.iden_split)
tsv = open(os.path.join(dest_dir, f"{args.split}.tsv"), "w")
print(wav_root, file=tsv)
for n_frames, sr, spk_id, wav_path in tqdm(dataset, desc="tsv/txt/wav"):
assert sr == 16000, f"sampling rate {sr} != 16000"
assert os.path.exists(os.path.join(args.root, wav_path))
print(f"{wav_path}\t{n_frames}\t{spk_id}", file=tsv)
tsv.close()
if __name__ == "__main__":
parser = get_parser()
args = parser.parse_args()
main(args)