promptttspp / data_prep /prepare_mfa.py
MasayaKawamura's picture
Initial commit
82334b0
# Copyright 2024 LY Corporation
# LY Corporation licenses this file to you under the Apache License,
# version 2.0 (the "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at:
# https://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import argparse
import shutil
import sys
from pathlib import Path
from common import load_libritts_spk_metadata
from joblib import Parallel, delayed
from promptttspp.utils.joblib import tqdm_joblib
from tqdm.auto import tqdm
def get_parser():
parser = argparse.ArgumentParser(
description="Restructure the LibriTTS-R dataset for convenience",
)
parser.add_argument("in_dir", type=str, help="LibriTTS original data root")
parser.add_argument("out_dir", type=str, help="Output directory")
parser.add_argument("--n_jobs", type=int, default=8, help="Number of jobs")
parser.add_argument("--debug", action="store_true", help="Debug")
return parser
def process_spk(spk, meta, in_dir, out_dir):
subset = meta["subset"]
wav_files = sorted((in_dir / subset).glob(f"*/*/{spk}_*.wav"))
if len(wav_files) == 0:
print(f"No wav files found for {spk}", meta)
return
spk_out_dir = out_dir / spk
spk_out_dir.mkdir(exist_ok=True, parents=True)
# copy to spk_out_dir/filename
for wav_file in tqdm(wav_files, leave=False):
utt_id = wav_file.name.replace(".wav", "")
text_file = wav_file.parent / f"{utt_id}.normalized.txt"
# Sadly, some text transcriptions are missing
# train-clean-360/1382/130492/1382_130492_000049_000000.normalized.txt
if not text_file.exists():
print(f"Text file not found for {wav_file}")
continue
out_wav_file = spk_out_dir / wav_file.name
out_lab_file = spk_out_dir / f"{utt_id}.lab"
shutil.copy2(wav_file, out_wav_file)
shutil.copy2(text_file, out_lab_file)
if __name__ == "__main__":
args = get_parser().parse_args(sys.argv[1:])
in_dir = Path(args.in_dir)
out_dir = Path(args.out_dir)
out_dir.mkdir(exist_ok=True, parents=True)
spk2meta = load_libritts_spk_metadata(debug=args.debug)
with tqdm_joblib(len(spk2meta)):
Parallel(n_jobs=args.n_jobs)(
delayed(process_spk)(spk, meta, in_dir, out_dir)
for spk, meta in spk2meta.items()
)