|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
This script is used to generate JSON manifests for mel-generator model training. The usage is below. |
|
|
|
$ python scripts/dataset_processing/tts/thorsten_neutral/get_data.py \ |
|
--data-root ~/experiments/thorsten_neutral \ |
|
--manifests-root ~/experiments/thorsten_neutral \ |
|
--data-version "22_10" \ |
|
--min-duration 0.1 \ |
|
--normalize-text |
|
""" |
|
|
|
import argparse |
|
import json |
|
import random |
|
import shutil |
|
import subprocess |
|
import urllib.request |
|
from pathlib import Path |
|
|
|
from joblib import Parallel, delayed |
|
from nemo_text_processing.text_normalization.normalize import Normalizer |
|
from tqdm import tqdm |
|
|
|
from nemo.utils import logging |
|
|
|
|
|
THORSTEN_NEUTRAL = { |
|
"21_02": { |
|
"url": "https://zenodo.org/record/5525342/files/thorsten-neutral_v03.tgz?download=1", |
|
"dir_name": "thorsten-de_v03", |
|
"metadata": ["metadata.csv"], |
|
}, |
|
"22_10": { |
|
"url": "https://zenodo.org/record/7265581/files/ThorstenVoice-Dataset_2022.10.zip?download=1", |
|
"dir_name": "ThorstenVoice-Dataset_2022.10", |
|
"metadata": ["metadata_train.csv", "metadata_dev.csv", "metadata_test.csv"], |
|
}, |
|
} |
|
|
|
|
|
def get_args(): |
|
parser = argparse.ArgumentParser( |
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter, |
|
description="Download Thorsten Müller's neutral voice dataset and create manifests with predefined split. " |
|
"Thorsten Müller published two neural voice datasets, 21.02 and 22.10, where 22.10 provides better " |
|
"audio quality. Please choose one of the two for your TTS models. Details about the dataset are " |
|
"in https://github.com/thorstenMueller/Thorsten-Voice.", |
|
) |
|
parser.add_argument("--data-root", required=True, type=Path, help="where the resulting dataset will reside.") |
|
parser.add_argument("--manifests-root", required=True, type=Path, help="where the manifests files will reside.") |
|
parser.add_argument("--data-version", default="22_10", choices=["21_02", "22_10"], type=str) |
|
parser.add_argument("--min-duration", default=0.1, type=float) |
|
parser.add_argument("--max-duration", default=float('inf'), type=float) |
|
parser.add_argument("--val-size", default=100, type=int) |
|
parser.add_argument("--test-size", default=100, type=int) |
|
parser.add_argument( |
|
"--num-workers", |
|
default=-1, |
|
type=int, |
|
help="Specify the max number of concurrent Python worker processes. " |
|
"If -1 all CPUs are used. If 1 no parallel computing is used.", |
|
) |
|
parser.add_argument( |
|
"--normalize-text", |
|
default=False, |
|
action='store_true', |
|
help="Normalize original text and add a new entry 'normalized_text' to .json file if True.", |
|
) |
|
parser.add_argument( |
|
"--seed-for-ds-split", |
|
default=100, |
|
type=float, |
|
help="Seed for deterministic split of train/dev/test, NVIDIA's default is 100.", |
|
) |
|
args = parser.parse_args() |
|
return args |
|
|
|
|
|
def __maybe_download_file(source_url, destination_path): |
|
if not destination_path.exists(): |
|
logging.info(f"Downloading data: {source_url} --> {destination_path}") |
|
tmp_file_path = destination_path.with_suffix(".tmp") |
|
urllib.request.urlretrieve(source_url, filename=tmp_file_path) |
|
tmp_file_path.rename(destination_path) |
|
else: |
|
logging.info(f"Skipped downloading data because it exists: {destination_path}") |
|
|
|
|
|
def __extract_file(filepath, data_dir): |
|
logging.info(f"Unzipping data: {filepath} --> {data_dir}") |
|
shutil.unpack_archive(filepath, data_dir) |
|
logging.info(f"Unzipping data is complete: {filepath}.") |
|
|
|
|
|
def __save_json(json_file, dict_list): |
|
logging.info(f"Saving JSON split to {json_file}.") |
|
with open(json_file, "w") as f: |
|
for d in dict_list: |
|
f.write(json.dumps(d) + "\n") |
|
|
|
|
|
def __text_normalization(json_file, num_workers=-1): |
|
text_normalizer_call_kwargs = { |
|
"punct_pre_process": True, |
|
"punct_post_process": True, |
|
} |
|
text_normalizer = Normalizer( |
|
lang="de", input_case="cased", overwrite_cache=True, cache_dir=str(json_file.parent / "cache_dir"), |
|
) |
|
|
|
def normalizer_call(x): |
|
return text_normalizer.normalize(x, **text_normalizer_call_kwargs) |
|
|
|
def add_normalized_text(line_dict): |
|
normalized_text = normalizer_call(line_dict["text"]) |
|
line_dict.update({"normalized_text": normalized_text}) |
|
return line_dict |
|
|
|
logging.info(f"Normalizing text for {json_file}.") |
|
with open(json_file, 'r', encoding='utf-8') as fjson: |
|
lines = fjson.readlines() |
|
|
|
|
|
dict_list = Parallel(n_jobs=num_workers)( |
|
delayed(add_normalized_text)(json.loads(line)) for line in tqdm(lines) |
|
) |
|
|
|
json_file_text_normed = json_file.parent / f"{json_file.stem}_text_normed{json_file.suffix}" |
|
with open(json_file_text_normed, 'w', encoding="utf-8") as fjson_norm: |
|
for dct in dict_list: |
|
fjson_norm.write(json.dumps(dct) + "\n") |
|
logging.info(f"Normalizing text is complete: {json_file} --> {json_file_text_normed}") |
|
|
|
|
|
def __process_data( |
|
unzipped_dataset_path, metadata, min_duration, max_duration, val_size, test_size, seed_for_ds_split |
|
): |
|
logging.info("Preparing JSON train/val/test splits.") |
|
|
|
entries = list() |
|
not_found_wavs = list() |
|
wrong_duration_wavs = list() |
|
|
|
for metadata_fname in metadata: |
|
meta_file = unzipped_dataset_path / metadata_fname |
|
with open(meta_file, 'r') as fmeta: |
|
for line in tqdm(fmeta): |
|
items = line.strip().split('|') |
|
wav_file_stem, text = items[0], items[1] |
|
wav_file = unzipped_dataset_path / "wavs" / f"{wav_file_stem}.wav" |
|
|
|
|
|
if not wav_file.exists(): |
|
not_found_wavs.append(wav_file) |
|
logging.warning(f"Skipping {wav_file}: it is not found.") |
|
continue |
|
|
|
|
|
duration = subprocess.check_output(f"soxi -D {wav_file}", shell=True) |
|
duration = float(duration) |
|
if min_duration <= duration <= max_duration: |
|
entry = { |
|
'audio_filepath': str(wav_file), |
|
'duration': duration, |
|
'text': text, |
|
} |
|
entries.append(entry) |
|
elif duration < min_duration: |
|
wrong_duration_wavs.append(wav_file) |
|
logging.warning(f"Skipping {wav_file}: it is too short, less than {min_duration} seconds.") |
|
continue |
|
else: |
|
wrong_duration_wavs.append(wav_file) |
|
logging.warning(f"Skipping {wav_file}: it is too long, greater than {max_duration} seconds.") |
|
continue |
|
|
|
random.Random(seed_for_ds_split).shuffle(entries) |
|
train_size = len(entries) - val_size - test_size |
|
if train_size <= 0: |
|
raise ValueError("Not enough data for the train split.") |
|
|
|
logging.info("Preparing JSON train/val/test splits is complete.") |
|
train, val, test = ( |
|
entries[:train_size], |
|
entries[train_size : train_size + val_size], |
|
entries[train_size + val_size :], |
|
) |
|
|
|
return train, val, test, not_found_wavs, wrong_duration_wavs |
|
|
|
|
|
def main(): |
|
args = get_args() |
|
data_root = args.data_root |
|
manifests_root = args.manifests_root |
|
data_version = args.data_version |
|
|
|
dataset_root = data_root / f"ThorstenVoice-Dataset-{data_version}" |
|
dataset_root.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
dataset_url = THORSTEN_NEUTRAL[data_version]["url"] |
|
zipped_dataset_path = dataset_root / Path(dataset_url).name.split("?")[0] |
|
__maybe_download_file(dataset_url, zipped_dataset_path) |
|
__extract_file(zipped_dataset_path, dataset_root) |
|
|
|
|
|
unzipped_dataset_path = dataset_root / THORSTEN_NEUTRAL[data_version]["dir_name"] |
|
entries_train, entries_val, entries_test, not_found_wavs, wrong_duration_wavs = __process_data( |
|
unzipped_dataset_path=unzipped_dataset_path, |
|
metadata=THORSTEN_NEUTRAL[data_version]["metadata"], |
|
min_duration=args.min_duration, |
|
max_duration=args.max_duration, |
|
val_size=args.val_size, |
|
test_size=args.test_size, |
|
seed_for_ds_split=args.seed_for_ds_split, |
|
) |
|
|
|
|
|
train_json = manifests_root / "train_manifest.json" |
|
val_json = manifests_root / "val_manifest.json" |
|
test_json = manifests_root / "test_manifest.json" |
|
__save_json(train_json, entries_train) |
|
__save_json(val_json, entries_val) |
|
__save_json(test_json, entries_test) |
|
|
|
|
|
if len(not_found_wavs) > 0: |
|
skipped_not_found_file = manifests_root / "skipped_not_found_wavs.list" |
|
with open(skipped_not_found_file, "w") as f_notfound: |
|
for line in not_found_wavs: |
|
f_notfound.write(f"{line}\n") |
|
|
|
|
|
if len(wrong_duration_wavs) > 0: |
|
skipped_wrong_duration_file = manifests_root / "skipped_wrong_duration_wavs.list" |
|
with open(skipped_wrong_duration_file, "w") as f_wrong_dur: |
|
for line in wrong_duration_wavs: |
|
f_wrong_dur.write(f"{line}\n") |
|
|
|
|
|
if args.normalize_text: |
|
__text_normalization(train_json, args.num_workers) |
|
__text_normalization(val_json, args.num_workers) |
|
__text_normalization(test_json, args.num_workers) |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|