| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| import argparse |
| import glob |
| import json |
| import re |
| import tarfile |
| import urllib.request |
| from pathlib import Path |
|
|
| from tqdm import tqdm |
|
|
|
|
| def get_args(): |
| parser = argparse.ArgumentParser(description='Download HiFiTTS and create manifests with predefined split') |
| parser.add_argument( |
| "--data-root", |
| required=True, |
| type=Path, |
| help='Directory into which to download and extract dataset. \{data-root\}/hi_fi_tts_v0 will be created.', |
| ) |
| parser.add_argument( |
| '--split', |
| type=str, |
| default='all', |
| help='Choose to generate manifest for all or one of (train, test, split), note that this will still download the full dataset.', |
| ) |
|
|
| args = parser.parse_args() |
| return args |
|
|
|
|
| URL = "https://us.openslr.org/resources/109/hi_fi_tts_v0.tar.gz" |
|
|
|
|
| def __maybe_download_file(source_url, destination_path): |
| if not destination_path.exists(): |
| tmp_file_path = destination_path.with_suffix('.tmp') |
| urllib.request.urlretrieve(source_url, filename=str(tmp_file_path)) |
| tmp_file_path.rename(destination_path) |
|
|
|
|
| def __extract_file(filepath, data_dir): |
| try: |
| tar = tarfile.open(filepath) |
| tar.extractall(data_dir) |
| tar.close() |
| except Exception: |
| print(f"Error while extracting {filepath}. Already extracted?") |
|
|
|
|
| def __process_data(data_root, filelists): |
| |
| for split in tqdm(filelists): |
| manifest_target = data_root / f"{split}_manifest.json" |
| print(f"Creating manifest for {split}.") |
|
|
| entries = [] |
| for manifest_src in glob.glob(str(data_root / f"*_{split}.json")): |
| try: |
| search_res = re.search('.*\/([0-9]+)_manifest_([a-z]+)_.*.json', manifest_src) |
| speaker_id = search_res.group(1) |
| audio_quality = search_res.group(2) |
| except Exception: |
| print(f"Failed to find speaker id or audio quality for {manifest_src}, check formatting.") |
| continue |
|
|
| with open(manifest_src, 'r') as f_in: |
| for input_json_entry in f_in: |
| data = json.loads(input_json_entry) |
|
|
| |
| wav_path = data_root / data['audio_filepath'] |
| assert wav_path.exists(), f"{wav_path} does not exist!" |
|
|
| entry = { |
| 'audio_filepath': data['audio_filepath'], |
| 'duration': data['duration'], |
| 'text': data['text'], |
| 'normalized_text': data['text_normalized'], |
| 'speaker': int(speaker_id), |
| |
| |
| |
| |
| 'audio_quality': audio_quality, |
| } |
| entries.append(entry) |
|
|
| with open(manifest_target, 'w') as f_out: |
| for m in entries: |
| f_out.write(json.dumps(m) + '\n') |
|
|
|
|
| def main(): |
| args = get_args() |
|
|
| split = ['train', 'dev', 'test'] if args.split == 'all' else list(args.split) |
|
|
| tarred_data_path = args.data_root / "hi_fi_tts_v0.tar.gz" |
|
|
| __maybe_download_file(URL, tarred_data_path) |
| __extract_file(str(tarred_data_path), str(args.data_root)) |
|
|
| data_root = args.data_root / "hi_fi_tts_v0" |
| __process_data(data_root, split) |
|
|
|
|
| if __name__ == '__main__': |
| main() |
|
|