Spaces:

amphion
/

Text-to-Speech

Running

App Files Files Community

Text-to-Speech / preprocessors /metadata.py

zyingt

Upload 685 files

0d80816 11 months ago

raw

history blame

5.41 kB

	# Copyright (c) 2023 Amphion.
	#
	# This source code is licensed under the MIT license found in the
	# LICENSE file in the root directory of this source tree.

	import os
	import json
	from tqdm import tqdm


	def cal_metadata(cfg):
	"""
	Dump metadata (singers.json, meta_info.json, utt2singer) for singer dataset or multi-datasets.
	"""
	from collections import Counter

	datasets = cfg.dataset

	print("-" * 10)
	print("Preparing metadata...")
	print("Including: \n{}\n".format("\n".join(datasets)))

	datasets.sort()

	for dataset in tqdm(datasets):
	save_dir = os.path.join(cfg.preprocess.processed_dir, dataset)
	assert os.path.exists(save_dir)

	# 'train.json' and 'test.json' of target dataset
	train_metadata = os.path.join(save_dir, "train.json")
	test_metadata = os.path.join(save_dir, "test.json")

	# Sort the metadata as the duration order
	with open(train_metadata, "r", encoding="utf-8") as f:
	train_utterances = json.load(f)
	with open(test_metadata, "r", encoding="utf-8") as f:
	test_utterances = json.load(f)

	train_utterances = sorted(train_utterances, key=lambda x: x["Duration"])
	test_utterances = sorted(test_utterances, key=lambda x: x["Duration"])

	# Write back the sorted metadata
	with open(train_metadata, "w") as f:
	json.dump(train_utterances, f, indent=4, ensure_ascii=False)
	with open(test_metadata, "w") as f:
	json.dump(test_utterances, f, indent=4, ensure_ascii=False)

	# Paths of metadata needed to be generated
	singer_dict_file = os.path.join(save_dir, cfg.preprocess.spk2id)
	utt2singer_file = os.path.join(save_dir, cfg.preprocess.utt2spk)

	# Get the total duration and singer names for train and test utterances
	train_total_duration = sum(utt["Duration"] for utt in train_utterances)
	test_total_duration = sum(utt["Duration"] for utt in test_utterances)

	singer_names = set(
	f"{replace_augment_name(utt['Dataset'])}_{utt['Singer']}"
	for utt in train_utterances + test_utterances
	)

	# Write the utt2singer file and sort the singer names
	with open(utt2singer_file, "w", encoding="utf-8") as f:
	for utt in train_utterances + test_utterances:
	f.write(
	f"{utt['Dataset']}_{utt['Uid']}\t{replace_augment_name(utt['Dataset'])}_{utt['Singer']}\n"
	)

	singer_names = sorted(singer_names)
	singer_lut = {name: i for i, name in enumerate(singer_names)}

	# dump singers.json
	with open(singer_dict_file, "w", encoding="utf-8") as f:
	json.dump(singer_lut, f, indent=4, ensure_ascii=False)

	meta_info = {
	"dataset": dataset,
	"statistics": {
	"size": len(train_utterances) + len(test_utterances),
	"hours": round(train_total_duration / 3600, 4)
	+ round(test_total_duration / 3600, 4),
	},
	"train": {
	"size": len(train_utterances),
	"hours": round(train_total_duration / 3600, 4),
	},
	"test": {
	"size": len(test_utterances),
	"hours": round(test_total_duration / 3600, 4),
	},
	"singers": {"size": len(singer_lut)},
	}
	# Use Counter to count the minutes for each singer
	total_singer2mins = Counter()
	training_singer2mins = Counter()
	for utt in train_utterances:
	k = f"{replace_augment_name(utt['Dataset'])}_{utt['Singer']}"
	training_singer2mins[k] += utt["Duration"] / 60
	total_singer2mins[k] += utt["Duration"] / 60
	for utt in test_utterances:
	k = f"{replace_augment_name(utt['Dataset'])}_{utt['Singer']}"
	total_singer2mins[k] += utt["Duration"] / 60

	training_singer2mins = dict(
	sorted(training_singer2mins.items(), key=lambda x: x[1], reverse=True)
	)
	training_singer2mins = {k: round(v, 2) for k, v in training_singer2mins.items()}
	meta_info["singers"]["training_minutes"] = training_singer2mins

	total_singer2mins = dict(
	sorted(total_singer2mins.items(), key=lambda x: x[1], reverse=True)
	)
	total_singer2mins = {k: round(v, 2) for k, v in total_singer2mins.items()}
	meta_info["singers"]["minutes"] = total_singer2mins

	with open(os.path.join(save_dir, "meta_info.json"), "w") as f:
	json.dump(meta_info, f, indent=4, ensure_ascii=False)

	for singer, min in training_singer2mins.items():
	print(f"Singer {singer}: {min} mins for training")
	print("-" * 10, "\n")


	def replace_augment_name(dataset: str) -> str:
	"""Replace the augmented dataset name with the original dataset name.
	>>> print(replace_augment_name("dataset_equalizer"))
	dataset
	"""
	if "equalizer" in dataset:
	dataset = dataset.replace("_equalizer", "")
	elif "formant_shift" in dataset:
	dataset = dataset.replace("_formant_shift", "")
	elif "pitch_shift" in dataset:
	dataset = dataset.replace("_pitch_shift", "")
	elif "time_stretch" in dataset:
	dataset = dataset.replace("_time_stretch", "")
	else:
	pass
	return dataset