Spaces:

amphion
/

NaturalSpeech2

Running on T4

NaturalSpeech2 / preprocessors /__init__.py

yuancwang

init

b725c5a 11 months ago

4.79 kB

	# Copyright (c) 2023 Amphion.
	#
	# This source code is licensed under the MIT license found in the
	# LICENSE file in the root directory of this source tree.

	"""
	For source datasets' standard samples
	"""

	from collections import defaultdict
	import os
	import json

	SPEECH_DATASETS = ["vctk", "vctksample"]

	GOLDEN_TEST_SAMPLES = defaultdict(list)
	GOLDEN_TEST_SAMPLES["m4singer"] = [
	"Alto-1_美错_0014",
	"Bass-1_十年_0008",
	"Soprano-2_同桌的你_0018",
	"Tenor-5_爱笑的眼睛_0010",
	]
	GOLDEN_TEST_SAMPLES["svcc"] = [
	# IDF1
	"IDF1_10030",
	"IDF1_10120",
	"IDF1_10140",
	# IDM1
	"IDM1_10001",
	"IDM1_10030",
	"IDM1_10120",
	# CDF1
	"CDF1_10030",
	"CDF1_10120",
	"CDF1_10140",
	# CDM1
	"CDM1_10001",
	"CDM1_10030",
	"CDM1_10120",
	]
	GOLDEN_TEST_SAMPLES["svcceval"] = [
	# SF1
	"SF1_30001",
	"SF1_30002",
	"SF1_30003",
	# SM1
	"SM1_30001",
	"SM1_30002",
	"SM1_30003",
	]
	GOLDEN_TEST_SAMPLES["popbutfy"] = [
	"Female1#you_are_my_sunshine_Professional#0",
	"Female4#Someone_Like_You_Professional#10",
	"Male2#Lemon_Tree_Professional#12",
	"Male5#can_you_feel_the_love_tonight_Professional#20",
	]
	GOLDEN_TEST_SAMPLES["opensinger"] = [
	"Man_0_大鱼_10",
	"Man_21_丑八怪_14",
	"Woman_39_mojito_22",
	"Woman_40_易燃易爆炸_12",
	]
	GOLDEN_TEST_SAMPLES["nus48e"] = [
	"ADIZ_read#01#0000",
	"MCUR_sing#10#0000",
	"JLEE_read#08#0001",
	"SAMF_sing#18#0001",
	]
	GOLDEN_TEST_SAMPLES["popcs"] = [
	"明天会更好_0004",
	"欧若拉_0005",
	"虫儿飞_0006",
	"隐形的翅膀_0008",
	]
	GOLDEN_TEST_SAMPLES["kising"] = [
	"421_0040",
	"424_0013",
	"431_0026",
	]
	GOLDEN_TEST_SAMPLES["csd"] = [
	"en_004a_0001",
	"en_042b_0006",
	"kr_013a_0006",
	"kr_045b_0004",
	]
	GOLDEN_TEST_SAMPLES["opera"] = [
	"fem_01#neg_1#0000",
	"fem_12#pos_3#0003",
	"male_02#neg_1#0002",
	"male_11#pos_2#0001",
	]
	GOLDEN_TEST_SAMPLES["lijian"] = [
	"058矜持_0000",
	"079绒花_0000",
	"120遥远的天空底下_0000",
	]
	GOLDEN_TEST_SAMPLES["cdmusiceval"] = ["陶喆_普通朋友", "蔡琴_给电影人的情书"]

	GOLDEN_TRAIN_SAMPLES = defaultdict(list)


	def get_golden_samples_indexes(
	dataset_name,
	dataset_dir=None,
	cfg=None,
	split=None,
	min_samples=5,
	):
	"""
	# Get Standard samples' indexes
	"""
	if dataset_dir is None:
	assert cfg is not None
	dataset_dir = os.path.join(
	cfg.OUTPUT_PATH,
	"preprocess/{}_version".format(cfg.PREPROCESS_VERSION),
	dataset_name,
	)

	assert split is not None
	utt_file = os.path.join(dataset_dir, "{}.json".format(split))
	with open(utt_file, "r", encoding="utf-8") as f:
	samples = json.load(f)

	if "train" in split:
	golden_samples = GOLDEN_TRAIN_SAMPLES[dataset_name]
	if "test" in split:
	golden_samples = GOLDEN_TEST_SAMPLES[dataset_name]

	res = []
	for idx, utt in enumerate(samples):
	if utt["Uid"] in golden_samples:
	res.append(idx)

	if dataset_name == "cdmusiceval":
	if "_".join(utt["Uid"].split("_")[:2]) in golden_samples:
	res.append(idx)

	if len(res) == 0:
	res = [i for i in range(min_samples)]

	return res


	def get_specific_singer_indexes(dataset_dir, singer_name, split):
	utt_file = os.path.join(dataset_dir, "{}.json".format(split))
	with open(utt_file, "r", encoding="utf-8") as f:
	samples = json.load(f)

	res = []
	for idx, utt in enumerate(samples):
	if utt["Singer"] == singer_name:
	res.append(idx)

	assert len(res) != 0
	return res


	def get_uids_and_wav_paths(
	cfg, dataset, dataset_type="train", only_specific_singer=None, return_singers=False
	):
	dataset_dir = os.path.join(
	cfg.OUTPUT_PATH, "preprocess/{}_version".format(cfg.PREPROCESS_VERSION), dataset
	)
	dataset_file = os.path.join(
	dataset_dir, "{}.json".format(dataset_type.split("_")[-1])
	)
	with open(dataset_file, "r") as f:
	utterances = json.load(f)

	indexes = range(len(utterances))
	if "golden" in dataset_type:
	# golden_train or golden_test
	indexes = get_golden_samples_indexes(
	dataset, dataset_dir, split=dataset_type.split("_")[-1]
	)
	if only_specific_singer is not None:
	indexes = get_specific_singer_indexes(
	dataset_dir, only_specific_singer, dataset_type
	)

	uids = [utterances[i]["Uid"] for i in indexes]
	wav_paths = [utterances[i]["Path"] for i in indexes]
	singers = [utterances[i]["Singer"] for i in indexes]

	if not return_singers:
	return uids, wav_paths
	else:
	return uids, wav_paths, singers