Text-to-Speech / processors /acoustic_extractor.py
zyingt's picture
Upload 685 files
0d80816
raw
history blame
42.3 kB
# Copyright (c) 2023 Amphion.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import os
import torch
import numpy as np
import torchaudio
import json
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from utils.io import save_feature, save_txt, save_torch_audio
from utils.util import has_existed
from utils.tokenizer import extract_encodec_token
from utils.stft import TacotronSTFT
from utils.dsp import compress, audio_to_label
from utils.data_utils import remove_outlier
from preprocessors.metadata import replace_augment_name
from scipy.interpolate import interp1d
from concurrent.futures import ProcessPoolExecutor
from functools import partial
ZERO = 1e-12
def extract_utt_acoustic_features_parallel(metadata, dataset_output, cfg, num_workers=1):
"""Extract acoustic features from utterances using muliprocess
Args:
metadata (dict): dictionary that stores data in train.json and test.json files
dataset_output (str): directory to store acoustic features
cfg (dict): dictionary that stores configurations
num_workers (int, optional): num of processes to extract features in parallel. Defaults to 1.
Returns:
list: acoustic features
"""
executor = ProcessPoolExecutor(max_workers=num_workers)
results = []
for utt in metadata:
if cfg.task_type == "tts":
results.append(
executor.submit(
partial(extract_utt_acoustic_features_tts, dataset_output, cfg, utt)
)
)
if cfg.task_type == "svc":
results.append(
executor.submit(
partial(extract_utt_acoustic_features_svc, dataset_output, cfg, utt)
)
)
if cfg.task_type == "vocoder":
results.append(
executor.submit(
partial(extract_utt_acoustic_features_vocoder, dataset_output, cfg, utt)
)
)
if cfg.task_type == "tta":
results.append(
executor.submit(
partial(extract_utt_acoustic_features_tta, dataset_output, cfg, utt)
)
)
return [result.result() for result in tqdm(results)]
# def extract_utt_acoustic_features_parallel(metadata, dataset_output, cfg, num_workers=1):
# """Extract acoustic features from utterances using muliprocess
# Args:
# metadata (dict): dictionary that stores data in train.json and test.json files
# dataset_output (str): directory to store acoustic features
# cfg (dict): dictionary that stores configurations
# num_workers (int, optional): num of processes to extract features in parallel. Defaults to 1.
# Returns:
# list: acoustic features
# """
# for utt in tqdm(metadata):
# if cfg.task_type == "tts":
# extract_utt_acoustic_features_tts(dataset_output, cfg, utt)
# if cfg.task_type == "svc":
# extract_utt_acoustic_features_svc(dataset_output, cfg, utt)
# if cfg.task_type == "vocoder":
# extract_utt_acoustic_features_vocoder(dataset_output, cfg, utt)
# if cfg.task_type == "tta":
# extract_utt_acoustic_features_tta(dataset_output, cfg, utt)
def avg_phone_feature(feature, duration, interpolation=False):
feature = feature[: sum(duration)]
if interpolation:
nonzero_ids = np.where(feature != 0)[0]
interp_fn = interp1d(
nonzero_ids,
feature[nonzero_ids],
fill_value=(feature[nonzero_ids[0]], feature[nonzero_ids[-1]]),
bounds_error=False,
)
feature = interp_fn(np.arange(0, len(feature)))
# Phoneme-level average
pos = 0
for i, d in enumerate(duration):
if d > 0:
feature[i] = np.mean(feature[pos : pos + d])
else:
feature[i] = 0
pos += d
feature = feature[: len(duration)]
return feature
def extract_utt_acoustic_features_serial(metadata, dataset_output, cfg):
"""Extract acoustic features from utterances (in single process)
Args:
metadata (dict): dictionary that stores data in train.json and test.json files
dataset_output (str): directory to store acoustic features
cfg (dict): dictionary that stores configurations
"""
for utt in tqdm(metadata):
if cfg.task_type == "tts":
extract_utt_acoustic_features_tts(dataset_output, cfg, utt)
if cfg.task_type == "svc":
extract_utt_acoustic_features_svc(dataset_output, cfg, utt)
if cfg.task_type == "vocoder":
extract_utt_acoustic_features_vocoder(dataset_output, cfg, utt)
if cfg.task_type == "tta":
extract_utt_acoustic_features_tta(dataset_output, cfg, utt)
def __extract_utt_acoustic_features(dataset_output, cfg, utt):
"""Extract acoustic features from utterances (in single process)
Args:
dataset_output (str): directory to store acoustic features
cfg (dict): dictionary that stores configurations
utt (dict): utterance info including dataset, singer, uid:{singer}_{song}_{index},
path to utternace, duration, utternace index
"""
from utils import audio, f0, world, duration
uid = utt["Uid"]
wav_path = utt["Path"]
if os.path.exists(os.path.join(dataset_output, cfg.preprocess.raw_data)):
wav_path = os.path.join(
dataset_output, cfg.preprocess.raw_data, utt["Singer"], uid + ".wav"
)
with torch.no_grad():
# Load audio data into tensor with sample rate of the config file
wav_torch, _ = audio.load_audio_torch(wav_path, cfg.preprocess.sample_rate)
wav = wav_torch.cpu().numpy()
# extract features
if cfg.preprocess.extract_duration:
durations, phones, start, end = duration.get_duration(
utt, wav, cfg.preprocess
)
save_feature(dataset_output, cfg.preprocess.duration_dir, uid, durations)
save_txt(dataset_output, cfg.preprocess.lab_dir, uid, phones)
wav = wav[start:end].astype(np.float32)
wav_torch = torch.from_numpy(wav).to(wav_torch.device)
if cfg.preprocess.extract_linear_spec:
from utils.mel import extract_linear_features
linear = extract_linear_features(wav_torch.unsqueeze(0), cfg.preprocess)
save_feature(
dataset_output, cfg.preprocess.linear_dir, uid, linear.cpu().numpy()
)
if cfg.preprocess.extract_mel:
from utils.mel import extract_mel_features
if cfg.preprocess.mel_extract_mode == "taco":
_stft = TacotronSTFT(
sampling_rate=cfg.preprocess.sample_rate,
win_length=cfg.preprocess.win_size,
hop_length=cfg.preprocess.hop_size,
filter_length=cfg.preprocess.n_fft,
n_mel_channels=cfg.preprocess.n_mel,
mel_fmin=cfg.preprocess.fmin,
mel_fmax=cfg.preprocess.fmax,
)
mel = extract_mel_features(
wav_torch.unsqueeze(0), cfg.preprocess, taco=True, _stft=_stft
)
if cfg.preprocess.extract_duration:
mel = mel[:, : sum(durations)]
else:
mel = extract_mel_features(wav_torch.unsqueeze(0), cfg.preprocess)
save_feature(dataset_output, cfg.preprocess.mel_dir, uid, mel.cpu().numpy())
if cfg.preprocess.extract_energy:
if (
cfg.preprocess.energy_extract_mode == "from_mel"
and cfg.preprocess.extract_mel
):
energy = (mel.exp() ** 2).sum(0).sqrt().cpu().numpy()
elif cfg.preprocess.energy_extract_mode == "from_waveform":
energy = audio.energy(wav, cfg.preprocess)
elif cfg.preprocess.energy_extract_mode == "from_tacotron_stft":
_stft = TacotronSTFT(
sampling_rate=cfg.preprocess.sample_rate,
win_length=cfg.preprocess.win_size,
hop_length=cfg.preprocess.hop_size,
filter_length=cfg.preprocess.n_fft,
n_mel_channels=cfg.preprocess.n_mel,
mel_fmin=cfg.preprocess.fmin,
mel_fmax=cfg.preprocess.fmax,
)
_, energy = audio.get_energy_from_tacotron(wav, _stft)
else:
assert cfg.preprocess.energy_extract_mode in [
"from_mel",
"from_waveform",
"from_tacotron_stft",
], f"{cfg.preprocess.energy_extract_mode} not in supported energy_extract_mode [from_mel, from_waveform, from_tacotron_stft]"
if cfg.preprocess.extract_duration:
energy = energy[: sum(durations)]
phone_energy = avg_phone_feature(energy, durations)
save_feature(
dataset_output, cfg.preprocess.phone_energy_dir, uid, phone_energy
)
save_feature(dataset_output, cfg.preprocess.energy_dir, uid, energy)
if cfg.preprocess.extract_pitch:
pitch = f0.get_f0(wav, cfg.preprocess)
if cfg.preprocess.extract_duration:
pitch = pitch[: sum(durations)]
phone_pitch = avg_phone_feature(pitch, durations, interpolation=True)
save_feature(
dataset_output, cfg.preprocess.phone_pitch_dir, uid, phone_pitch
)
save_feature(dataset_output, cfg.preprocess.pitch_dir, uid, pitch)
if cfg.preprocess.extract_uv:
assert isinstance(pitch, np.ndarray)
uv = pitch != 0
save_feature(dataset_output, cfg.preprocess.uv_dir, uid, uv)
if cfg.preprocess.extract_audio:
save_feature(dataset_output, cfg.preprocess.audio_dir, uid, wav)
if cfg.preprocess.extract_label:
if cfg.preprocess.is_mu_law:
# compress audio
wav = compress(wav, cfg.preprocess.bits)
label = audio_to_label(wav, cfg.preprocess.bits)
save_feature(dataset_output, cfg.preprocess.label_dir, uid, label)
if cfg.preprocess.extract_acoustic_token:
if cfg.preprocess.acoustic_token_extractor == "Encodec":
codes = extract_encodec_token(wav_path)
save_feature(dataset_output, cfg.preprocess.acoustic_token_dir, uid, codes)
# TODO: refactor extract_utt_acoustic_features_task function due to many duplicated code
def extract_utt_acoustic_features_tts(dataset_output, cfg, utt):
"""Extract acoustic features from utterances (in single process)
Args:
dataset_output (str): directory to store acoustic features
cfg (dict): dictionary that stores configurations
utt (dict): utterance info including dataset, singer, uid:{singer}_{song}_{index},
path to utternace, duration, utternace index
"""
from utils import audio, f0, world, duration
uid = utt["Uid"]
wav_path = utt["Path"]
if os.path.exists(os.path.join(dataset_output, cfg.preprocess.raw_data)):
wav_path = os.path.join(
dataset_output, cfg.preprocess.raw_data, utt["Singer"], uid + ".wav"
)
if not os.path.exists(wav_path):
wav_path = os.path.join(
dataset_output, cfg.preprocess.raw_data, utt["Singer"], uid + ".flac"
)
assert os.path.exists(wav_path)
with torch.no_grad():
# Load audio data into tensor with sample rate of the config file
wav_torch, _ = audio.load_audio_torch(wav_path, cfg.preprocess.sample_rate)
wav = wav_torch.cpu().numpy()
# extract features
if cfg.preprocess.extract_duration:
durations, phones, start, end = duration.get_duration(
utt, wav, cfg.preprocess
)
save_feature(dataset_output, cfg.preprocess.duration_dir, uid, durations)
save_txt(dataset_output, cfg.preprocess.lab_dir, uid, phones)
wav = wav[start:end].astype(np.float32)
wav_torch = torch.from_numpy(wav).to(wav_torch.device)
if cfg.preprocess.extract_linear_spec:
from utils.mel import extract_linear_features
linear = extract_linear_features(wav_torch.unsqueeze(0), cfg.preprocess)
save_feature(
dataset_output, cfg.preprocess.linear_dir, uid, linear.cpu().numpy()
)
if cfg.preprocess.extract_mel:
from utils.mel import extract_mel_features
if cfg.preprocess.mel_extract_mode == "taco":
_stft = TacotronSTFT(
sampling_rate=cfg.preprocess.sample_rate,
win_length=cfg.preprocess.win_size,
hop_length=cfg.preprocess.hop_size,
filter_length=cfg.preprocess.n_fft,
n_mel_channels=cfg.preprocess.n_mel,
mel_fmin=cfg.preprocess.fmin,
mel_fmax=cfg.preprocess.fmax,
)
mel = extract_mel_features(
wav_torch.unsqueeze(0), cfg.preprocess, taco=True, _stft=_stft
)
if cfg.preprocess.extract_duration:
mel = mel[:, : sum(durations)]
else:
mel = extract_mel_features(wav_torch.unsqueeze(0), cfg.preprocess)
save_feature(dataset_output, cfg.preprocess.mel_dir, uid, mel.cpu().numpy())
if cfg.preprocess.extract_energy:
if (
cfg.preprocess.energy_extract_mode == "from_mel"
and cfg.preprocess.extract_mel
):
energy = (mel.exp() ** 2).sum(0).sqrt().cpu().numpy()
elif cfg.preprocess.energy_extract_mode == "from_waveform":
energy = audio.energy(wav, cfg.preprocess)
elif cfg.preprocess.energy_extract_mode == "from_tacotron_stft":
_stft = TacotronSTFT(
sampling_rate=cfg.preprocess.sample_rate,
win_length=cfg.preprocess.win_size,
hop_length=cfg.preprocess.hop_size,
filter_length=cfg.preprocess.n_fft,
n_mel_channels=cfg.preprocess.n_mel,
mel_fmin=cfg.preprocess.fmin,
mel_fmax=cfg.preprocess.fmax,
)
_, energy = audio.get_energy_from_tacotron(wav, _stft)
else:
assert cfg.preprocess.energy_extract_mode in [
"from_mel",
"from_waveform",
"from_tacotron_stft",
], f"{cfg.preprocess.energy_extract_mode} not in supported energy_extract_mode [from_mel, from_waveform, from_tacotron_stft]"
if cfg.preprocess.extract_duration:
energy = energy[: sum(durations)]
phone_energy = avg_phone_feature(energy, durations)
save_feature(
dataset_output, cfg.preprocess.phone_energy_dir, uid, phone_energy
)
save_feature(dataset_output, cfg.preprocess.energy_dir, uid, energy)
if cfg.preprocess.extract_pitch:
pitch = f0.get_f0(wav, cfg.preprocess)
if cfg.preprocess.extract_duration:
pitch = pitch[: sum(durations)]
phone_pitch = avg_phone_feature(pitch, durations, interpolation=True)
save_feature(
dataset_output, cfg.preprocess.phone_pitch_dir, uid, phone_pitch
)
save_feature(dataset_output, cfg.preprocess.pitch_dir, uid, pitch)
if cfg.preprocess.extract_uv:
assert isinstance(pitch, np.ndarray)
uv = pitch != 0
save_feature(dataset_output, cfg.preprocess.uv_dir, uid, uv)
if cfg.preprocess.extract_audio:
save_torch_audio(dataset_output,
cfg.preprocess.audio_dir,
uid,
wav_torch,
cfg.preprocess.sample_rate)
if cfg.preprocess.extract_label:
if cfg.preprocess.is_mu_law:
# compress audio
wav = compress(wav, cfg.preprocess.bits)
label = audio_to_label(wav, cfg.preprocess.bits)
save_feature(dataset_output, cfg.preprocess.label_dir, uid, label)
if cfg.preprocess.extract_acoustic_token:
if cfg.preprocess.acoustic_token_extractor == "Encodec":
codes = extract_encodec_token(wav_path)
save_feature(dataset_output, cfg.preprocess.acoustic_token_dir, uid, codes)
def extract_utt_acoustic_features_svc(dataset_output, cfg, utt):
__extract_utt_acoustic_features(dataset_output, cfg, utt)
def extract_utt_acoustic_features_tta(dataset_output, cfg, utt):
__extract_utt_acoustic_features(dataset_output, cfg, utt)
def extract_utt_acoustic_features_vocoder(dataset_output, cfg, utt):
"""Extract acoustic features from utterances (in single process)
Args:
dataset_output (str): directory to store acoustic features
cfg (dict): dictionary that stores configurations
utt (dict): utterance info including dataset, singer, uid:{singer}_{song}_{index},
path to utternace, duration, utternace index
"""
from utils import audio, f0, world, duration
uid = utt["Uid"]
wav_path = utt["Path"]
with torch.no_grad():
# Load audio data into tensor with sample rate of the config file
wav_torch, _ = audio.load_audio_torch(wav_path, cfg.preprocess.sample_rate)
wav = wav_torch.cpu().numpy()
# extract features
if cfg.preprocess.extract_mel:
from utils.mel import extract_mel_features
mel = extract_mel_features(wav_torch.unsqueeze(0), cfg.preprocess)
save_feature(dataset_output, cfg.preprocess.mel_dir, uid, mel.cpu().numpy())
if cfg.preprocess.extract_energy:
if (
cfg.preprocess.energy_extract_mode == "from_mel"
and cfg.preprocess.extract_mel
):
energy = (mel.exp() ** 2).sum(0).sqrt().cpu().numpy()
elif cfg.preprocess.energy_extract_mode == "from_waveform":
energy = audio.energy(wav, cfg.preprocess)
else:
assert cfg.preprocess.energy_extract_mode in [
"from_mel",
"from_waveform",
], f"{cfg.preprocess.energy_extract_mode} not in supported energy_extract_mode [from_mel, from_waveform, from_tacotron_stft]"
save_feature(dataset_output, cfg.preprocess.energy_dir, uid, energy)
if cfg.preprocess.extract_pitch:
pitch = f0.get_f0(wav, cfg.preprocess)
save_feature(dataset_output, cfg.preprocess.pitch_dir, uid, pitch)
if cfg.preprocess.extract_uv:
assert isinstance(pitch, np.ndarray)
uv = pitch != 0
save_feature(dataset_output, cfg.preprocess.uv_dir, uid, uv)
if cfg.preprocess.extract_audio:
save_feature(dataset_output, cfg.preprocess.audio_dir, uid, wav)
if cfg.preprocess.extract_label:
if cfg.preprocess.is_mu_law:
# compress audio
wav = compress(wav, cfg.preprocess.bits)
label = audio_to_label(wav, cfg.preprocess.bits)
save_feature(dataset_output, cfg.preprocess.label_dir, uid, label)
def cal_normalized_mel(mel, dataset_name, cfg):
mel_min, mel_max = load_mel_extrema(cfg, dataset_name)
mel_norm = normalize_mel_channel(mel, mel_min, mel_max)
return mel_norm
def cal_mel_min_max(dataset, output_path, cfg, metadata=None):
dataset_output = os.path.join(output_path, dataset)
if metadata is None:
metadata = []
for dataset_type in ["train", "test"] if "eval" not in dataset else ["test"]:
dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
with open(dataset_file, "r") as f:
metadata.extend(json.load(f))
tmp_mel_min = []
tmp_mel_max = []
for item in metadata:
mel_path = os.path.join(
dataset_output, cfg.preprocess.mel_dir, item["Uid"] + ".npy"
)
if not os.path.exists(mel_path):
continue
mel = np.load(mel_path)
if mel.shape[0] != cfg.preprocess.n_mel:
mel = mel.T
# mel: (n_mels, T)
assert mel.shape[0] == cfg.preprocess.n_mel
tmp_mel_min.append(np.min(mel, axis=-1))
tmp_mel_max.append(np.max(mel, axis=-1))
mel_min = np.min(tmp_mel_min, axis=0)
mel_max = np.max(tmp_mel_max, axis=0)
## save mel min max data
mel_min_max_dir = os.path.join(dataset_output, cfg.preprocess.mel_min_max_stats_dir)
os.makedirs(mel_min_max_dir, exist_ok=True)
mel_min_path = os.path.join(mel_min_max_dir, "mel_min.npy")
mel_max_path = os.path.join(mel_min_max_dir, "mel_max.npy")
np.save(mel_min_path, mel_min)
np.save(mel_max_path, mel_max)
def denorm_for_pred_mels(cfg, dataset_name, split, pred):
"""
Args:
pred: a list whose every element is (frame_len, n_mels)
Return:
similar like pred
"""
mel_min, mel_max = load_mel_extrema(cfg.preprocess, dataset_name)
recovered_mels = [
denormalize_mel_channel(mel.T, mel_min, mel_max).T for mel in pred
]
return recovered_mels
def load_mel_extrema(cfg, dataset_name):
data_dir = os.path.join(cfg.processed_dir, dataset_name, cfg.mel_min_max_stats_dir)
min_file = os.path.join(data_dir, "mel_min.npy")
max_file = os.path.join(data_dir, "mel_max.npy")
mel_min = np.load(min_file)
mel_max = np.load(max_file)
return mel_min, mel_max
def denormalize_mel_channel(mel, mel_min, mel_max):
mel_min = np.expand_dims(mel_min, -1)
mel_max = np.expand_dims(mel_max, -1)
return (mel + 1) / 2 * (mel_max - mel_min + ZERO) + mel_min
def normalize_mel_channel(mel, mel_min, mel_max):
mel_min = np.expand_dims(mel_min, -1)
mel_max = np.expand_dims(mel_max, -1)
return (mel - mel_min) / (mel_max - mel_min + ZERO) * 2 - 1
def normalize(dataset, feat_dir, cfg):
dataset_output = os.path.join(cfg.preprocess.processed_dir, dataset)
print(f"normalize {feat_dir}")
max_value = np.finfo(np.float64).min
min_value = np.finfo(np.float64).max
scaler = StandardScaler()
feat_files = os.listdir(os.path.join(dataset_output, feat_dir))
for feat_file in tqdm(feat_files):
feat_file = os.path.join(dataset_output, feat_dir, feat_file)
if not feat_file.endswith(".npy"):
continue
feat = np.load(feat_file)
max_value = max(max_value, max(feat))
min_value = min(min_value, min(feat))
scaler.partial_fit(feat.reshape((-1, 1)))
mean = scaler.mean_[0]
std = scaler.scale_[0]
stat = np.array([min_value, max_value, mean, std])
stat_npy = os.path.join(dataset_output, f"{feat_dir}_stat.npy")
np.save(stat_npy, stat)
return mean, std, min_value, max_value
def load_normalized(feat_dir, dataset_name, cfg):
dataset_output = os.path.join(cfg.preprocess.processed_dir, dataset_name)
stat_npy = os.path.join(dataset_output, f"{feat_dir}_stat.npy")
min_value, max_value, mean, std = np.load(stat_npy)
return mean, std, min_value, max_value
def cal_pitch_statistics_svc(dataset, output_path, cfg, metadata=None):
# path of dataset
dataset_dir = os.path.join(output_path, dataset)
save_dir = os.path.join(dataset_dir, cfg.preprocess.pitch_dir)
os.makedirs(save_dir, exist_ok=True)
if has_existed(os.path.join(save_dir, "statistics.json")):
return
if metadata is None:
# load singers and ids
singers = json.load(open(os.path.join(dataset_dir, "singers.json"), "r"))
# combine train and test metadata
metadata = []
for dataset_type in ["train", "test"] if "eval" not in dataset else ["test"]:
dataset_file = os.path.join(dataset_dir, "{}.json".format(dataset_type))
with open(dataset_file, "r") as f:
metadata.extend(json.load(f))
else:
singers = list(set([item["Singer"] for item in metadata]))
singers = {
"{}_{}".format(dataset, name): idx for idx, name in enumerate(singers)
}
# use different scalers for each singer
pitch_scalers = [[] for _ in range(len(singers))]
total_pitch_scalers = [[] for _ in range(len(singers))]
for utt_info in tqdm(metadata, desc="Loading F0..."):
# utt = f'{utt_info["Dataset"]}_{utt_info["Uid"]}'
singer = utt_info["Singer"]
pitch_path = os.path.join(
dataset_dir, cfg.preprocess.pitch_dir, utt_info["Uid"] + ".npy"
)
# total_pitch contains all pitch including unvoiced frames
if not os.path.exists(pitch_path):
continue
total_pitch = np.load(pitch_path)
assert len(total_pitch) > 0
# pitch contains only voiced frames
pitch = total_pitch[total_pitch != 0]
spkid = singers[f"{replace_augment_name(dataset)}_{singer}"]
# update pitch scalers
pitch_scalers[spkid].extend(pitch.tolist())
# update total pitch scalers
total_pitch_scalers[spkid].extend(total_pitch.tolist())
# save pitch statistics for each singer in dict
sta_dict = {}
for singer in tqdm(singers, desc="Singers statistics"):
spkid = singers[singer]
# voiced pitch statistics
mean, std, min, max, median = (
np.mean(pitch_scalers[spkid]),
np.std(pitch_scalers[spkid]),
np.min(pitch_scalers[spkid]),
np.max(pitch_scalers[spkid]),
np.median(pitch_scalers[spkid]),
)
# total pitch statistics
mean_t, std_t, min_t, max_t, median_t = (
np.mean(total_pitch_scalers[spkid]),
np.std(total_pitch_scalers[spkid]),
np.min(total_pitch_scalers[spkid]),
np.max(total_pitch_scalers[spkid]),
np.median(total_pitch_scalers[spkid]),
)
sta_dict[singer] = {
"voiced_positions": {
"mean": mean,
"std": std,
"median": median,
"min": min,
"max": max,
},
"total_positions": {
"mean": mean_t,
"std": std_t,
"median": median_t,
"min": min_t,
"max": max_t,
},
}
# save statistics
with open(os.path.join(save_dir, "statistics.json"), "w") as f:
json.dump(sta_dict, f, indent=4, ensure_ascii=False)
def cal_pitch_statistics(dataset, output_path, cfg):
# path of dataset
dataset_dir = os.path.join(output_path, dataset)
if cfg.preprocess.use_phone_pitch:
pitch_dir = cfg.preprocess.phone_pitch_dir
else:
pitch_dir = cfg.preprocess.pitch_dir
save_dir = os.path.join(dataset_dir, pitch_dir)
os.makedirs(save_dir, exist_ok=True)
if has_existed(os.path.join(save_dir, "statistics.json")):
return
# load singers and ids
singers = json.load(open(os.path.join(dataset_dir, "singers.json"), "r"))
# combine train and test metadata
metadata = []
for dataset_type in ["train", "test"] if "eval" not in dataset else ["test"]:
dataset_file = os.path.join(dataset_dir, "{}.json".format(dataset_type))
with open(dataset_file, "r") as f:
metadata.extend(json.load(f))
# use different scalers for each singer
pitch_scalers = [[] for _ in range(len(singers))]
total_pitch_scalers = [[] for _ in range(len(singers))]
for utt_info in metadata:
utt = f'{utt_info["Dataset"]}_{utt_info["Uid"]}'
singer = utt_info["Singer"]
pitch_path = os.path.join(dataset_dir, pitch_dir, utt_info["Uid"] + ".npy")
# total_pitch contains all pitch including unvoiced frames
if not os.path.exists(pitch_path):
continue
total_pitch = np.load(pitch_path)
assert len(total_pitch) > 0
# pitch contains only voiced frames
# pitch = total_pitch[total_pitch != 0]
if cfg.preprocess.pitch_remove_outlier:
pitch = remove_outlier(total_pitch)
spkid = singers[f"{replace_augment_name(dataset)}_{singer}"]
# update pitch scalers
pitch_scalers[spkid].extend(pitch.tolist())
# update total pitch scalers
total_pitch_scalers[spkid].extend(total_pitch.tolist())
# save pitch statistics for each singer in dict
sta_dict = {}
for singer in singers:
spkid = singers[singer]
# voiced pitch statistics
mean, std, min, max, median = (
np.mean(pitch_scalers[spkid]),
np.std(pitch_scalers[spkid]),
np.min(pitch_scalers[spkid]),
np.max(pitch_scalers[spkid]),
np.median(pitch_scalers[spkid]),
)
# total pitch statistics
mean_t, std_t, min_t, max_t, median_t = (
np.mean(total_pitch_scalers[spkid]),
np.std(total_pitch_scalers[spkid]),
np.min(total_pitch_scalers[spkid]),
np.max(total_pitch_scalers[spkid]),
np.median(total_pitch_scalers[spkid]),
)
sta_dict[singer] = {
"voiced_positions": {
"mean": mean,
"std": std,
"median": median,
"min": min,
"max": max,
},
"total_positions": {
"mean": mean_t,
"std": std_t,
"median": median_t,
"min": min_t,
"max": max_t,
},
}
# save statistics
with open(os.path.join(save_dir, "statistics.json"), "w") as f:
json.dump(sta_dict, f, indent=4, ensure_ascii=False)
def cal_energy_statistics(dataset, output_path, cfg):
# path of dataset
dataset_dir = os.path.join(output_path, dataset)
if cfg.preprocess.use_phone_energy:
energy_dir = cfg.preprocess.phone_energy_dir
else:
energy_dir = cfg.preprocess.energy_dir
save_dir = os.path.join(dataset_dir, energy_dir)
os.makedirs(save_dir, exist_ok=True)
print(os.path.join(save_dir, "statistics.json"))
if has_existed(os.path.join(save_dir, "statistics.json")):
return
# load singers and ids
singers = json.load(open(os.path.join(dataset_dir, "singers.json"), "r"))
# combine train and test metadata
metadata = []
for dataset_type in ["train", "test"] if "eval" not in dataset else ["test"]:
dataset_file = os.path.join(dataset_dir, "{}.json".format(dataset_type))
with open(dataset_file, "r") as f:
metadata.extend(json.load(f))
# use different scalers for each singer
energy_scalers = [[] for _ in range(len(singers))]
total_energy_scalers = [[] for _ in range(len(singers))]
for utt_info in metadata:
utt = f'{utt_info["Dataset"]}_{utt_info["Uid"]}'
singer = utt_info["Singer"]
energy_path = os.path.join(dataset_dir, energy_dir, utt_info["Uid"] + ".npy")
# total_energy contains all energy including unvoiced frames
if not os.path.exists(energy_path):
continue
total_energy = np.load(energy_path)
assert len(total_energy) > 0
# energy contains only voiced frames
# energy = total_energy[total_energy != 0]
if cfg.preprocess.energy_remove_outlier:
energy = remove_outlier(total_energy)
spkid = singers[f"{replace_augment_name(dataset)}_{singer}"]
# update energy scalers
energy_scalers[spkid].extend(energy.tolist())
# update total energyscalers
total_energy_scalers[spkid].extend(total_energy.tolist())
# save energy statistics for each singer in dict
sta_dict = {}
for singer in singers:
spkid = singers[singer]
# voiced energy statistics
mean, std, min, max, median = (
np.mean(energy_scalers[spkid]),
np.std(energy_scalers[spkid]),
np.min(energy_scalers[spkid]),
np.max(energy_scalers[spkid]),
np.median(energy_scalers[spkid]),
)
# total energy statistics
mean_t, std_t, min_t, max_t, median_t = (
np.mean(total_energy_scalers[spkid]),
np.std(total_energy_scalers[spkid]),
np.min(total_energy_scalers[spkid]),
np.max(total_energy_scalers[spkid]),
np.median(total_energy_scalers[spkid]),
)
sta_dict[singer] = {
"voiced_positions": {
"mean": mean,
"std": std,
"median": median,
"min": min,
"max": max,
},
"total_positions": {
"mean": mean_t,
"std": std_t,
"median": median_t,
"min": min_t,
"max": max_t,
},
}
# save statistics
with open(os.path.join(save_dir, "statistics.json"), "w") as f:
json.dump(sta_dict, f, indent=4, ensure_ascii=False)
def copy_acoustic_features(metadata, dataset_dir, src_dataset_dir, cfg):
"""Copy acoustic features from src_dataset_dir to dataset_dir
Args:
metadata (dict): dictionary that stores data in train.json and test.json files
dataset_dir (str): directory to store acoustic features
src_dataset_dir (str): directory to store acoustic features
cfg (dict): dictionary that stores configurations
"""
if cfg.preprocess.extract_mel:
if not has_existed(os.path.join(dataset_dir, cfg.preprocess.mel_dir)):
os.makedirs(
os.path.join(dataset_dir, cfg.preprocess.mel_dir), exist_ok=True
)
print(
"Copying mel features from {} to {}...".format(
src_dataset_dir, dataset_dir
)
)
for utt_info in tqdm(metadata):
src_mel_path = os.path.join(
src_dataset_dir, cfg.preprocess.mel_dir, utt_info["Uid"] + ".npy"
)
dst_mel_path = os.path.join(
dataset_dir, cfg.preprocess.mel_dir, utt_info["Uid"] + ".npy"
)
# create soft-links
if not os.path.exists(dst_mel_path):
os.symlink(src_mel_path, dst_mel_path)
if cfg.preprocess.extract_energy:
if not has_existed(os.path.join(dataset_dir, cfg.preprocess.energy_dir)):
os.makedirs(
os.path.join(dataset_dir, cfg.preprocess.energy_dir), exist_ok=True
)
print(
"Copying energy features from {} to {}...".format(
src_dataset_dir, dataset_dir
)
)
for utt_info in tqdm(metadata):
src_energy_path = os.path.join(
src_dataset_dir, cfg.preprocess.energy_dir, utt_info["Uid"] + ".npy"
)
dst_energy_path = os.path.join(
dataset_dir, cfg.preprocess.energy_dir, utt_info["Uid"] + ".npy"
)
# create soft-links
if not os.path.exists(dst_energy_path):
os.symlink(src_energy_path, dst_energy_path)
if cfg.preprocess.extract_pitch:
if not has_existed(os.path.join(dataset_dir, cfg.preprocess.pitch_dir)):
os.makedirs(
os.path.join(dataset_dir, cfg.preprocess.pitch_dir), exist_ok=True
)
print(
"Copying pitch features from {} to {}...".format(
src_dataset_dir, dataset_dir
)
)
for utt_info in tqdm(metadata):
src_pitch_path = os.path.join(
src_dataset_dir, cfg.preprocess.pitch_dir, utt_info["Uid"] + ".npy"
)
dst_pitch_path = os.path.join(
dataset_dir, cfg.preprocess.pitch_dir, utt_info["Uid"] + ".npy"
)
# create soft-links
if not os.path.exists(dst_pitch_path):
os.symlink(src_pitch_path, dst_pitch_path)
if cfg.preprocess.extract_uv:
if not has_existed(os.path.join(dataset_dir, cfg.preprocess.uv_dir)):
os.makedirs(
os.path.join(dataset_dir, cfg.preprocess.uv_dir), exist_ok=True
)
print(
"Copying uv features from {} to {}...".format(
src_dataset_dir, dataset_dir
)
)
for utt_info in tqdm(metadata):
src_uv_path = os.path.join(
src_dataset_dir, cfg.preprocess.uv_dir, utt_info["Uid"] + ".npy"
)
dst_uv_path = os.path.join(
dataset_dir, cfg.preprocess.uv_dir, utt_info["Uid"] + ".npy"
)
# create soft-links
if not os.path.exists(dst_uv_path):
os.symlink(src_uv_path, dst_uv_path)
if cfg.preprocess.extract_audio:
if not has_existed(os.path.join(dataset_dir, cfg.preprocess.audio_dir)):
os.makedirs(
os.path.join(dataset_dir, cfg.preprocess.audio_dir), exist_ok=True
)
print(
"Copying audio features from {} to {}...".format(
src_dataset_dir, dataset_dir
)
)
for utt_info in tqdm(metadata):
if cfg.task_type == "tts":
src_audio_path = os.path.join(
src_dataset_dir, cfg.preprocess.audio_dir, utt_info["Uid"] + ".wav"
)
else:
src_audio_path = os.path.join(
src_dataset_dir, cfg.preprocess.audio_dir, utt_info["Uid"] + ".npy"
)
if cfg.task_type == "tts":
dst_audio_path = os.path.join(
dataset_dir, cfg.preprocess.audio_dir, utt_info["Uid"] + ".wav"
)
else:
dst_audio_path = os.path.join(
dataset_dir, cfg.preprocess.audio_dir, utt_info["Uid"] + ".npy"
)
# create soft-links
if not os.path.exists(dst_audio_path):
os.symlink(src_audio_path, dst_audio_path)
if cfg.preprocess.extract_label:
if not has_existed(os.path.join(dataset_dir, cfg.preprocess.label_dir)):
os.makedirs(
os.path.join(dataset_dir, cfg.preprocess.label_dir), exist_ok=True
)
print(
"Copying label features from {} to {}...".format(
src_dataset_dir, dataset_dir
)
)
for utt_info in tqdm(metadata):
src_label_path = os.path.join(
src_dataset_dir, cfg.preprocess.label_dir, utt_info["Uid"] + ".npy"
)
dst_label_path = os.path.join(
dataset_dir, cfg.preprocess.label_dir, utt_info["Uid"] + ".npy"
)
# create soft-links
if not os.path.exists(dst_label_path):
os.symlink(src_label_path, dst_label_path)
def align_duration_mel(dataset, output_path, cfg):
print("align the duration and mel")
dataset_dir = os.path.join(output_path, dataset)
metadata = []
for dataset_type in ["train", "test"] if "eval" not in dataset else ["test"]:
dataset_file = os.path.join(dataset_dir, "{}.json".format(dataset_type))
with open(dataset_file, "r") as f:
metadata.extend(json.load(f))
utt2dur = {}
for index in tqdm(range(len(metadata))):
utt_info = metadata[index]
dataset = utt_info["Dataset"]
uid = utt_info["Uid"]
utt = "{}_{}".format(dataset, uid)
mel_path = os.path.join(dataset_dir, cfg.preprocess.mel_dir, uid + ".npy")
mel = np.load(mel_path).transpose(1, 0)
duration_path = os.path.join(
dataset_dir, cfg.preprocess.duration_dir, uid + ".npy"
)
duration = np.load(duration_path)
if sum(duration) != mel.shape[0]:
duration_sum = sum(duration)
mel_len = mel.shape[0]
mismatch = abs(duration_sum - mel_len)
assert mismatch <= 5, "duration and mel length mismatch!"
cloned = np.array(duration, copy=True)
if duration_sum > mel_len:
for j in range(1, len(duration) - 1):
if mismatch == 0:
break
dur_val = cloned[-j]
if dur_val >= mismatch:
cloned[-j] -= mismatch
mismatch -= dur_val
break
else:
cloned[-j] = 0
mismatch -= dur_val
elif duration_sum < mel_len:
cloned[-1] += mismatch
duration = cloned
utt2dur[utt] = duration
np.save(duration_path, duration)
return utt2dur