|
from multiprocessing.pool import Pool |
|
from synthesizer import audio |
|
from functools import partial |
|
from itertools import chain |
|
from encoder import inference as encoder |
|
from pathlib import Path |
|
from utils import logmmse |
|
from tqdm import tqdm |
|
import numpy as np |
|
import librosa |
|
|
|
|
|
def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int, |
|
skip_existing: bool, hparams, no_alignments: bool, |
|
datasets_name: str, subfolders: str): |
|
|
|
dataset_root = datasets_root.joinpath(datasets_name) |
|
input_dirs = [dataset_root.joinpath(subfolder.strip()) for subfolder in subfolders.split(",")] |
|
print("\n ".join(map(str, ["Using data from:"] + input_dirs))) |
|
assert all(input_dir.exists() for input_dir in input_dirs) |
|
|
|
|
|
out_dir.joinpath("mels").mkdir(exist_ok=True) |
|
out_dir.joinpath("audio").mkdir(exist_ok=True) |
|
|
|
|
|
metadata_fpath = out_dir.joinpath("train.txt") |
|
metadata_file = metadata_fpath.open("a" if skip_existing else "w", encoding="cp949") |
|
|
|
|
|
speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs)) |
|
func = partial(preprocess_speaker, out_dir=out_dir, skip_existing=skip_existing, |
|
hparams=hparams, no_alignments=no_alignments) |
|
job = Pool(n_processes).imap(func, speaker_dirs) |
|
for speaker_metadata in tqdm(job, datasets_name, len(speaker_dirs), unit="speakers"): |
|
for metadatum in speaker_metadata: |
|
metadata_file.write("|".join(str(x) for x in metadatum) + "\n") |
|
metadata_file.close() |
|
|
|
|
|
with metadata_fpath.open("r", encoding="cp949") as metadata_file: |
|
metadata = [line.split("|") for line in metadata_file] |
|
mel_frames = sum([int(m[4]) for m in metadata]) |
|
timesteps = sum([int(m[3]) for m in metadata]) |
|
sample_rate = hparams.sample_rate |
|
hours = (timesteps / sample_rate) / 3600 |
|
print("The dataset consists of %d utterances, %d mel frames, %d audio timesteps (%.2f hours)." % |
|
(len(metadata), mel_frames, timesteps, hours)) |
|
print("Max input length (text chars): %d" % max(len(m[5]) for m in metadata)) |
|
print("Max mel frames length: %d" % max(int(m[4]) for m in metadata)) |
|
print("Max audio timesteps length: %d" % max(int(m[3]) for m in metadata)) |
|
|
|
|
|
def preprocess_speaker(speaker_dir, out_dir: Path, skip_existing: bool, hparams, no_alignments: bool): |
|
metadata = [] |
|
for book_dir in speaker_dir.glob("*"): |
|
if no_alignments: |
|
|
|
|
|
extensions = ["*.wav", "*.flac", "*.mp3"] |
|
for extension in extensions: |
|
wav_fpaths = book_dir.glob(extension) |
|
|
|
for wav_fpath in wav_fpaths: |
|
|
|
wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate) |
|
if hparams.rescale: |
|
wav = wav / np.abs(wav).max() * hparams.rescaling_max |
|
|
|
|
|
|
|
text_fpath = wav_fpath.with_suffix(".txt") |
|
if not text_fpath.exists(): |
|
|
|
text_fpath = wav_fpath.with_suffix(".normalized.txt") |
|
assert text_fpath.exists() |
|
with text_fpath.open("r") as text_file: |
|
text = "".join([line for line in text_file]) |
|
text = text.replace("\"", "") |
|
text = text.strip() |
|
|
|
|
|
metadata.append(process_utterance(wav, text, out_dir, str(wav_fpath.with_suffix("").name), |
|
skip_existing, hparams)) |
|
else: |
|
|
|
|
|
try: |
|
alignments_fpath = next(book_dir.glob("*.alignment.txt")) |
|
with alignments_fpath.open("r") as alignments_file: |
|
alignments = [line.rstrip().split(" ") for line in alignments_file] |
|
except StopIteration: |
|
|
|
continue |
|
|
|
|
|
for wav_fname, words, end_times in alignments: |
|
wav_fpath = book_dir.joinpath(wav_fname + ".flac") |
|
assert wav_fpath.exists() |
|
words = words.replace("\"", "").split(",") |
|
end_times = list(map(float, end_times.replace("\"", "").split(","))) |
|
|
|
|
|
wavs, texts = split_on_silences(wav_fpath, words, end_times, hparams) |
|
for i, (wav, text) in enumerate(zip(wavs, texts)): |
|
sub_basename = "%s_%02d" % (wav_fname, i) |
|
metadata.append(process_utterance(wav, text, out_dir, sub_basename, |
|
skip_existing, hparams)) |
|
|
|
return [m for m in metadata if m is not None] |
|
|
|
|
|
def split_on_silences(wav_fpath, words, end_times, hparams): |
|
|
|
wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate) |
|
if hparams.rescale: |
|
wav = wav / np.abs(wav).max() * hparams.rescaling_max |
|
|
|
words = np.array(words) |
|
start_times = np.array([0.0] + end_times[:-1]) |
|
end_times = np.array(end_times) |
|
assert len(words) == len(end_times) == len(start_times) |
|
assert words[0] == "" and words[-1] == "" |
|
|
|
|
|
mask = (words == "") & (end_times - start_times >= hparams.silence_min_duration_split) |
|
mask[0] = mask[-1] = True |
|
breaks = np.where(mask)[0] |
|
|
|
|
|
silence_times = [[start_times[i], end_times[i]] for i in breaks] |
|
silence_times = (np.array(silence_times) * hparams.sample_rate).astype(np.int) |
|
noisy_wav = np.concatenate([wav[stime[0]:stime[1]] for stime in silence_times]) |
|
if len(noisy_wav) > hparams.sample_rate * 0.02: |
|
profile = logmmse.profile_noise(noisy_wav, hparams.sample_rate) |
|
wav = logmmse.denoise(wav, profile, eta=0) |
|
|
|
|
|
segments = list(zip(breaks[:-1], breaks[1:])) |
|
segment_durations = [start_times[end] - end_times[start] for start, end in segments] |
|
i = 0 |
|
while i < len(segments) and len(segments) > 1: |
|
if segment_durations[i] < hparams.utterance_min_duration: |
|
|
|
left_duration = float("inf") if i == 0 else segment_durations[i - 1] |
|
right_duration = float("inf") if i == len(segments) - 1 else segment_durations[i + 1] |
|
joined_duration = segment_durations[i] + min(left_duration, right_duration) |
|
|
|
|
|
if joined_duration > hparams.hop_size * hparams.max_mel_frames / hparams.sample_rate: |
|
i += 1 |
|
continue |
|
|
|
|
|
j = i - 1 if left_duration <= right_duration else i |
|
segments[j] = (segments[j][0], segments[j + 1][1]) |
|
segment_durations[j] = joined_duration |
|
del segments[j + 1], segment_durations[j + 1] |
|
else: |
|
i += 1 |
|
|
|
|
|
segment_times = [[end_times[start], start_times[end]] for start, end in segments] |
|
segment_times = (np.array(segment_times) * hparams.sample_rate).astype(np.int) |
|
wavs = [wav[segment_time[0]:segment_time[1]] for segment_time in segment_times] |
|
texts = [" ".join(words[start + 1:end]).replace(" ", " ") for start, end in segments] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return wavs, texts |
|
|
|
|
|
def process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str, |
|
skip_existing: bool, hparams): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
mel_fpath = out_dir.joinpath("mels", "mel-%s.npy" % basename) |
|
wav_fpath = out_dir.joinpath("audio", "audio-%s.npy" % basename) |
|
if skip_existing and mel_fpath.exists() and wav_fpath.exists(): |
|
return None |
|
|
|
|
|
if hparams.trim_silence: |
|
wav = encoder.preprocess_wav(wav, normalize=False, trim_silence=True) |
|
|
|
|
|
if len(wav) < hparams.utterance_min_duration * hparams.sample_rate: |
|
return None |
|
|
|
|
|
mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) |
|
mel_frames = mel_spectrogram.shape[1] |
|
|
|
|
|
if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: |
|
return None |
|
|
|
|
|
np.save(mel_fpath, mel_spectrogram.T, allow_pickle=False) |
|
np.save(wav_fpath, wav, allow_pickle=False) |
|
|
|
|
|
return wav_fpath.name, mel_fpath.name, "embed-%s.npy" % basename, len(wav), mel_frames, text |
|
|
|
|
|
def embed_utterance(fpaths, encoder_model_fpath): |
|
if not encoder.is_loaded(): |
|
encoder.load_model(encoder_model_fpath) |
|
|
|
|
|
wav_fpath, embed_fpath = fpaths |
|
wav = np.load(wav_fpath) |
|
wav = encoder.preprocess_wav(wav) |
|
embed = encoder.embed_utterance(wav) |
|
np.save(embed_fpath, embed, allow_pickle=False) |
|
|
|
|
|
def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_processes: int): |
|
wav_dir = synthesizer_root.joinpath("audio") |
|
metadata_fpath = synthesizer_root.joinpath("train.txt") |
|
assert wav_dir.exists() and metadata_fpath.exists() |
|
embed_dir = synthesizer_root.joinpath("embeds") |
|
embed_dir.mkdir(exist_ok=True) |
|
|
|
|
|
with metadata_fpath.open("r",encoding='cp949') as metadata_file: |
|
metadata = [line.split("|") for line in metadata_file] |
|
fpaths = [(wav_dir.joinpath(m[0]), embed_dir.joinpath(m[2])) for m in metadata] |
|
|
|
|
|
|
|
func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath) |
|
job = Pool(n_processes).imap(func, fpaths) |
|
list(tqdm(job, "Embedding", len(fpaths), unit="utterances")) |
|
|
|
|