|
from multiprocessing.pool import Pool |
|
from synthesizer import audio |
|
from functools import partial |
|
from itertools import chain |
|
from encoder import inference as encoder |
|
from pathlib import Path |
|
from utils import logmmse |
|
from tqdm import tqdm |
|
import numpy as np |
|
import librosa |
|
import os |
|
|
|
|
|
|
|
def preprocess_KSponSpeech(datasets_root: Path, out_dir: Path, n_processes: int, |
|
skip_existing: bool, hparams): |
|
|
|
dataset_root = datasets_root.joinpath("KSponSpeech") |
|
input_dirs = [dataset_root.joinpath("KsponSpeech_01"), |
|
dataset_root.joinpath("KsponSpeech_02"), |
|
dataset_root.joinpath("KsponSpeech_03"), |
|
dataset_root.joinpath("KsponSpeech_04"), |
|
dataset_root.joinpath("KsponSpeech_05")] |
|
|
|
print("\n ".join(map(str, ["Using data from:"] + input_dirs))) |
|
assert all(input_dir.exists() for input_dir in input_dirs) |
|
|
|
|
|
out_dir.joinpath("mels").mkdir(exist_ok=True) |
|
out_dir.joinpath("audio").mkdir(exist_ok=True) |
|
|
|
|
|
metadata_fpath = out_dir.joinpath("train.txt") |
|
metadata_file = metadata_fpath.open("a" if skip_existing else "w", encoding="cp949") |
|
|
|
|
|
speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs)) |
|
func = partial(preprocess_speaker, out_dir=out_dir, skip_existing=skip_existing, |
|
hparams=hparams) |
|
job = Pool(n_processes).imap(func, speaker_dirs) |
|
for speaker_metadata in tqdm(job, "KSponSpeech", len(speaker_dirs), unit="speakers"): |
|
for metadatum in speaker_metadata: |
|
metadata_file.write("|".join(str(x) for x in metadatum) + "\n") |
|
metadata_file.close() |
|
|
|
|
|
with metadata_fpath.open("r", encoding="cp949") as metadata_file: |
|
metadata = [line.split("|") for line in metadata_file] |
|
mel_frames = sum([int(m[4]) for m in metadata]) |
|
timesteps = sum([int(m[3]) for m in metadata]) |
|
sample_rate = hparams.sample_rate |
|
hours = (timesteps / sample_rate) / 3600 |
|
print("The dataset consists of %d utterances, %d mel frames, %d audio timesteps (%.2f hours)." % |
|
(len(metadata), mel_frames, timesteps, hours)) |
|
print("Max input length (text chars): %d" % max(len(m[5]) for m in metadata)) |
|
print("Max mel frames length: %d" % max(int(m[4]) for m in metadata)) |
|
print("Max audio timesteps length: %d" % max(int(m[3]) for m in metadata)) |
|
|
|
|
|
def preprocess_speaker(speaker_dir, out_dir: Path, skip_existing: bool, hparams): |
|
metadata = [] |
|
check_list = [",01", ",02", ",03", ",04", ",05", ",06", ",07", ",08", ",09"] |
|
|
|
|
|
files = os.listdir(speaker_dir) |
|
|
|
for file in files: |
|
if file.endswith("alignment.txt"): |
|
with open(os.path.join(speaker_dir, file), "r", encoding='cp949') as alignments_file: |
|
alignments = [line.rstrip().split(" ") for line in alignments_file.readlines()] |
|
|
|
|
|
|
|
|
|
|
|
for wav_fname, words in alignments: |
|
|
|
for check in check_list: |
|
if check in words: |
|
print(words) |
|
words = "pass" |
|
|
|
wav_fpath = speaker_dir.joinpath(wav_fname + ".pcm") |
|
assert wav_fpath.exists() |
|
|
|
|
|
|
|
|
|
wavs = normalization(wav_fpath, hparams) |
|
|
|
if wavs is not None and words is not "pass": |
|
sub_basename = "%s" % (wav_fname) |
|
metadata.append(process_utterance(wavs, words, out_dir, sub_basename, |
|
skip_existing, hparams)) |
|
|
|
return [m for m in metadata if m is not None] |
|
|
|
|
|
def normalization(wav_fpath, hparams): |
|
try: |
|
wav = np.memmap(wav_fpath, dtype='h', mode='r') |
|
if hparams.rescale: |
|
wav = wav / np.abs(wav).max() * hparams.rescaling_max |
|
except EOFError: |
|
print(wav_fpath) |
|
return None |
|
return wav |
|
|
|
|
|
def process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str, |
|
skip_existing: bool, hparams): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
mel_fpath = out_dir.joinpath("mels", "mel-%s.npy" % basename) |
|
wav_fpath = out_dir.joinpath("audio", "audio-%s.npy" % basename) |
|
if skip_existing and mel_fpath.exists() and wav_fpath.exists(): |
|
return None |
|
|
|
|
|
if len(wav) < hparams.utterance_min_duration * hparams.sample_rate: |
|
return None |
|
|
|
|
|
mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) |
|
mel_frames = mel_spectrogram.shape[1] |
|
|
|
|
|
if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: |
|
return None |
|
|
|
|
|
np.save(mel_fpath, mel_spectrogram.T, allow_pickle=False) |
|
np.save(wav_fpath, wav, allow_pickle=False) |
|
|
|
|
|
return wav_fpath.name, mel_fpath.name, "embed-%s.npy" % basename, len(wav), mel_frames, text |
|
|
|
|
|
def embed_utterance(fpaths, encoder_model_fpath): |
|
if not encoder.is_loaded(): |
|
encoder.load_model(encoder_model_fpath) |
|
|
|
|
|
wav_fpath, embed_fpath = fpaths |
|
wav = np.load(wav_fpath) |
|
wav = encoder.preprocess_wav(wav) |
|
embed = encoder.embed_utterance(wav) |
|
np.save(embed_fpath, embed, allow_pickle=False) |
|
|
|
|
|
def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_processes: int): |
|
wav_dir = synthesizer_root.joinpath("audio") |
|
metadata_fpath = synthesizer_root.joinpath("train.txt") |
|
assert wav_dir.exists() and metadata_fpath.exists() |
|
embed_dir = synthesizer_root.joinpath("embeds") |
|
embed_dir.mkdir(exist_ok=True) |
|
|
|
|
|
with metadata_fpath.open("r") as metadata_file: |
|
metadata = [line.split("|") for line in metadata_file] |
|
fpaths = [(wav_dir.joinpath(m[0]), embed_dir.joinpath(m[2])) for m in metadata] |
|
|
|
|
|
|
|
func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath) |
|
job = Pool(n_processes).imap(func, fpaths) |
|
list(tqdm(job, "Embedding", len(fpaths), unit="utterances")) |
|
|