Spaces:
Running
Running
from speaker_encoder import inference as encoder | |
from multiprocessing.pool import Pool | |
from functools import partial | |
from pathlib import Path | |
# from utils import logmmse | |
# from tqdm import tqdm | |
# import numpy as np | |
# import librosa | |
def embed_utterance(fpaths, encoder_model_fpath): | |
if not encoder.is_loaded(): | |
encoder.load_model(encoder_model_fpath) | |
# Compute the speaker embedding of the utterance | |
wav_fpath, embed_fpath = fpaths | |
wav = np.load(wav_fpath) | |
wav = encoder.preprocess_wav(wav) | |
embed = encoder.embed_utterance(wav) | |
np.save(embed_fpath, embed, allow_pickle=False) | |
def create_embeddings(outdir_root: Path, wav_dir: Path, encoder_model_fpath: Path, n_processes: int): | |
wav_dir = outdir_root.joinpath("audio") | |
metadata_fpath = synthesizer_root.joinpath("train.txt") | |
assert wav_dir.exists() and metadata_fpath.exists() | |
embed_dir = synthesizer_root.joinpath("embeds") | |
embed_dir.mkdir(exist_ok=True) | |
# Gather the input wave filepath and the target output embed filepath | |
with metadata_fpath.open("r") as metadata_file: | |
metadata = [line.split("|") for line in metadata_file] | |
fpaths = [(wav_dir.joinpath(m[0]), embed_dir.joinpath(m[2])) for m in metadata] | |
# TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here. | |
# Embed the utterances in separate threads | |
func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath) | |
job = Pool(n_processes).imap(func, fpaths) | |
list(tqdm(job, "Embedding", len(fpaths), unit="utterances")) |