Voice-Clone2

Running

App Files Files Community

Voice-Clone2 / TTS /tts /models /glow_tts.py

Shadhil

voice-clone with single audio sample input

9b2107c 12 months ago

raw

history blame

24.3 kB

	import math
	from typing import Dict, List, Tuple, Union

	import torch
	from coqpit import Coqpit
	from torch import nn
	from torch.cuda.amp.autocast_mode import autocast
	from torch.nn import functional as F

	from TTS.tts.configs.glow_tts_config import GlowTTSConfig
	from TTS.tts.layers.glow_tts.decoder import Decoder
	from TTS.tts.layers.glow_tts.encoder import Encoder
	from TTS.tts.models.base_tts import BaseTTS
	from TTS.tts.utils.helpers import generate_path, maximum_path, sequence_mask
	from TTS.tts.utils.speakers import SpeakerManager
	from TTS.tts.utils.synthesis import synthesis
	from TTS.tts.utils.text.tokenizer import TTSTokenizer
	from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
	from TTS.utils.io import load_fsspec


	class GlowTTS(BaseTTS):
	"""GlowTTS model.

	Paper::
	https://arxiv.org/abs/2005.11129

	Paper abstract::
	Recently, text-to-speech (TTS) models such as FastSpeech and ParaNet have been proposed to generate
	mel-spectrograms from text in parallel. Despite the advantage, the parallel TTS models cannot be trained
	without guidance from autoregressive TTS models as their external aligners. In this work, we propose Glow-TTS,
	a flow-based generative model for parallel TTS that does not require any external aligner. By combining the
	properties of flows and dynamic programming, the proposed model searches for the most probable monotonic
	alignment between text and the latent representation of speech on its own. We demonstrate that enforcing hard
	monotonic alignments enables robust TTS, which generalizes to long utterances, and employing generative flows
	enables fast, diverse, and controllable speech synthesis. Glow-TTS obtains an order-of-magnitude speed-up over
	the autoregressive model, Tacotron 2, at synthesis with comparable speech quality. We further show that our
	model can be easily extended to a multi-speaker setting.

	Check :class:`TTS.tts.configs.glow_tts_config.GlowTTSConfig` for class arguments.

	Examples:
	Init only model layers.

	>>> from TTS.tts.configs.glow_tts_config import GlowTTSConfig
	>>> from TTS.tts.models.glow_tts import GlowTTS
	>>> config = GlowTTSConfig(num_chars=2)
	>>> model = GlowTTS(config)

	Fully init a model ready for action. All the class attributes and class members
	(e.g Tokenizer, AudioProcessor, etc.). are initialized internally based on config values.

	>>> from TTS.tts.configs.glow_tts_config import GlowTTSConfig
	>>> from TTS.tts.models.glow_tts import GlowTTS
	>>> config = GlowTTSConfig()
	>>> model = GlowTTS.init_from_config(config, verbose=False)
	"""

	def __init__(
	self,
	config: GlowTTSConfig,
	ap: "AudioProcessor" = None,
	tokenizer: "TTSTokenizer" = None,
	speaker_manager: SpeakerManager = None,
	):
	super().__init__(config, ap, tokenizer, speaker_manager)

	# pass all config fields to `self`
	# for fewer code change
	self.config = config
	for key in config:
	setattr(self, key, config[key])

	self.decoder_output_dim = config.out_channels

	# init multi-speaker layers if necessary
	self.init_multispeaker(config)

	self.run_data_dep_init = config.data_dep_init_steps > 0
	self.encoder = Encoder(
	self.num_chars,
	out_channels=self.out_channels,
	hidden_channels=self.hidden_channels_enc,
	hidden_channels_dp=self.hidden_channels_dp,
	encoder_type=self.encoder_type,
	encoder_params=self.encoder_params,
	mean_only=self.mean_only,
	use_prenet=self.use_encoder_prenet,
	dropout_p_dp=self.dropout_p_dp,
	c_in_channels=self.c_in_channels,
	)

	self.decoder = Decoder(
	self.out_channels,
	self.hidden_channels_dec,
	self.kernel_size_dec,
	self.dilation_rate,
	self.num_flow_blocks_dec,
	self.num_block_layers,
	dropout_p=self.dropout_p_dec,
	num_splits=self.num_splits,
	num_squeeze=self.num_squeeze,
	sigmoid_scale=self.sigmoid_scale,
	c_in_channels=self.c_in_channels,
	)

	def init_multispeaker(self, config: Coqpit):
	"""Init speaker embedding layer if `use_speaker_embedding` is True and set the expected speaker embedding
	vector dimension to the encoder layer channel size. If model uses d-vectors, then it only sets
	speaker embedding vector dimension to the d-vector dimension from the config.

	Args:
	config (Coqpit): Model configuration.
	"""
	self.embedded_speaker_dim = 0
	# set number of speakers - if num_speakers is set in config, use it, otherwise use speaker_manager
	if self.speaker_manager is not None:
	self.num_speakers = self.speaker_manager.num_speakers
	# set ultimate speaker embedding size
	if config.use_d_vector_file:
	self.embedded_speaker_dim = (
	config.d_vector_dim if "d_vector_dim" in config and config.d_vector_dim is not None else 512
	)
	if self.speaker_manager is not None:
	assert (
	config.d_vector_dim == self.speaker_manager.embedding_dim
	), " [!] d-vector dimension mismatch b/w config and speaker manager."
	# init speaker embedding layer
	if config.use_speaker_embedding and not config.use_d_vector_file:
	print(" > Init speaker_embedding layer.")
	self.embedded_speaker_dim = self.hidden_channels_enc
	self.emb_g = nn.Embedding(self.num_speakers, self.hidden_channels_enc)
	nn.init.uniform_(self.emb_g.weight, -0.1, 0.1)
	# set conditioning dimensions
	self.c_in_channels = self.embedded_speaker_dim

	@staticmethod
	def compute_outputs(attn, o_mean, o_log_scale, x_mask):
	"""Compute and format the mode outputs with the given alignment map"""
	y_mean = torch.matmul(attn.squeeze(1).transpose(1, 2), o_mean.transpose(1, 2)).transpose(
	1, 2
	) # [b, t', t], [b, t, d] -> [b, d, t']
	y_log_scale = torch.matmul(attn.squeeze(1).transpose(1, 2), o_log_scale.transpose(1, 2)).transpose(
	1, 2
	) # [b, t', t], [b, t, d] -> [b, d, t']
	# compute total duration with adjustment
	o_attn_dur = torch.log(1 + torch.sum(attn, -1)) * x_mask
	return y_mean, y_log_scale, o_attn_dur

	def unlock_act_norm_layers(self):
	"""Unlock activation normalization layers for data depended initalization."""
	for f in self.decoder.flows:
	if getattr(f, "set_ddi", False):
	f.set_ddi(True)

	def lock_act_norm_layers(self):
	"""Lock activation normalization layers."""
	for f in self.decoder.flows:
	if getattr(f, "set_ddi", False):
	f.set_ddi(False)

	def _set_speaker_input(self, aux_input: Dict):
	if aux_input is None:
	d_vectors = None
	speaker_ids = None
	else:
	d_vectors = aux_input.get("d_vectors", None)
	speaker_ids = aux_input.get("speaker_ids", None)

	if d_vectors is not None and speaker_ids is not None:
	raise ValueError("[!] Cannot use d-vectors and speaker-ids together.")

	if speaker_ids is not None and not hasattr(self, "emb_g"):
	raise ValueError("[!] Cannot use speaker-ids without enabling speaker embedding.")

	g = speaker_ids if speaker_ids is not None else d_vectors
	return g

	def _speaker_embedding(self, aux_input: Dict) -> Union[torch.tensor, None]:
	g = self._set_speaker_input(aux_input)
	# speaker embedding
	if g is not None:
	if hasattr(self, "emb_g"):
	# use speaker embedding layer
	if not g.size(): # if is a scalar
	g = g.unsqueeze(0) # unsqueeze
	g = F.normalize(self.emb_g(g)).unsqueeze(-1) # [b, h, 1]
	else:
	# use d-vector
	g = F.normalize(g).unsqueeze(-1) # [b, h, 1]
	return g

	def forward(
	self, x, x_lengths, y, y_lengths=None, aux_input={"d_vectors": None, "speaker_ids": None}
	): # pylint: disable=dangerous-default-value
	"""
	Args:
	x (torch.Tensor):
	Input text sequence ids. :math:`[B, T_en]`

	x_lengths (torch.Tensor):
	Lengths of input text sequences. :math:`[B]`

	y (torch.Tensor):
	Target mel-spectrogram frames. :math:`[B, T_de, C_mel]`

	y_lengths (torch.Tensor):
	Lengths of target mel-spectrogram frames. :math:`[B]`

	aux_input (Dict):
	Auxiliary inputs. `d_vectors` is speaker embedding vectors for a multi-speaker model.
	:math:`[B, D_vec]`. `speaker_ids` is speaker ids for a multi-speaker model usind speaker-embedding
	layer. :math:`B`

	Returns:
	Dict:
	- z: :math: `[B, T_de, C]`
	- logdet: :math:`B`
	- y_mean: :math:`[B, T_de, C]`
	- y_log_scale: :math:`[B, T_de, C]`
	- alignments: :math:`[B, T_en, T_de]`
	- durations_log: :math:`[B, T_en, 1]`
	- total_durations_log: :math:`[B, T_en, 1]`
	"""
	# [B, T, C] -> [B, C, T]
	y = y.transpose(1, 2)
	y_max_length = y.size(2)
	# norm speaker embeddings
	g = self._speaker_embedding(aux_input)
	# embedding pass
	o_mean, o_log_scale, o_dur_log, x_mask = self.encoder(x, x_lengths, g=g)
	# drop redisual frames wrt num_squeeze and set y_lengths.
	y, y_lengths, y_max_length, attn = self.preprocess(y, y_lengths, y_max_length, None)
	# create masks
	y_mask = torch.unsqueeze(sequence_mask(y_lengths, y_max_length), 1).to(x_mask.dtype)
	# [B, 1, T_en, T_de]
	attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2)
	# decoder pass
	z, logdet = self.decoder(y, y_mask, g=g, reverse=False)
	# find the alignment path
	with torch.no_grad():
	o_scale = torch.exp(-2 * o_log_scale)
	logp1 = torch.sum(-0.5 * math.log(2 * math.pi) - o_log_scale, [1]).unsqueeze(-1) # [b, t, 1]
	logp2 = torch.matmul(o_scale.transpose(1, 2), -0.5 * (z**2)) # [b, t, d] x [b, d, t'] = [b, t, t']
	logp3 = torch.matmul((o_mean * o_scale).transpose(1, 2), z) # [b, t, d] x [b, d, t'] = [b, t, t']
	logp4 = torch.sum(-0.5 * (o_mean*2) o_scale, [1]).unsqueeze(-1) # [b, t, 1]
	logp = logp1 + logp2 + logp3 + logp4 # [b, t, t']
	attn = maximum_path(logp, attn_mask.squeeze(1)).unsqueeze(1).detach()
	y_mean, y_log_scale, o_attn_dur = self.compute_outputs(attn, o_mean, o_log_scale, x_mask)
	attn = attn.squeeze(1).permute(0, 2, 1)
	outputs = {
	"z": z.transpose(1, 2),
	"logdet": logdet,
	"y_mean": y_mean.transpose(1, 2),
	"y_log_scale": y_log_scale.transpose(1, 2),
	"alignments": attn,
	"durations_log": o_dur_log.transpose(1, 2),
	"total_durations_log": o_attn_dur.transpose(1, 2),
	}
	return outputs

	@torch.no_grad()
	def inference_with_MAS(
	self, x, x_lengths, y=None, y_lengths=None, aux_input={"d_vectors": None, "speaker_ids": None}
	): # pylint: disable=dangerous-default-value
	"""
	It's similar to the teacher forcing in Tacotron.
	It was proposed in: https://arxiv.org/abs/2104.05557

	Shapes:
	- x: :math:`[B, T]`
	- x_lenghts: :math:`B`
	- y: :math:`[B, T, C]`
	- y_lengths: :math:`B`
	- g: :math:`[B, C] or B`
	"""
	y = y.transpose(1, 2)
	y_max_length = y.size(2)
	# norm speaker embeddings
	g = self._speaker_embedding(aux_input)
	# embedding pass
	o_mean, o_log_scale, o_dur_log, x_mask = self.encoder(x, x_lengths, g=g)
	# drop redisual frames wrt num_squeeze and set y_lengths.
	y, y_lengths, y_max_length, attn = self.preprocess(y, y_lengths, y_max_length, None)
	# create masks
	y_mask = torch.unsqueeze(sequence_mask(y_lengths, y_max_length), 1).to(x_mask.dtype)
	attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2)
	# decoder pass
	z, logdet = self.decoder(y, y_mask, g=g, reverse=False)
	# find the alignment path between z and encoder output
	o_scale = torch.exp(-2 * o_log_scale)
	logp1 = torch.sum(-0.5 * math.log(2 * math.pi) - o_log_scale, [1]).unsqueeze(-1) # [b, t, 1]
	logp2 = torch.matmul(o_scale.transpose(1, 2), -0.5 * (z**2)) # [b, t, d] x [b, d, t'] = [b, t, t']
	logp3 = torch.matmul((o_mean * o_scale).transpose(1, 2), z) # [b, t, d] x [b, d, t'] = [b, t, t']
	logp4 = torch.sum(-0.5 * (o_mean*2) o_scale, [1]).unsqueeze(-1) # [b, t, 1]
	logp = logp1 + logp2 + logp3 + logp4 # [b, t, t']
	attn = maximum_path(logp, attn_mask.squeeze(1)).unsqueeze(1).detach()

	y_mean, y_log_scale, o_attn_dur = self.compute_outputs(attn, o_mean, o_log_scale, x_mask)
	attn = attn.squeeze(1).permute(0, 2, 1)

	# get predited aligned distribution
	z = y_mean * y_mask

	# reverse the decoder and predict using the aligned distribution
	y, logdet = self.decoder(z, y_mask, g=g, reverse=True)
	outputs = {
	"model_outputs": z.transpose(1, 2),
	"logdet": logdet,
	"y_mean": y_mean.transpose(1, 2),
	"y_log_scale": y_log_scale.transpose(1, 2),
	"alignments": attn,
	"durations_log": o_dur_log.transpose(1, 2),
	"total_durations_log": o_attn_dur.transpose(1, 2),
	}
	return outputs

	@torch.no_grad()
	def decoder_inference(
	self, y, y_lengths=None, aux_input={"d_vectors": None, "speaker_ids": None}
	): # pylint: disable=dangerous-default-value
	"""
	Shapes:
	- y: :math:`[B, T, C]`
	- y_lengths: :math:`B`
	- g: :math:`[B, C] or B`
	"""
	y = y.transpose(1, 2)
	y_max_length = y.size(2)
	g = self._speaker_embedding(aux_input)
	y_mask = torch.unsqueeze(sequence_mask(y_lengths, y_max_length), 1).to(y.dtype)
	# decoder pass
	z, logdet = self.decoder(y, y_mask, g=g, reverse=False)
	# reverse decoder and predict
	y, logdet = self.decoder(z, y_mask, g=g, reverse=True)
	outputs = {}
	outputs["model_outputs"] = y.transpose(1, 2)
	outputs["logdet"] = logdet
	return outputs

	@torch.no_grad()
	def inference(
	self, x, aux_input={"x_lengths": None, "d_vectors": None, "speaker_ids": None}
	): # pylint: disable=dangerous-default-value
	x_lengths = aux_input["x_lengths"]
	g = self._speaker_embedding(aux_input)
	# embedding pass
	o_mean, o_log_scale, o_dur_log, x_mask = self.encoder(x, x_lengths, g=g)
	# compute output durations
	w = (torch.exp(o_dur_log) - 1) * x_mask * self.length_scale
	w_ceil = torch.clamp_min(torch.ceil(w), 1)
	y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
	y_max_length = None
	# compute masks
	y_mask = torch.unsqueeze(sequence_mask(y_lengths, y_max_length), 1).to(x_mask.dtype)
	attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2)
	# compute attention mask
	attn = generate_path(w_ceil.squeeze(1), attn_mask.squeeze(1)).unsqueeze(1)
	y_mean, y_log_scale, o_attn_dur = self.compute_outputs(attn, o_mean, o_log_scale, x_mask)

	z = (y_mean + torch.exp(y_log_scale) * torch.randn_like(y_mean) * self.inference_noise_scale) * y_mask
	# decoder pass
	y, logdet = self.decoder(z, y_mask, g=g, reverse=True)
	attn = attn.squeeze(1).permute(0, 2, 1)
	outputs = {
	"model_outputs": y.transpose(1, 2),
	"logdet": logdet,
	"y_mean": y_mean.transpose(1, 2),
	"y_log_scale": y_log_scale.transpose(1, 2),
	"alignments": attn,
	"durations_log": o_dur_log.transpose(1, 2),
	"total_durations_log": o_attn_dur.transpose(1, 2),
	}
	return outputs

	def train_step(self, batch: dict, criterion: nn.Module):
	"""A single training step. Forward pass and loss computation. Run data depended initialization for the
	first `config.data_dep_init_steps` steps.

	Args:
	batch (dict): [description]
	criterion (nn.Module): [description]
	"""
	text_input = batch["text_input"]
	text_lengths = batch["text_lengths"]
	mel_input = batch["mel_input"]
	mel_lengths = batch["mel_lengths"]
	d_vectors = batch["d_vectors"]
	speaker_ids = batch["speaker_ids"]

	if self.run_data_dep_init and self.training:
	# compute data-dependent initialization of activation norm layers
	self.unlock_act_norm_layers()
	with torch.no_grad():
	_ = self.forward(
	text_input,
	text_lengths,
	mel_input,
	mel_lengths,
	aux_input={"d_vectors": d_vectors, "speaker_ids": speaker_ids},
	)
	outputs = None
	loss_dict = None
	self.lock_act_norm_layers()
	else:
	# normal training step
	outputs = self.forward(
	text_input,
	text_lengths,
	mel_input,
	mel_lengths,
	aux_input={"d_vectors": d_vectors, "speaker_ids": speaker_ids},
	)

	with autocast(enabled=False): # avoid mixed_precision in criterion
	loss_dict = criterion(
	outputs["z"].float(),
	outputs["y_mean"].float(),
	outputs["y_log_scale"].float(),
	outputs["logdet"].float(),
	mel_lengths,
	outputs["durations_log"].float(),
	outputs["total_durations_log"].float(),
	text_lengths,
	)
	return outputs, loss_dict

	def _create_logs(self, batch, outputs, ap):
	alignments = outputs["alignments"]
	text_input = batch["text_input"][:1] if batch["text_input"] is not None else None
	text_lengths = batch["text_lengths"]
	mel_input = batch["mel_input"]
	d_vectors = batch["d_vectors"][:1] if batch["d_vectors"] is not None else None
	speaker_ids = batch["speaker_ids"][:1] if batch["speaker_ids"] is not None else None

	# model runs reverse flow to predict spectrograms
	pred_outputs = self.inference(
	text_input,
	aux_input={"x_lengths": text_lengths[:1], "d_vectors": d_vectors, "speaker_ids": speaker_ids},
	)
	model_outputs = pred_outputs["model_outputs"]

	pred_spec = model_outputs[0].data.cpu().numpy()
	gt_spec = mel_input[0].data.cpu().numpy()
	align_img = alignments[0].data.cpu().numpy()

	figures = {
	"prediction": plot_spectrogram(pred_spec, ap, output_fig=False),
	"ground_truth": plot_spectrogram(gt_spec, ap, output_fig=False),
	"alignment": plot_alignment(align_img, output_fig=False),
	}

	# Sample audio
	train_audio = ap.inv_melspectrogram(pred_spec.T)
	return figures, {"audio": train_audio}

	def train_log(
	self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int
	) -> None: # pylint: disable=no-self-use
	figures, audios = self._create_logs(batch, outputs, self.ap)
	logger.train_figures(steps, figures)
	logger.train_audios(steps, audios, self.ap.sample_rate)

	@torch.no_grad()
	def eval_step(self, batch: dict, criterion: nn.Module):
	return self.train_step(batch, criterion)

	def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None:
	figures, audios = self._create_logs(batch, outputs, self.ap)
	logger.eval_figures(steps, figures)
	logger.eval_audios(steps, audios, self.ap.sample_rate)

	@torch.no_grad()
	def test_run(self, assets: Dict) -> Tuple[Dict, Dict]:
	"""Generic test run for `tts` models used by `Trainer`.

	You can override this for a different behaviour.

	Returns:
	Tuple[Dict, Dict]: Test figures and audios to be projected to Tensorboard.
	"""
	print(" \| > Synthesizing test sentences.")
	test_audios = {}
	test_figures = {}
	test_sentences = self.config.test_sentences
	aux_inputs = self._get_test_aux_input()
	if len(test_sentences) == 0:
	print(" \| [!] No test sentences provided.")
	else:
	for idx, sen in enumerate(test_sentences):
	outputs = synthesis(
	self,
	sen,
	self.config,
	"cuda" in str(next(self.parameters()).device),
	speaker_id=aux_inputs["speaker_id"],
	d_vector=aux_inputs["d_vector"],
	style_wav=aux_inputs["style_wav"],
	use_griffin_lim=True,
	do_trim_silence=False,
	)

	test_audios["{}-audio".format(idx)] = outputs["wav"]
	test_figures["{}-prediction".format(idx)] = plot_spectrogram(
	outputs["outputs"]["model_outputs"], self.ap, output_fig=False
	)
	test_figures["{}-alignment".format(idx)] = plot_alignment(outputs["alignments"], output_fig=False)
	return test_figures, test_audios

	def preprocess(self, y, y_lengths, y_max_length, attn=None):
	if y_max_length is not None:
	y_max_length = (y_max_length // self.num_squeeze) * self.num_squeeze
	y = y[:, :, :y_max_length]
	if attn is not None:
	attn = attn[:, :, :, :y_max_length]
	y_lengths = torch.div(y_lengths, self.num_squeeze, rounding_mode="floor") * self.num_squeeze
	return y, y_lengths, y_max_length, attn

	def store_inverse(self):
	self.decoder.store_inverse()

	def load_checkpoint(
	self, config, checkpoint_path, eval=False
	): # pylint: disable=unused-argument, redefined-builtin
	state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
	self.load_state_dict(state["model"])
	if eval:
	self.eval()
	self.store_inverse()
	assert not self.training

	@staticmethod
	def get_criterion():
	from TTS.tts.layers.losses import GlowTTSLoss # pylint: disable=import-outside-toplevel

	return GlowTTSLoss()

	def on_train_step_start(self, trainer):
	"""Decide on every training step wheter enable/disable data depended initialization."""
	self.run_data_dep_init = trainer.total_steps_done < self.data_dep_init_steps

	@staticmethod
	def init_from_config(config: "GlowTTSConfig", samples: Union[List[List], List[Dict]] = None, verbose=True):
	"""Initiate model from config

	Args:
	config (VitsConfig): Model config.
	samples (Union[List[List], List[Dict]]): Training samples to parse speaker ids for training.
	Defaults to None.
	verbose (bool): If True, print init messages. Defaults to True.
	"""
	from TTS.utils.audio import AudioProcessor

	ap = AudioProcessor.init_from_config(config, verbose)
	tokenizer, new_config = TTSTokenizer.init_from_config(config)
	speaker_manager = SpeakerManager.init_from_config(config, samples)
	return GlowTTS(new_config, ap, tokenizer, speaker_manager)