Spaces:
Build error
Build error
| import math | |
| from typing import Dict, List, Tuple, Union | |
| import torch | |
| from coqpit import Coqpit | |
| from torch import nn | |
| from torch.cuda.amp.autocast_mode import autocast | |
| from torch.nn import functional as F | |
| from TTS.tts.configs.glow_tts_config import GlowTTSConfig | |
| from TTS.tts.layers.glow_tts.decoder import Decoder | |
| from TTS.tts.layers.glow_tts.encoder import Encoder | |
| from TTS.tts.models.base_tts import BaseTTS | |
| from TTS.tts.utils.helpers import generate_path, maximum_path, sequence_mask | |
| from TTS.tts.utils.speakers import SpeakerManager | |
| from TTS.tts.utils.synthesis import synthesis | |
| from TTS.tts.utils.text.tokenizer import TTSTokenizer | |
| from TTS.tts.utils.visual import plot_alignment, plot_spectrogram | |
| from TTS.utils.io import load_fsspec | |
| class GlowTTS(BaseTTS): | |
| """GlowTTS model. | |
| Paper:: | |
| https://arxiv.org/abs/2005.11129 | |
| Paper abstract:: | |
| Recently, text-to-speech (TTS) models such as FastSpeech and ParaNet have been proposed to generate | |
| mel-spectrograms from text in parallel. Despite the advantage, the parallel TTS models cannot be trained | |
| without guidance from autoregressive TTS models as their external aligners. In this work, we propose Glow-TTS, | |
| a flow-based generative model for parallel TTS that does not require any external aligner. By combining the | |
| properties of flows and dynamic programming, the proposed model searches for the most probable monotonic | |
| alignment between text and the latent representation of speech on its own. We demonstrate that enforcing hard | |
| monotonic alignments enables robust TTS, which generalizes to long utterances, and employing generative flows | |
| enables fast, diverse, and controllable speech synthesis. Glow-TTS obtains an order-of-magnitude speed-up over | |
| the autoregressive model, Tacotron 2, at synthesis with comparable speech quality. We further show that our | |
| model can be easily extended to a multi-speaker setting. | |
| Check :class:`TTS.tts.configs.glow_tts_config.GlowTTSConfig` for class arguments. | |
| Examples: | |
| Init only model layers. | |
| >>> from TTS.tts.configs.glow_tts_config import GlowTTSConfig | |
| >>> from TTS.tts.models.glow_tts import GlowTTS | |
| >>> config = GlowTTSConfig(num_chars=2) | |
| >>> model = GlowTTS(config) | |
| Fully init a model ready for action. All the class attributes and class members | |
| (e.g Tokenizer, AudioProcessor, etc.). are initialized internally based on config values. | |
| >>> from TTS.tts.configs.glow_tts_config import GlowTTSConfig | |
| >>> from TTS.tts.models.glow_tts import GlowTTS | |
| >>> config = GlowTTSConfig() | |
| >>> model = GlowTTS.init_from_config(config, verbose=False) | |
| """ | |
| def __init__( | |
| self, | |
| config: GlowTTSConfig, | |
| ap: "AudioProcessor" = None, | |
| tokenizer: "TTSTokenizer" = None, | |
| speaker_manager: SpeakerManager = None, | |
| ): | |
| super().__init__(config, ap, tokenizer, speaker_manager) | |
| # pass all config fields to `self` | |
| # for fewer code change | |
| self.config = config | |
| for key in config: | |
| setattr(self, key, config[key]) | |
| self.decoder_output_dim = config.out_channels | |
| # init multi-speaker layers if necessary | |
| self.init_multispeaker(config) | |
| self.run_data_dep_init = config.data_dep_init_steps > 0 | |
| self.encoder = Encoder( | |
| self.num_chars, | |
| out_channels=self.out_channels, | |
| hidden_channels=self.hidden_channels_enc, | |
| hidden_channels_dp=self.hidden_channels_dp, | |
| encoder_type=self.encoder_type, | |
| encoder_params=self.encoder_params, | |
| mean_only=self.mean_only, | |
| use_prenet=self.use_encoder_prenet, | |
| dropout_p_dp=self.dropout_p_dp, | |
| c_in_channels=self.c_in_channels, | |
| ) | |
| self.decoder = Decoder( | |
| self.out_channels, | |
| self.hidden_channels_dec, | |
| self.kernel_size_dec, | |
| self.dilation_rate, | |
| self.num_flow_blocks_dec, | |
| self.num_block_layers, | |
| dropout_p=self.dropout_p_dec, | |
| num_splits=self.num_splits, | |
| num_squeeze=self.num_squeeze, | |
| sigmoid_scale=self.sigmoid_scale, | |
| c_in_channels=self.c_in_channels, | |
| ) | |
| def init_multispeaker(self, config: Coqpit): | |
| """Init speaker embedding layer if `use_speaker_embedding` is True and set the expected speaker embedding | |
| vector dimension to the encoder layer channel size. If model uses d-vectors, then it only sets | |
| speaker embedding vector dimension to the d-vector dimension from the config. | |
| Args: | |
| config (Coqpit): Model configuration. | |
| """ | |
| self.embedded_speaker_dim = 0 | |
| # set number of speakers - if num_speakers is set in config, use it, otherwise use speaker_manager | |
| if self.speaker_manager is not None: | |
| self.num_speakers = self.speaker_manager.num_speakers | |
| # set ultimate speaker embedding size | |
| if config.use_d_vector_file: | |
| self.embedded_speaker_dim = ( | |
| config.d_vector_dim if "d_vector_dim" in config and config.d_vector_dim is not None else 512 | |
| ) | |
| if self.speaker_manager is not None: | |
| assert ( | |
| config.d_vector_dim == self.speaker_manager.embedding_dim | |
| ), " [!] d-vector dimension mismatch b/w config and speaker manager." | |
| # init speaker embedding layer | |
| if config.use_speaker_embedding and not config.use_d_vector_file: | |
| print(" > Init speaker_embedding layer.") | |
| self.embedded_speaker_dim = self.hidden_channels_enc | |
| self.emb_g = nn.Embedding(self.num_speakers, self.hidden_channels_enc) | |
| nn.init.uniform_(self.emb_g.weight, -0.1, 0.1) | |
| # set conditioning dimensions | |
| self.c_in_channels = self.embedded_speaker_dim | |
| def compute_outputs(attn, o_mean, o_log_scale, x_mask): | |
| """Compute and format the mode outputs with the given alignment map""" | |
| y_mean = torch.matmul(attn.squeeze(1).transpose(1, 2), o_mean.transpose(1, 2)).transpose( | |
| 1, 2 | |
| ) # [b, t', t], [b, t, d] -> [b, d, t'] | |
| y_log_scale = torch.matmul(attn.squeeze(1).transpose(1, 2), o_log_scale.transpose(1, 2)).transpose( | |
| 1, 2 | |
| ) # [b, t', t], [b, t, d] -> [b, d, t'] | |
| # compute total duration with adjustment | |
| o_attn_dur = torch.log(1 + torch.sum(attn, -1)) * x_mask | |
| return y_mean, y_log_scale, o_attn_dur | |
| def unlock_act_norm_layers(self): | |
| """Unlock activation normalization layers for data depended initalization.""" | |
| for f in self.decoder.flows: | |
| if getattr(f, "set_ddi", False): | |
| f.set_ddi(True) | |
| def lock_act_norm_layers(self): | |
| """Lock activation normalization layers.""" | |
| for f in self.decoder.flows: | |
| if getattr(f, "set_ddi", False): | |
| f.set_ddi(False) | |
| def _set_speaker_input(self, aux_input: Dict): | |
| if aux_input is None: | |
| d_vectors = None | |
| speaker_ids = None | |
| else: | |
| d_vectors = aux_input.get("d_vectors", None) | |
| speaker_ids = aux_input.get("speaker_ids", None) | |
| if d_vectors is not None and speaker_ids is not None: | |
| raise ValueError("[!] Cannot use d-vectors and speaker-ids together.") | |
| if speaker_ids is not None and not hasattr(self, "emb_g"): | |
| raise ValueError("[!] Cannot use speaker-ids without enabling speaker embedding.") | |
| g = speaker_ids if speaker_ids is not None else d_vectors | |
| return g | |
| def _speaker_embedding(self, aux_input: Dict) -> Union[torch.tensor, None]: | |
| g = self._set_speaker_input(aux_input) | |
| # speaker embedding | |
| if g is not None: | |
| if hasattr(self, "emb_g"): | |
| # use speaker embedding layer | |
| if not g.size(): # if is a scalar | |
| g = g.unsqueeze(0) # unsqueeze | |
| g = F.normalize(self.emb_g(g)).unsqueeze(-1) # [b, h, 1] | |
| else: | |
| # use d-vector | |
| g = F.normalize(g).unsqueeze(-1) # [b, h, 1] | |
| return g | |
| def forward( | |
| self, x, x_lengths, y, y_lengths=None, aux_input={"d_vectors": None, "speaker_ids": None} | |
| ): # pylint: disable=dangerous-default-value | |
| """ | |
| Args: | |
| x (torch.Tensor): | |
| Input text sequence ids. :math:`[B, T_en]` | |
| x_lengths (torch.Tensor): | |
| Lengths of input text sequences. :math:`[B]` | |
| y (torch.Tensor): | |
| Target mel-spectrogram frames. :math:`[B, T_de, C_mel]` | |
| y_lengths (torch.Tensor): | |
| Lengths of target mel-spectrogram frames. :math:`[B]` | |
| aux_input (Dict): | |
| Auxiliary inputs. `d_vectors` is speaker embedding vectors for a multi-speaker model. | |
| :math:`[B, D_vec]`. `speaker_ids` is speaker ids for a multi-speaker model usind speaker-embedding | |
| layer. :math:`B` | |
| Returns: | |
| Dict: | |
| - z: :math: `[B, T_de, C]` | |
| - logdet: :math:`B` | |
| - y_mean: :math:`[B, T_de, C]` | |
| - y_log_scale: :math:`[B, T_de, C]` | |
| - alignments: :math:`[B, T_en, T_de]` | |
| - durations_log: :math:`[B, T_en, 1]` | |
| - total_durations_log: :math:`[B, T_en, 1]` | |
| """ | |
| # [B, T, C] -> [B, C, T] | |
| y = y.transpose(1, 2) | |
| y_max_length = y.size(2) | |
| # norm speaker embeddings | |
| g = self._speaker_embedding(aux_input) | |
| # embedding pass | |
| o_mean, o_log_scale, o_dur_log, x_mask = self.encoder(x, x_lengths, g=g) | |
| # drop redisual frames wrt num_squeeze and set y_lengths. | |
| y, y_lengths, y_max_length, attn = self.preprocess(y, y_lengths, y_max_length, None) | |
| # create masks | |
| y_mask = torch.unsqueeze(sequence_mask(y_lengths, y_max_length), 1).to(x_mask.dtype) | |
| # [B, 1, T_en, T_de] | |
| attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2) | |
| # decoder pass | |
| z, logdet = self.decoder(y, y_mask, g=g, reverse=False) | |
| # find the alignment path | |
| with torch.no_grad(): | |
| o_scale = torch.exp(-2 * o_log_scale) | |
| logp1 = torch.sum(-0.5 * math.log(2 * math.pi) - o_log_scale, [1]).unsqueeze(-1) # [b, t, 1] | |
| logp2 = torch.matmul(o_scale.transpose(1, 2), -0.5 * (z**2)) # [b, t, d] x [b, d, t'] = [b, t, t'] | |
| logp3 = torch.matmul((o_mean * o_scale).transpose(1, 2), z) # [b, t, d] x [b, d, t'] = [b, t, t'] | |
| logp4 = torch.sum(-0.5 * (o_mean**2) * o_scale, [1]).unsqueeze(-1) # [b, t, 1] | |
| logp = logp1 + logp2 + logp3 + logp4 # [b, t, t'] | |
| attn = maximum_path(logp, attn_mask.squeeze(1)).unsqueeze(1).detach() | |
| y_mean, y_log_scale, o_attn_dur = self.compute_outputs(attn, o_mean, o_log_scale, x_mask) | |
| attn = attn.squeeze(1).permute(0, 2, 1) | |
| outputs = { | |
| "z": z.transpose(1, 2), | |
| "logdet": logdet, | |
| "y_mean": y_mean.transpose(1, 2), | |
| "y_log_scale": y_log_scale.transpose(1, 2), | |
| "alignments": attn, | |
| "durations_log": o_dur_log.transpose(1, 2), | |
| "total_durations_log": o_attn_dur.transpose(1, 2), | |
| } | |
| return outputs | |
| def inference_with_MAS( | |
| self, x, x_lengths, y=None, y_lengths=None, aux_input={"d_vectors": None, "speaker_ids": None} | |
| ): # pylint: disable=dangerous-default-value | |
| """ | |
| It's similar to the teacher forcing in Tacotron. | |
| It was proposed in: https://arxiv.org/abs/2104.05557 | |
| Shapes: | |
| - x: :math:`[B, T]` | |
| - x_lenghts: :math:`B` | |
| - y: :math:`[B, T, C]` | |
| - y_lengths: :math:`B` | |
| - g: :math:`[B, C] or B` | |
| """ | |
| y = y.transpose(1, 2) | |
| y_max_length = y.size(2) | |
| # norm speaker embeddings | |
| g = self._speaker_embedding(aux_input) | |
| # embedding pass | |
| o_mean, o_log_scale, o_dur_log, x_mask = self.encoder(x, x_lengths, g=g) | |
| # drop redisual frames wrt num_squeeze and set y_lengths. | |
| y, y_lengths, y_max_length, attn = self.preprocess(y, y_lengths, y_max_length, None) | |
| # create masks | |
| y_mask = torch.unsqueeze(sequence_mask(y_lengths, y_max_length), 1).to(x_mask.dtype) | |
| attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2) | |
| # decoder pass | |
| z, logdet = self.decoder(y, y_mask, g=g, reverse=False) | |
| # find the alignment path between z and encoder output | |
| o_scale = torch.exp(-2 * o_log_scale) | |
| logp1 = torch.sum(-0.5 * math.log(2 * math.pi) - o_log_scale, [1]).unsqueeze(-1) # [b, t, 1] | |
| logp2 = torch.matmul(o_scale.transpose(1, 2), -0.5 * (z**2)) # [b, t, d] x [b, d, t'] = [b, t, t'] | |
| logp3 = torch.matmul((o_mean * o_scale).transpose(1, 2), z) # [b, t, d] x [b, d, t'] = [b, t, t'] | |
| logp4 = torch.sum(-0.5 * (o_mean**2) * o_scale, [1]).unsqueeze(-1) # [b, t, 1] | |
| logp = logp1 + logp2 + logp3 + logp4 # [b, t, t'] | |
| attn = maximum_path(logp, attn_mask.squeeze(1)).unsqueeze(1).detach() | |
| y_mean, y_log_scale, o_attn_dur = self.compute_outputs(attn, o_mean, o_log_scale, x_mask) | |
| attn = attn.squeeze(1).permute(0, 2, 1) | |
| # get predited aligned distribution | |
| z = y_mean * y_mask | |
| # reverse the decoder and predict using the aligned distribution | |
| y, logdet = self.decoder(z, y_mask, g=g, reverse=True) | |
| outputs = { | |
| "model_outputs": z.transpose(1, 2), | |
| "logdet": logdet, | |
| "y_mean": y_mean.transpose(1, 2), | |
| "y_log_scale": y_log_scale.transpose(1, 2), | |
| "alignments": attn, | |
| "durations_log": o_dur_log.transpose(1, 2), | |
| "total_durations_log": o_attn_dur.transpose(1, 2), | |
| } | |
| return outputs | |
| def decoder_inference( | |
| self, y, y_lengths=None, aux_input={"d_vectors": None, "speaker_ids": None} | |
| ): # pylint: disable=dangerous-default-value | |
| """ | |
| Shapes: | |
| - y: :math:`[B, T, C]` | |
| - y_lengths: :math:`B` | |
| - g: :math:`[B, C] or B` | |
| """ | |
| y = y.transpose(1, 2) | |
| y_max_length = y.size(2) | |
| g = self._speaker_embedding(aux_input) | |
| y_mask = torch.unsqueeze(sequence_mask(y_lengths, y_max_length), 1).to(y.dtype) | |
| # decoder pass | |
| z, logdet = self.decoder(y, y_mask, g=g, reverse=False) | |
| # reverse decoder and predict | |
| y, logdet = self.decoder(z, y_mask, g=g, reverse=True) | |
| outputs = {} | |
| outputs["model_outputs"] = y.transpose(1, 2) | |
| outputs["logdet"] = logdet | |
| return outputs | |
| def inference( | |
| self, x, aux_input={"x_lengths": None, "d_vectors": None, "speaker_ids": None} | |
| ): # pylint: disable=dangerous-default-value | |
| x_lengths = aux_input["x_lengths"] | |
| g = self._speaker_embedding(aux_input) | |
| # embedding pass | |
| o_mean, o_log_scale, o_dur_log, x_mask = self.encoder(x, x_lengths, g=g) | |
| # compute output durations | |
| w = (torch.exp(o_dur_log) - 1) * x_mask * self.length_scale | |
| w_ceil = torch.clamp_min(torch.ceil(w), 1) | |
| y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long() | |
| y_max_length = None | |
| # compute masks | |
| y_mask = torch.unsqueeze(sequence_mask(y_lengths, y_max_length), 1).to(x_mask.dtype) | |
| attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2) | |
| # compute attention mask | |
| attn = generate_path(w_ceil.squeeze(1), attn_mask.squeeze(1)).unsqueeze(1) | |
| y_mean, y_log_scale, o_attn_dur = self.compute_outputs(attn, o_mean, o_log_scale, x_mask) | |
| z = (y_mean + torch.exp(y_log_scale) * torch.randn_like(y_mean) * self.inference_noise_scale) * y_mask | |
| # decoder pass | |
| y, logdet = self.decoder(z, y_mask, g=g, reverse=True) | |
| attn = attn.squeeze(1).permute(0, 2, 1) | |
| outputs = { | |
| "model_outputs": y.transpose(1, 2), | |
| "logdet": logdet, | |
| "y_mean": y_mean.transpose(1, 2), | |
| "y_log_scale": y_log_scale.transpose(1, 2), | |
| "alignments": attn, | |
| "durations_log": o_dur_log.transpose(1, 2), | |
| "total_durations_log": o_attn_dur.transpose(1, 2), | |
| } | |
| return outputs | |
| def train_step(self, batch: dict, criterion: nn.Module): | |
| """A single training step. Forward pass and loss computation. Run data depended initialization for the | |
| first `config.data_dep_init_steps` steps. | |
| Args: | |
| batch (dict): [description] | |
| criterion (nn.Module): [description] | |
| """ | |
| text_input = batch["text_input"] | |
| text_lengths = batch["text_lengths"] | |
| mel_input = batch["mel_input"] | |
| mel_lengths = batch["mel_lengths"] | |
| d_vectors = batch["d_vectors"] | |
| speaker_ids = batch["speaker_ids"] | |
| if self.run_data_dep_init and self.training: | |
| # compute data-dependent initialization of activation norm layers | |
| self.unlock_act_norm_layers() | |
| with torch.no_grad(): | |
| _ = self.forward( | |
| text_input, | |
| text_lengths, | |
| mel_input, | |
| mel_lengths, | |
| aux_input={"d_vectors": d_vectors, "speaker_ids": speaker_ids}, | |
| ) | |
| outputs = None | |
| loss_dict = None | |
| self.lock_act_norm_layers() | |
| else: | |
| # normal training step | |
| outputs = self.forward( | |
| text_input, | |
| text_lengths, | |
| mel_input, | |
| mel_lengths, | |
| aux_input={"d_vectors": d_vectors, "speaker_ids": speaker_ids}, | |
| ) | |
| with autocast(enabled=False): # avoid mixed_precision in criterion | |
| loss_dict = criterion( | |
| outputs["z"].float(), | |
| outputs["y_mean"].float(), | |
| outputs["y_log_scale"].float(), | |
| outputs["logdet"].float(), | |
| mel_lengths, | |
| outputs["durations_log"].float(), | |
| outputs["total_durations_log"].float(), | |
| text_lengths, | |
| ) | |
| return outputs, loss_dict | |
| def _create_logs(self, batch, outputs, ap): | |
| alignments = outputs["alignments"] | |
| text_input = batch["text_input"][:1] if batch["text_input"] is not None else None | |
| text_lengths = batch["text_lengths"] | |
| mel_input = batch["mel_input"] | |
| d_vectors = batch["d_vectors"][:1] if batch["d_vectors"] is not None else None | |
| speaker_ids = batch["speaker_ids"][:1] if batch["speaker_ids"] is not None else None | |
| # model runs reverse flow to predict spectrograms | |
| pred_outputs = self.inference( | |
| text_input, | |
| aux_input={"x_lengths": text_lengths[:1], "d_vectors": d_vectors, "speaker_ids": speaker_ids}, | |
| ) | |
| model_outputs = pred_outputs["model_outputs"] | |
| pred_spec = model_outputs[0].data.cpu().numpy() | |
| gt_spec = mel_input[0].data.cpu().numpy() | |
| align_img = alignments[0].data.cpu().numpy() | |
| figures = { | |
| "prediction": plot_spectrogram(pred_spec, ap, output_fig=False), | |
| "ground_truth": plot_spectrogram(gt_spec, ap, output_fig=False), | |
| "alignment": plot_alignment(align_img, output_fig=False), | |
| } | |
| # Sample audio | |
| train_audio = ap.inv_melspectrogram(pred_spec.T) | |
| return figures, {"audio": train_audio} | |
| def train_log( | |
| self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int | |
| ) -> None: # pylint: disable=no-self-use | |
| figures, audios = self._create_logs(batch, outputs, self.ap) | |
| logger.train_figures(steps, figures) | |
| logger.train_audios(steps, audios, self.ap.sample_rate) | |
| def eval_step(self, batch: dict, criterion: nn.Module): | |
| return self.train_step(batch, criterion) | |
| def eval_log(self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int) -> None: | |
| figures, audios = self._create_logs(batch, outputs, self.ap) | |
| logger.eval_figures(steps, figures) | |
| logger.eval_audios(steps, audios, self.ap.sample_rate) | |
| def test_run(self, assets: Dict) -> Tuple[Dict, Dict]: | |
| """Generic test run for `tts` models used by `Trainer`. | |
| You can override this for a different behaviour. | |
| Returns: | |
| Tuple[Dict, Dict]: Test figures and audios to be projected to Tensorboard. | |
| """ | |
| print(" | > Synthesizing test sentences.") | |
| test_audios = {} | |
| test_figures = {} | |
| test_sentences = self.config.test_sentences | |
| aux_inputs = self._get_test_aux_input() | |
| if len(test_sentences) == 0: | |
| print(" | [!] No test sentences provided.") | |
| else: | |
| for idx, sen in enumerate(test_sentences): | |
| outputs = synthesis( | |
| self, | |
| sen, | |
| self.config, | |
| "cuda" in str(next(self.parameters()).device), | |
| speaker_id=aux_inputs["speaker_id"], | |
| d_vector=aux_inputs["d_vector"], | |
| style_wav=aux_inputs["style_wav"], | |
| use_griffin_lim=True, | |
| do_trim_silence=False, | |
| ) | |
| test_audios["{}-audio".format(idx)] = outputs["wav"] | |
| test_figures["{}-prediction".format(idx)] = plot_spectrogram( | |
| outputs["outputs"]["model_outputs"], self.ap, output_fig=False | |
| ) | |
| test_figures["{}-alignment".format(idx)] = plot_alignment(outputs["alignments"], output_fig=False) | |
| return test_figures, test_audios | |
| def preprocess(self, y, y_lengths, y_max_length, attn=None): | |
| if y_max_length is not None: | |
| y_max_length = (y_max_length // self.num_squeeze) * self.num_squeeze | |
| y = y[:, :, :y_max_length] | |
| if attn is not None: | |
| attn = attn[:, :, :, :y_max_length] | |
| y_lengths = torch.div(y_lengths, self.num_squeeze, rounding_mode="floor") * self.num_squeeze | |
| return y, y_lengths, y_max_length, attn | |
| def store_inverse(self): | |
| self.decoder.store_inverse() | |
| def load_checkpoint( | |
| self, config, checkpoint_path, eval=False | |
| ): # pylint: disable=unused-argument, redefined-builtin | |
| state = load_fsspec(checkpoint_path, map_location=torch.device("cpu")) | |
| self.load_state_dict(state["model"]) | |
| if eval: | |
| self.eval() | |
| self.store_inverse() | |
| assert not self.training | |
| def get_criterion(): | |
| from TTS.tts.layers.losses import GlowTTSLoss # pylint: disable=import-outside-toplevel | |
| return GlowTTSLoss() | |
| def on_train_step_start(self, trainer): | |
| """Decide on every training step wheter enable/disable data depended initialization.""" | |
| self.run_data_dep_init = trainer.total_steps_done < self.data_dep_init_steps | |
| def init_from_config(config: "GlowTTSConfig", samples: Union[List[List], List[Dict]] = None, verbose=True): | |
| """Initiate model from config | |
| Args: | |
| config (VitsConfig): Model config. | |
| samples (Union[List[List], List[Dict]]): Training samples to parse speaker ids for training. | |
| Defaults to None. | |
| verbose (bool): If True, print init messages. Defaults to True. | |
| """ | |
| from TTS.utils.audio import AudioProcessor | |
| ap = AudioProcessor.init_from_config(config, verbose) | |
| tokenizer, new_config = TTSTokenizer.init_from_config(config) | |
| speaker_manager = SpeakerManager.init_from_config(config, samples) | |
| return GlowTTS(new_config, ap, tokenizer, speaker_manager) | |