Spaces:

Flux9665
/

SpeechCloning

Runtime error

App Files Files

Florian Lux commited on Feb 21, 2022

Commit

2cb106d

1 Parent(s): f9463cb

implement the cloning demo

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +16 -0
InferenceInterfaces/InferenceArchitectures/InferenceFastSpeech2.py +256 -0
InferenceInterfaces/InferenceArchitectures/InferenceHiFiGAN.py +91 -0
InferenceInterfaces/InferenceArchitectures/__init__.py +0 -0
InferenceInterfaces/Meta_FastSpeech2.py +75 -0
InferenceInterfaces/__init__.py +0 -0
Layers/Attention.py +324 -0
Layers/Conformer.py +144 -0
Layers/Convolution.py +55 -0
Layers/DurationPredictor.py +139 -0
Layers/EncoderLayer.py +144 -0
Layers/LayerNorm.py +36 -0
Layers/LengthRegulator.py +62 -0
Layers/MultiLayeredConv1d.py +87 -0
Layers/MultiSequential.py +33 -0
Layers/PositionalEncoding.py +166 -0
Layers/PositionwiseFeedForward.py +26 -0
Layers/PostNet.py +74 -0
Layers/ResidualBlock.py +98 -0
Layers/ResidualStack.py +51 -0
Layers/STFT.py +118 -0
Layers/Swish.py +18 -0
Layers/VariancePredictor.py +65 -0
Layers/__init__.py +0 -0
Models/Aligner/__init__.py +0 -0
Models/FastSpeech2_Meta/__init__.py +0 -0
Models/HiFiGAN_combined/__init__.py +0 -0
Preprocessing/ArticulatoryCombinedTextFrontend.py +323 -0
Preprocessing/AudioPreprocessor.py +166 -0
Preprocessing/ProsodicConditionExtractor.py +40 -0
Preprocessing/__init__.py +0 -0
Preprocessing/papercup_features.py +637 -0
README.md +3 -3
TrainingInterfaces/Text_to_Spectrogram/AutoAligner/Aligner.py +287 -0
TrainingInterfaces/Text_to_Spectrogram/AutoAligner/AlignerDataset.py +211 -0
TrainingInterfaces/Text_to_Spectrogram/AutoAligner/TinyTTS.py +36 -0
TrainingInterfaces/Text_to_Spectrogram/AutoAligner/__init__.py +0 -0
TrainingInterfaces/Text_to_Spectrogram/AutoAligner/autoaligner_train_loop.py +145 -0
TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/DurationCalculator.py +31 -0
TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/EnergyCalculator.py +86 -0
TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/FastSpeech2.py +379 -0
TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/FastSpeech2Loss.py +96 -0
TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/FastSpeechDatasetLanguageID.py +217 -0
TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/PitchCalculator.py +121 -0
TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/__init__.py +0 -0
TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/fastspeech2_train_loop.py +201 -0
TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/fastspeech2_train_loop_ctc.py +191 -0
TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/meta_train_loop.py +211 -0
TrainingInterfaces/Text_to_Spectrogram/__init__.py +0 -0
TrainingInterfaces/__init__.py +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,16 @@

+.idea
+*.pyc
+*.png
+*.pdf
+*.pt
+tensorboard_logs
+Corpora
+*_graph
+*.out
+*.wav
+*.flac
+audios/
+*playground*
+*.json
+.tmp/
+.vscode/

InferenceInterfaces/InferenceArchitectures/InferenceFastSpeech2.py ADDED Viewed

	@@ -0,0 +1,256 @@

+from abc import ABC
+import torch
+from Layers.Conformer import Conformer
+from Layers.DurationPredictor import DurationPredictor
+from Layers.LengthRegulator import LengthRegulator
+from Layers.PostNet import PostNet
+from Layers.VariancePredictor import VariancePredictor
+from Utility.utils import make_non_pad_mask
+from Utility.utils import make_pad_mask
+class FastSpeech2(torch.nn.Module, ABC):
+    def __init__(self,  # network structure related
+                 weights,
+                 idim=66,
+                 odim=80,
+                 adim=384,
+                 aheads=4,
+                 elayers=6,
+                 eunits=1536,
+                 dlayers=6,
+                 dunits=1536,
+                 postnet_layers=5,
+                 postnet_chans=256,
+                 postnet_filts=5,
+                 positionwise_conv_kernel_size=1,
+                 use_scaled_pos_enc=True,
+                 use_batch_norm=True,
+                 encoder_normalize_before=True,
+                 decoder_normalize_before=True,
+                 encoder_concat_after=False,
+                 decoder_concat_after=False,
+                 reduction_factor=1,
+                 # encoder / decoder
+                 use_macaron_style_in_conformer=True,
+                 use_cnn_in_conformer=True,
+                 conformer_enc_kernel_size=7,
+                 conformer_dec_kernel_size=31,
+                 # duration predictor
+                 duration_predictor_layers=2,
+                 duration_predictor_chans=256,
+                 duration_predictor_kernel_size=3,
+                 # energy predictor
+                 energy_predictor_layers=2,
+                 energy_predictor_chans=256,
+                 energy_predictor_kernel_size=3,
+                 energy_predictor_dropout=0.5,
+                 energy_embed_kernel_size=1,
+                 energy_embed_dropout=0.0,
+                 stop_gradient_from_energy_predictor=True,
+                 # pitch predictor
+                 pitch_predictor_layers=5,
+                 pitch_predictor_chans=256,
+                 pitch_predictor_kernel_size=5,
+                 pitch_predictor_dropout=0.5,
+                 pitch_embed_kernel_size=1,
+                 pitch_embed_dropout=0.0,
+                 stop_gradient_from_pitch_predictor=True,
+                 # training related
+                 transformer_enc_dropout_rate=0.2,
+                 transformer_enc_positional_dropout_rate=0.2,
+                 transformer_enc_attn_dropout_rate=0.2,
+                 transformer_dec_dropout_rate=0.2,
+                 transformer_dec_positional_dropout_rate=0.2,
+                 transformer_dec_attn_dropout_rate=0.2,
+                 duration_predictor_dropout_rate=0.2,
+                 postnet_dropout_rate=0.5,
+                 # additional features
+                 utt_embed_dim=704,
+                 connect_utt_emb_at_encoder_out=True,
+                 lang_embs=100):
+        super().__init__()
+        self.idim = idim
+        self.odim = odim
+        self.reduction_factor = reduction_factor
+        self.stop_gradient_from_pitch_predictor = stop_gradient_from_pitch_predictor
+        self.stop_gradient_from_energy_predictor = stop_gradient_from_energy_predictor
+        self.use_scaled_pos_enc = use_scaled_pos_enc
+        embed = torch.nn.Sequential(torch.nn.Linear(idim, 100),
+                                    torch.nn.Tanh(),
+                                    torch.nn.Linear(100, adim))
+        self.encoder = Conformer(idim=idim, attention_dim=adim, attention_heads=aheads, linear_units=eunits, num_blocks=elayers,
+                                 input_layer=embed, dropout_rate=transformer_enc_dropout_rate,
+                                 positional_dropout_rate=transformer_enc_positional_dropout_rate, attention_dropout_rate=transformer_enc_attn_dropout_rate,
+                                 normalize_before=encoder_normalize_before, concat_after=encoder_concat_after,
+                                 positionwise_conv_kernel_size=positionwise_conv_kernel_size, macaron_style=use_macaron_style_in_conformer,
+                                 use_cnn_module=use_cnn_in_conformer, cnn_module_kernel=conformer_enc_kernel_size, zero_triu=False,
+                                 utt_embed=utt_embed_dim, connect_utt_emb_at_encoder_out=connect_utt_emb_at_encoder_out, lang_embs=lang_embs)
+        self.duration_predictor = DurationPredictor(idim=adim, n_layers=duration_predictor_layers,
+                                                    n_chans=duration_predictor_chans,
+                                                    kernel_size=duration_predictor_kernel_size,
+                                                    dropout_rate=duration_predictor_dropout_rate, )
+        self.pitch_predictor = VariancePredictor(idim=adim, n_layers=pitch_predictor_layers,
+                                                 n_chans=pitch_predictor_chans,
+                                                 kernel_size=pitch_predictor_kernel_size,
+                                                 dropout_rate=pitch_predictor_dropout)
+        self.pitch_embed = torch.nn.Sequential(torch.nn.Conv1d(in_channels=1, out_channels=adim,
+                                                               kernel_size=pitch_embed_kernel_size,
+                                                               padding=(pitch_embed_kernel_size - 1) // 2),
+                                               torch.nn.Dropout(pitch_embed_dropout))
+        self.energy_predictor = VariancePredictor(idim=adim, n_layers=energy_predictor_layers,
+                                                  n_chans=energy_predictor_chans,
+                                                  kernel_size=energy_predictor_kernel_size,
+                                                  dropout_rate=energy_predictor_dropout)
+        self.energy_embed = torch.nn.Sequential(torch.nn.Conv1d(in_channels=1, out_channels=adim,
+                                                                kernel_size=energy_embed_kernel_size,
+                                                                padding=(energy_embed_kernel_size - 1) // 2),
+                                                torch.nn.Dropout(energy_embed_dropout))
+        self.length_regulator = LengthRegulator()
+        self.decoder = Conformer(idim=0,
+                                 attention_dim=adim,
+                                 attention_heads=aheads,
+                                 linear_units=dunits,
+                                 num_blocks=dlayers,
+                                 input_layer=None,
+                                 dropout_rate=transformer_dec_dropout_rate,
+                                 positional_dropout_rate=transformer_dec_positional_dropout_rate,
+                                 attention_dropout_rate=transformer_dec_attn_dropout_rate,
+                                 normalize_before=decoder_normalize_before,
+                                 concat_after=decoder_concat_after,
+                                 positionwise_conv_kernel_size=positionwise_conv_kernel_size,
+                                 macaron_style=use_macaron_style_in_conformer,
+                                 use_cnn_module=use_cnn_in_conformer,
+                                 cnn_module_kernel=conformer_dec_kernel_size)
+        self.feat_out = torch.nn.Linear(adim, odim * reduction_factor)
+        self.postnet = PostNet(idim=idim,
+                               odim=odim,
+                               n_layers=postnet_layers,
+                               n_chans=postnet_chans,
+                               n_filts=postnet_filts,
+                               use_batch_norm=use_batch_norm,
+                               dropout_rate=postnet_dropout_rate)
+        self.load_state_dict(weights)
+    def _forward(self, text_tensors, text_lens, gold_speech=None, speech_lens=None,
+                 gold_durations=None, gold_pitch=None, gold_energy=None,
+                 is_inference=False, alpha=1.0, utterance_embedding=None, lang_ids=None):
+        # forward encoder
+        text_masks = self._source_mask(text_lens)
+        encoded_texts, _ = self.encoder(text_tensors, text_masks, utterance_embedding=utterance_embedding, lang_ids=lang_ids)  # (B, Tmax, adim)
+        # forward duration predictor and variance predictors
+        duration_masks = make_pad_mask(text_lens, device=text_lens.device)
+        if self.stop_gradient_from_pitch_predictor:
+            pitch_predictions = self.pitch_predictor(encoded_texts.detach(), duration_masks.unsqueeze(-1))
+        else:
+            pitch_predictions = self.pitch_predictor(encoded_texts, duration_masks.unsqueeze(-1))
+        if self.stop_gradient_from_energy_predictor:
+            energy_predictions = self.energy_predictor(encoded_texts.detach(), duration_masks.unsqueeze(-1))
+        else:
+            energy_predictions = self.energy_predictor(encoded_texts, duration_masks.unsqueeze(-1))
+        if is_inference:
+            if gold_durations is not None:
+                duration_predictions = gold_durations
+            else:
+                duration_predictions = self.duration_predictor.inference(encoded_texts, duration_masks)
+            if gold_pitch is not None:
+                pitch_predictions = gold_pitch
+            if gold_energy is not None:
+                energy_predictions = gold_energy
+            pitch_embeddings = self.pitch_embed(pitch_predictions.transpose(1, 2)).transpose(1, 2)
+            energy_embeddings = self.energy_embed(energy_predictions.transpose(1, 2)).transpose(1, 2)
+            encoded_texts = encoded_texts + energy_embeddings + pitch_embeddings
+            encoded_texts = self.length_regulator(encoded_texts, duration_predictions, alpha)
+        else:
+            duration_predictions = self.duration_predictor(encoded_texts, duration_masks)
+            # use groundtruth in training
+            pitch_embeddings = self.pitch_embed(gold_pitch.transpose(1, 2)).transpose(1, 2)
+            energy_embeddings = self.energy_embed(gold_energy.transpose(1, 2)).transpose(1, 2)
+            encoded_texts = encoded_texts + energy_embeddings + pitch_embeddings
+            encoded_texts = self.length_regulator(encoded_texts, gold_durations)  # (B, Lmax, adim)
+        # forward decoder
+        if speech_lens is not None and not is_inference:
+            if self.reduction_factor > 1:
+                olens_in = speech_lens.new([olen // self.reduction_factor for olen in speech_lens])
+            else:
+                olens_in = speech_lens
+            h_masks = self._source_mask(olens_in)
+        else:
+            h_masks = None
+        zs, _ = self.decoder(encoded_texts, h_masks)  # (B, Lmax, adim)
+        before_outs = self.feat_out(zs).view(zs.size(0), -1, self.odim)  # (B, Lmax, odim)
+        # postnet -> (B, Lmax//r * r, odim)
+        after_outs = before_outs + self.postnet(before_outs.transpose(1, 2)).transpose(1, 2)
+        return before_outs, after_outs, duration_predictions, pitch_predictions, energy_predictions
+    @torch.no_grad()
+    def forward(self,
+                text,
+                speech=None,
+                durations=None,
+                pitch=None,
+                energy=None,
+                utterance_embedding=None,
+                return_duration_pitch_energy=False,
+                lang_id=None):
+        """
+        Generate the sequence of features given the sequences of characters.
+        Args:
+            text: Input sequence of characters
+            speech: Feature sequence to extract style
+            durations: Groundtruth of duration
+            pitch: Groundtruth of token-averaged pitch
+            energy: Groundtruth of token-averaged energy
+            return_duration_pitch_energy: whether to return the list of predicted durations for nicer plotting
+            utterance_embedding: embedding of utterance wide parameters
+        Returns:
+            Mel Spectrogram
+        """
+        self.eval()
+        # setup batch axis
+        ilens = torch.tensor([text.shape[0]], dtype=torch.long, device=text.device)
+        if speech is not None:
+            gold_speech = speech.unsqueeze(0)
+        else:
+            gold_speech = None
+        if durations is not None:
+            durations = durations.unsqueeze(0)
+        if pitch is not None:
+            pitch = pitch.unsqueeze(0)
+        if energy is not None:
+            energy = energy.unsqueeze(0)
+        if lang_id is not None:
+            lang_id = lang_id.unsqueeze(0)
+        before_outs, after_outs, d_outs, pitch_predictions, energy_predictions = self._forward(text.unsqueeze(0),
+                                                                                               ilens,
+                                                                                               gold_speech=gold_speech,
+                                                                                               gold_durations=durations,
+                                                                                               is_inference=True,
+                                                                                               gold_pitch=pitch,
+                                                                                               gold_energy=energy,
+                                                                                               utterance_embedding=utterance_embedding.unsqueeze(0),
+                                                                                               lang_ids=lang_id)
+        self.train()
+        if return_duration_pitch_energy:
+            return after_outs[0], d_outs[0], pitch_predictions[0], energy_predictions[0]
+        return after_outs[0]
+    def _source_mask(self, ilens):
+        x_masks = make_non_pad_mask(ilens).to(next(self.parameters()).device)
+        return x_masks.unsqueeze(-2)

InferenceInterfaces/InferenceArchitectures/InferenceHiFiGAN.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import torch
+from Layers.ResidualBlock import HiFiGANResidualBlock as ResidualBlock
+class HiFiGANGenerator(torch.nn.Module):
+    def __init__(self,
+                 path_to_weights,
+                 in_channels=80,
+                 out_channels=1,
+                 channels=512,
+                 kernel_size=7,
+                 upsample_scales=(8, 6, 4, 4),
+                 upsample_kernel_sizes=(16, 12, 8, 8),
+                 resblock_kernel_sizes=(3, 7, 11),
+                 resblock_dilations=[(1, 3, 5), (1, 3, 5), (1, 3, 5)],
+                 use_additional_convs=True,
+                 bias=True,
+                 nonlinear_activation="LeakyReLU",
+                 nonlinear_activation_params={"negative_slope": 0.1},
+                 use_weight_norm=True, ):
+        super().__init__()
+        assert kernel_size % 2 == 1, "Kernal size must be odd number."
+        assert len(upsample_scales) == len(upsample_kernel_sizes)
+        assert len(resblock_dilations) == len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_kernel_sizes)
+        self.num_blocks = len(resblock_kernel_sizes)
+        self.input_conv = torch.nn.Conv1d(in_channels,
+                                          channels,
+                                          kernel_size,
+                                          1,
+                                          padding=(kernel_size - 1) // 2, )
+        self.upsamples = torch.nn.ModuleList()
+        self.blocks = torch.nn.ModuleList()
+        for i in range(len(upsample_kernel_sizes)):
+            self.upsamples += [
+                torch.nn.Sequential(getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
+                                    torch.nn.ConvTranspose1d(channels // (2 ** i),
+                                                             channels // (2 ** (i + 1)),
+                                                             upsample_kernel_sizes[i],
+                                                             upsample_scales[i],
+                                                             padding=(upsample_kernel_sizes[i] - upsample_scales[i]) // 2, ), )]
+            for j in range(len(resblock_kernel_sizes)):
+                self.blocks += [ResidualBlock(kernel_size=resblock_kernel_sizes[j],
+                                              channels=channels // (2 ** (i + 1)),
+                                              dilations=resblock_dilations[j],
+                                              bias=bias,
+                                              use_additional_convs=use_additional_convs,
+                                              nonlinear_activation=nonlinear_activation,
+                                              nonlinear_activation_params=nonlinear_activation_params, )]
+        self.output_conv = torch.nn.Sequential(
+            torch.nn.LeakyReLU(),
+            torch.nn.Conv1d(channels // (2 ** (i + 1)),
+                            out_channels,
+                            kernel_size,
+                            1,
+                            padding=(kernel_size - 1) // 2, ),
+            torch.nn.Tanh(), )
+        if use_weight_norm:
+            self.apply_weight_norm()
+        self.load_state_dict(torch.load(path_to_weights, map_location='cpu')["generator"])
+    def forward(self, c, normalize_before=False):
+        if normalize_before:
+            c = (c - self.mean) / self.scale
+        c = self.input_conv(c.unsqueeze(0))
+        for i in range(self.num_upsamples):
+            c = self.upsamples[i](c)
+            cs = 0.0  # initialize
+            for j in range(self.num_blocks):
+                cs = cs + self.blocks[i * self.num_blocks + j](c)
+            c = cs / self.num_blocks
+        c = self.output_conv(c)
+        return c.squeeze(0).squeeze(0)
+    def remove_weight_norm(self):
+        def _remove_weight_norm(m):
+            try:
+                torch.nn.utils.remove_weight_norm(m)
+            except ValueError:
+                return
+        self.apply(_remove_weight_norm)
+    def apply_weight_norm(self):
+        def _apply_weight_norm(m):
+            if isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.ConvTranspose1d):
+                torch.nn.utils.weight_norm(m)
+        self.apply(_apply_weight_norm)

InferenceInterfaces/InferenceArchitectures/__init__.py ADDED Viewed

File without changes

InferenceInterfaces/Meta_FastSpeech2.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import os
+import librosa.display as lbd
+import matplotlib.pyplot as plt
+import soundfile
+import torch
+from InferenceInterfaces.InferenceArchitectures.InferenceFastSpeech2 import FastSpeech2
+from InferenceInterfaces.InferenceArchitectures.InferenceHiFiGAN import HiFiGANGenerator
+from Preprocessing.ArticulatoryCombinedTextFrontend import ArticulatoryCombinedTextFrontend
+from Preprocessing.ArticulatoryCombinedTextFrontend import get_language_id
+from Preprocessing.ProsodicConditionExtractor import ProsodicConditionExtractor
+class Meta_FastSpeech2(torch.nn.Module):
+    def __init__(self, device="cpu"):
+        super().__init__()
+        model_name = "Meta"
+        language = "en"
+        self.device = device
+        self.text2phone = ArticulatoryCombinedTextFrontend(language=language, add_silence_to_end=True)
+        checkpoint = torch.load(os.path.join("Models", f"FastSpeech2_{model_name}", "best.pt"), map_location='cpu')
+        self.phone2mel = FastSpeech2(weights=checkpoint["model"]).to(torch.device(device))
+        self.mel2wav = HiFiGANGenerator(path_to_weights=os.path.join("Models", "HiFiGAN_combined", "best.pt")).to(torch.device(device))
+        self.default_utterance_embedding = checkpoint["default_emb"].to(self.device)
+        self.phone2mel.eval()
+        self.mel2wav.eval()
+        self.lang_id = get_language_id(language)
+        self.to(torch.device(device))
+    def set_utterance_embedding(self, path_to_reference_audio):
+        wave, sr = soundfile.read(path_to_reference_audio)
+        self.default_utterance_embedding = ProsodicConditionExtractor(sr=sr).extract_condition_from_reference_wave(wave).to(self.device)
+    def set_language(self, lang_id):
+        """
+        The id parameter actually refers to the shorthand. This has become ambiguous with the introduction of the actual language IDs
+        """
+        self.text2phone = ArticulatoryCombinedTextFrontend(language=lang_id, add_silence_to_end=True, silent=True)
+        self.lang_id = get_language_id(lang_id).to(self.device)
+    def forward(self, text, view=False, durations=None, pitch=None, energy=None):
+        with torch.no_grad():
+            phones = self.text2phone.string_to_tensor(text).to(torch.device(self.device))
+            mel, durations, pitch, energy = self.phone2mel(phones,
+                                                           return_duration_pitch_energy=True,
+                                                           utterance_embedding=self.default_utterance_embedding,
+                                                           durations=durations,
+                                                           pitch=pitch,
+                                                           energy=energy)
+            mel = mel.transpose(0, 1)
+            wave = self.mel2wav(mel)
+        if view:
+            from Utility.utils import cumsum_durations
+            fig, ax = plt.subplots(nrows=2, ncols=1)
+            ax[0].plot(wave.cpu().numpy())
+            lbd.specshow(mel.cpu().numpy(),
+                         ax=ax[1],
+                         sr=16000,
+                         cmap='GnBu',
+                         y_axis='mel',
+                         x_axis=None,
+                         hop_length=256)
+            ax[0].yaxis.set_visible(False)
+            ax[1].yaxis.set_visible(False)
+            duration_splits, label_positions = cumsum_durations(durations.cpu().numpy())
+            ax[1].set_xticks(duration_splits, minor=True)
+            ax[1].xaxis.grid(True, which='minor')
+            ax[1].set_xticks(label_positions, minor=False)
+            ax[1].set_xticklabels(self.text2phone.get_phone_string(text))
+            ax[0].set_title(text)
+            plt.subplots_adjust(left=0.05, bottom=0.1, right=0.95, top=.9, wspace=0.0, hspace=0.0)
+            plt.show()
+        return wave

InferenceInterfaces/__init__.py ADDED Viewed

File without changes

Layers/Attention.py ADDED Viewed

	@@ -0,0 +1,324 @@

+# Written by Shigeki Karita, 2019
+# Published under Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+# Adapted by Florian Lux, 2021
+"""Multi-Head Attention layer definition."""
+import math
+import numpy
+import torch
+from torch import nn
+from Utility.utils import make_non_pad_mask
+class MultiHeadedAttention(nn.Module):
+    """
+    Multi-Head Attention layer.
+    Args:
+        n_head (int): The number of heads.
+        n_feat (int): The number of features.
+        dropout_rate (float): Dropout rate.
+    """
+    def __init__(self, n_head, n_feat, dropout_rate):
+        """
+        Construct an MultiHeadedAttention object.
+        """
+        super(MultiHeadedAttention, self).__init__()
+        assert n_feat % n_head == 0
+        # We assume d_v always equals d_k
+        self.d_k = n_feat // n_head
+        self.h = n_head
+        self.linear_q = nn.Linear(n_feat, n_feat)
+        self.linear_k = nn.Linear(n_feat, n_feat)
+        self.linear_v = nn.Linear(n_feat, n_feat)
+        self.linear_out = nn.Linear(n_feat, n_feat)
+        self.attn = None
+        self.dropout = nn.Dropout(p=dropout_rate)
+    def forward_qkv(self, query, key, value):
+        """
+        Transform query, key and value.
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+        Returns:
+            torch.Tensor: Transformed query tensor (#batch, n_head, time1, d_k).
+            torch.Tensor: Transformed key tensor (#batch, n_head, time2, d_k).
+            torch.Tensor: Transformed value tensor (#batch, n_head, time2, d_k).
+        """
+        n_batch = query.size(0)
+        q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k)
+        k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k)
+        v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k)
+        q = q.transpose(1, 2)  # (batch, head, time1, d_k)
+        k = k.transpose(1, 2)  # (batch, head, time2, d_k)
+        v = v.transpose(1, 2)  # (batch, head, time2, d_k)
+        return q, k, v
+    def forward_attention(self, value, scores, mask):
+        """
+        Compute attention context vector.
+        Args:
+            value (torch.Tensor): Transformed value (#batch, n_head, time2, d_k).
+            scores (torch.Tensor): Attention score (#batch, n_head, time1, time2).
+            mask (torch.Tensor): Mask (#batch, 1, time2) or (#batch, time1, time2).
+        Returns:
+            torch.Tensor: Transformed value (#batch, time1, d_model)
+                weighted by the attention score (#batch, time1, time2).
+        """
+        n_batch = value.size(0)
+        if mask is not None:
+            mask = mask.unsqueeze(1).eq(0)  # (batch, 1, *, time2)
+            min_value = float(numpy.finfo(torch.tensor(0, dtype=scores.dtype).numpy().dtype).min)
+            scores = scores.masked_fill(mask, min_value)
+            self.attn = torch.softmax(scores, dim=-1).masked_fill(mask, 0.0)  # (batch, head, time1, time2)
+        else:
+            self.attn = torch.softmax(scores, dim=-1)  # (batch, head, time1, time2)
+        p_attn = self.dropout(self.attn)
+        x = torch.matmul(p_attn, value)  # (batch, head, time1, d_k)
+        x = (x.transpose(1, 2).contiguous().view(n_batch, -1, self.h * self.d_k))  # (batch, time1, d_model)
+        return self.linear_out(x)  # (batch, time1, d_model)
+    def forward(self, query, key, value, mask):
+        """
+        Compute scaled dot product attention.
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
+                (#batch, time1, time2).
+        Returns:
+            torch.Tensor: Output tensor (#batch, time1, d_model).
+        """
+        q, k, v = self.forward_qkv(query, key, value)
+        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k)
+        return self.forward_attention(v, scores, mask)
+class RelPositionMultiHeadedAttention(MultiHeadedAttention):
+    """
+    Multi-Head Attention layer with relative position encoding.
+    Details can be found in https://github.com/espnet/espnet/pull/2816.
+    Paper: https://arxiv.org/abs/1901.02860
+    Args:
+        n_head (int): The number of heads.
+        n_feat (int): The number of features.
+        dropout_rate (float): Dropout rate.
+        zero_triu (bool): Whether to zero the upper triangular part of attention matrix.
+    """
+    def __init__(self, n_head, n_feat, dropout_rate, zero_triu=False):
+        """Construct an RelPositionMultiHeadedAttention object."""
+        super().__init__(n_head, n_feat, dropout_rate)
+        self.zero_triu = zero_triu
+        # linear transformation for positional encoding
+        self.linear_pos = nn.Linear(n_feat, n_feat, bias=False)
+        # these two learnable bias are used in matrix c and matrix d
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k))
+        self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k))
+        torch.nn.init.xavier_uniform_(self.pos_bias_u)
+        torch.nn.init.xavier_uniform_(self.pos_bias_v)
+    def rel_shift(self, x):
+        """
+        Compute relative positional encoding.
+        Args:
+            x (torch.Tensor): Input tensor (batch, head, time1, 2*time1-1).
+            time1 means the length of query vector.
+        Returns:
+            torch.Tensor: Output tensor.
+        """
+        zero_pad = torch.zeros((*x.size()[:3], 1), device=x.device, dtype=x.dtype)
+        x_padded = torch.cat([zero_pad, x], dim=-1)
+        x_padded = x_padded.view(*x.size()[:2], x.size(3) + 1, x.size(2))
+        x = x_padded[:, :, 1:].view_as(x)[:, :, :, : x.size(-1) // 2 + 1]  # only keep the positions from 0 to time2
+        if self.zero_triu:
+            ones = torch.ones((x.size(2), x.size(3)), device=x.device)
+            x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :]
+        return x
+    def forward(self, query, key, value, pos_emb, mask):
+        """
+        Compute 'Scaled Dot Product Attention' with rel. positional encoding.
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+            pos_emb (torch.Tensor): Positional embedding tensor
+                (#batch, 2*time1-1, size).
+            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
+                (#batch, time1, time2).
+        Returns:
+            torch.Tensor: Output tensor (#batch, time1, d_model).
+        """
+        q, k, v = self.forward_qkv(query, key, value)
+        q = q.transpose(1, 2)  # (batch, time1, head, d_k)
+        n_batch_pos = pos_emb.size(0)
+        p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k)
+        p = p.transpose(1, 2)  # (batch, head, 2*time1-1, d_k)
+        # (batch, head, time1, d_k)
+        q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2)
+        # (batch, head, time1, d_k)
+        q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2)
+        # compute attention score
+        # first compute matrix a and matrix c
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        # (batch, head, time1, time2)
+        matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1))
+        # compute matrix b and matrix d
+        # (batch, head, time1, 2*time1-1)
+        matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1))
+        matrix_bd = self.rel_shift(matrix_bd)
+        scores = (matrix_ac + matrix_bd) / math.sqrt(self.d_k)  # (batch, head, time1, time2)
+        return self.forward_attention(v, scores, mask)
+class GuidedAttentionLoss(torch.nn.Module):
+    """
+    Guided attention loss function module.
+    This module calculates the guided attention loss described
+    in `Efficiently Trainable Text-to-Speech System Based
+    on Deep Convolutional Networks with Guided Attention`_,
+    which forces the attention to be diagonal.
+    .. _`Efficiently Trainable Text-to-Speech System
+        Based on Deep Convolutional Networks with Guided Attention`:
+        https://arxiv.org/abs/1710.08969
+    """
+    def __init__(self, sigma=0.4, alpha=1.0):
+        """
+        Initialize guided attention loss module.
+        Args:
+            sigma (float, optional): Standard deviation to control
+                how close attention to a diagonal.
+            alpha (float, optional): Scaling coefficient (lambda).
+            reset_always (bool, optional): Whether to always reset masks.
+        """
+        super(GuidedAttentionLoss, self).__init__()
+        self.sigma = sigma
+        self.alpha = alpha
+        self.guided_attn_masks = None
+        self.masks = None
+    def _reset_masks(self):
+        self.guided_attn_masks = None
+        self.masks = None
+    def forward(self, att_ws, ilens, olens):
+        """
+        Calculate forward propagation.
+        Args:
+            att_ws (Tensor): Batch of attention weights (B, T_max_out, T_max_in).
+            ilens (LongTensor): Batch of input lenghts (B,).
+            olens (LongTensor): Batch of output lenghts (B,).
+        Returns:
+            Tensor: Guided attention loss value.
+        """
+        self._reset_masks()
+        self.guided_attn_masks = self._make_guided_attention_masks(ilens, olens).to(att_ws.device)
+        self.masks = self._make_masks(ilens, olens).to(att_ws.device)
+        losses = self.guided_attn_masks * att_ws
+        loss = torch.mean(losses.masked_select(self.masks))
+        self._reset_masks()
+        return self.alpha * loss
+    def _make_guided_attention_masks(self, ilens, olens):
+        n_batches = len(ilens)
+        max_ilen = max(ilens)
+        max_olen = max(olens)
+        guided_attn_masks = torch.zeros((n_batches, max_olen, max_ilen), device=ilens.device)
+        for idx, (ilen, olen) in enumerate(zip(ilens, olens)):
+            guided_attn_masks[idx, :olen, :ilen] = self._make_guided_attention_mask(ilen, olen, self.sigma)
+        return guided_attn_masks
+    @staticmethod
+    def _make_guided_attention_mask(ilen, olen, sigma):
+        """
+        Make guided attention mask.
+        """
+        grid_x, grid_y = torch.meshgrid(torch.arange(olen, device=olen.device).float(), torch.arange(ilen, device=ilen.device).float())
+        return 1.0 - torch.exp(-((grid_y / ilen - grid_x / olen) ** 2) / (2 * (sigma ** 2)))
+    @staticmethod
+    def _make_masks(ilens, olens):
+        """
+        Make masks indicating non-padded part.
+        Args:
+            ilens (LongTensor or List): Batch of lengths (B,).
+            olens (LongTensor or List): Batch of lengths (B,).
+        Returns:
+            Tensor: Mask tensor indicating non-padded part.
+                    dtype=torch.uint8 in PyTorch 1.2-
+                    dtype=torch.bool in PyTorch 1.2+ (including 1.2)
+        """
+        in_masks = make_non_pad_mask(ilens, device=ilens.device)  # (B, T_in)
+        out_masks = make_non_pad_mask(olens, device=olens.device)  # (B, T_out)
+        return out_masks.unsqueeze(-1) & in_masks.unsqueeze(-2)  # (B, T_out, T_in)
+class GuidedMultiHeadAttentionLoss(GuidedAttentionLoss):
+    """
+    Guided attention loss function module for multi head attention.
+    Args:
+        sigma (float, optional): Standard deviation to control
+        how close attention to a diagonal.
+        alpha (float, optional): Scaling coefficient (lambda).
+        reset_always (bool, optional): Whether to always reset masks.
+    """
+    def forward(self, att_ws, ilens, olens):
+        """
+        Calculate forward propagation.
+        Args:
+            att_ws (Tensor):
+                Batch of multi head attention weights (B, H, T_max_out, T_max_in).
+            ilens (LongTensor): Batch of input lenghts (B,).
+            olens (LongTensor): Batch of output lenghts (B,).
+        Returns:
+            Tensor: Guided attention loss value.
+        """
+        if self.guided_attn_masks is None:
+            self.guided_attn_masks = (self._make_guided_attention_masks(ilens, olens).to(att_ws.device).unsqueeze(1))
+        if self.masks is None:
+            self.masks = self._make_masks(ilens, olens).to(att_ws.device).unsqueeze(1)
+        losses = self.guided_attn_masks * att_ws
+        loss = torch.mean(losses.masked_select(self.masks))
+        if self.reset_always:
+            self._reset_masks()
+        return self.alpha * loss

Layers/Conformer.py ADDED Viewed

	@@ -0,0 +1,144 @@

+"""
+Taken from ESPNet
+"""
+import torch
+import torch.nn.functional as F
+from Layers.Attention import RelPositionMultiHeadedAttention
+from Layers.Convolution import ConvolutionModule
+from Layers.EncoderLayer import EncoderLayer
+from Layers.LayerNorm import LayerNorm
+from Layers.MultiLayeredConv1d import MultiLayeredConv1d
+from Layers.MultiSequential import repeat
+from Layers.PositionalEncoding import RelPositionalEncoding
+from Layers.Swish import Swish
+class Conformer(torch.nn.Module):
+    """
+    Conformer encoder module.
+    Args:
+        idim (int): Input dimension.
+        attention_dim (int): Dimension of attention.
+        attention_heads (int): The number of heads of multi head attention.
+        linear_units (int): The number of units of position-wise feed forward.
+        num_blocks (int): The number of decoder blocks.
+        dropout_rate (float): Dropout rate.
+        positional_dropout_rate (float): Dropout rate after adding positional encoding.
+        attention_dropout_rate (float): Dropout rate in attention.
+        input_layer (Union[str, torch.nn.Module]): Input layer type.
+        normalize_before (bool): Whether to use layer_norm before the first block.
+        concat_after (bool): Whether to concat attention layer's input and output.
+            if True, additional linear will be applied.
+            i.e. x -> x + linear(concat(x, att(x)))
+            if False, no additional linear will be applied. i.e. x -> x + att(x)
+        positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear".
+        positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer.
+        macaron_style (bool): Whether to use macaron style for positionwise layer.
+        pos_enc_layer_type (str): Conformer positional encoding layer type.
+        selfattention_layer_type (str): Conformer attention layer type.
+        activation_type (str): Conformer activation function type.
+        use_cnn_module (bool): Whether to use convolution module.
+        cnn_module_kernel (int): Kernerl size of convolution module.
+        padding_idx (int): Padding idx for input_layer=embed.
+    """
+    def __init__(self, idim, attention_dim=256, attention_heads=4, linear_units=2048, num_blocks=6, dropout_rate=0.1, positional_dropout_rate=0.1,
+                 attention_dropout_rate=0.0, input_layer="conv2d", normalize_before=True, concat_after=False, positionwise_conv_kernel_size=1,
+                 macaron_style=False, use_cnn_module=False, cnn_module_kernel=31, zero_triu=False, utt_embed=None, connect_utt_emb_at_encoder_out=True,
+                 spk_emb_bottleneck_size=128, lang_embs=None):
+        super(Conformer, self).__init__()
+        activation = Swish()
+        self.conv_subsampling_factor = 1
+        if isinstance(input_layer, torch.nn.Module):
+            self.embed = input_layer
+            self.pos_enc = RelPositionalEncoding(attention_dim, positional_dropout_rate)
+        elif input_layer is None:
+            self.embed = None
+            self.pos_enc = torch.nn.Sequential(RelPositionalEncoding(attention_dim, positional_dropout_rate))
+        else:
+            raise ValueError("unknown input_layer: " + input_layer)
+        self.normalize_before = normalize_before
+        self.connect_utt_emb_at_encoder_out = connect_utt_emb_at_encoder_out
+        if utt_embed is not None:
+            self.hs_emb_projection = torch.nn.Linear(attention_dim + spk_emb_bottleneck_size, attention_dim)
+            # embedding projection derived from https://arxiv.org/pdf/1705.08947.pdf
+            self.embedding_projection = torch.nn.Sequential(torch.nn.Linear(utt_embed, spk_emb_bottleneck_size),
+                                                            torch.nn.Softsign())
+        if lang_embs is not None:
+            self.language_embedding = torch.nn.Embedding(num_embeddings=lang_embs, embedding_dim=attention_dim)
+        # self-attention module definition
+        encoder_selfattn_layer = RelPositionMultiHeadedAttention
+        encoder_selfattn_layer_args = (attention_heads, attention_dim, attention_dropout_rate, zero_triu)
+        # feed-forward module definition
+        positionwise_layer = MultiLayeredConv1d
+        positionwise_layer_args = (attention_dim, linear_units, positionwise_conv_kernel_size, dropout_rate,)
+        # convolution module definition
+        convolution_layer = ConvolutionModule
+        convolution_layer_args = (attention_dim, cnn_module_kernel, activation)
+        self.encoders = repeat(num_blocks, lambda lnum: EncoderLayer(attention_dim, encoder_selfattn_layer(*encoder_selfattn_layer_args),
+                                                                     positionwise_layer(*positionwise_layer_args),
+                                                                     positionwise_layer(*positionwise_layer_args) if macaron_style else None,
+                                                                     convolution_layer(*convolution_layer_args) if use_cnn_module else None, dropout_rate,
+                                                                     normalize_before, concat_after))
+        if self.normalize_before:
+            self.after_norm = LayerNorm(attention_dim)
+    def forward(self, xs, masks, utterance_embedding=None, lang_ids=None):
+        """
+        Encode input sequence.
+        Args:
+            utterance_embedding: embedding containing lots of conditioning signals
+            step: indicator for when to start updating the embedding function
+            xs (torch.Tensor): Input tensor (#batch, time, idim).
+            masks (torch.Tensor): Mask tensor (#batch, time).
+        Returns:
+            torch.Tensor: Output tensor (#batch, time, attention_dim).
+            torch.Tensor: Mask tensor (#batch, time).
+        """
+        if self.embed is not None:
+            xs = self.embed(xs)
+        if lang_ids is not None:
+            lang_embs = self.language_embedding(lang_ids)
+            xs = xs + lang_embs  # offset the phoneme distribution of a language
+        if utterance_embedding is not None and not self.connect_utt_emb_at_encoder_out:
+            xs = self._integrate_with_utt_embed(xs, utterance_embedding)
+        xs = self.pos_enc(xs)
+        xs, masks = self.encoders(xs, masks)
+        if isinstance(xs, tuple):
+            xs = xs[0]
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+        if utterance_embedding is not None and self.connect_utt_emb_at_encoder_out:
+            xs = self._integrate_with_utt_embed(xs, utterance_embedding)
+        return xs, masks
+    def _integrate_with_utt_embed(self, hs, utt_embeddings):
+        # project embedding into smaller space
+        speaker_embeddings_projected = self.embedding_projection(utt_embeddings)
+        # concat hidden states with spk embeds and then apply projection
+        speaker_embeddings_expanded = F.normalize(speaker_embeddings_projected).unsqueeze(1).expand(-1, hs.size(1), -1)
+        hs = self.hs_emb_projection(torch.cat([hs, speaker_embeddings_expanded], dim=-1))
+        return hs

Layers/Convolution.py ADDED Viewed

	@@ -0,0 +1,55 @@

+# Copyright 2020 Johns Hopkins University (Shinji Watanabe)
+#                Northwestern Polytechnical University (Pengcheng Guo)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+# Adapted by Florian Lux 2021
+from torch import nn
+class ConvolutionModule(nn.Module):
+    """
+    ConvolutionModule in Conformer model.
+    Args:
+        channels (int): The number of channels of conv layers.
+        kernel_size (int): Kernel size of conv layers.
+    """
+    def __init__(self, channels, kernel_size, activation=nn.ReLU(), bias=True):
+        super(ConvolutionModule, self).__init__()
+        # kernel_size should be an odd number for 'SAME' padding
+        assert (kernel_size - 1) % 2 == 0
+        self.pointwise_conv1 = nn.Conv1d(channels, 2 * channels, kernel_size=1, stride=1, padding=0, bias=bias, )
+        self.depthwise_conv = nn.Conv1d(channels, channels, kernel_size, stride=1, padding=(kernel_size - 1) // 2, groups=channels, bias=bias, )
+        self.norm = nn.GroupNorm(num_groups=32, num_channels=channels)
+        self.pointwise_conv2 = nn.Conv1d(channels, channels, kernel_size=1, stride=1, padding=0, bias=bias, )
+        self.activation = activation
+    def forward(self, x):
+        """
+        Compute convolution module.
+        Args:
+            x (torch.Tensor): Input tensor (#batch, time, channels).
+        Returns:
+            torch.Tensor: Output tensor (#batch, time, channels).
+        """
+        # exchange the temporal dimension and the feature dimension
+        x = x.transpose(1, 2)
+        # GLU mechanism
+        x = self.pointwise_conv1(x)  # (batch, 2*channel, dim)
+        x = nn.functional.glu(x, dim=1)  # (batch, channel, dim)
+        # 1D Depthwise Conv
+        x = self.depthwise_conv(x)
+        x = self.activation(self.norm(x))
+        x = self.pointwise_conv2(x)
+        return x.transpose(1, 2)

Layers/DurationPredictor.py ADDED Viewed

	@@ -0,0 +1,139 @@

+# Copyright 2019 Tomoki Hayashi
+# MIT License (https://opensource.org/licenses/MIT)
+# Adapted by Florian Lux 2021
+import torch
+from Layers.LayerNorm import LayerNorm
+class DurationPredictor(torch.nn.Module):
+    """
+    Duration predictor module.
+    This is a module of duration predictor described
+    in `FastSpeech: Fast, Robust and Controllable Text to Speech`_.
+    The duration predictor predicts a duration of each frame in log domain
+    from the hidden embeddings of encoder.
+    .. _`FastSpeech: Fast, Robust and Controllable Text to Speech`:
+        https://arxiv.org/pdf/1905.09263.pdf
+    Note:
+        The calculation domain of outputs is different
+        between in `forward` and in `inference`. In `forward`,
+        the outputs are calculated in log domain but in `inference`,
+        those are calculated in linear domain.
+    """
+    def __init__(self, idim, n_layers=2, n_chans=384, kernel_size=3, dropout_rate=0.1, offset=1.0):
+        """
+        Initialize duration predictor module.
+        Args:
+            idim (int): Input dimension.
+            n_layers (int, optional): Number of convolutional layers.
+            n_chans (int, optional): Number of channels of convolutional layers.
+            kernel_size (int, optional): Kernel size of convolutional layers.
+            dropout_rate (float, optional): Dropout rate.
+            offset (float, optional): Offset value to avoid nan in log domain.
+        """
+        super(DurationPredictor, self).__init__()
+        self.offset = offset
+        self.conv = torch.nn.ModuleList()
+        for idx in range(n_layers):
+            in_chans = idim if idx == 0 else n_chans
+            self.conv += [torch.nn.Sequential(torch.nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=(kernel_size - 1) // 2, ), torch.nn.ReLU(),
+                                              LayerNorm(n_chans, dim=1), torch.nn.Dropout(dropout_rate), )]
+        self.linear = torch.nn.Linear(n_chans, 1)
+    def _forward(self, xs, x_masks=None, is_inference=False):
+        xs = xs.transpose(1, -1)  # (B, idim, Tmax)
+        for f in self.conv:
+            xs = f(xs)  # (B, C, Tmax)
+        # NOTE: calculate in log domain
+        xs = self.linear(xs.transpose(1, -1)).squeeze(-1)  # (B, Tmax)
+        if is_inference:
+            # NOTE: calculate in linear domain
+            xs = torch.clamp(torch.round(xs.exp() - self.offset), min=0).long()  # avoid negative value
+        if x_masks is not None:
+            xs = xs.masked_fill(x_masks, 0.0)
+        return xs
+    def forward(self, xs, x_masks=None):
+        """
+        Calculate forward propagation.
+        Args:
+            xs (Tensor): Batch of input sequences (B, Tmax, idim).
+            x_masks (ByteTensor, optional):
+                Batch of masks indicating padded part (B, Tmax).
+        Returns:
+            Tensor: Batch of predicted durations in log domain (B, Tmax).
+        """
+        return self._forward(xs, x_masks, False)
+    def inference(self, xs, x_masks=None):
+        """
+        Inference duration.
+        Args:
+            xs (Tensor): Batch of input sequences (B, Tmax, idim).
+            x_masks (ByteTensor, optional):
+                Batch of masks indicating padded part (B, Tmax).
+        Returns:
+            LongTensor: Batch of predicted durations in linear domain (B, Tmax).
+        """
+        return self._forward(xs, x_masks, True)
+class DurationPredictorLoss(torch.nn.Module):
+    """
+    Loss function module for duration predictor.
+    The loss value is Calculated in log domain to make it Gaussian.
+    """
+    def __init__(self, offset=1.0, reduction="mean"):
+        """
+        Args:
+            offset (float, optional): Offset value to avoid nan in log domain.
+            reduction (str): Reduction type in loss calculation.
+        """
+        super(DurationPredictorLoss, self).__init__()
+        self.criterion = torch.nn.MSELoss(reduction=reduction)
+        self.offset = offset
+    def forward(self, outputs, targets):
+        """
+        Calculate forward propagation.
+        Args:
+            outputs (Tensor): Batch of prediction durations in log domain (B, T)
+            targets (LongTensor): Batch of groundtruth durations in linear domain (B, T)
+        Returns:
+            Tensor: Mean squared error loss value.
+        Note:
+            `outputs` is in log domain but `targets` is in linear domain.
+        """
+        # NOTE: outputs is in log domain while targets in linear
+        targets = torch.log(targets.float() + self.offset)
+        loss = self.criterion(outputs, targets)
+        return loss

Layers/EncoderLayer.py ADDED Viewed

	@@ -0,0 +1,144 @@

+# Copyright 2020 Johns Hopkins University (Shinji Watanabe)
+#                Northwestern Polytechnical University (Pengcheng Guo)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+# Adapted by Florian Lux 2021
+import torch
+from torch import nn
+from Layers.LayerNorm import LayerNorm
+class EncoderLayer(nn.Module):
+    """
+    Encoder layer module.
+    Args:
+        size (int): Input dimension.
+        self_attn (torch.nn.Module): Self-attention module instance.
+            `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance
+            can be used as the argument.
+        feed_forward (torch.nn.Module): Feed-forward module instance.
+            `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
+            can be used as the argument.
+        feed_forward_macaron (torch.nn.Module): Additional feed-forward module instance.
+            `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
+            can be used as the argument.
+        conv_module (torch.nn.Module): Convolution module instance.
+            `ConvlutionModule` instance can be used as the argument.
+        dropout_rate (float): Dropout rate.
+        normalize_before (bool): Whether to use layer_norm before the first block.
+        concat_after (bool): Whether to concat attention layer's input and output.
+            if True, additional linear will be applied.
+            i.e. x -> x + linear(concat(x, att(x)))
+            if False, no additional linear will be applied. i.e. x -> x + att(x)
+    """
+    def __init__(self, size, self_attn, feed_forward, feed_forward_macaron, conv_module, dropout_rate, normalize_before=True, concat_after=False, ):
+        super(EncoderLayer, self).__init__()
+        self.self_attn = self_attn
+        self.feed_forward = feed_forward
+        self.feed_forward_macaron = feed_forward_macaron
+        self.conv_module = conv_module
+        self.norm_ff = LayerNorm(size)  # for the FNN module
+        self.norm_mha = LayerNorm(size)  # for the MHA module
+        if feed_forward_macaron is not None:
+            self.norm_ff_macaron = LayerNorm(size)
+            self.ff_scale = 0.5
+        else:
+            self.ff_scale = 1.0
+        if self.conv_module is not None:
+            self.norm_conv = LayerNorm(size)  # for the CNN module
+            self.norm_final = LayerNorm(size)  # for the final output of the block
+        self.dropout = nn.Dropout(dropout_rate)
+        self.size = size
+        self.normalize_before = normalize_before
+        self.concat_after = concat_after
+        if self.concat_after:
+            self.concat_linear = nn.Linear(size + size, size)
+    def forward(self, x_input, mask, cache=None):
+        """
+        Compute encoded features.
+        Args:
+            x_input (Union[Tuple, torch.Tensor]): Input tensor w/ or w/o pos emb.
+                - w/ pos emb: Tuple of tensors [(#batch, time, size), (1, time, size)].
+                - w/o pos emb: Tensor (#batch, time, size).
+            mask (torch.Tensor): Mask tensor for the input (#batch, time).
+            cache (torch.Tensor): Cache tensor of the input (#batch, time - 1, size).
+        Returns:
+            torch.Tensor: Output tensor (#batch, time, size).
+            torch.Tensor: Mask tensor (#batch, time).
+        """
+        if isinstance(x_input, tuple):
+            x, pos_emb = x_input[0], x_input[1]
+        else:
+            x, pos_emb = x_input, None
+        # whether to use macaron style
+        if self.feed_forward_macaron is not None:
+            residual = x
+            if self.normalize_before:
+                x = self.norm_ff_macaron(x)
+            x = residual + self.ff_scale * self.dropout(self.feed_forward_macaron(x))
+            if not self.normalize_before:
+                x = self.norm_ff_macaron(x)
+        # multi-headed self-attention module
+        residual = x
+        if self.normalize_before:
+            x = self.norm_mha(x)
+        if cache is None:
+            x_q = x
+        else:
+            assert cache.shape == (x.shape[0], x.shape[1] - 1, self.size)
+            x_q = x[:, -1:, :]
+            residual = residual[:, -1:, :]
+            mask = None if mask is None else mask[:, -1:, :]
+        if pos_emb is not None:
+            x_att = self.self_attn(x_q, x, x, pos_emb, mask)
+        else:
+            x_att = self.self_attn(x_q, x, x, mask)
+        if self.concat_after:
+            x_concat = torch.cat((x, x_att), dim=-1)
+            x = residual + self.concat_linear(x_concat)
+        else:
+            x = residual + self.dropout(x_att)
+        if not self.normalize_before:
+            x = self.norm_mha(x)
+        # convolution module
+        if self.conv_module is not None:
+            residual = x
+            if self.normalize_before:
+                x = self.norm_conv(x)
+            x = residual + self.dropout(self.conv_module(x))
+            if not self.normalize_before:
+                x = self.norm_conv(x)
+        # feed forward module
+        residual = x
+        if self.normalize_before:
+            x = self.norm_ff(x)
+        x = residual + self.ff_scale * self.dropout(self.feed_forward(x))
+        if not self.normalize_before:
+            x = self.norm_ff(x)
+        if self.conv_module is not None:
+            x = self.norm_final(x)
+        if cache is not None:
+            x = torch.cat([cache, x], dim=1)
+        if pos_emb is not None:
+            return (x, pos_emb), mask
+        return x, mask

Layers/LayerNorm.py ADDED Viewed

	@@ -0,0 +1,36 @@

+# Written by Shigeki Karita, 2019
+# Published under Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+# Adapted by Florian Lux, 2021
+import torch
+class LayerNorm(torch.nn.LayerNorm):
+    """
+    Layer normalization module.
+    Args:
+        nout (int): Output dim size.
+        dim (int): Dimension to be normalized.
+    """
+    def __init__(self, nout, dim=-1):
+        """
+        Construct an LayerNorm object.
+        """
+        super(LayerNorm, self).__init__(nout, eps=1e-12)
+        self.dim = dim
+    def forward(self, x):
+        """
+        Apply layer normalization.
+        Args:
+            x (torch.Tensor): Input tensor.
+        Returns:
+            torch.Tensor: Normalized tensor.
+        """
+        if self.dim == -1:
+            return super(LayerNorm, self).forward(x)
+        return super(LayerNorm, self).forward(x.transpose(1, -1)).transpose(1, -1)

Layers/LengthRegulator.py ADDED Viewed

	@@ -0,0 +1,62 @@

+# Copyright 2019 Tomoki Hayashi
+# MIT License (https://opensource.org/licenses/MIT)
+# Adapted by Florian Lux 2021
+from abc import ABC
+import torch
+from Utility.utils import pad_list
+class LengthRegulator(torch.nn.Module, ABC):
+    """
+    Length regulator module for feed-forward Transformer.
+    This is a module of length regulator described in
+    `FastSpeech: Fast, Robust and Controllable Text to Speech`_.
+    The length regulator expands char or
+    phoneme-level embedding features to frame-level by repeating each
+    feature based on the corresponding predicted durations.
+    .. _`FastSpeech: Fast, Robust and Controllable Text to Speech`:
+        https://arxiv.org/pdf/1905.09263.pdf
+    """
+    def __init__(self, pad_value=0.0):
+        """
+        Initialize length regulator module.
+        Args:
+            pad_value (float, optional): Value used for padding.
+        """
+        super(LengthRegulator, self).__init__()
+        self.pad_value = pad_value
+    def forward(self, xs, ds, alpha=1.0):
+        """
+        Calculate forward propagation.
+        Args:
+            xs (Tensor): Batch of sequences of char or phoneme embeddings (B, Tmax, D).
+            ds (LongTensor): Batch of durations of each frame (B, T).
+            alpha (float, optional): Alpha value to control speed of speech.
+        Returns:
+            Tensor: replicated input tensor based on durations (B, T*, D).
+        """
+        if alpha != 1.0:
+            assert alpha > 0
+            ds = torch.round(ds.float() * alpha).long()
+        if ds.sum() == 0:
+            ds[ds.sum(dim=1).eq(0)] = 1
+        return pad_list([self._repeat_one_sequence(x, d) for x, d in zip(xs, ds)], self.pad_value)
+    def _repeat_one_sequence(self, x, d):
+        """
+        Repeat each frame according to duration
+        """
+        return torch.repeat_interleave(x, d, dim=0)

Layers/MultiLayeredConv1d.py ADDED Viewed

	@@ -0,0 +1,87 @@

+# Copyright 2019 Tomoki Hayashi
+# MIT License (https://opensource.org/licenses/MIT)
+# Adapted by Florian Lux 2021
+"""
+Layer modules for FFT block in FastSpeech (Feed-forward Transformer).
+"""
+import torch
+class MultiLayeredConv1d(torch.nn.Module):
+    """
+    Multi-layered conv1d for Transformer block.
+    This is a module of multi-layered conv1d designed
+    to replace positionwise feed-forward network
+    in Transformer block, which is introduced in
+    `FastSpeech: Fast, Robust and Controllable Text to Speech`_.
+    .. _`FastSpeech: Fast, Robust and Controllable Text to Speech`:
+        https://arxiv.org/pdf/1905.09263.pdf
+    """
+    def __init__(self, in_chans, hidden_chans, kernel_size, dropout_rate):
+        """
+        Initialize MultiLayeredConv1d module.
+        Args:
+            in_chans (int): Number of input channels.
+            hidden_chans (int): Number of hidden channels.
+            kernel_size (int): Kernel size of conv1d.
+            dropout_rate (float): Dropout rate.
+        """
+        super(MultiLayeredConv1d, self).__init__()
+        self.w_1 = torch.nn.Conv1d(in_chans, hidden_chans, kernel_size, stride=1, padding=(kernel_size - 1) // 2, )
+        self.w_2 = torch.nn.Conv1d(hidden_chans, in_chans, kernel_size, stride=1, padding=(kernel_size - 1) // 2, )
+        self.dropout = torch.nn.Dropout(dropout_rate)
+    def forward(self, x):
+        """
+        Calculate forward propagation.
+        Args:
+            x (torch.Tensor): Batch of input tensors (B, T, in_chans).
+        Returns:
+            torch.Tensor: Batch of output tensors (B, T, hidden_chans).
+        """
+        x = torch.relu(self.w_1(x.transpose(-1, 1))).transpose(-1, 1)
+        return self.w_2(self.dropout(x).transpose(-1, 1)).transpose(-1, 1)
+class Conv1dLinear(torch.nn.Module):
+    """
+    Conv1D + Linear for Transformer block.
+    A variant of MultiLayeredConv1d, which replaces second conv-layer to linear.
+    """
+    def __init__(self, in_chans, hidden_chans, kernel_size, dropout_rate):
+        """
+        Initialize Conv1dLinear module.
+        Args:
+            in_chans (int): Number of input channels.
+            hidden_chans (int): Number of hidden channels.
+            kernel_size (int): Kernel size of conv1d.
+            dropout_rate (float): Dropout rate.
+        """
+        super(Conv1dLinear, self).__init__()
+        self.w_1 = torch.nn.Conv1d(in_chans, hidden_chans, kernel_size, stride=1, padding=(kernel_size - 1) // 2, )
+        self.w_2 = torch.nn.Linear(hidden_chans, in_chans)
+        self.dropout = torch.nn.Dropout(dropout_rate)
+    def forward(self, x):
+        """
+        Calculate forward propagation.
+        Args:
+            x (torch.Tensor): Batch of input tensors (B, T, in_chans).
+        Returns:
+            torch.Tensor: Batch of output tensors (B, T, hidden_chans).
+        """
+        x = torch.relu(self.w_1(x.transpose(-1, 1))).transpose(-1, 1)
+        return self.w_2(self.dropout(x))

Layers/MultiSequential.py ADDED Viewed

	@@ -0,0 +1,33 @@

+# Written by Shigeki Karita, 2019
+# Published under Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+# Adapted by Florian Lux, 2021
+import torch
+class MultiSequential(torch.nn.Sequential):
+    """
+    Multi-input multi-output torch.nn.Sequential.
+    """
+    def forward(self, *args):
+        """
+        Repeat.
+        """
+        for m in self:
+            args = m(*args)
+        return args
+def repeat(N, fn):
+    """
+    Repeat module N times.
+    Args:
+        N (int): Number of repeat time.
+        fn (Callable): Function to generate module.
+    Returns:
+        MultiSequential: Repeated model instance.
+    """
+    return MultiSequential(*[fn(n) for n in range(N)])

Layers/PositionalEncoding.py ADDED Viewed

	@@ -0,0 +1,166 @@

+"""
+Taken from ESPNet
+"""
+import math
+import torch
+class PositionalEncoding(torch.nn.Module):
+    """
+    Positional encoding.
+    Args:
+        d_model (int): Embedding dimension.
+        dropout_rate (float): Dropout rate.
+        max_len (int): Maximum input length.
+        reverse (bool): Whether to reverse the input position.
+    """
+    def __init__(self, d_model, dropout_rate, max_len=5000, reverse=False):
+        """
+        Construct an PositionalEncoding object.
+        """
+        super(PositionalEncoding, self).__init__()
+        self.d_model = d_model
+        self.reverse = reverse
+        self.xscale = math.sqrt(self.d_model)
+        self.dropout = torch.nn.Dropout(p=dropout_rate)
+        self.pe = None
+        self.extend_pe(torch.tensor(0.0, device=d_model.device).expand(1, max_len))
+    def extend_pe(self, x):
+        """
+        Reset the positional encodings.
+        """
+        if self.pe is not None:
+            if self.pe.size(1) >= x.size(1):
+                if self.pe.dtype != x.dtype or self.pe.device != x.device:
+                    self.pe = self.pe.to(dtype=x.dtype, device=x.device)
+                return
+        pe = torch.zeros(x.size(1), self.d_model)
+        if self.reverse:
+            position = torch.arange(x.size(1) - 1, -1, -1.0, dtype=torch.float32).unsqueeze(1)
+        else:
+            position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, self.d_model, 2, dtype=torch.float32) * -(math.log(10000.0) / self.d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.pe = pe.to(device=x.device, dtype=x.dtype)
+    def forward(self, x):
+        """
+        Add positional encoding.
+        Args:
+            x (torch.Tensor): Input tensor (batch, time, `*`).
+        Returns:
+            torch.Tensor: Encoded tensor (batch, time, `*`).
+        """
+        self.extend_pe(x)
+        x = x * self.xscale + self.pe[:, : x.size(1)]
+        return self.dropout(x)
+class RelPositionalEncoding(torch.nn.Module):
+    """
+    Relative positional encoding module (new implementation).
+    Details can be found in https://github.com/espnet/espnet/pull/2816.
+    See : Appendix B in https://arxiv.org/abs/1901.02860
+    Args:
+        d_model (int): Embedding dimension.
+        dropout_rate (float): Dropout rate.
+        max_len (int): Maximum input length.
+    """
+    def __init__(self, d_model, dropout_rate, max_len=5000):
+        """
+        Construct an PositionalEncoding object.
+        """
+        super(RelPositionalEncoding, self).__init__()
+        self.d_model = d_model
+        self.xscale = math.sqrt(self.d_model)
+        self.dropout = torch.nn.Dropout(p=dropout_rate)
+        self.pe = None
+        self.extend_pe(torch.tensor(0.0).expand(1, max_len))
+    def extend_pe(self, x):
+        """Reset the positional encodings."""
+        if self.pe is not None:
+            # self.pe contains both positive and negative parts
+            # the length of self.pe is 2 * input_len - 1
+            if self.pe.size(1) >= x.size(1) * 2 - 1:
+                if self.pe.dtype != x.dtype or self.pe.device != x.device:
+                    self.pe = self.pe.to(dtype=x.dtype, device=x.device)
+                return
+        # Suppose `i` means to the position of query vecotr and `j` means the
+        # position of key vector. We use position relative positions when keys
+        # are to the left (i>j) and negative relative positions otherwise (i<j).
+        pe_positive = torch.zeros(x.size(1), self.d_model, device=x.device)
+        pe_negative = torch.zeros(x.size(1), self.d_model, device=x.device)
+        position = torch.arange(0, x.size(1), dtype=torch.float32, device=x.device).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, self.d_model, 2, dtype=torch.float32, device=x.device) * -(math.log(10000.0) / self.d_model))
+        pe_positive[:, 0::2] = torch.sin(position * div_term)
+        pe_positive[:, 1::2] = torch.cos(position * div_term)
+        pe_negative[:, 0::2] = torch.sin(-1 * position * div_term)
+        pe_negative[:, 1::2] = torch.cos(-1 * position * div_term)
+        # Reserve the order of positive indices and concat both positive and
+        # negative indices. This is used to support the shifting trick
+        # as in https://arxiv.org/abs/1901.02860
+        pe_positive = torch.flip(pe_positive, [0]).unsqueeze(0)
+        pe_negative = pe_negative[1:].unsqueeze(0)
+        pe = torch.cat([pe_positive, pe_negative], dim=1)
+        self.pe = pe.to(dtype=x.dtype)
+    def forward(self, x):
+        """
+        Add positional encoding.
+        Args:
+            x (torch.Tensor): Input tensor (batch, time, `*`).
+        Returns:
+            torch.Tensor: Encoded tensor (batch, time, `*`).
+        """
+        self.extend_pe(x)
+        x = x * self.xscale
+        pos_emb = self.pe[:, self.pe.size(1) // 2 - x.size(1) + 1: self.pe.size(1) // 2 + x.size(1), ]
+        return self.dropout(x), self.dropout(pos_emb)
+class ScaledPositionalEncoding(PositionalEncoding):
+    """
+    Scaled positional encoding module.
+    See Sec. 3.2  https://arxiv.org/abs/1809.08895
+    Args:
+        d_model (int): Embedding dimension.
+        dropout_rate (float): Dropout rate.
+        max_len (int): Maximum input length.
+    """
+    def __init__(self, d_model, dropout_rate, max_len=5000):
+        super().__init__(d_model=d_model, dropout_rate=dropout_rate, max_len=max_len)
+        self.alpha = torch.nn.Parameter(torch.tensor(1.0))
+    def reset_parameters(self):
+        self.alpha.data = torch.tensor(1.0)
+    def forward(self, x):
+        """
+        Add positional encoding.
+        Args:
+            x (torch.Tensor): Input tensor (batch, time, `*`).
+        Returns:
+            torch.Tensor: Encoded tensor (batch, time, `*`).
+        """
+        self.extend_pe(x)
+        x = x + self.alpha * self.pe[:, : x.size(1)]
+        return self.dropout(x)

Layers/PositionwiseFeedForward.py ADDED Viewed

	@@ -0,0 +1,26 @@

+# Written by Shigeki Karita, 2019
+# Published under Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+# Adapted by Florian Lux, 2021
+import torch
+class PositionwiseFeedForward(torch.nn.Module):
+    """
+    Args:
+        idim (int): Input dimenstion.
+        hidden_units (int): The number of hidden units.
+        dropout_rate (float): Dropout rate.
+    """
+    def __init__(self, idim, hidden_units, dropout_rate, activation=torch.nn.ReLU()):
+        super(PositionwiseFeedForward, self).__init__()
+        self.w_1 = torch.nn.Linear(idim, hidden_units)
+        self.w_2 = torch.nn.Linear(hidden_units, idim)
+        self.dropout = torch.nn.Dropout(dropout_rate)
+        self.activation = activation
+    def forward(self, x):
+        return self.w_2(self.dropout(self.activation(self.w_1(x))))

Layers/PostNet.py ADDED Viewed

	@@ -0,0 +1,74 @@

+"""
+Taken from ESPNet
+"""
+import torch
+class PostNet(torch.nn.Module):
+    """
+    From Tacotron2
+    Postnet module for Spectrogram prediction network.
+    This is a module of Postnet in Spectrogram prediction network,
+    which described in `Natural TTS Synthesis by
+    Conditioning WaveNet on Mel Spectrogram Predictions`_.
+    The Postnet refines the predicted
+    Mel-filterbank of the decoder,
+    which helps to compensate the detail sturcture of spectrogram.
+    .. _`Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`:
+       https://arxiv.org/abs/1712.05884
+    """
+    def __init__(self, idim, odim, n_layers=5, n_chans=512, n_filts=5, dropout_rate=0.5, use_batch_norm=True):
+        """
+        Initialize postnet module.
+        Args:
+            idim (int): Dimension of the inputs.
+            odim (int): Dimension of the outputs.
+            n_layers (int, optional): The number of layers.
+            n_filts (int, optional): The number of filter size.
+            n_units (int, optional): The number of filter channels.
+            use_batch_norm (bool, optional): Whether to use batch normalization..
+            dropout_rate (float, optional): Dropout rate..
+        """
+        super(PostNet, self).__init__()
+        self.postnet = torch.nn.ModuleList()
+        for layer in range(n_layers - 1):
+            ichans = odim if layer == 0 else n_chans
+            ochans = odim if layer == n_layers - 1 else n_chans
+            if use_batch_norm:
+                self.postnet += [torch.nn.Sequential(torch.nn.Conv1d(ichans, ochans, n_filts, stride=1, padding=(n_filts - 1) // 2, bias=False, ),
+                                                     torch.nn.GroupNorm(num_groups=32, num_channels=ochans), torch.nn.Tanh(),
+                                                     torch.nn.Dropout(dropout_rate), )]
+            else:
+                self.postnet += [
+                    torch.nn.Sequential(torch.nn.Conv1d(ichans, ochans, n_filts, stride=1, padding=(n_filts - 1) // 2, bias=False, ), torch.nn.Tanh(),
+                                        torch.nn.Dropout(dropout_rate), )]
+        ichans = n_chans if n_layers != 1 else odim
+        if use_batch_norm:
+            self.postnet += [torch.nn.Sequential(torch.nn.Conv1d(ichans, odim, n_filts, stride=1, padding=(n_filts - 1) // 2, bias=False, ),
+                                                 torch.nn.GroupNorm(num_groups=20, num_channels=odim),
+                                                 torch.nn.Dropout(dropout_rate), )]
+        else:
+            self.postnet += [torch.nn.Sequential(torch.nn.Conv1d(ichans, odim, n_filts, stride=1, padding=(n_filts - 1) // 2, bias=False, ),
+                                                 torch.nn.Dropout(dropout_rate), )]
+    def forward(self, xs):
+        """
+        Calculate forward propagation.
+        Args:
+            xs (Tensor): Batch of the sequences of padded input tensors (B, idim, Tmax).
+        Returns:
+            Tensor: Batch of padded output tensor. (B, odim, Tmax).
+        """
+        for i in range(len(self.postnet)):
+            xs = self.postnet[i](xs)
+        return xs

Layers/ResidualBlock.py ADDED Viewed

	@@ -0,0 +1,98 @@

+# -*- coding: utf-8 -*-
+"""
+References:
+    - https://github.com/jik876/hifi-gan
+    - https://github.com/kan-bayashi/ParallelWaveGAN
+"""
+import torch
+class Conv1d(torch.nn.Conv1d):
+    """
+    Conv1d module with customized initialization.
+    """
+    def __init__(self, *args, **kwargs):
+        super(Conv1d, self).__init__(*args, **kwargs)
+    def reset_parameters(self):
+        torch.nn.init.kaiming_normal_(self.weight, nonlinearity="relu")
+        if self.bias is not None:
+            torch.nn.init.constant_(self.bias, 0.0)
+class Conv1d1x1(Conv1d):
+    """
+    1x1 Conv1d with customized initialization.
+    """
+    def __init__(self, in_channels, out_channels, bias):
+        super(Conv1d1x1, self).__init__(in_channels, out_channels, kernel_size=1, padding=0, dilation=1, bias=bias)
+class HiFiGANResidualBlock(torch.nn.Module):
+    """Residual block module in HiFiGAN."""
+    def __init__(self,
+                 kernel_size=3,
+                 channels=512,
+                 dilations=(1, 3, 5),
+                 bias=True,
+                 use_additional_convs=True,
+                 nonlinear_activation="LeakyReLU",
+                 nonlinear_activation_params={"negative_slope": 0.1}, ):
+        """
+        Initialize HiFiGANResidualBlock module.
+        Args:
+            kernel_size (int): Kernel size of dilation convolution layer.
+            channels (int): Number of channels for convolution layer.
+            dilations (List[int]): List of dilation factors.
+            use_additional_convs (bool): Whether to use additional convolution layers.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (dict): Hyperparameters for activation function.
+        """
+        super().__init__()
+        self.use_additional_convs = use_additional_convs
+        self.convs1 = torch.nn.ModuleList()
+        if use_additional_convs:
+            self.convs2 = torch.nn.ModuleList()
+        assert kernel_size % 2 == 1, "Kernel size must be odd number."
+        for dilation in dilations:
+            self.convs1 += [torch.nn.Sequential(getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
+                                                torch.nn.Conv1d(channels,
+                                                                channels,
+                                                                kernel_size,
+                                                                1,
+                                                                dilation=dilation,
+                                                                bias=bias,
+                                                                padding=(kernel_size - 1) // 2 * dilation, ), )]
+            if use_additional_convs:
+                self.convs2 += [torch.nn.Sequential(getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
+                                                    torch.nn.Conv1d(channels,
+                                                                    channels,
+                                                                    kernel_size,
+                                                                    1,
+                                                                    dilation=1,
+                                                                    bias=bias,
+                                                                    padding=(kernel_size - 1) // 2, ), )]
+    def forward(self, x):
+        """
+        Calculate forward propagation.
+        Args:
+            x (Tensor): Input tensor (B, channels, T).
+        Returns:
+            Tensor: Output tensor (B, channels, T).
+        """
+        for idx in range(len(self.convs1)):
+            xt = self.convs1[idx](x)
+            if self.use_additional_convs:
+                xt = self.convs2[idx](xt)
+            x = xt + x
+        return x

Layers/ResidualStack.py ADDED Viewed

	@@ -0,0 +1,51 @@

+# Copyright 2019 Tomoki Hayashi
+# MIT License (https://opensource.org/licenses/MIT)
+# Adapted by Florian Lux 2021
+import torch
+class ResidualStack(torch.nn.Module):
+    def __init__(self, kernel_size=3, channels=32, dilation=1, bias=True, nonlinear_activation="LeakyReLU", nonlinear_activation_params={"negative_slope": 0.2},
+                 pad="ReflectionPad1d", pad_params={}, ):
+        """
+        Initialize ResidualStack module.
+        Args:
+            kernel_size (int): Kernel size of dilation convolution layer.
+            channels (int): Number of channels of convolution layers.
+            dilation (int): Dilation factor.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (dict): Hyperparameters for activation function.
+            pad (str): Padding function module name before dilated convolution layer.
+            pad_params (dict): Hyperparameters for padding function.
+        """
+        super(ResidualStack, self).__init__()
+        # defile residual stack part
+        assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size."
+        self.stack = torch.nn.Sequential(getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
+                                         getattr(torch.nn, pad)((kernel_size - 1) // 2 * dilation, **pad_params),
+                                         torch.nn.Conv1d(channels, channels, kernel_size, dilation=dilation, bias=bias),
+                                         getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
+                                         torch.nn.Conv1d(channels, channels, 1, bias=bias), )
+        # defile extra layer for skip connection
+        self.skip_layer = torch.nn.Conv1d(channels, channels, 1, bias=bias)
+    def forward(self, c):
+        """
+        Calculate forward propagation.
+        Args:
+            c (Tensor): Input tensor (B, channels, T).
+        Returns:
+            Tensor: Output tensor (B, chennels, T).
+        """
+        return self.stack(c) + self.skip_layer(c)

Layers/STFT.py ADDED Viewed

	@@ -0,0 +1,118 @@

+"""
+Taken from ESPNet
+"""
+import torch
+from torch.functional import stft as torch_stft
+from torch_complex.tensor import ComplexTensor
+from Utility.utils import make_pad_mask
+class STFT(torch.nn.Module):
+    def __init__(self, n_fft=512, win_length=None, hop_length=128, window="hann", center=True, normalized=False,
+                 onesided=True):
+        super().__init__()
+        self.n_fft = n_fft
+        if win_length is None:
+            self.win_length = n_fft
+        else:
+            self.win_length = win_length
+        self.hop_length = hop_length
+        self.center = center
+        self.normalized = normalized
+        self.onesided = onesided
+        self.window = window
+    def extra_repr(self):
+        return (f"n_fft={self.n_fft}, "
+                f"win_length={self.win_length}, "
+                f"hop_length={self.hop_length}, "
+                f"center={self.center}, "
+                f"normalized={self.normalized}, "
+                f"onesided={self.onesided}")
+    def forward(self, input_wave, ilens=None):
+        """
+        STFT forward function.
+        Args:
+            input_wave: (Batch, Nsamples) or (Batch, Nsample, Channels)
+            ilens: (Batch)
+        Returns:
+            output: (Batch, Frames, Freq, 2) or (Batch, Frames, Channels, Freq, 2)
+        """
+        bs = input_wave.size(0)
+        if input_wave.dim() == 3:
+            multi_channel = True
+            # input: (Batch, Nsample, Channels) -> (Batch * Channels, Nsample)
+            input_wave = input_wave.transpose(1, 2).reshape(-1, input_wave.size(1))
+        else:
+            multi_channel = False
+        # output: (Batch, Freq, Frames, 2=real_imag)
+        # or (Batch, Channel, Freq, Frames, 2=real_imag)
+        if self.window is not None:
+            window_func = getattr(torch, f"{self.window}_window")
+            window = window_func(self.win_length, dtype=input_wave.dtype, device=input_wave.device)
+        else:
+            window = None
+        complex_output = torch_stft(input=input_wave,
+                                    n_fft=self.n_fft,
+                                    win_length=self.win_length,
+                                    hop_length=self.hop_length,
+                                    center=self.center,
+                                    window=window,
+                                    normalized=self.normalized,
+                                    onesided=self.onesided,
+                                    return_complex=True)
+        output = torch.view_as_real(complex_output)
+        # output: (Batch, Freq, Frames, 2=real_imag)
+        # -> (Batch, Frames, Freq, 2=real_imag)
+        output = output.transpose(1, 2)
+        if multi_channel:
+            # output: (Batch * Channel, Frames, Freq, 2=real_imag)
+            # -> (Batch, Frame, Channel, Freq, 2=real_imag)
+            output = output.view(bs, -1, output.size(1), output.size(2), 2).transpose(1, 2)
+        if ilens is not None:
+            if self.center:
+                pad = self.win_length // 2
+                ilens = ilens + 2 * pad
+            olens = torch.div((ilens - self.win_length), self.hop_length, rounding_mode="trunc") + 1
+            output.masked_fill_(make_pad_mask(olens, output, 1), 0.0)
+        else:
+            olens = None
+        return output, olens
+    def inverse(self, input, ilens=None):
+        """
+        Inverse STFT.
+        Args:
+            input: Tensor(batch, T, F, 2) or ComplexTensor(batch, T, F)
+            ilens: (batch,)
+        Returns:
+            wavs: (batch, samples)
+            ilens: (batch,)
+        """
+        istft = torch.functional.istft
+        if self.window is not None:
+            window_func = getattr(torch, f"{self.window}_window")
+            window = window_func(self.win_length, dtype=input.dtype, device=input.device)
+        else:
+            window = None
+        if isinstance(input, ComplexTensor):
+            input = torch.stack([input.real, input.imag], dim=-1)
+        assert input.shape[-1] == 2
+        input = input.transpose(1, 2)
+        wavs = istft(input, n_fft=self.n_fft, hop_length=self.hop_length, win_length=self.win_length, window=window, center=self.center,
+                     normalized=self.normalized, onesided=self.onesided, length=ilens.max() if ilens is not None else ilens)
+        return wavs, ilens

Layers/Swish.py ADDED Viewed

	@@ -0,0 +1,18 @@

+# Copyright 2020 Johns Hopkins University (Shinji Watanabe)
+#                Northwestern Polytechnical University (Pengcheng Guo)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+# Adapted by Florian Lux 2021
+import torch
+class Swish(torch.nn.Module):
+    """
+    Construct an Swish activation function for Conformer.
+    """
+    def forward(self, x):
+        """
+        Return Swish activation function.
+        """
+        return x * torch.sigmoid(x)

Layers/VariancePredictor.py ADDED Viewed

	@@ -0,0 +1,65 @@

+# Copyright 2019 Tomoki Hayashi
+# MIT License (https://opensource.org/licenses/MIT)
+# Adapted by Florian Lux 2021
+from abc import ABC
+import torch
+from Layers.LayerNorm import LayerNorm
+class VariancePredictor(torch.nn.Module, ABC):
+    """
+    Variance predictor module.
+    This is a module of variance predictor described in `FastSpeech 2:
+    Fast and High-Quality End-to-End Text to Speech`_.
+    .. _`FastSpeech 2: Fast and High-Quality End-to-End Text to Speech`:
+        https://arxiv.org/abs/2006.04558
+    """
+    def __init__(self, idim, n_layers=2, n_chans=384, kernel_size=3, bias=True, dropout_rate=0.5, ):
+        """
+        Initilize duration predictor module.
+        Args:
+            idim (int): Input dimension.
+            n_layers (int, optional): Number of convolutional layers.
+            n_chans (int, optional): Number of channels of convolutional layers.
+            kernel_size (int, optional): Kernel size of convolutional layers.
+            dropout_rate (float, optional): Dropout rate.
+        """
+        super().__init__()
+        self.conv = torch.nn.ModuleList()
+        for idx in range(n_layers):
+            in_chans = idim if idx == 0 else n_chans
+            self.conv += [
+                torch.nn.Sequential(torch.nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=(kernel_size - 1) // 2, bias=bias, ), torch.nn.ReLU(),
+                                    LayerNorm(n_chans, dim=1), torch.nn.Dropout(dropout_rate), )]
+        self.linear = torch.nn.Linear(n_chans, 1)
+    def forward(self, xs, x_masks=None):
+        """
+        Calculate forward propagation.
+        Args:
+            xs (Tensor): Batch of input sequences (B, Tmax, idim).
+            x_masks (ByteTensor, optional):
+                Batch of masks indicating padded part (B, Tmax).
+        Returns:
+            Tensor: Batch of predicted sequences (B, Tmax, 1).
+        """
+        xs = xs.transpose(1, -1)  # (B, idim, Tmax)
+        for f in self.conv:
+            xs = f(xs)  # (B, C, Tmax)
+        xs = self.linear(xs.transpose(1, 2))  # (B, Tmax, 1)
+        if x_masks is not None:
+            xs = xs.masked_fill(x_masks, 0.0)
+        return xs

Layers/__init__.py ADDED Viewed

File without changes

Models/Aligner/__init__.py ADDED Viewed

File without changes

Models/FastSpeech2_Meta/__init__.py ADDED Viewed

File without changes

Models/HiFiGAN_combined/__init__.py ADDED Viewed

File without changes

Preprocessing/ArticulatoryCombinedTextFrontend.py ADDED Viewed

	@@ -0,0 +1,323 @@

+import re
+import sys
+import panphon
+import phonemizer
+import torch
+from Preprocessing.papercup_features import generate_feature_table
+class ArticulatoryCombinedTextFrontend:
+    def __init__(self,
+                 language,
+                 use_word_boundaries=False,  # goes together well with
+                 # parallel models and an aligner. Doesn't go together
+                 # well with autoregressive models.
+                 use_explicit_eos=True,
+                 use_prosody=False,  # unfortunately the non-segmental
+                 # nature of prosodic markers mixed with the sequential
+                 # phonemes hurts the performance of end-to-end models a
+                 # lot, even though one might think enriching the input
+                 # with such information would help.
+                 use_lexical_stress=False,
+                 silent=True,
+                 allow_unknown=False,
+                 add_silence_to_end=True,
+                 strip_silence=True):
+        """
+        Mostly preparing ID lookups
+        """
+        self.strip_silence = strip_silence
+        self.use_word_boundaries = use_word_boundaries
+        self.allow_unknown = allow_unknown
+        self.use_explicit_eos = use_explicit_eos
+        self.use_prosody = use_prosody
+        self.use_stress = use_lexical_stress
+        self.add_silence_to_end = add_silence_to_end
+        self.feature_table = panphon.FeatureTable()
+        if language == "en":
+            self.g2p_lang = "en-us"
+            self.expand_abbreviations = english_text_expansion
+            if not silent:
+                print("Created an English Text-Frontend")
+        elif language == "de":
+            self.g2p_lang = "de"
+            self.expand_abbreviations = lambda x: x
+            if not silent:
+                print("Created a German Text-Frontend")
+        elif language == "el":
+            self.g2p_lang = "el"
+            self.expand_abbreviations = lambda x: x
+            if not silent:
+                print("Created a Greek Text-Frontend")
+        elif language == "es":
+            self.g2p_lang = "es"
+            self.expand_abbreviations = lambda x: x
+            if not silent:
+                print("Created a Spanish Text-Frontend")
+        elif language == "fi":
+            self.g2p_lang = "fi"
+            self.expand_abbreviations = lambda x: x
+            if not silent:
+                print("Created a Finnish Text-Frontend")
+        elif language == "ru":
+            self.g2p_lang = "ru"
+            self.expand_abbreviations = lambda x: x
+            if not silent:
+                print("Created a Russian Text-Frontend")
+        elif language == "hu":
+            self.g2p_lang = "hu"
+            self.expand_abbreviations = lambda x: x
+            if not silent:
+                print("Created a Hungarian Text-Frontend")
+        elif language == "nl":
+            self.g2p_lang = "nl"
+            self.expand_abbreviations = lambda x: x
+            if not silent:
+                print("Created a Dutch Text-Frontend")
+        elif language == "fr":
+            self.g2p_lang = "fr-fr"
+            self.expand_abbreviations = lambda x: x
+            if not silent:
+                print("Created a French Text-Frontend")
+        elif language == "it":
+            self.g2p_lang = "it"
+            self.expand_abbreviations = lambda x: x
+            if not silent:
+                print("Created a Italian Text-Frontend")
+        elif language == "pt":
+            self.g2p_lang = "pt"
+            self.expand_abbreviations = lambda x: x
+            if not silent:
+                print("Created a Portuguese Text-Frontend")
+        elif language == "pl":
+            self.g2p_lang = "pl"
+            self.expand_abbreviations = lambda x: x
+            if not silent:
+                print("Created a Polish Text-Frontend")
+        # remember to also update get_language_id() when adding something here
+        else:
+            print("Language not supported yet")
+            sys.exit()
+        self.phone_to_vector_papercup = generate_feature_table()
+        self.phone_to_vector = dict()
+        for phone in self.phone_to_vector_papercup:
+            panphon_features = self.feature_table.word_to_vector_list(phone, numeric=True)
+            if panphon_features == []:
+                panphon_features = [[0] * 24]
+            papercup_features = self.phone_to_vector_papercup[phone]
+            self.phone_to_vector[phone] = papercup_features + panphon_features[0]
+        self.phone_to_id = {  # this lookup must be updated manually, because the only
+            # other way would be extracting them from a set, which can be non-deterministic
+            '~': 0,
+            '#': 1,
+            '?': 2,
+            '!': 3,
+            '.': 4,
+            'ɜ': 5,
+            'ɫ': 6,
+            'ə': 7,
+            'ɚ': 8,
+            'a': 9,
+            'ð': 10,
+            'ɛ': 11,
+            'ɪ': 12,
+            'ᵻ': 13,
+            'ŋ': 14,
+            'ɔ': 15,
+            'ɒ': 16,
+            'ɾ': 17,
+            'ʃ': 18,
+            'θ': 19,
+            'ʊ': 20,
+            'ʌ': 21,
+            'ʒ': 22,
+            'æ': 23,
+            'b': 24,
+            'ʔ': 25,
+            'd': 26,
+            'e': 27,
+            'f': 28,
+            'g': 29,
+            'h': 30,
+            'i': 31,
+            'j': 32,
+            'k': 33,
+            'l': 34,
+            'm': 35,
+            'n': 36,
+            'ɳ': 37,
+            'o': 38,
+            'p': 39,
+            'ɡ': 40,
+            'ɹ': 41,
+            'r': 42,
+            's': 43,
+            't': 44,
+            'u': 45,
+            'v': 46,
+            'w': 47,
+            'x': 48,
+            'z': 49,
+            'ʀ': 50,
+            'ø': 51,
+            'ç': 52,
+            'ɐ': 53,
+            'œ': 54,
+            'y': 55,
+            'ʏ': 56,
+            'ɑ': 57,
+            'c': 58,
+            'ɲ': 59,
+            'ɣ': 60,
+            'ʎ': 61,
+            'β': 62,
+            'ʝ': 63,
+            'ɟ': 64,
+            'q': 65,
+            'ɕ': 66,
+            'ʲ': 67,
+            'ɭ': 68,
+            'ɵ': 69,
+            'ʑ': 70,
+            'ʋ': 71,
+            'ʁ': 72,
+            'ɨ': 73,
+            'ʂ': 74,
+            'ɬ': 75,
+            }  # for the states of the ctc loss and dijkstra/mas in the aligner
+        self.id_to_phone = {v: k for k, v in self.phone_to_id.items()}
+    def string_to_tensor(self, text, view=False, device="cpu", handle_missing=True, input_phonemes=False):
+        """
+        Fixes unicode errors, expands some abbreviations,
+        turns graphemes into phonemes and then vectorizes
+        the sequence as articulatory features
+        """
+        if input_phonemes:
+            phones = text
+        else:
+            phones = self.get_phone_string(text=text, include_eos_symbol=True)
+        if view:
+            print("Phonemes: \n{}\n".format(phones))
+        phones_vector = list()
+        # turn into numeric vectors
+        for char in phones:
+            if handle_missing:
+                try:
+                    phones_vector.append(self.phone_to_vector[char])
+                except KeyError:
+                    print("unknown phoneme: {}".format(char))
+            else:
+                phones_vector.append(self.phone_to_vector[char])  # leave error handling to elsewhere
+        return torch.Tensor(phones_vector, device=device)
+    def get_phone_string(self, text, include_eos_symbol=True):
+        # expand abbreviations
+        utt = self.expand_abbreviations(text)
+        # phonemize
+        phones = phonemizer.phonemize(utt,
+                                      language_switch='remove-flags',
+                                      backend="espeak",
+                                      language=self.g2p_lang,
+                                      preserve_punctuation=True,
+                                      strip=True,
+                                      punctuation_marks=';:,.!?¡¿—…"«»“”~/',
+                                      with_stress=self.use_stress).replace(";", ",").replace("/", " ").replace("—", "") \
+            .replace(":", ",").replace('"', ",").replace("-", ",").replace("...", ",").replace("-", ",").replace("\n", " ") \
+            .replace("\t", " ").replace("¡", "").replace("¿", "").replace(",", "~").replace(" ̃", "").replace('̩', "").replace("̃", "").replace("̪", "")
+        # less than 1 wide characters hidden here
+        phones = re.sub("~+", "~", phones)
+        if not self.use_prosody:
+            # retain ~ as heuristic pause marker, even though all other symbols are removed with this option.
+            # also retain . ? and ! since they can be indicators for the stop token
+            phones = phones.replace("ˌ", "").replace("ː", "").replace("ˑ", "") \
+                .replace("˘", "").replace("|", "").replace("‖", "")
+        if not self.use_word_boundaries:
+            phones = phones.replace(" ", "")
+        else:
+            phones = re.sub(r"\s+", " ", phones)
+            phones = re.sub(" ", "~", phones)
+        if self.strip_silence:
+            phones = phones.lstrip("~").rstrip("~")
+        if self.add_silence_to_end:
+            phones += "~"  # adding a silence in the end during add_silence_to_end produces more natural sounding prosody
+        if include_eos_symbol:
+            phones += "#"
+        phones = "~" + phones
+        phones = re.sub("~+", "~", phones)
+        return phones
+def english_text_expansion(text):
+    """
+    Apply as small part of the tacotron style text cleaning pipeline, suitable for e.g. LJSpeech.
+    See https://github.com/keithito/tacotron/
+    Careful: Only apply to english datasets. Different languages need different cleaners.
+    """
+    _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in
+                      [('Mrs.', 'misess'), ('Mr.', 'mister'), ('Dr.', 'doctor'), ('St.', 'saint'), ('Co.', 'company'), ('Jr.', 'junior'), ('Maj.', 'major'),
+                       ('Gen.', 'general'), ('Drs.', 'doctors'), ('Rev.', 'reverend'), ('Lt.', 'lieutenant'), ('Hon.', 'honorable'), ('Sgt.', 'sergeant'),
+                       ('Capt.', 'captain'), ('Esq.', 'esquire'), ('Ltd.', 'limited'), ('Col.', 'colonel'), ('Ft.', 'fort')]]
+    for regex, replacement in _abbreviations:
+        text = re.sub(regex, replacement, text)
+    return text
+def get_language_id(language):
+    if language == "en":
+        return torch.LongTensor([0])
+    elif language == "de":
+        return torch.LongTensor([1])
+    elif language == "el":
+        return torch.LongTensor([2])
+    elif language == "es":
+        return torch.LongTensor([3])
+    elif language == "fi":
+        return torch.LongTensor([4])
+    elif language == "ru":
+        return torch.LongTensor([5])
+    elif language == "hu":
+        return torch.LongTensor([6])
+    elif language == "nl":
+        return torch.LongTensor([7])
+    elif language == "fr":
+        return torch.LongTensor([8])
+    elif language == "pt":
+        return torch.LongTensor([9])
+    elif language == "pl":
+        return torch.LongTensor([10])
+    elif language == "it":
+        return torch.LongTensor([11])
+if __name__ == '__main__':
+    # test an English utterance
+    tfr_en = ArticulatoryCombinedTextFrontend(language="en")
+    print(tfr_en.string_to_tensor("This is a complex sentence, it even has a pause! But can it do this? Nice.", view=True))
+    tfr_en = ArticulatoryCombinedTextFrontend(language="de")
+    print(tfr_en.string_to_tensor("Alles klar, jetzt testen wir einen deutschen Satz. Ich hoffe es gibt nicht mehr viele unspezifizierte Phoneme.", view=True))

Preprocessing/AudioPreprocessor.py ADDED Viewed

	@@ -0,0 +1,166 @@

+import librosa
+import librosa.core as lb
+import librosa.display as lbd
+import matplotlib.pyplot as plt
+import numpy
+import numpy as np
+import pyloudnorm as pyln
+import torch
+from torchaudio.transforms import Resample
+class AudioPreprocessor:
+    def __init__(self, input_sr, output_sr=None, melspec_buckets=80, hop_length=256, n_fft=1024, cut_silence=False, device="cpu"):
+        """
+        The parameters are by default set up to do well
+        on a 16kHz signal. A different sampling rate may
+        require different hop_length and n_fft (e.g.
+        doubling frequency --> doubling hop_length and
+        doubling n_fft)
+        """
+        self.cut_silence = cut_silence
+        self.device = device
+        self.sr = input_sr
+        self.new_sr = output_sr
+        self.hop_length = hop_length
+        self.n_fft = n_fft
+        self.mel_buckets = melspec_buckets
+        self.meter = pyln.Meter(input_sr)
+        self.final_sr = input_sr
+        if cut_silence:
+            torch.hub._validate_not_a_forked_repo = lambda a, b, c: True  # torch 1.9 has a bug in the hub loading, this is a workaround
+            # careful: assumes 16kHz or 8kHz audio
+            self.silero_model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
+                                                      model='silero_vad',
+                                                      force_reload=False,
+                                                      onnx=False,
+                                                      verbose=False)
+            (self.get_speech_timestamps,
+             self.save_audio,
+             self.read_audio,
+             self.VADIterator,
+             self.collect_chunks) = utils
+            self.silero_model = self.silero_model.to(self.device)
+        if output_sr is not None and output_sr != input_sr:
+            self.resample = Resample(orig_freq=input_sr, new_freq=output_sr).to(self.device)
+            self.final_sr = output_sr
+        else:
+            self.resample = lambda x: x
+    def cut_silence_from_audio(self, audio):
+        """
+        https://github.com/snakers4/silero-vad
+        """
+        return self.collect_chunks(self.get_speech_timestamps(audio, self.silero_model, sampling_rate=self.final_sr), audio)
+    def to_mono(self, x):
+        """
+        make sure we deal with a 1D array
+        """
+        if len(x.shape) == 2:
+            return lb.to_mono(numpy.transpose(x))
+        else:
+            return x
+    def normalize_loudness(self, audio):
+        """
+        normalize the amplitudes according to
+        their decibels, so this should turn any
+        signal with different magnitudes into
+        the same magnitude by analysing loudness
+        """
+        loudness = self.meter.integrated_loudness(audio)
+        loud_normed = pyln.normalize.loudness(audio, loudness, -30.0)
+        peak = numpy.amax(numpy.abs(loud_normed))
+        peak_normed = numpy.divide(loud_normed, peak)
+        return peak_normed
+    def logmelfilterbank(self, audio, sampling_rate, fmin=40, fmax=8000, eps=1e-10):
+        """
+        Compute log-Mel filterbank
+        one day this could be replaced by torchaudio's internal log10(melspec(audio)), but
+        for some reason it gives slightly different results, so in order not to break backwards
+        compatibility, this is kept for now. If there is ever a reason to completely re-train
+        all models, this would be a good opportunity to make the switch.
+        """
+        if isinstance(audio, torch.Tensor):
+            audio = audio.numpy()
+        # get amplitude spectrogram
+        x_stft = librosa.stft(audio, n_fft=self.n_fft, hop_length=self.hop_length, win_length=None, window="hann", pad_mode="reflect")
+        spc = np.abs(x_stft).T
+        # get mel basis
+        fmin = 0 if fmin is None else fmin
+        fmax = sampling_rate / 2 if fmax is None else fmax
+        mel_basis = librosa.filters.mel(sampling_rate, self.n_fft, self.mel_buckets, fmin, fmax)
+        # apply log and return
+        return torch.Tensor(np.log10(np.maximum(eps, np.dot(spc, mel_basis.T)))).transpose(0, 1)
+    def normalize_audio(self, audio):
+        """
+        one function to apply them all in an
+        order that makes sense.
+        """
+        audio = self.to_mono(audio)
+        audio = self.normalize_loudness(audio)
+        audio = torch.Tensor(audio).to(self.device)
+        audio = self.resample(audio)
+        if self.cut_silence:
+            audio = self.cut_silence_from_audio(audio)
+        return audio.to("cpu")
+    def visualize_cleaning(self, unclean_audio):
+        """
+        displays Mel Spectrogram of unclean audio
+        and then displays Mel Spectrogram of the
+        cleaned version.
+        """
+        fig, ax = plt.subplots(nrows=2, ncols=1)
+        unclean_audio_mono = self.to_mono(unclean_audio)
+        unclean_spec = self.audio_to_mel_spec_tensor(unclean_audio_mono, normalize=False).numpy()
+        clean_spec = self.audio_to_mel_spec_tensor(unclean_audio_mono, normalize=True).numpy()
+        lbd.specshow(unclean_spec, sr=self.sr, cmap='GnBu', y_axis='mel', ax=ax[0], x_axis='time')
+        ax[0].set(title='Uncleaned Audio')
+        ax[0].label_outer()
+        if self.new_sr is not None:
+            lbd.specshow(clean_spec, sr=self.new_sr, cmap='GnBu', y_axis='mel', ax=ax[1], x_axis='time')
+        else:
+            lbd.specshow(clean_spec, sr=self.sr, cmap='GnBu', y_axis='mel', ax=ax[1], x_axis='time')
+        ax[1].set(title='Cleaned Audio')
+        ax[1].label_outer()
+        plt.show()
+    def audio_to_wave_tensor(self, audio, normalize=True):
+        if normalize:
+            return self.normalize_audio(audio)
+        else:
+            if isinstance(audio, torch.Tensor):
+                return audio
+            else:
+                return torch.Tensor(audio)
+    def audio_to_mel_spec_tensor(self, audio, normalize=True, explicit_sampling_rate=None):
+        """
+        explicit_sampling_rate is for when
+        normalization has already been applied
+        and that included resampling. No way
+        to detect the current sr of the incoming
+        audio
+        """
+        if explicit_sampling_rate is None:
+            if normalize:
+                audio = self.normalize_audio(audio)
+                return self.logmelfilterbank(audio=audio, sampling_rate=self.final_sr)
+            return self.logmelfilterbank(audio=audio, sampling_rate=self.sr)
+        if normalize:
+            audio = self.normalize_audio(audio)
+        return self.logmelfilterbank(audio=audio, sampling_rate=explicit_sampling_rate)
+if __name__ == '__main__':
+    import soundfile
+    wav, sr = soundfile.read("../audios/test.wav")
+    ap = AudioPreprocessor(input_sr=sr, output_sr=16000)
+    ap.visualize_cleaning(wav)

Preprocessing/ProsodicConditionExtractor.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import soundfile as sf
+import torch
+import torch.multiprocessing
+import torch.multiprocessing
+from numpy import trim_zeros
+from speechbrain.pretrained import EncoderClassifier
+from Preprocessing.AudioPreprocessor import AudioPreprocessor
+class ProsodicConditionExtractor:
+    def __init__(self, sr, device=torch.device("cpu")):
+        self.ap = AudioPreprocessor(input_sr=sr, output_sr=16000, melspec_buckets=80, hop_length=256, n_fft=1024, cut_silence=False)
+        # https://huggingface.co/speechbrain/spkrec-ecapa-voxceleb
+        self.speaker_embedding_func_ecapa = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb",
+                                                                           run_opts={"device": str(device)},
+                                                                           savedir="Models/SpeakerEmbedding/speechbrain_speaker_embedding_ecapa")
+        # https://huggingface.co/speechbrain/spkrec-xvect-voxceleb
+        self.speaker_embedding_func_xvector = EncoderClassifier.from_hparams(source="speechbrain/spkrec-xvect-voxceleb",
+                                                                             run_opts={"device": str(device)},
+                                                                             savedir="Models/SpeakerEmbedding/speechbrain_speaker_embedding_xvector")
+    def extract_condition_from_reference_wave(self, wave, already_normalized=False):
+        if already_normalized:
+            norm_wave = wave
+        else:
+            norm_wave = self.ap.audio_to_wave_tensor(normalize=True, audio=wave)
+            norm_wave = torch.tensor(trim_zeros(norm_wave.numpy()))
+        spk_emb_ecapa = self.speaker_embedding_func_ecapa.encode_batch(wavs=norm_wave.unsqueeze(0)).squeeze()
+        spk_emb_xvector = self.speaker_embedding_func_xvector.encode_batch(wavs=norm_wave.unsqueeze(0)).squeeze()
+        combined_utt_condition = torch.cat([spk_emb_ecapa.cpu(),
+                                            spk_emb_xvector.cpu()], dim=0)
+        return combined_utt_condition
+if __name__ == '__main__':
+    wave, sr = sf.read("../audios/1.wav")
+    ext = ProsodicConditionExtractor(sr=sr)
+    print(ext.extract_condition_from_reference_wave(wave=wave).shape)

Preprocessing/__init__.py ADDED Viewed

File without changes

Preprocessing/papercup_features.py ADDED Viewed

	@@ -0,0 +1,637 @@

+# Derived from  an open-source resource provided by Papercup Technologies Limited
+# Resource-Author: Marlene Staib
+# Modified by Florian Lux, 2021
+def generate_feature_lookup():
+    return {
+        '~': {'symbol_type': 'silence'},
+        '#': {'symbol_type': 'end of sentence'},
+        '?': {'symbol_type': 'questionmark'},
+        '!': {'symbol_type': 'exclamationmark'},
+        '.': {'symbol_type': 'fullstop'},
+        'ɜ': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'central',
+            'vowel_openness'   : 'open-mid',
+            'vowel_roundedness': 'unrounded',
+            },
+        'ɫ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'alveolar',
+            'consonant_manner': 'lateral-approximant',
+            },
+        'ə': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'central',
+            'vowel_openness'   : 'mid',
+            'vowel_roundedness': 'unrounded',
+            },
+        'ɚ': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'central',
+            'vowel_openness'   : 'mid',
+            'vowel_roundedness': 'unrounded',
+            },
+        'a': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'front',
+            'vowel_openness'   : 'open',
+            'vowel_roundedness': 'unrounded',
+            },
+        'ð': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'dental',
+            'consonant_manner': 'fricative'
+            },
+        'ɛ': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'front',
+            'vowel_openness'   : 'open-mid',
+            'vowel_roundedness': 'unrounded',
+            },
+        'ɪ': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'front_central',
+            'vowel_openness'   : 'close_close-mid',
+            'vowel_roundedness': 'unrounded',
+            },
+        'ᵻ': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'central',
+            'vowel_openness'   : 'close',
+            'vowel_roundedness': 'unrounded',
+            },
+        'ŋ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'velar',
+            'consonant_manner': 'nasal'
+            },
+        'ɔ': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'back',
+            'vowel_openness'   : 'open-mid',
+            'vowel_roundedness': 'rounded',
+            },
+        'ɒ': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'back',
+            'vowel_openness'   : 'open',
+            'vowel_roundedness': 'rounded',
+            },
+        'ɾ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'alveolar',
+            'consonant_manner': 'tap'
+            },
+        'ʃ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'postalveolar',
+            'consonant_manner': 'fricative'
+            },
+        'θ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'dental',
+            'consonant_manner': 'fricative'
+            },
+        'ʊ': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'central_back',
+            'vowel_openness'   : 'close_close-mid',
+            'vowel_roundedness': 'unrounded'
+            },
+        'ʌ': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'back',
+            'vowel_openness'   : 'open-mid',
+            'vowel_roundedness': 'unrounded'
+            },
+        'ʒ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'postalveolar',
+            'consonant_manner': 'fricative'
+            },
+        'æ': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'front',
+            'vowel_openness'   : 'open-mid_open',
+            'vowel_roundedness': 'unrounded'
+            },
+        'b': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'bilabial',
+            'consonant_manner': 'stop'
+            },
+        'ʔ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'glottal',
+            'consonant_manner': 'stop'
+            },
+        'd': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'alveolar',
+            'consonant_manner': 'stop'
+            },
+        'e': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'front',
+            'vowel_openness'   : 'close-mid',
+            'vowel_roundedness': 'unrounded'
+            },
+        'f': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'labiodental',
+            'consonant_manner': 'fricative'
+            },
+        'g': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'velar',
+            'consonant_manner': 'stop'
+            },
+        'h': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'glottal',
+            'consonant_manner': 'fricative'
+            },
+        'i': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'front',
+            'vowel_openness'   : 'close',
+            'vowel_roundedness': 'unrounded'
+            },
+        'j': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'palatal',
+            'consonant_manner': 'approximant'
+            },
+        'k': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'velar',
+            'consonant_manner': 'stop'
+            },
+        'l': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'alveolar',
+            'consonant_manner': 'lateral-approximant'
+            },
+        'm': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'bilabial',
+            'consonant_manner': 'nasal'
+            },
+        'n': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'alveolar',
+            'consonant_manner': 'nasal'
+            },
+        'ɳ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'palatal',
+            'consonant_manner': 'nasal'
+            },
+        'o': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'back',
+            'vowel_openness'   : 'close-mid',
+            'vowel_roundedness': 'rounded'
+            },
+        'p': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'bilabial',
+            'consonant_manner': 'stop'
+            },
+        'ɡ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'velar',
+            'consonant_manner': 'stop'
+            },
+        'ɹ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'alveolar',
+            'consonant_manner': 'approximant'
+            },
+        'r': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'alveolar',
+            'consonant_manner': 'trill'
+            },
+        's': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'alveolar',
+            'consonant_manner': 'fricative'
+            },
+        't': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'alveolar',
+            'consonant_manner': 'stop'
+            },
+        'u': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'back',
+            'vowel_openness'   : 'close',
+            'vowel_roundedness': 'rounded',
+            },
+        'v': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'labiodental',
+            'consonant_manner': 'fricative'
+            },
+        'w': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'labial-velar',
+            'consonant_manner': 'approximant'
+            },
+        'x': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'velar',
+            'consonant_manner': 'fricative'
+            },
+        'z': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'alveolar',
+            'consonant_manner': 'fricative'
+            },
+        'ʀ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'uvular',
+            'consonant_manner': 'trill'
+            },
+        'ø': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'front',
+            'vowel_openness'   : 'close-mid',
+            'vowel_roundedness': 'rounded'
+            },
+        'ç': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'palatal',
+            'consonant_manner': 'fricative'
+            },
+        'ɐ': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'central',
+            'vowel_openness'   : 'open',
+            'vowel_roundedness': 'unrounded'
+            },
+        'œ': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'front',
+            'vowel_openness'   : 'open-mid',
+            'vowel_roundedness': 'rounded'
+            },
+        'y': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'front',
+            'vowel_openness'   : 'close',
+            'vowel_roundedness': 'rounded'
+            },
+        'ʏ': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'front_central',
+            'vowel_openness'   : 'close_close-mid',
+            'vowel_roundedness': 'rounded'
+            },
+        'ɑ': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'back',
+            'vowel_openness'   : 'open',
+            'vowel_roundedness': 'unrounded'
+            },
+        'c': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'palatal',
+            'consonant_manner': 'stop'
+            },
+        'ɲ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'palatal',
+            'consonant_manner': 'nasal'
+            },
+        'ɣ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'velar',
+            'consonant_manner': 'fricative'
+            },
+        'ʎ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'palatal',
+            'consonant_manner': 'lateral-approximant'
+            },
+        'β': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'bilabial',
+            'consonant_manner': 'fricative'
+            },
+        'ʝ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'palatal',
+            'consonant_manner': 'fricative'
+            },
+        'ɟ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'palatal',
+            'consonant_manner': 'stop'
+            },
+        'q': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'uvular',
+            'consonant_manner': 'stop'
+            },
+        'ɕ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'alveolopalatal',
+            'consonant_manner': 'fricative'
+            },
+        'ʲ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'palatal',
+            'consonant_manner': 'approximant'
+            },
+        'ɭ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'palatal',  # should be retroflex, but palatal should be close enough
+            'consonant_manner': 'lateral-approximant'
+            },
+        'ɵ': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'central',
+            'vowel_openness'   : 'open-mid',
+            'vowel_roundedness': 'rounded'
+            },
+        'ʑ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'alveolopalatal',
+            'consonant_manner': 'fricative'
+            },
+        'ʋ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'labiodental',
+            'consonant_manner': 'approximant'
+            },
+        'ʁ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'uvular',
+            'consonant_manner': 'fricative'
+            },
+        'ɨ': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'central',
+            'vowel_openness'   : 'close',
+            'vowel_roundedness': 'unrounded'
+            },
+        'ʂ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'palatal',  # should be retroflex, but palatal should be close enough
+            'consonant_manner': 'fricative'
+            },
+        'ɬ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'alveolar',  # should be noted it's also lateral, but should be close enough
+            'consonant_manner': 'fricative'
+            },
+        }  # REMEMBER to also add the phonemes added here to the ID lookup table in the TextFrontend as the new highest ID
+def generate_feature_table():
+    ipa_to_phonemefeats = generate_feature_lookup()
+    feat_types = set()
+    for ipa in ipa_to_phonemefeats:
+        if len(ipa) == 1:
+            [feat_types.add(feat) for feat in ipa_to_phonemefeats[ipa].keys()]
+    feat_to_val_set = dict()
+    for feat in feat_types:
+        feat_to_val_set[feat] = set()
+    for ipa in ipa_to_phonemefeats:
+        if len(ipa) == 1:
+            for feat in ipa_to_phonemefeats[ipa]:
+                feat_to_val_set[feat].add(ipa_to_phonemefeats[ipa][feat])
+    # print(feat_to_val_set)
+    value_list = set()
+    for val_set in [feat_to_val_set[feat] for feat in feat_to_val_set]:
+        for value in val_set:
+            value_list.add(value)
+    # print("{")
+    # for index, value in enumerate(list(value_list)):
+    #     print('"{}":{},'.format(value,index))
+    # print("}")
+    value_to_index = {
+        "dental"             : 0,
+        "postalveolar"       : 1,
+        "mid"                : 2,
+        "close-mid"          : 3,
+        "vowel"              : 4,
+        "silence"            : 5,
+        "consonant"          : 6,
+        "close"              : 7,
+        "velar"              : 8,
+        "stop"               : 9,
+        "palatal"            : 10,
+        "nasal"              : 11,
+        "glottal"            : 12,
+        "central"            : 13,
+        "back"               : 14,
+        "approximant"        : 15,
+        "uvular"             : 16,
+        "open-mid"           : 17,
+        "front_central"      : 18,
+        "front"              : 19,
+        "end of sentence"    : 20,
+        "labiodental"        : 21,
+        "close_close-mid"    : 22,
+        "labial-velar"       : 23,
+        "unvoiced"           : 24,
+        "central_back"       : 25,
+        "trill"              : 26,
+        "rounded"            : 27,
+        "open-mid_open"      : 28,
+        "tap"                : 29,
+        "alveolar"           : 30,
+        "bilabial"           : 31,
+        "phoneme"            : 32,
+        "open"               : 33,
+        "fricative"          : 34,
+        "unrounded"          : 35,
+        "lateral-approximant": 36,
+        "voiced"             : 37,
+        "questionmark"       : 38,
+        "exclamationmark"    : 39,
+        "fullstop"           : 40,
+        "alveolopalatal"     : 41
+        }
+    phone_to_vector = dict()
+    for ipa in ipa_to_phonemefeats:
+        if len(ipa) == 1:
+            phone_to_vector[ipa] = [0] * sum([len(values) for values in [feat_to_val_set[feat] for feat in feat_to_val_set]])
+            for feat in ipa_to_phonemefeats[ipa]:
+                if ipa_to_phonemefeats[ipa][feat] in value_to_index:
+                    phone_to_vector[ipa][value_to_index[ipa_to_phonemefeats[ipa][feat]]] = 1
+    for feat in feat_to_val_set:
+        for value in feat_to_val_set[feat]:
+            if value not in value_to_index:
+                print(f"Unknown feature value in featureset! {value}")
+    # print(f"{sum([len(values) for values in [feat_to_val_set[feat] for feat in feat_to_val_set]])} should be 42")
+    return phone_to_vector
+def generate_phone_to_id_lookup():
+    ipa_to_phonemefeats = generate_feature_lookup()
+    count = 0
+    phone_to_id = dict()
+    for key in sorted(list(ipa_to_phonemefeats)):  # careful: non-deterministic
+        phone_to_id[key] = count
+        count += 1
+    return phone_to_id
+if __name__ == '__main__':
+    print(generate_phone_to_id_lookup())

README.md CHANGED Viewed

@@ -1,8 +1,8 @@
 ---
 title: SpeechCloning
-emoji: 📈
-colorFrom: gray
-colorTo: yellow
 sdk: gradio
 app_file: app.py
 pinned: false

 ---
 title: SpeechCloning
+emoji: 🦜
+colorFrom: purple
+colorTo: red
 sdk: gradio
 app_file: app.py
 pinned: false

TrainingInterfaces/Text_to_Spectrogram/AutoAligner/Aligner.py ADDED Viewed

	@@ -0,0 +1,287 @@

+"""
+taken and adapted from https://github.com/as-ideas/DeepForcedAligner
+"""
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+import torch.multiprocessing
+import torch.nn as nn
+from scipy.sparse import coo_matrix
+from scipy.sparse.csgraph import dijkstra
+from torch.nn import CTCLoss
+from torch.nn.utils.rnn import pack_padded_sequence
+from torch.nn.utils.rnn import pad_packed_sequence
+from Preprocessing.ArticulatoryCombinedTextFrontend import ArticulatoryCombinedTextFrontend
+class BatchNormConv(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int, kernel_size: int):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            in_channels, out_channels, kernel_size,
+            stride=1, padding=kernel_size // 2, bias=False)
+        self.bnorm = nn.BatchNorm1d(out_channels)
+        self.relu = nn.ReLU()
+    def forward(self, x):
+        x = x.transpose(1, 2)
+        x = self.conv(x)
+        x = self.relu(x)
+        x = self.bnorm(x)
+        x = x.transpose(1, 2)
+        return x
+class Aligner(torch.nn.Module):
+    def __init__(self,
+                 n_mels=80,
+                 num_symbols=145,
+                 lstm_dim=512,
+                 conv_dim=512):
+        super().__init__()
+        self.convs = nn.ModuleList([
+            BatchNormConv(n_mels, conv_dim, 3),
+            nn.Dropout(p=0.5),
+            BatchNormConv(conv_dim, conv_dim, 3),
+            nn.Dropout(p=0.5),
+            BatchNormConv(conv_dim, conv_dim, 3),
+            nn.Dropout(p=0.5),
+            BatchNormConv(conv_dim, conv_dim, 3),
+            nn.Dropout(p=0.5),
+            BatchNormConv(conv_dim, conv_dim, 3),
+            nn.Dropout(p=0.5),
+            ])
+        self.rnn = torch.nn.LSTM(conv_dim, lstm_dim, batch_first=True, bidirectional=True)
+        self.proj = torch.nn.Linear(2 * lstm_dim, num_symbols)
+        self.tf = ArticulatoryCombinedTextFrontend(language="en")
+        self.ctc_loss = CTCLoss(blank=144, zero_infinity=True)
+        self.vector_to_id = dict()
+        for phone in self.tf.phone_to_vector:
+            self.vector_to_id[tuple(self.tf.phone_to_vector[phone])] = self.tf.phone_to_id[phone]
+    def forward(self, x, lens=None):
+        for conv in self.convs:
+            x = conv(x)
+        if lens is not None:
+            x = pack_padded_sequence(x, lens.cpu(), batch_first=True, enforce_sorted=False)
+        x, _ = self.rnn(x)
+        if lens is not None:
+            x, _ = pad_packed_sequence(x, batch_first=True)
+        x = self.proj(x)
+        return x
+    @torch.no_grad()
+    def label_speech(self, speech):
+        # theoretically possible, but doesn't work well at all. Would probably require a beamsearch
+        probabilities_of_phones_over_frames = self(speech.unsqueeze(0)).squeeze()[:, :73]
+        smoothed_phone_probs_over_frames = list()
+        for index, _ in enumerate(probabilities_of_phones_over_frames):
+            access_safe_prev_index = max(0, index - 1)
+            access_safe_next_index = min(index + 1, len(probabilities_of_phones_over_frames) - 1)
+            smoothed_probs = (probabilities_of_phones_over_frames[access_safe_prev_index] +
+                              probabilities_of_phones_over_frames[access_safe_next_index] +
+                              probabilities_of_phones_over_frames[index]) / 3
+            smoothed_phone_probs_over_frames.append(smoothed_probs.unsqueeze(0))
+        print(torch.cat(smoothed_phone_probs_over_frames))
+        _, phone_ids_over_frames = torch.max(torch.cat(smoothed_phone_probs_over_frames), dim=1)
+        phone_ids = torch.unique_consecutive(phone_ids_over_frames)
+        phones = list()
+        for id_of_phone in phone_ids:
+            phones.append(self.tf.id_to_phone[int(id_of_phone)])
+        return "".join(phones)
+    @torch.inference_mode()
+    def inference(self, mel, tokens, save_img_for_debug=None, train=False, pathfinding="MAS", return_ctc=False):
+        if not train:
+            tokens_indexed = list()  # first we need to convert the articulatory vectors to IDs, so we can apply dijkstra or viterbi
+            for vector in tokens:
+                tokens_indexed.append(self.vector_to_id[tuple(vector.cpu().detach().numpy().tolist())])
+            tokens = np.asarray(tokens_indexed)
+        else:
+            tokens = tokens.cpu().detach().numpy()
+        pred = self(mel.unsqueeze(0))
+        if return_ctc:
+            ctc_loss = self.ctc_loss(pred.transpose(0, 1).log_softmax(2), torch.LongTensor(tokens), torch.LongTensor([len(pred[0])]),
+                                     torch.LongTensor([len(tokens)])).item()
+        pred = pred.squeeze().cpu().detach().numpy()
+        pred_max = pred[:, tokens]
+        path_probs = 1. - pred_max
+        adj_matrix = to_adj_matrix(path_probs)
+        if pathfinding == "MAS":
+            alignment_matrix = binarize_alignment(pred_max)
+            if save_img_for_debug is not None:
+                phones = list()
+                for index in tokens:
+                    for phone in self.tf.phone_to_id:
+                        if self.tf.phone_to_id[phone] == index:
+                            phones.append(phone)
+                fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 4))
+                ax.imshow(alignment_matrix, interpolation='nearest', aspect='auto', origin="lower", cmap='cividis')
+                ax.set_ylabel("Mel-Frames")
+                ax.set_xticks(range(len(pred_max[0])))
+                ax.set_xticklabels(labels=phones)
+                ax.set_title("MAS Path")
+                plt.tight_layout()
+                fig.savefig(save_img_for_debug)
+                fig.clf()
+                plt.close()
+            if return_ctc:
+                return alignment_matrix, ctc_loss
+            return alignment_matrix
+        elif pathfinding == "dijkstra":
+            dist_matrix, predecessors, *_ = dijkstra(csgraph=adj_matrix,
+                                                     directed=True,
+                                                     indices=0,
+                                                     return_predecessors=True)
+            path = []
+            pr_index = predecessors[-1]
+            while pr_index != 0:
+                path.append(pr_index)
+                pr_index = predecessors[pr_index]
+            path.reverse()
+            # append first and last node
+            path = [0] + path + [dist_matrix.size - 1]
+            cols = path_probs.shape[1]
+            mel_text = {}
+            # collect indices (mel, text) along the path
+            for node_index in path:
+                i, j = from_node_index(node_index, cols)
+                mel_text[i] = j
+            path_plot = np.zeros_like(pred_max)
+            for i in mel_text:
+                path_plot[i][mel_text[i]] = 1.0
+            if save_img_for_debug is not None:
+                phones = list()
+                for index in tokens:
+                    for phone in self.tf.phone_to_id:
+                        if self.tf.phone_to_id[phone] == index:
+                            phones.append(phone)
+                fig, ax = plt.subplots(nrows=2, ncols=1, figsize=(10, 9))
+                ax[0].imshow(pred_max, interpolation='nearest', aspect='auto', origin="lower")
+                ax[1].imshow(path_plot, interpolation='nearest', aspect='auto', origin="lower", cmap='cividis')
+                ax[0].set_ylabel("Mel-Frames")
+                ax[1].set_ylabel("Mel-Frames")
+                ax[0].set_xticks(range(len(pred_max[0])))
+                ax[0].set_xticklabels(labels=phones)
+                ax[1].set_xticks(range(len(pred_max[0])))
+                ax[1].set_xticklabels(labels=phones)
+                ax[0].set_title("Path Probabilities")
+                ax[1].set_title("Dijkstra Path")
+                plt.tight_layout()
+                fig.savefig(save_img_for_debug)
+                fig.clf()
+                plt.close()
+            if return_ctc:
+                return path_plot, ctc_loss
+            return path_plot
+def binarize_alignment(alignment_prob):
+    """
+    # Implementation by:
+    # https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/SpeechSynthesis/FastPitch/fastpitch/alignment.py
+    # https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/SpeechSynthesis/FastPitch/fastpitch/attn_loss_function.py
+    Binarizes alignment with MAS.
+    """
+    # assumes mel x text
+    opt = np.zeros_like(alignment_prob)
+    alignment_prob = alignment_prob + (np.abs(alignment_prob).max() + 1.0)  # make all numbers positive and add an offset to avoid log of 0 later
+    alignment_prob * alignment_prob * (1.0 / alignment_prob.max())  # normalize to (0,  1]
+    attn_map = np.log(alignment_prob)
+    attn_map[0, 1:] = -np.inf
+    log_p = np.zeros_like(attn_map)
+    log_p[0, :] = attn_map[0, :]
+    prev_ind = np.zeros_like(attn_map, dtype=np.int64)
+    for i in range(1, attn_map.shape[0]):
+        for j in range(attn_map.shape[1]):  # for each text dim
+            prev_log = log_p[i - 1, j]
+            prev_j = j
+            if j - 1 >= 0 and log_p[i - 1, j - 1] >= log_p[i - 1, j]:
+                prev_log = log_p[i - 1, j - 1]
+                prev_j = j - 1
+            log_p[i, j] = attn_map[i, j] + prev_log
+            prev_ind[i, j] = prev_j
+    # now backtrack
+    curr_text_idx = attn_map.shape[1] - 1
+    for i in range(attn_map.shape[0] - 1, -1, -1):
+        opt[i, curr_text_idx] = 1
+        curr_text_idx = prev_ind[i, curr_text_idx]
+    opt[0, curr_text_idx] = 1
+    return opt
+def to_node_index(i, j, cols):
+    return cols * i + j
+def from_node_index(node_index, cols):
+    return node_index // cols, node_index % cols
+def to_adj_matrix(mat):
+    rows = mat.shape[0]
+    cols = mat.shape[1]
+    row_ind = []
+    col_ind = []
+    data = []
+    for i in range(rows):
+        for j in range(cols):
+            node = to_node_index(i, j, cols)
+            if j < cols - 1:
+                right_node = to_node_index(i, j + 1, cols)
+                weight_right = mat[i, j + 1]
+                row_ind.append(node)
+                col_ind.append(right_node)
+                data.append(weight_right)
+            if i < rows - 1 and j < cols:
+                bottom_node = to_node_index(i + 1, j, cols)
+                weight_bottom = mat[i + 1, j]
+                row_ind.append(node)
+                col_ind.append(bottom_node)
+                data.append(weight_bottom)
+            if i < rows - 1 and j < cols - 1:
+                bottom_right_node = to_node_index(i + 1, j + 1, cols)
+                weight_bottom_right = mat[i + 1, j + 1]
+                row_ind.append(node)
+                col_ind.append(bottom_right_node)
+                data.append(weight_bottom_right)
+    adj_mat = coo_matrix((data, (row_ind, col_ind)), shape=(rows * cols, rows * cols))
+    return adj_mat.tocsr()

TrainingInterfaces/Text_to_Spectrogram/AutoAligner/AlignerDataset.py ADDED Viewed

	@@ -0,0 +1,211 @@

+import os
+import random
+import warnings
+import soundfile as sf
+import torch
+from numpy import trim_zeros
+from speechbrain.pretrained import EncoderClassifier
+from torch.multiprocessing import Manager
+from torch.multiprocessing import Process
+from torch.multiprocessing import set_start_method
+from torch.utils.data import Dataset
+from tqdm import tqdm
+from Preprocessing.ArticulatoryCombinedTextFrontend import ArticulatoryCombinedTextFrontend
+from Preprocessing.AudioPreprocessor import AudioPreprocessor
+class AlignerDataset(Dataset):
+    def __init__(self,
+                 path_to_transcript_dict,
+                 cache_dir,
+                 lang,
+                 loading_processes=30,  # careful with the amount of processes if you use silence removal, only as many processes as you have cores
+                 min_len_in_seconds=1,
+                 max_len_in_seconds=20,
+                 cut_silences=False,
+                 rebuild_cache=False,
+                 verbose=False,
+                 device="cpu"):
+        os.makedirs(cache_dir, exist_ok=True)
+        if not os.path.exists(os.path.join(cache_dir, "aligner_train_cache.pt")) or rebuild_cache:
+            if (device == "cuda" or device == torch.device("cuda")) and cut_silences:
+                try:
+                    set_start_method('spawn')  # in order to be able to make use of cuda in multiprocessing
+                except RuntimeError:
+                    pass
+            elif cut_silences:
+                torch.set_num_threads(1)
+            if cut_silences:
+                torch.hub.load(repo_or_dir='snakers4/silero-vad',
+                               model='silero_vad',
+                               force_reload=False,
+                               onnx=False,
+                               verbose=False)  # download and cache for it to be loaded and used later
+                torch.set_grad_enabled(True)
+            resource_manager = Manager()
+            self.path_to_transcript_dict = resource_manager.dict(path_to_transcript_dict)
+            key_list = list(self.path_to_transcript_dict.keys())
+            with open(os.path.join(cache_dir, "files_used.txt"), encoding='utf8', mode="w") as files_used_note:
+                files_used_note.write(str(key_list))
+            random.shuffle(key_list)
+            # build cache
+            print("... building dataset cache ...")
+            self.datapoints = resource_manager.list()
+            # make processes
+            key_splits = list()
+            process_list = list()
+            for i in range(loading_processes):
+                key_splits.append(key_list[i * len(key_list) // loading_processes:(i + 1) * len(key_list) // loading_processes])
+            for key_split in key_splits:
+                process_list.append(
+                    Process(target=self.cache_builder_process,
+                            args=(key_split,
+                                  lang,
+                                  min_len_in_seconds,
+                                  max_len_in_seconds,
+                                  cut_silences,
+                                  verbose,
+                                  device),
+                            daemon=True))
+                process_list[-1].start()
+            for process in process_list:
+                process.join()
+            self.datapoints = list(self.datapoints)
+            tensored_datapoints = list()
+            # we had to turn all of the tensors to numpy arrays to avoid shared memory
+            # issues. Now that the multi-processing is over, we can convert them back
+            # to tensors to save on conversions in the future.
+            print("Converting into convenient format...")
+            norm_waves = list()
+            for datapoint in tqdm(self.datapoints):
+                tensored_datapoints.append([torch.Tensor(datapoint[0]),
+                                            torch.LongTensor(datapoint[1]),
+                                            torch.Tensor(datapoint[2]),
+                                            torch.LongTensor(datapoint[3])])
+                norm_waves.append(torch.Tensor(datapoint[-1]))
+            self.datapoints = tensored_datapoints
+            pop_indexes = list()
+            for index, el in enumerate(self.datapoints):
+                try:
+                    if len(el[0][0]) != 66:
+                        pop_indexes.append(index)
+                except TypeError:
+                    pop_indexes.append(index)
+            for pop_index in sorted(pop_indexes, reverse=True):
+                print(f"There seems to be a problem in the transcriptions. Deleting datapoint {pop_index}.")
+                self.datapoints.pop(pop_index)
+            # add speaker embeddings
+            self.speaker_embeddings = list()
+            speaker_embedding_func_ecapa = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb",
+                                                                          run_opts={"device": str(device)},
+                                                                          savedir="Models/SpeakerEmbedding/speechbrain_speaker_embedding_ecapa")
+            with torch.no_grad():
+                for wave in tqdm(norm_waves):
+                    self.speaker_embeddings.append(speaker_embedding_func_ecapa.encode_batch(wavs=wave.to(device).unsqueeze(0)).squeeze().cpu())
+            # save to cache
+            torch.save((self.datapoints, norm_waves, self.speaker_embeddings), os.path.join(cache_dir, "aligner_train_cache.pt"))
+        else:
+            # just load the datapoints from cache
+            self.datapoints = torch.load(os.path.join(cache_dir, "aligner_train_cache.pt"), map_location='cpu')
+            if len(self.datapoints) == 2:
+                # speaker embeddings are still missing, have to add them here
+                wave_datapoints = self.datapoints[1]
+                self.datapoints = self.datapoints[0]
+                self.speaker_embeddings = list()
+                speaker_embedding_func_ecapa = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb",
+                                                                              run_opts={"device": str(device)},
+                                                                              savedir="Models/SpeakerEmbedding/speechbrain_speaker_embedding_ecapa")
+                with torch.no_grad():
+                    for wave in tqdm(wave_datapoints):
+                        self.speaker_embeddings.append(speaker_embedding_func_ecapa.encode_batch(wavs=wave.to(device).unsqueeze(0)).squeeze().cpu())
+                torch.save((self.datapoints, wave_datapoints, self.speaker_embeddings), os.path.join(cache_dir, "aligner_train_cache.pt"))
+            else:
+                self.speaker_embeddings = self.datapoints[2]
+                self.datapoints = self.datapoints[0]
+        self.tf = ArticulatoryCombinedTextFrontend(language=lang, use_word_boundaries=True)
+        print(f"Prepared an Aligner dataset with {len(self.datapoints)} datapoints in {cache_dir}.")
+    def cache_builder_process(self,
+                              path_list,
+                              lang,
+                              min_len,
+                              max_len,
+                              cut_silences,
+                              verbose,
+                              device):
+        process_internal_dataset_chunk = list()
+        tf = ArticulatoryCombinedTextFrontend(language=lang, use_word_boundaries=False)
+        _, sr = sf.read(path_list[0])
+        ap = AudioPreprocessor(input_sr=sr, output_sr=16000, melspec_buckets=80, hop_length=256, n_fft=1024, cut_silence=cut_silences, device=device)
+        for path in tqdm(path_list):
+            if self.path_to_transcript_dict[path].strip() == "":
+                continue
+            wave, sr = sf.read(path)
+            dur_in_seconds = len(wave) / sr
+            if not (min_len <= dur_in_seconds <= max_len):
+                if verbose:
+                    print(f"Excluding {path} because of its duration of {round(dur_in_seconds, 2)} seconds.")
+                continue
+            try:
+                with warnings.catch_warnings():
+                    warnings.simplefilter("ignore")  # otherwise we get tons of warnings about an RNN not being in contiguous chunks
+                    norm_wave = ap.audio_to_wave_tensor(normalize=True, audio=wave)
+            except ValueError:
+                continue
+            dur_in_seconds = len(norm_wave) / 16000
+            if not (min_len <= dur_in_seconds <= max_len):
+                if verbose:
+                    print(f"Excluding {path} because of its duration of {round(dur_in_seconds, 2)} seconds.")
+                continue
+            norm_wave = torch.tensor(trim_zeros(norm_wave.numpy()))
+            # raw audio preprocessing is done
+            transcript = self.path_to_transcript_dict[path]
+            try:
+                cached_text = tf.string_to_tensor(transcript, handle_missing=False).squeeze(0).cpu().numpy()
+            except KeyError:
+                tf.string_to_tensor(transcript, handle_missing=True).squeeze(0).cpu().numpy()
+                continue  # we skip sentences with unknown symbols
+            try:
+                if len(cached_text[0]) != 66:
+                    print(f"There seems to be a problem with the following transcription: {transcript}")
+                    continue
+            except TypeError:
+                print(f"There seems to be a problem with the following transcription: {transcript}")
+                continue
+            cached_text_len = torch.LongTensor([len(cached_text)]).numpy()
+            cached_speech = ap.audio_to_mel_spec_tensor(audio=norm_wave, normalize=False, explicit_sampling_rate=16000).transpose(0, 1).cpu().numpy()
+            cached_speech_len = torch.LongTensor([len(cached_speech)]).numpy()
+            process_internal_dataset_chunk.append([cached_text,
+                                                   cached_text_len,
+                                                   cached_speech,
+                                                   cached_speech_len,
+                                                   norm_wave.cpu().detach().numpy()])
+        self.datapoints += process_internal_dataset_chunk
+    def __getitem__(self, index):
+        text_vector = self.datapoints[index][0]
+        tokens = list()
+        for vector in text_vector:
+            for phone in self.tf.phone_to_vector:
+                if vector.numpy().tolist() == self.tf.phone_to_vector[phone]:
+                    tokens.append(self.tf.phone_to_id[phone])
+                    # this is terribly inefficient, but it's good enough for testing for now.
+        tokens = torch.LongTensor(tokens)
+        return tokens, \
+               self.datapoints[index][1], \
+               self.datapoints[index][2], \
+               self.datapoints[index][3], \
+               self.speaker_embeddings[index]
+    def __len__(self):
+        return len(self.datapoints)

TrainingInterfaces/Text_to_Spectrogram/AutoAligner/TinyTTS.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import torch
+import torch.multiprocessing
+from torch.nn.utils.rnn import pack_padded_sequence
+from torch.nn.utils.rnn import pad_packed_sequence
+from Utility.utils import make_non_pad_mask
+class TinyTTS(torch.nn.Module):
+    def __init__(self,
+                 n_mels=80,
+                 num_symbols=145,
+                 speaker_embedding_dim=192,
+                 lstm_dim=512):
+        super().__init__()
+        self.in_proj = torch.nn.Linear(num_symbols + speaker_embedding_dim, lstm_dim)
+        self.rnn1 = torch.nn.LSTM(lstm_dim, lstm_dim, batch_first=True, bidirectional=True)
+        self.rnn2 = torch.nn.LSTM(2 * lstm_dim, lstm_dim, batch_first=True, bidirectional=True)
+        self.out_proj = torch.nn.Linear(2 * lstm_dim, n_mels)
+        self.l1_criterion = torch.nn.L1Loss(reduction="none")
+        self.l2_criterion = torch.nn.MSELoss(reduction="none")
+    def forward(self, x, lens, ys):
+        x = self.in_proj(x)
+        x = pack_padded_sequence(x, lens.cpu(), batch_first=True, enforce_sorted=False)
+        x, _ = self.rnn1(x)
+        x, _ = self.rnn2(x)
+        x, _ = pad_packed_sequence(x, batch_first=True)
+        x = self.out_proj(x)
+        out_masks = make_non_pad_mask(lens).unsqueeze(-1).to(ys.device)
+        out_weights = out_masks.float() / out_masks.sum(dim=1, keepdim=True).float()
+        out_weights /= ys.size(0) * ys.size(2)
+        l1_loss = self.l1_criterion(x, ys).mul(out_weights).masked_select(out_masks).sum()
+        l2_loss = self.l2_criterion(x, ys).mul(out_weights).masked_select(out_masks).sum()
+        return l1_loss + l2_loss

TrainingInterfaces/Text_to_Spectrogram/AutoAligner/__init__.py ADDED Viewed

File without changes

TrainingInterfaces/Text_to_Spectrogram/AutoAligner/autoaligner_train_loop.py ADDED Viewed

	@@ -0,0 +1,145 @@

+import os
+import time
+import torch
+import torch.multiprocessing
+from torch.nn.utils.rnn import pad_sequence
+from torch.optim import RAdam
+from torch.utils.data.dataloader import DataLoader
+from tqdm import tqdm
+from TrainingInterfaces.Text_to_Spectrogram.AutoAligner.Aligner import Aligner
+from TrainingInterfaces.Text_to_Spectrogram.AutoAligner.TinyTTS import TinyTTS
+def collate_and_pad(batch):
+    # text, text_len, speech, speech_len
+    return (pad_sequence([datapoint[0] for datapoint in batch], batch_first=True),
+            torch.stack([datapoint[1] for datapoint in batch]).squeeze(1),
+            pad_sequence([datapoint[2] for datapoint in batch], batch_first=True),
+            torch.stack([datapoint[3] for datapoint in batch]).squeeze(1),
+            torch.stack([datapoint[4] for datapoint in batch]).squeeze())
+def train_loop(train_dataset,
+               device,
+               save_directory,
+               batch_size,
+               steps,
+               path_to_checkpoint=None,
+               fine_tune=False,
+               resume=False,
+               debug_img_path=None,
+               use_reconstruction=True):
+    """
+    Args:
+        resume: whether to resume from the most recent checkpoint
+        steps: How many steps to train
+        path_to_checkpoint: reloads a checkpoint to continue training from there
+        fine_tune: whether to load everything from a checkpoint, or only the model parameters
+        train_dataset: Pytorch Dataset Object for train data
+        device: Device to put the loaded tensors on
+        save_directory: Where to save the checkpoints
+        batch_size: How many elements should be loaded at once
+    """
+    os.makedirs(save_directory, exist_ok=True)
+    train_loader = DataLoader(batch_size=batch_size,
+                              dataset=train_dataset,
+                              drop_last=True,
+                              num_workers=8,
+                              pin_memory=False,
+                              shuffle=True,
+                              prefetch_factor=16,
+                              collate_fn=collate_and_pad,
+                              persistent_workers=True)
+    asr_model = Aligner().to(device)
+    optim_asr = RAdam(asr_model.parameters(), lr=0.0001)
+    tiny_tts = TinyTTS().to(device)
+    optim_tts = RAdam(tiny_tts.parameters(), lr=0.0001)
+    step_counter = 0
+    if resume:
+        previous_checkpoint = os.path.join(save_directory, "aligner.pt")
+        path_to_checkpoint = previous_checkpoint
+        fine_tune = False
+    if path_to_checkpoint is not None:
+        check_dict = torch.load(os.path.join(path_to_checkpoint), map_location=device)
+        asr_model.load_state_dict(check_dict["asr_model"])
+        tiny_tts.load_state_dict(check_dict["tts_model"])
+        if not fine_tune:
+            optim_asr.load_state_dict(check_dict["optimizer"])
+            optim_tts.load_state_dict(check_dict["tts_optimizer"])
+            step_counter = check_dict["step_counter"]
+            if step_counter > steps:
+                print("Desired steps already reached in loaded checkpoint.")
+                return
+    start_time = time.time()
+    while True:
+        loss_sum = list()
+        asr_model.train()
+        tiny_tts.train()
+        for batch in tqdm(train_loader):
+            tokens = batch[0].to(device)
+            tokens_len = batch[1].to(device)
+            mel = batch[2].to(device)
+            mel_len = batch[3].to(device)
+            speaker_embeddings = batch[4].to(device)
+            pred = asr_model(mel, mel_len)
+            ctc_loss = asr_model.ctc_loss(pred.transpose(0, 1).log_softmax(2),
+                                          tokens,
+                                          mel_len,
+                                          tokens_len)
+            if use_reconstruction:
+                speaker_embeddings_expanded = torch.nn.functional.normalize(speaker_embeddings).unsqueeze(1).expand(-1, pred.size(1), -1)
+                tts_lambda = min([5, step_counter / 2000])  # super simple schedule
+                reconstruction_loss = tiny_tts(x=torch.cat([pred, speaker_embeddings_expanded], dim=-1),
+                                               # combine ASR prediction with speaker embeddings to allow for reconstruction loss on multiple speakers
+                                               lens=mel_len,
+                                               ys=mel) * tts_lambda  # reconstruction loss to make the states more distinct
+                loss = ctc_loss + reconstruction_loss
+            else:
+                loss = ctc_loss
+            optim_asr.zero_grad()
+            if use_reconstruction:
+                optim_tts.zero_grad()
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(asr_model.parameters(), 1.0)
+            if use_reconstruction:
+                torch.nn.utils.clip_grad_norm_(tiny_tts.parameters(), 1.0)
+            optim_asr.step()
+            if use_reconstruction:
+                optim_tts.step()
+            step_counter += 1
+            loss_sum.append(loss.item())
+        asr_model.eval()
+        loss_this_epoch = sum(loss_sum) / len(loss_sum)
+        torch.save({
+            "asr_model"    : asr_model.state_dict(),
+            "optimizer"    : optim_asr.state_dict(),
+            "tts_model"    : tiny_tts.state_dict(),
+            "tts_optimizer": optim_tts.state_dict(),
+            "step_counter" : step_counter,
+            },
+            os.path.join(save_directory, "aligner.pt"))
+        print("Total Loss:   {}".format(round(loss_this_epoch, 3)))
+        print("Time elapsed: {} Minutes".format(round((time.time() - start_time) / 60)))
+        print("Steps:        {}".format(step_counter))
+        if debug_img_path is not None:
+            asr_model.inference(mel=mel[0][:mel_len[0]],
+                                tokens=tokens[0][:tokens_len[0]],
+                                save_img_for_debug=debug_img_path + f"/{step_counter}.png",
+                                train=True)  # for testing
+        if step_counter > steps:
+            return

TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/DurationCalculator.py ADDED Viewed

	@@ -0,0 +1,31 @@

+# Copyright 2020 Nagoya University (Tomoki Hayashi)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+# Adapted by Florian Lux 2021
+import matplotlib.pyplot as plt
+import torch
+class DurationCalculator(torch.nn.Module):
+    def __init__(self, reduction_factor):
+        self.reduction_factor = reduction_factor
+        super().__init__()
+    @torch.no_grad()
+    def forward(self, att_ws, vis=None):
+        """
+        Convert alignment matrix to durations.
+        """
+        if vis is not None:
+            plt.figure(figsize=(8, 4))
+            plt.imshow(att_ws.cpu().numpy(), interpolation='nearest', aspect='auto', origin="lower")
+            plt.xlabel("Inputs")
+            plt.ylabel("Outputs")
+            plt.tight_layout()
+            plt.savefig(vis)
+            plt.close()
+        # calculate duration from 2d alignment matrix
+        durations = torch.stack([att_ws.argmax(-1).eq(i).sum() for i in range(att_ws.shape[1])])
+        return durations.view(-1) * self.reduction_factor

TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/EnergyCalculator.py ADDED Viewed

	@@ -0,0 +1,86 @@

+# Copyright 2020 Nagoya University (Tomoki Hayashi)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+# Adapted by Florian Lux 2021
+import torch
+import torch.nn.functional as F
+from Layers.STFT import STFT
+from Utility.utils import pad_list
+class EnergyCalculator(torch.nn.Module):
+    def __init__(self, fs=16000, n_fft=1024, win_length=None, hop_length=256, window="hann", center=True,
+                 normalized=False, onesided=True, use_token_averaged_energy=True, reduction_factor=1):
+        super().__init__()
+        self.fs = fs
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.win_length = win_length
+        self.window = window
+        self.use_token_averaged_energy = use_token_averaged_energy
+        if use_token_averaged_energy:
+            assert reduction_factor >= 1
+        self.reduction_factor = reduction_factor
+        self.stft = STFT(n_fft=n_fft, win_length=win_length, hop_length=hop_length, window=window, center=center, normalized=normalized, onesided=onesided)
+    def output_size(self):
+        return 1
+    def get_parameters(self):
+        return dict(fs=self.fs, n_fft=self.n_fft, hop_length=self.hop_length, window=self.window, win_length=self.win_length, center=self.stft.center,
+                    normalized=self.stft.normalized, use_token_averaged_energy=self.use_token_averaged_energy, reduction_factor=self.reduction_factor)
+    def forward(self, input_waves, input_waves_lengths=None, feats_lengths=None, durations=None,
+                durations_lengths=None, norm_by_average=True):
+        # If not provided, we assume that the inputs have the same length
+        if input_waves_lengths is None:
+            input_waves_lengths = (input_waves.new_ones(input_waves.shape[0], dtype=torch.long) * input_waves.shape[1])
+        # Domain-conversion: e.g. Stft: time -> time-freq
+        input_stft, energy_lengths = self.stft(input_waves, input_waves_lengths)
+        assert input_stft.dim() >= 4, input_stft.shape
+        assert input_stft.shape[-1] == 2, input_stft.shape
+        # input_stft: (..., F, 2) -> (..., F)
+        input_power = input_stft[..., 0] ** 2 + input_stft[..., 1] ** 2
+        # sum over frequency (B, N, F) -> (B, N)
+        energy = torch.sqrt(torch.clamp(input_power.sum(dim=2), min=1.0e-10))
+        # (Optional): Adjust length to match with the mel-spectrogram
+        if feats_lengths is not None:
+            energy = [self._adjust_num_frames(e[:el].view(-1), fl) for e, el, fl in zip(energy, energy_lengths, feats_lengths)]
+            energy_lengths = feats_lengths
+        # (Optional): Average by duration to calculate token-wise energy
+        if self.use_token_averaged_energy:
+            energy = [self._average_by_duration(e[:el].view(-1), d) for e, el, d in zip(energy, energy_lengths, durations)]
+            energy_lengths = durations_lengths
+        # Padding
+        if isinstance(energy, list):
+            energy = pad_list(energy, 0.0)
+        # Return with the shape (B, T, 1)
+        if norm_by_average:
+            average = energy[0][energy[0] != 0.0].mean()
+            energy = energy / average
+        return energy.unsqueeze(-1), energy_lengths
+    def _average_by_duration(self, x, d):
+        assert 0 <= len(x) - d.sum() < self.reduction_factor
+        d_cumsum = F.pad(d.cumsum(dim=0), (1, 0))
+        x_avg = [x[start:end].mean() if len(x[start:end]) != 0 else x.new_tensor(0.0) for start, end in zip(d_cumsum[:-1], d_cumsum[1:])]
+        return torch.stack(x_avg)
+    @staticmethod
+    def _adjust_num_frames(x, num_frames):
+        if num_frames > len(x):
+            x = F.pad(x, (0, num_frames - len(x)))
+        elif num_frames < len(x):
+            x = x[:num_frames]
+        return x

TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/FastSpeech2.py ADDED Viewed

	@@ -0,0 +1,379 @@

+"""
+Taken from ESPNet
+"""
+from abc import ABC
+import torch
+from Layers.Conformer import Conformer
+from Layers.DurationPredictor import DurationPredictor
+from Layers.LengthRegulator import LengthRegulator
+from Layers.PostNet import PostNet
+from Layers.VariancePredictor import VariancePredictor
+from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.FastSpeech2Loss import FastSpeech2Loss
+from Utility.SoftDTW.sdtw_cuda_loss import SoftDTW
+from Utility.utils import initialize
+from Utility.utils import make_non_pad_mask
+from Utility.utils import make_pad_mask
+class FastSpeech2(torch.nn.Module, ABC):
+    """
+    FastSpeech 2 module.
+    This is a module of FastSpeech 2 described in FastSpeech 2: Fast and
+    High-Quality End-to-End Text to Speech. Instead of quantized pitch and
+    energy, we use token-averaged value introduced in FastPitch: Parallel
+    Text-to-speech with Pitch Prediction. The encoder and decoder are Conformers
+    instead of regular Transformers.
+        https://arxiv.org/abs/2006.04558
+        https://arxiv.org/abs/2006.06873
+        https://arxiv.org/pdf/2005.08100
+    """
+    def __init__(self,
+                 # network structure related
+                 idim=66,
+                 odim=80,
+                 adim=384,
+                 aheads=4,
+                 elayers=6,
+                 eunits=1536,
+                 dlayers=6,
+                 dunits=1536,
+                 postnet_layers=5,
+                 postnet_chans=256,
+                 postnet_filts=5,
+                 positionwise_layer_type="conv1d",
+                 positionwise_conv_kernel_size=1,
+                 use_scaled_pos_enc=True,
+                 use_batch_norm=True,
+                 encoder_normalize_before=True,
+                 decoder_normalize_before=True,
+                 encoder_concat_after=False,
+                 decoder_concat_after=False,
+                 reduction_factor=1,
+                 # encoder / decoder
+                 use_macaron_style_in_conformer=True,
+                 use_cnn_in_conformer=True,
+                 conformer_enc_kernel_size=7,
+                 conformer_dec_kernel_size=31,
+                 # duration predictor
+                 duration_predictor_layers=2,
+                 duration_predictor_chans=256,
+                 duration_predictor_kernel_size=3,
+                 # energy predictor
+                 energy_predictor_layers=2,
+                 energy_predictor_chans=256,
+                 energy_predictor_kernel_size=3,
+                 energy_predictor_dropout=0.5,
+                 energy_embed_kernel_size=1,
+                 energy_embed_dropout=0.0,
+                 stop_gradient_from_energy_predictor=False,
+                 # pitch predictor
+                 pitch_predictor_layers=5,
+                 pitch_predictor_chans=256,
+                 pitch_predictor_kernel_size=5,
+                 pitch_predictor_dropout=0.5,
+                 pitch_embed_kernel_size=1,
+                 pitch_embed_dropout=0.0,
+                 stop_gradient_from_pitch_predictor=True,
+                 # training related
+                 transformer_enc_dropout_rate=0.2,
+                 transformer_enc_positional_dropout_rate=0.2,
+                 transformer_enc_attn_dropout_rate=0.2,
+                 transformer_dec_dropout_rate=0.2,
+                 transformer_dec_positional_dropout_rate=0.2,
+                 transformer_dec_attn_dropout_rate=0.2,
+                 duration_predictor_dropout_rate=0.2,
+                 postnet_dropout_rate=0.5,
+                 init_type="xavier_uniform",
+                 init_enc_alpha=1.0,
+                 init_dec_alpha=1.0,
+                 use_masking=False,
+                 use_weighted_masking=True,
+                 # additional features
+                 use_dtw_loss=False,
+                 utt_embed_dim=704,
+                 connect_utt_emb_at_encoder_out=True,
+                 lang_embs=100):
+        super().__init__()
+        # store hyperparameters
+        self.idim = idim
+        self.odim = odim
+        self.use_dtw_loss = use_dtw_loss
+        self.eos = 1
+        self.reduction_factor = reduction_factor
+        self.stop_gradient_from_pitch_predictor = stop_gradient_from_pitch_predictor
+        self.stop_gradient_from_energy_predictor = stop_gradient_from_energy_predictor
+        self.use_scaled_pos_enc = use_scaled_pos_enc
+        self.multilingual_model = lang_embs is not None
+        self.multispeaker_model = utt_embed_dim is not None
+        # define encoder
+        embed = torch.nn.Sequential(torch.nn.Linear(idim, 100),
+                                    torch.nn.Tanh(),
+                                    torch.nn.Linear(100, adim))
+        self.encoder = Conformer(idim=idim, attention_dim=adim, attention_heads=aheads, linear_units=eunits, num_blocks=elayers,
+                                 input_layer=embed, dropout_rate=transformer_enc_dropout_rate,
+                                 positional_dropout_rate=transformer_enc_positional_dropout_rate, attention_dropout_rate=transformer_enc_attn_dropout_rate,
+                                 normalize_before=encoder_normalize_before, concat_after=encoder_concat_after,
+                                 positionwise_conv_kernel_size=positionwise_conv_kernel_size, macaron_style=use_macaron_style_in_conformer,
+                                 use_cnn_module=use_cnn_in_conformer, cnn_module_kernel=conformer_enc_kernel_size, zero_triu=False,
+                                 utt_embed=utt_embed_dim, connect_utt_emb_at_encoder_out=connect_utt_emb_at_encoder_out, lang_embs=lang_embs)
+        # define duration predictor
+        self.duration_predictor = DurationPredictor(idim=adim, n_layers=duration_predictor_layers, n_chans=duration_predictor_chans,
+                                                    kernel_size=duration_predictor_kernel_size, dropout_rate=duration_predictor_dropout_rate, )
+        # define pitch predictor
+        self.pitch_predictor = VariancePredictor(idim=adim, n_layers=pitch_predictor_layers, n_chans=pitch_predictor_chans,
+                                                 kernel_size=pitch_predictor_kernel_size, dropout_rate=pitch_predictor_dropout)
+        # continuous pitch + FastPitch style avg
+        self.pitch_embed = torch.nn.Sequential(
+            torch.nn.Conv1d(in_channels=1, out_channels=adim, kernel_size=pitch_embed_kernel_size, padding=(pitch_embed_kernel_size - 1) // 2),
+            torch.nn.Dropout(pitch_embed_dropout))
+        # define energy predictor
+        self.energy_predictor = VariancePredictor(idim=adim, n_layers=energy_predictor_layers, n_chans=energy_predictor_chans,
+                                                  kernel_size=energy_predictor_kernel_size, dropout_rate=energy_predictor_dropout)
+        # continuous energy + FastPitch style avg
+        self.energy_embed = torch.nn.Sequential(
+            torch.nn.Conv1d(in_channels=1, out_channels=adim, kernel_size=energy_embed_kernel_size, padding=(energy_embed_kernel_size - 1) // 2),
+            torch.nn.Dropout(energy_embed_dropout))
+        # define length regulator
+        self.length_regulator = LengthRegulator()
+        self.decoder = Conformer(idim=0, attention_dim=adim, attention_heads=aheads, linear_units=dunits, num_blocks=dlayers, input_layer=None,
+                                 dropout_rate=transformer_dec_dropout_rate, positional_dropout_rate=transformer_dec_positional_dropout_rate,
+                                 attention_dropout_rate=transformer_dec_attn_dropout_rate, normalize_before=decoder_normalize_before,
+                                 concat_after=decoder_concat_after, positionwise_conv_kernel_size=positionwise_conv_kernel_size,
+                                 macaron_style=use_macaron_style_in_conformer, use_cnn_module=use_cnn_in_conformer, cnn_module_kernel=conformer_dec_kernel_size)
+        # define final projection
+        self.feat_out = torch.nn.Linear(adim, odim * reduction_factor)
+        # define postnet
+        self.postnet = PostNet(idim=idim, odim=odim, n_layers=postnet_layers, n_chans=postnet_chans, n_filts=postnet_filts, use_batch_norm=use_batch_norm,
+                               dropout_rate=postnet_dropout_rate)
+        # initialize parameters
+        self._reset_parameters(init_type=init_type, init_enc_alpha=init_enc_alpha, init_dec_alpha=init_dec_alpha)
+        # define criterions
+        self.criterion = FastSpeech2Loss(use_masking=use_masking, use_weighted_masking=use_weighted_masking)
+        self.dtw_criterion = SoftDTW(use_cuda=True, gamma=0.1)
+    def forward(self,
+                text_tensors,
+                text_lengths,
+                gold_speech,
+                speech_lengths,
+                gold_durations,
+                gold_pitch,
+                gold_energy,
+                utterance_embedding,
+                return_mels=False,
+                lang_ids=None):
+        """
+        Calculate forward propagation.
+        Args:
+            return_mels: whether to return the predicted spectrogram
+            text_tensors (LongTensor): Batch of padded text vectors (B, Tmax).
+            text_lengths (LongTensor): Batch of lengths of each input (B,).
+            gold_speech (Tensor): Batch of padded target features (B, Lmax, odim).
+            speech_lengths (LongTensor): Batch of the lengths of each target (B,).
+            gold_durations (LongTensor): Batch of padded durations (B, Tmax + 1).
+            gold_pitch (Tensor): Batch of padded token-averaged pitch (B, Tmax + 1, 1).
+            gold_energy (Tensor): Batch of padded token-averaged energy (B, Tmax + 1, 1).
+        Returns:
+            Tensor: Loss scalar value.
+            Dict: Statistics to be monitored.
+            Tensor: Weight value.
+        """
+        # Texts include EOS token from the teacher model already in this version
+        # forward propagation
+        before_outs, after_outs, d_outs, p_outs, e_outs = self._forward(text_tensors, text_lengths, gold_speech, speech_lengths,
+                                                                        gold_durations, gold_pitch, gold_energy, utterance_embedding=utterance_embedding,
+                                                                        is_inference=False, lang_ids=lang_ids)
+        # modify mod part of groundtruth (speaking pace)
+        if self.reduction_factor > 1:
+            speech_lengths = speech_lengths.new([olen - olen % self.reduction_factor for olen in speech_lengths])
+        # calculate loss
+        l1_loss, duration_loss, pitch_loss, energy_loss = self.criterion(after_outs=after_outs, before_outs=before_outs, d_outs=d_outs, p_outs=p_outs,
+                                                                         e_outs=e_outs, ys=gold_speech, ds=gold_durations, ps=gold_pitch, es=gold_energy,
+                                                                         ilens=text_lengths, olens=speech_lengths)
+        loss = l1_loss + duration_loss + pitch_loss + energy_loss
+        if self.use_dtw_loss:
+            # print("Regular Loss: {}".format(loss))
+            dtw_loss = self.dtw_criterion(after_outs, gold_speech).mean() / 2000.0  # division to balance orders of magnitude
+            # print("DTW Loss: {}".format(dtw_loss))
+            loss = loss + dtw_loss
+        if return_mels:
+            return loss, after_outs
+        return loss
+    def _forward(self, text_tensors, text_lens, gold_speech=None, speech_lens=None,
+                 gold_durations=None, gold_pitch=None, gold_energy=None,
+                 is_inference=False, alpha=1.0, utterance_embedding=None, lang_ids=None):
+        if not self.multilingual_model:
+            lang_ids = None
+        if not self.multispeaker_model:
+            utterance_embedding = None
+        # forward encoder
+        text_masks = self._source_mask(text_lens)
+        encoded_texts, _ = self.encoder(text_tensors, text_masks, utterance_embedding=utterance_embedding, lang_ids=lang_ids)  # (B, Tmax, adim)
+        # forward duration predictor and variance predictors
+        d_masks = make_pad_mask(text_lens, device=text_lens.device)
+        if self.stop_gradient_from_pitch_predictor:
+            pitch_predictions = self.pitch_predictor(encoded_texts.detach(), d_masks.unsqueeze(-1))
+        else:
+            pitch_predictions = self.pitch_predictor(encoded_texts, d_masks.unsqueeze(-1))
+        if self.stop_gradient_from_energy_predictor:
+            energy_predictions = self.energy_predictor(encoded_texts.detach(), d_masks.unsqueeze(-1))
+        else:
+            energy_predictions = self.energy_predictor(encoded_texts, d_masks.unsqueeze(-1))
+        if is_inference:
+            d_outs = self.duration_predictor.inference(encoded_texts, d_masks)  # (B, Tmax)
+            # use prediction in inference
+            p_embs = self.pitch_embed(pitch_predictions.transpose(1, 2)).transpose(1, 2)
+            e_embs = self.energy_embed(energy_predictions.transpose(1, 2)).transpose(1, 2)
+            encoded_texts = encoded_texts + e_embs + p_embs
+            encoded_texts = self.length_regulator(encoded_texts, d_outs, alpha)  # (B, Lmax, adim)
+        else:
+            d_outs = self.duration_predictor(encoded_texts, d_masks)
+            # use groundtruth in training
+            p_embs = self.pitch_embed(gold_pitch.transpose(1, 2)).transpose(1, 2)
+            e_embs = self.energy_embed(gold_energy.transpose(1, 2)).transpose(1, 2)
+            encoded_texts = encoded_texts + e_embs + p_embs
+            encoded_texts = self.length_regulator(encoded_texts, gold_durations)  # (B, Lmax, adim)
+        # forward decoder
+        if speech_lens is not None and not is_inference:
+            if self.reduction_factor > 1:
+                olens_in = speech_lens.new([olen // self.reduction_factor for olen in speech_lens])
+            else:
+                olens_in = speech_lens
+            h_masks = self._source_mask(olens_in)
+        else:
+            h_masks = None
+        zs, _ = self.decoder(encoded_texts, h_masks)  # (B, Lmax, adim)
+        before_outs = self.feat_out(zs).view(zs.size(0), -1, self.odim)  # (B, Lmax, odim)
+        # postnet -> (B, Lmax//r * r, odim)
+        after_outs = before_outs + self.postnet(before_outs.transpose(1, 2)).transpose(1, 2)
+        return before_outs, after_outs, d_outs, pitch_predictions, energy_predictions
+    def batch_inference(self, texts, text_lens, utt_emb):
+        _, after_outs, d_outs, _, _ = self._forward(texts,
+                                                    text_lens,
+                                                    None,
+                                                    is_inference=True,
+                                                    alpha=1.0)
+        return after_outs, d_outs
+    def inference(self,
+                  text,
+                  speech=None,
+                  durations=None,
+                  pitch=None,
+                  energy=None,
+                  alpha=1.0,
+                  use_teacher_forcing=False,
+                  utterance_embedding=None,
+                  return_duration_pitch_energy=False,
+                  lang_id=None):
+        """
+        Generate the sequence of features given the sequences of characters.
+        Args:
+            text (LongTensor): Input sequence of characters (T,).
+            speech (Tensor, optional): Feature sequence to extract style (N, idim).
+            durations (LongTensor, optional): Groundtruth of duration (T + 1,).
+            pitch (Tensor, optional): Groundtruth of token-averaged pitch (T + 1, 1).
+            energy (Tensor, optional): Groundtruth of token-averaged energy (T + 1, 1).
+            alpha (float, optional): Alpha to control the speed.
+            use_teacher_forcing (bool, optional): Whether to use teacher forcing.
+                If true, groundtruth of duration, pitch and energy will be used.
+            return_duration_pitch_energy: whether to return the list of predicted durations for nicer plotting
+        Returns:
+            Tensor: Output sequence of features (L, odim).
+        """
+        self.eval()
+        x, y = text, speech
+        d, p, e = durations, pitch, energy
+        # setup batch axis
+        ilens = torch.tensor([x.shape[0]], dtype=torch.long, device=x.device)
+        xs, ys = x.unsqueeze(0), None
+        if y is not None:
+            ys = y.unsqueeze(0)
+        if lang_id is not None:
+            lang_id = lang_id.unsqueeze(0)
+        if use_teacher_forcing:
+            # use groundtruth of duration, pitch, and energy
+            ds, ps, es = d.unsqueeze(0), p.unsqueeze(0), e.unsqueeze(0)
+            before_outs, after_outs, d_outs, pitch_predictions, energy_predictions = self._forward(xs,
+                                                                                                   ilens,
+                                                                                                   ys,
+                                                                                                   gold_durations=ds,
+                                                                                                   gold_pitch=ps,
+                                                                                                   gold_energy=es,
+                                                                                                   utterance_embedding=utterance_embedding.unsqueeze(0),
+                                                                                                   lang_ids=lang_id)  # (1, L, odim)
+        else:
+            before_outs, after_outs, d_outs, pitch_predictions, energy_predictions = self._forward(xs,
+                                                                                                   ilens,
+                                                                                                   ys,
+                                                                                                   is_inference=True,
+                                                                                                   alpha=alpha,
+                                                                                                   utterance_embedding=utterance_embedding.unsqueeze(0),
+                                                                                                   lang_ids=lang_id)  # (1, L, odim)
+        self.train()
+        if return_duration_pitch_energy:
+            return after_outs[0], d_outs[0], pitch_predictions[0], energy_predictions[0]
+        return after_outs[0]
+    def _source_mask(self, ilens):
+        """
+        Make masks for self-attention.
+        Args:
+            ilens (LongTensor): Batch of lengths (B,).
+        Returns:
+            Tensor: Mask tensor for self-attention.
+        """
+        x_masks = make_non_pad_mask(ilens, device=ilens.device)
+        return x_masks.unsqueeze(-2)
+    def _reset_parameters(self, init_type, init_enc_alpha, init_dec_alpha):
+        # initialize parameters
+        if init_type != "pytorch":
+            initialize(self, init_type)

TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/FastSpeech2Loss.py ADDED Viewed

	@@ -0,0 +1,96 @@

+"""
+Taken from ESPNet
+"""
+import torch
+from Layers.DurationPredictor import DurationPredictorLoss
+from Utility.utils import make_non_pad_mask
+class FastSpeech2Loss(torch.nn.Module):
+    def __init__(self, use_masking=True, use_weighted_masking=False):
+        """
+            use_masking (bool):
+                Whether to apply masking for padded part in loss calculation.
+            use_weighted_masking (bool):
+                Whether to weighted masking in loss calculation.
+        """
+        super().__init__()
+        assert (use_masking != use_weighted_masking) or not use_masking
+        self.use_masking = use_masking
+        self.use_weighted_masking = use_weighted_masking
+        # define criterions
+        reduction = "none" if self.use_weighted_masking else "mean"
+        self.l1_criterion = torch.nn.L1Loss(reduction=reduction)
+        self.mse_criterion = torch.nn.MSELoss(reduction=reduction)
+        self.duration_criterion = DurationPredictorLoss(reduction=reduction)
+    def forward(self, after_outs, before_outs, d_outs, p_outs, e_outs, ys,
+                ds, ps, es, ilens, olens, ):
+        """
+        Args:
+            after_outs (Tensor): Batch of outputs after postnets (B, Lmax, odim).
+            before_outs (Tensor): Batch of outputs before postnets (B, Lmax, odim).
+            d_outs (LongTensor): Batch of outputs of duration predictor (B, Tmax).
+            p_outs (Tensor): Batch of outputs of pitch predictor (B, Tmax, 1).
+            e_outs (Tensor): Batch of outputs of energy predictor (B, Tmax, 1).
+            ys (Tensor): Batch of target features (B, Lmax, odim).
+            ds (LongTensor): Batch of durations (B, Tmax).
+            ps (Tensor): Batch of target token-averaged pitch (B, Tmax, 1).
+            es (Tensor): Batch of target token-averaged energy (B, Tmax, 1).
+            ilens (LongTensor): Batch of the lengths of each input (B,).
+            olens (LongTensor): Batch of the lengths of each target (B,).
+        Returns:
+            Tensor: L1 loss value.
+            Tensor: Duration predictor loss value.
+            Tensor: Pitch predictor loss value.
+            Tensor: Energy predictor loss value.
+        """
+        # apply mask to remove padded part
+        if self.use_masking:
+            out_masks = make_non_pad_mask(olens).unsqueeze(-1).to(ys.device)
+            before_outs = before_outs.masked_select(out_masks)
+            if after_outs is not None:
+                after_outs = after_outs.masked_select(out_masks)
+            ys = ys.masked_select(out_masks)
+            duration_masks = make_non_pad_mask(ilens).to(ys.device)
+            d_outs = d_outs.masked_select(duration_masks)
+            ds = ds.masked_select(duration_masks)
+            pitch_masks = make_non_pad_mask(ilens).unsqueeze(-1).to(ys.device)
+            p_outs = p_outs.masked_select(pitch_masks)
+            e_outs = e_outs.masked_select(pitch_masks)
+            ps = ps.masked_select(pitch_masks)
+            es = es.masked_select(pitch_masks)
+        # calculate loss
+        l1_loss = self.l1_criterion(before_outs, ys)
+        if after_outs is not None:
+            l1_loss += self.l1_criterion(after_outs, ys)
+        duration_loss = self.duration_criterion(d_outs, ds)
+        pitch_loss = self.mse_criterion(p_outs, ps)
+        energy_loss = self.mse_criterion(e_outs, es)
+        # make weighted mask and apply it
+        if self.use_weighted_masking:
+            out_masks = make_non_pad_mask(olens).unsqueeze(-1).to(ys.device)
+            out_weights = out_masks.float() / out_masks.sum(dim=1, keepdim=True).float()
+            out_weights /= ys.size(0) * ys.size(2)
+            duration_masks = make_non_pad_mask(ilens).to(ys.device)
+            duration_weights = (duration_masks.float() / duration_masks.sum(dim=1, keepdim=True).float())
+            duration_weights /= ds.size(0)
+            # apply weight
+            l1_loss = l1_loss.mul(out_weights).masked_select(out_masks).sum()
+            duration_loss = (duration_loss.mul(duration_weights).masked_select(duration_masks).sum())
+            pitch_masks = duration_masks.unsqueeze(-1)
+            pitch_weights = duration_weights.unsqueeze(-1)
+            pitch_loss = pitch_loss.mul(pitch_weights).masked_select(pitch_masks).sum()
+            energy_loss = (energy_loss.mul(pitch_weights).masked_select(pitch_masks).sum())
+        return l1_loss, duration_loss, pitch_loss, energy_loss

TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/FastSpeechDatasetLanguageID.py ADDED Viewed

	@@ -0,0 +1,217 @@

+import os
+import statistics
+import torch
+from torch.utils.data import Dataset
+from tqdm import tqdm
+from Preprocessing.ArticulatoryCombinedTextFrontend import get_language_id
+from Preprocessing.ProsodicConditionExtractor import ProsodicConditionExtractor
+from TrainingInterfaces.Text_to_Spectrogram.AutoAligner.Aligner import Aligner
+from TrainingInterfaces.Text_to_Spectrogram.AutoAligner.AlignerDataset import AlignerDataset
+from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.DurationCalculator import DurationCalculator
+from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.EnergyCalculator import EnergyCalculator
+from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.PitchCalculator import Dio
+class FastSpeechDataset(Dataset):
+    def __init__(self,
+                 path_to_transcript_dict,
+                 acoustic_checkpoint_path,
+                 cache_dir,
+                 lang,
+                 loading_processes=40,
+                 min_len_in_seconds=1,
+                 max_len_in_seconds=20,
+                 cut_silence=False,
+                 reduction_factor=1,
+                 device=torch.device("cpu"),
+                 rebuild_cache=False,
+                 ctc_selection=True,
+                 save_imgs=False):
+        self.cache_dir = cache_dir
+        os.makedirs(cache_dir, exist_ok=True)
+        if not os.path.exists(os.path.join(cache_dir, "fast_train_cache.pt")) or rebuild_cache:
+            if not os.path.exists(os.path.join(cache_dir, "aligner_train_cache.pt")) or rebuild_cache:
+                AlignerDataset(path_to_transcript_dict=path_to_transcript_dict,
+                               cache_dir=cache_dir,
+                               lang=lang,
+                               loading_processes=loading_processes,
+                               min_len_in_seconds=min_len_in_seconds,
+                               max_len_in_seconds=max_len_in_seconds,
+                               cut_silences=cut_silence,
+                               rebuild_cache=rebuild_cache,
+                               device=device)
+            datapoints = torch.load(os.path.join(cache_dir, "aligner_train_cache.pt"), map_location='cpu')
+            # we use the aligner dataset as basis and augment it to contain the additional information we need for fastspeech.
+            if not isinstance(datapoints, tuple):  # check for backwards compatibility
+                print(f"It seems like the Aligner dataset in {cache_dir} is not a tuple. Regenerating it, since we need the preprocessed waves.")
+                AlignerDataset(path_to_transcript_dict=path_to_transcript_dict,
+                               cache_dir=cache_dir,
+                               lang=lang,
+                               loading_processes=loading_processes,
+                               min_len_in_seconds=min_len_in_seconds,
+                               max_len_in_seconds=max_len_in_seconds,
+                               cut_silences=cut_silence,
+                               rebuild_cache=True)
+                datapoints = torch.load(os.path.join(cache_dir, "aligner_train_cache.pt"), map_location='cpu')
+            dataset = datapoints[0]
+            norm_waves = datapoints[1]
+            # build cache
+            print("... building dataset cache ...")
+            self.datapoints = list()
+            self.ctc_losses = list()
+            acoustic_model = Aligner()
+            acoustic_model.load_state_dict(torch.load(acoustic_checkpoint_path, map_location='cpu')["asr_model"])
+            # ==========================================
+            # actual creation of datapoints starts here
+            # ==========================================
+            acoustic_model = acoustic_model.to(device)
+            dio = Dio(reduction_factor=reduction_factor, fs=16000)
+            energy_calc = EnergyCalculator(reduction_factor=reduction_factor, fs=16000)
+            dc = DurationCalculator(reduction_factor=reduction_factor)
+            vis_dir = os.path.join(cache_dir, "duration_vis")
+            os.makedirs(vis_dir, exist_ok=True)
+            pros_cond_ext = ProsodicConditionExtractor(sr=16000, device=device)
+            for index in tqdm(range(len(dataset))):
+                norm_wave = norm_waves[index]
+                norm_wave_length = torch.LongTensor([len(norm_wave)])
+                if len(norm_wave) / 16000 < min_len_in_seconds and ctc_selection:
+                    continue
+                text = dataset[index][0]
+                melspec = dataset[index][2]
+                melspec_length = dataset[index][3]
+                alignment_path, ctc_loss = acoustic_model.inference(mel=melspec.to(device),
+                                                                    tokens=text.to(device),
+                                                                    save_img_for_debug=os.path.join(vis_dir, f"{index}.png") if save_imgs else None,
+                                                                    return_ctc=True)
+                cached_duration = dc(torch.LongTensor(alignment_path), vis=None).cpu()
+                last_vec = None
+                for phoneme_index, vec in enumerate(text):
+                    if last_vec is not None:
+                        if last_vec.numpy().tolist() == vec.numpy().tolist():
+                            # we found a case of repeating phonemes!
+                            # now we must repair their durations by giving the first one 3/5 of their sum and the second one 2/5 (i.e. the rest)
+                            dur_1 = cached_duration[phoneme_index - 1]
+                            dur_2 = cached_duration[phoneme_index]
+                            total_dur = dur_1 + dur_2
+                            new_dur_1 = int((total_dur / 5) * 3)
+                            new_dur_2 = total_dur - new_dur_1
+                            cached_duration[phoneme_index - 1] = new_dur_1
+                            cached_duration[phoneme_index] = new_dur_2
+                    last_vec = vec
+                cached_energy = energy_calc(input_waves=norm_wave.unsqueeze(0),
+                                            input_waves_lengths=norm_wave_length,
+                                            feats_lengths=melspec_length,
+                                            durations=cached_duration.unsqueeze(0),
+                                            durations_lengths=torch.LongTensor([len(cached_duration)]))[0].squeeze(0).cpu()
+                cached_pitch = dio(input_waves=norm_wave.unsqueeze(0),
+                                   input_waves_lengths=norm_wave_length,
+                                   feats_lengths=melspec_length,
+                                   durations=cached_duration.unsqueeze(0),
+                                   durations_lengths=torch.LongTensor([len(cached_duration)]))[0].squeeze(0).cpu()
+                try:
+                    prosodic_condition = pros_cond_ext.extract_condition_from_reference_wave(norm_wave, already_normalized=True).cpu()
+                except RuntimeError:
+                    # if there is an audio without any voiced segments whatsoever we have to skip it.
+                    continue
+                self.datapoints.append([dataset[index][0],
+                                        dataset[index][1],
+                                        dataset[index][2],
+                                        dataset[index][3],
+                                        cached_duration.cpu(),
+                                        cached_energy,
+                                        cached_pitch,
+                                        prosodic_condition])
+                self.ctc_losses.append(ctc_loss)
+            # =============================
+            # done with datapoint creation
+            # =============================
+            if ctc_selection:
+                # now we can filter out some bad datapoints based on the CTC scores we collected
+                mean_ctc = sum(self.ctc_losses) / len(self.ctc_losses)
+                std_dev = statistics.stdev(self.ctc_losses)
+                threshold = mean_ctc + std_dev
+                for index in range(len(self.ctc_losses), 0, -1):
+                    if self.ctc_losses[index - 1] > threshold:
+                        self.datapoints.pop(index - 1)
+                        print(
+                            f"Removing datapoint {index - 1}, because the CTC loss is one standard deviation higher than the mean. \n ctc: {round(self.ctc_losses[index - 1], 4)} vs. mean: {round(mean_ctc, 4)}")
+            # save to cache
+            if len(self.datapoints) > 0:
+                torch.save(self.datapoints, os.path.join(cache_dir, "fast_train_cache.pt"))
+            else:
+                import sys
+                print("No datapoints were prepared! Exiting...")
+                sys.exit()
+        else:
+            # just load the datapoints from cache
+            self.datapoints = torch.load(os.path.join(cache_dir, "fast_train_cache.pt"), map_location='cpu')
+        self.cache_dir = cache_dir
+        self.language_id = get_language_id(lang)
+        print(f"Prepared a FastSpeech dataset with {len(self.datapoints)} datapoints in {cache_dir}.")
+    def __getitem__(self, index):
+        return self.datapoints[index][0], \
+               self.datapoints[index][1], \
+               self.datapoints[index][2], \
+               self.datapoints[index][3], \
+               self.datapoints[index][4], \
+               self.datapoints[index][5], \
+               self.datapoints[index][6], \
+               self.datapoints[index][7], \
+               self.language_id
+    def __len__(self):
+        return len(self.datapoints)
+    def remove_samples(self, list_of_samples_to_remove):
+        for remove_id in sorted(list_of_samples_to_remove, reverse=True):
+            self.datapoints.pop(remove_id)
+        torch.save(self.datapoints, os.path.join(self.cache_dir, "fast_train_cache.pt"))
+        print("Dataset updated!")
+    def fix_repeating_phones(self):
+        """
+        The viterbi decoding of the durations cannot
+        handle repetitions. This is now solved heuristically,
+        but if you have a cache from before March 2022,
+        use this method to postprocess those cases.
+        """
+        for datapoint_index in tqdm(list(range(len(self.datapoints)))):
+            last_vec = None
+            for phoneme_index, vec in enumerate(self.datapoints[datapoint_index][0]):
+                if last_vec is not None:
+                    if last_vec.numpy().tolist() == vec.numpy().tolist():
+                        # we found a case of repeating phonemes!
+                        # now we must repair their durations by giving the first one 3/5 of their sum and the second one 2/5 (i.e. the rest)
+                        dur_1 = self.datapoints[datapoint_index][4][phoneme_index - 1]
+                        dur_2 = self.datapoints[datapoint_index][4][phoneme_index]
+                        total_dur = dur_1 + dur_2
+                        new_dur_1 = int((total_dur / 5) * 3)
+                        new_dur_2 = total_dur - new_dur_1
+                        self.datapoints[datapoint_index][4][phoneme_index - 1] = new_dur_1
+                        self.datapoints[datapoint_index][4][phoneme_index] = new_dur_2
+                        print("fix applied")
+                last_vec = vec
+        torch.save(self.datapoints, os.path.join(self.cache_dir, "fast_train_cache.pt"))
+        print("Dataset updated!")

TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/PitchCalculator.py ADDED Viewed

	@@ -0,0 +1,121 @@

+# Copyright 2020 Nagoya University (Tomoki Hayashi)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+# Adapted by Florian Lux 2021
+import numpy as np
+import pyworld
+import torch
+import torch.nn.functional as F
+from scipy.interpolate import interp1d
+from Utility.utils import pad_list
+class Dio(torch.nn.Module):
+    """
+    F0 estimation with dio + stonemask algortihm.
+    This is f0 extractor based on dio + stonemask algorithm
+    introduced in https://doi.org/10.1587/transinf.2015EDP7457
+    """
+    def __init__(self, fs=16000, n_fft=1024, hop_length=256, f0min=40, f0max=400, use_token_averaged_f0=True,
+                 use_continuous_f0=True, use_log_f0=True, reduction_factor=1):
+        super().__init__()
+        self.fs = fs
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.frame_period = 1000 * hop_length / fs
+        self.f0min = f0min
+        self.f0max = f0max
+        self.use_token_averaged_f0 = use_token_averaged_f0
+        self.use_continuous_f0 = use_continuous_f0
+        self.use_log_f0 = use_log_f0
+        if use_token_averaged_f0:
+            assert reduction_factor >= 1
+        self.reduction_factor = reduction_factor
+    def output_size(self):
+        return 1
+    def get_parameters(self):
+        return dict(fs=self.fs, n_fft=self.n_fft, hop_length=self.hop_length, f0min=self.f0min, f0max=self.f0max,
+                    use_token_averaged_f0=self.use_token_averaged_f0, use_continuous_f0=self.use_continuous_f0, use_log_f0=self.use_log_f0,
+                    reduction_factor=self.reduction_factor)
+    def forward(self, input_waves, input_waves_lengths=None, feats_lengths=None, durations=None,
+                durations_lengths=None, norm_by_average=True):
+        # If not provided, we assume that the inputs have the same length
+        if input_waves_lengths is None:
+            input_waves_lengths = (input_waves.new_ones(input_waves.shape[0], dtype=torch.long) * input_waves.shape[1])
+        # F0 extraction
+        pitch = [self._calculate_f0(x[:xl]) for x, xl in zip(input_waves, input_waves_lengths)]
+        # (Optional): Adjust length to match with the mel-spectrogram
+        if feats_lengths is not None:
+            pitch = [self._adjust_num_frames(p, fl).view(-1) for p, fl in zip(pitch, feats_lengths)]
+        # (Optional): Average by duration to calculate token-wise f0
+        if self.use_token_averaged_f0:
+            pitch = [self._average_by_duration(p, d).view(-1) for p, d in zip(pitch, durations)]
+            pitch_lengths = durations_lengths
+        else:
+            pitch_lengths = input_waves.new_tensor([len(p) for p in pitch], dtype=torch.long)
+        # Padding
+        pitch = pad_list(pitch, 0.0)
+        # Return with the shape (B, T, 1)
+        if norm_by_average:
+            average = pitch[0][pitch[0] != 0.0].mean()
+            pitch = pitch / average
+        return pitch.unsqueeze(-1), pitch_lengths
+    def _calculate_f0(self, input):
+        x = input.cpu().numpy().astype(np.double)
+        f0, timeaxis = pyworld.dio(x, self.fs, f0_floor=self.f0min, f0_ceil=self.f0max, frame_period=self.frame_period)
+        f0 = pyworld.stonemask(x, f0, timeaxis, self.fs)
+        if self.use_continuous_f0:
+            f0 = self._convert_to_continuous_f0(f0)
+        if self.use_log_f0:
+            nonzero_idxs = np.where(f0 != 0)[0]
+            f0[nonzero_idxs] = np.log(f0[nonzero_idxs])
+        return input.new_tensor(f0.reshape(-1), dtype=torch.float)
+    @staticmethod
+    def _adjust_num_frames(x, num_frames):
+        if num_frames > len(x):
+            x = F.pad(x, (0, num_frames - len(x)))
+        elif num_frames < len(x):
+            x = x[:num_frames]
+        return x
+    @staticmethod
+    def _convert_to_continuous_f0(f0: np.array):
+        if (f0 == 0).all():
+            return f0
+        # padding start and end of f0 sequence
+        start_f0 = f0[f0 != 0][0]
+        end_f0 = f0[f0 != 0][-1]
+        start_idx = np.where(f0 == start_f0)[0][0]
+        end_idx = np.where(f0 == end_f0)[0][-1]
+        f0[:start_idx] = start_f0
+        f0[end_idx:] = end_f0
+        # get non-zero frame index
+        nonzero_idxs = np.where(f0 != 0)[0]
+        # perform linear interpolation
+        interp_fn = interp1d(nonzero_idxs, f0[nonzero_idxs])
+        f0 = interp_fn(np.arange(0, f0.shape[0]))
+        return f0
+    def _average_by_duration(self, x, d):
+        assert 0 <= len(x) - d.sum() < self.reduction_factor
+        d_cumsum = F.pad(d.cumsum(dim=0), (1, 0))
+        x_avg = [
+            x[start:end].masked_select(x[start:end].gt(0.0)).mean(dim=0) if len(x[start:end].masked_select(x[start:end].gt(0.0))) != 0 else x.new_tensor(0.0)
+            for start, end in zip(d_cumsum[:-1], d_cumsum[1:])]
+        return torch.stack(x_avg)

TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/__init__.py ADDED Viewed

File without changes

TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/fastspeech2_train_loop.py ADDED Viewed

	@@ -0,0 +1,201 @@

+import os
+import time
+import librosa.display as lbd
+import matplotlib.pyplot as plt
+import torch
+import torch.multiprocessing
+import torch.multiprocessing
+from torch.cuda.amp import GradScaler
+from torch.cuda.amp import autocast
+from torch.nn.utils.rnn import pad_sequence
+from torch.utils.data.dataloader import DataLoader
+from tqdm import tqdm
+from Preprocessing.ArticulatoryCombinedTextFrontend import ArticulatoryCombinedTextFrontend
+from Preprocessing.ArticulatoryCombinedTextFrontend import get_language_id
+from Utility.WarmupScheduler import WarmupScheduler
+from Utility.utils import cumsum_durations
+from Utility.utils import delete_old_checkpoints
+from Utility.utils import get_most_recent_checkpoint
+@torch.no_grad()
+def plot_progress_spec(net, device, save_dir, step, lang, default_emb):
+    tf = ArticulatoryCombinedTextFrontend(language=lang)
+    sentence = ""
+    if lang == "en":
+        sentence = "This is a complex sentence, it even has a pause!"
+    elif lang == "de":
+        sentence = "Dies ist ein komplexer Satz, er hat sogar eine Pause!"
+    elif lang == "el":
+        sentence = "Αυτή είναι μια σύνθετη πρόταση, έχει ακόμη και παύση!"
+    elif lang == "es":
+        sentence = "Esta es una oración compleja, ¡incluso tiene una pausa!"
+    elif lang == "fi":
+        sentence = "Tämä on monimutkainen lause, sillä on jopa tauko!"
+    elif lang == "ru":
+        sentence = "Это сложное предложение, в нем даже есть пауза!"
+    elif lang == "hu":
+        sentence = "Ez egy összetett mondat, még szünet is van benne!"
+    elif lang == "nl":
+        sentence = "Dit is een complexe zin, er zit zelfs een pauze in!"
+    elif lang == "fr":
+        sentence = "C'est une phrase complexe, elle a même une pause !"
+    phoneme_vector = tf.string_to_tensor(sentence).squeeze(0).to(device)
+    spec, durations, *_ = net.inference(text=phoneme_vector,
+                                        return_duration_pitch_energy=True,
+                                        utterance_embedding=default_emb,
+                                        lang_id=get_language_id(lang).to(device))
+    spec = spec.transpose(0, 1).to("cpu").numpy()
+    duration_splits, label_positions = cumsum_durations(durations.cpu().numpy())
+    if not os.path.exists(os.path.join(save_dir, "spec")):
+        os.makedirs(os.path.join(save_dir, "spec"))
+    fig, ax = plt.subplots(nrows=1, ncols=1)
+    lbd.specshow(spec,
+                 ax=ax,
+                 sr=16000,
+                 cmap='GnBu',
+                 y_axis='mel',
+                 x_axis=None,
+                 hop_length=256)
+    ax.yaxis.set_visible(False)
+    ax.set_xticks(duration_splits, minor=True)
+    ax.xaxis.grid(True, which='minor')
+    ax.set_xticks(label_positions, minor=False)
+    ax.set_xticklabels(tf.get_phone_string(sentence))
+    ax.set_title(sentence)
+    plt.savefig(os.path.join(os.path.join(save_dir, "spec"), str(step) + ".png"))
+    plt.clf()
+    plt.close()
+def collate_and_pad(batch):
+    # text, text_len, speech, speech_len, durations, energy, pitch, utterance condition, language_id
+    return (pad_sequence([datapoint[0] for datapoint in batch], batch_first=True),
+            torch.stack([datapoint[1] for datapoint in batch]).squeeze(1),
+            pad_sequence([datapoint[2] for datapoint in batch], batch_first=True),
+            torch.stack([datapoint[3] for datapoint in batch]).squeeze(1),
+            pad_sequence([datapoint[4] for datapoint in batch], batch_first=True),
+            pad_sequence([datapoint[5] for datapoint in batch], batch_first=True),
+            pad_sequence([datapoint[6] for datapoint in batch], batch_first=True),
+            torch.stack([datapoint[7] for datapoint in batch]).squeeze(),
+            torch.stack([datapoint[8] for datapoint in batch]))
+def train_loop(net,
+               train_dataset,
+               device,
+               save_directory,
+               batch_size=32,
+               steps=300000,
+               epochs_per_save=1,
+               lang="en",
+               lr=0.0001,
+               warmup_steps=4000,
+               path_to_checkpoint=None,
+               fine_tune=False,
+               resume=False):
+    """
+    Args:
+        resume: whether to resume from the most recent checkpoint
+        warmup_steps: how long the learning rate should increase before it reaches the specified value
+        steps: How many steps to train
+        lr: The initial learning rate for the optimiser
+        path_to_checkpoint: reloads a checkpoint to continue training from there
+        fine_tune: whether to load everything from a checkpoint, or only the model parameters
+        lang: language of the synthesis
+        net: Model to train
+        train_dataset: Pytorch Dataset Object for train data
+        device: Device to put the loaded tensors on
+        save_directory: Where to save the checkpoints
+        batch_size: How many elements should be loaded at once
+        epochs_per_save: how many epochs to train in between checkpoints
+    """
+    net = net.to(device)
+    torch.multiprocessing.set_sharing_strategy('file_system')
+    train_loader = DataLoader(batch_size=batch_size,
+                              dataset=train_dataset,
+                              drop_last=True,
+                              num_workers=8,
+                              pin_memory=True,
+                              shuffle=True,
+                              prefetch_factor=8,
+                              collate_fn=collate_and_pad,
+                              persistent_workers=True)
+    default_embedding = None
+    for index in range(20):  # slicing is not implemented for datasets, so this detour is needed.
+        if default_embedding is None:
+            default_embedding = train_dataset[index][7].squeeze()
+        else:
+            default_embedding = default_embedding + train_dataset[index][7].squeeze()
+    default_embedding = (default_embedding / len(train_dataset)).to(device)
+    # default speaker embedding for inference is the average of the first 20 speaker embeddings. So if you use multiple datasets combined,
+    # put a single speaker one with the nicest voice first into the concat dataset.
+    step_counter = 0
+    optimizer = torch.optim.Adam(net.parameters(), lr=lr)
+    scheduler = WarmupScheduler(optimizer, warmup_steps=warmup_steps)
+    scaler = GradScaler()
+    epoch = 0
+    if resume:
+        path_to_checkpoint = get_most_recent_checkpoint(checkpoint_dir=save_directory)
+    if path_to_checkpoint is not None:
+        check_dict = torch.load(path_to_checkpoint, map_location=device)
+        net.load_state_dict(check_dict["model"])
+        if not fine_tune:
+            optimizer.load_state_dict(check_dict["optimizer"])
+            scheduler.load_state_dict(check_dict["scheduler"])
+            step_counter = check_dict["step_counter"]
+            scaler.load_state_dict(check_dict["scaler"])
+    start_time = time.time()
+    while True:
+        net.train()
+        epoch += 1
+        optimizer.zero_grad()
+        train_losses_this_epoch = list()
+        for batch in tqdm(train_loader):
+            with autocast():
+                train_loss = net(text_tensors=batch[0].to(device),
+                                 text_lengths=batch[1].to(device),
+                                 gold_speech=batch[2].to(device),
+                                 speech_lengths=batch[3].to(device),
+                                 gold_durations=batch[4].to(device),
+                                 gold_pitch=batch[6].to(device),  # mind the switched order
+                                 gold_energy=batch[5].to(device),  # mind the switched order
+                                 utterance_embedding=batch[7].to(device),
+                                 lang_ids=batch[8].to(device),
+                                 return_mels=False)
+                train_losses_this_epoch.append(train_loss.item())
+            optimizer.zero_grad()
+            scaler.scale(train_loss).backward()
+            del train_loss
+            step_counter += 1
+            scaler.unscale_(optimizer)
+            torch.nn.utils.clip_grad_norm_(net.parameters(), 1.0, error_if_nonfinite=False)
+            scaler.step(optimizer)
+            scaler.update()
+            scheduler.step()
+        net.eval()
+        if epoch % epochs_per_save == 0:
+            torch.save({
+                "model"       : net.state_dict(),
+                "optimizer"   : optimizer.state_dict(),
+                "step_counter": step_counter,
+                "scaler"      : scaler.state_dict(),
+                "scheduler"   : scheduler.state_dict(),
+                "default_emb" : default_embedding,
+                }, os.path.join(save_directory, "checkpoint_{}.pt".format(step_counter)))
+            delete_old_checkpoints(save_directory, keep=5)
+            plot_progress_spec(net, device, save_dir=save_directory, step=step_counter, lang=lang, default_emb=default_embedding)
+            if step_counter > steps:
+                # DONE
+                return
+        print("Epoch:        {}".format(epoch))
+        print("Train Loss:   {}".format(sum(train_losses_this_epoch) / len(train_losses_this_epoch)))
+        print("Time elapsed: {} Minutes".format(round((time.time() - start_time) / 60)))
+        print("Steps:        {}".format(step_counter))
+        net.train()

TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/fastspeech2_train_loop_ctc.py ADDED Viewed

	@@ -0,0 +1,191 @@

+import os
+import random
+import time
+import librosa.display as lbd
+import matplotlib.pyplot as plt
+import torch
+import torch.multiprocessing
+import torch.multiprocessing
+from torch.cuda.amp import GradScaler
+from torch.nn.utils.rnn import pad_sequence
+from tqdm import tqdm
+from Preprocessing.ArticulatoryCombinedTextFrontend import ArticulatoryCombinedTextFrontend
+from TrainingInterfaces.Text_to_Spectrogram.AutoAligner.Aligner import Aligner
+from Utility.WarmupScheduler import WarmupScheduler
+from Utility.utils import cumsum_durations
+from Utility.utils import delete_old_checkpoints
+from Utility.utils import get_most_recent_checkpoint
+def plot_progress_spec(net, device, save_dir, step, lang):
+    tf = ArticulatoryCombinedTextFrontend(language=lang)
+    sentence = ""
+    if lang == "en":
+        sentence = "This is a complex sentence, it even has a pause!"
+    elif lang == "de":
+        sentence = "Dies ist ein komplexer Satz, er hat sogar eine Pause!"
+    elif lang == "el":
+        sentence = "Αυτή είναι μια σύνθετη πρόταση, έχει ακόμη και παύση!"
+    elif lang == "es":
+        sentence = "Esta es una oración compleja, ¡incluso tiene una pausa!"
+    elif lang == "fi":
+        sentence = "Tämä on monimutkainen lause, sillä on jopa tauko!"
+    elif lang == "ru":
+        sentence = "Это сложное предложение, в нем даже есть пауза!"
+    elif lang == "hu":
+        sentence = "Ez egy összetett mondat, még szünet is van benne!"
+    elif lang == "nl":
+        sentence = "Dit is een complexe zin, er zit zelfs een pauze in!"
+    elif lang == "fr":
+        sentence = "C'est une phrase complexe, elle a même une pause !"
+    phoneme_vector = tf.string_to_tensor(sentence).squeeze(0).to(device)
+    spec, durations, *_ = net.inference(text=phoneme_vector, return_duration_pitch_energy=True)
+    spec = spec.transpose(0, 1).to("cpu").numpy()
+    duration_splits, label_positions = cumsum_durations(durations.cpu().numpy())
+    if not os.path.exists(os.path.join(save_dir, "spec")):
+        os.makedirs(os.path.join(save_dir, "spec"))
+    fig, ax = plt.subplots(nrows=1, ncols=1)
+    lbd.specshow(spec,
+                 ax=ax,
+                 sr=16000,
+                 cmap='GnBu',
+                 y_axis='mel',
+                 x_axis=None,
+                 hop_length=256)
+    ax.yaxis.set_visible(False)
+    ax.set_xticks(duration_splits, minor=True)
+    ax.xaxis.grid(True, which='minor')
+    ax.set_xticks(label_positions, minor=False)
+    ax.set_xticklabels(tf.get_phone_string(sentence))
+    ax.set_title(sentence)
+    plt.savefig(os.path.join(os.path.join(save_dir, "spec"), str(step) + ".png"))
+    plt.clf()
+    plt.close()
+def train_loop(net,
+               train_sentences,
+               device,
+               save_directory,
+               aligner_checkpoint,
+               batch_size=32,
+               steps=300000,
+               epochs_per_save=5,
+               lang="en",
+               lr=0.0001,
+               warmup_steps=4000,
+               path_to_checkpoint=None,
+               fine_tune=False,
+               resume=False):
+    """
+    Args:
+        resume: whether to resume from the most recent checkpoint
+        warmup_steps: how long the learning rate should increase before it reaches the specified value
+        steps: How many steps to train
+        lr: The initial learning rate for the optimiser
+        path_to_checkpoint: reloads a checkpoint to continue training from there
+        fine_tune: whether to load everything from a checkpoint, or only the model parameters
+        lang: language of the synthesis and of the train sentences
+        net: Model to train
+        train_sentences: list of (string) sentences the CTC objective should be learned on
+        device: Device to put the loaded tensors on
+        save_directory: Where to save the checkpoints
+        batch_size: How many elements should be loaded at once
+        epochs_per_save: how many epochs to train in between checkpoints
+    """
+    net = net.to(device)
+    torch.multiprocessing.set_sharing_strategy('file_system')
+    text_to_art_vec = ArticulatoryCombinedTextFrontend(language=lang)
+    asr_aligner = Aligner().to(device)
+    check_dict = torch.load(os.path.join(aligner_checkpoint), map_location=device)
+    asr_aligner.load_state_dict(check_dict["asr_model"])
+    net.stop_gradient_from_energy_predictor = False
+    net.stop_gradient_from_pitch_predictor = False
+    step_counter = 0
+    optimizer = torch.optim.Adam(net.parameters(), lr=lr)
+    scheduler = WarmupScheduler(optimizer, warmup_steps=warmup_steps)
+    scaler = GradScaler()
+    epoch = 0
+    if resume:
+        path_to_checkpoint = get_most_recent_checkpoint(checkpoint_dir=save_directory)
+    if path_to_checkpoint is not None:
+        check_dict = torch.load(path_to_checkpoint, map_location=device)
+        net.load_state_dict(check_dict["model"])
+        if not fine_tune:
+            optimizer.load_state_dict(check_dict["optimizer"])
+            scheduler.load_state_dict(check_dict["scheduler"])
+            step_counter = check_dict["step_counter"]
+            scaler.load_state_dict(check_dict["scaler"])
+    start_time = time.time()
+    while True:
+        net.train()
+        epoch += 1
+        optimizer.zero_grad()
+        train_losses_this_epoch = list()
+        random.shuffle(train_sentences)
+        batch_of_text_vecs = list()
+        batch_of_tokens = list()
+        for sentence in tqdm(train_sentences):
+            if sentence.strip() == "":
+                continue
+            phonemes = text_to_art_vec.get_phone_string(sentence)
+            # collect batch of texts
+            batch_of_text_vecs.append(text_to_art_vec.string_to_tensor(phonemes, input_phonemes=True).squeeze(0).to(device))
+            # collect batch of tokens
+            tokens = list()
+            for phone in phonemes:
+                tokens.append(text_to_art_vec.phone_to_id[phone])
+            tokens = torch.LongTensor(tokens).to(device)
+            batch_of_tokens.append(tokens)
+            if len(batch_of_tokens) == batch_size:
+                token_batch = pad_sequence(batch_of_tokens, batch_first=True)
+                token_lens = torch.LongTensor([len(x) for x in batch_of_tokens]).to(device)
+                text_batch = pad_sequence(batch_of_text_vecs, batch_first=True)
+                spec_batch, d_outs = net.batch_inference(texts=text_batch, text_lens=token_lens)
+                spec_lens = torch.LongTensor([sum(x) for x in d_outs]).to(device)
+                asr_pred = asr_aligner(spec_batch, spec_lens)
+                train_loss = asr_aligner.ctc_loss(asr_pred.transpose(0, 1).log_softmax(2), token_batch, spec_lens, token_lens)
+                train_losses_this_epoch.append(train_loss.item())
+                optimizer.zero_grad()
+                asr_aligner.zero_grad()
+                scaler.scale(train_loss).backward()
+                del train_loss
+                step_counter += 1
+                scaler.unscale_(optimizer)
+                torch.nn.utils.clip_grad_norm_(net.parameters(), 1.0, error_if_nonfinite=False)
+                scaler.step(optimizer)
+                scaler.update()
+                scheduler.step()
+                batch_of_tokens = list()
+                batch_of_text_vecs = list()
+        net.eval()
+        if epoch % epochs_per_save == 0:
+            torch.save({
+                "model"       : net.state_dict(),
+                "optimizer"   : optimizer.state_dict(),
+                "step_counter": step_counter,
+                "scaler"      : scaler.state_dict(),
+                "scheduler"   : scheduler.state_dict(),
+                }, os.path.join(save_directory, "checkpoint_{}.pt".format(step_counter)))
+            delete_old_checkpoints(save_directory, keep=5)
+            with torch.no_grad():
+                plot_progress_spec(net, device, save_dir=save_directory, step=step_counter, lang=lang)
+            if step_counter > steps:
+                # DONE
+                return
+        print("Epoch:        {}".format(epoch))
+        print("Train Loss:   {}".format(sum(train_losses_this_epoch) / len(train_losses_this_epoch)))
+        print("Time elapsed: {} Minutes".format(round((time.time() - start_time) / 60)))
+        print("Steps:        {}".format(step_counter))
+        net.train()

TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/meta_train_loop.py ADDED Viewed

	@@ -0,0 +1,211 @@

+import librosa.display as lbd
+import matplotlib.pyplot as plt
+import torch
+import torch.multiprocessing
+from torch.cuda.amp import GradScaler
+from torch.cuda.amp import autocast
+from torch.nn.utils.rnn import pad_sequence
+from torch.utils.data.dataloader import DataLoader
+from tqdm import tqdm
+from Preprocessing.ArticulatoryCombinedTextFrontend import ArticulatoryCombinedTextFrontend
+from Preprocessing.ArticulatoryCombinedTextFrontend import get_language_id
+from Utility.WarmupScheduler import WarmupScheduler
+from Utility.path_to_transcript_dicts import *
+from Utility.utils import cumsum_durations
+from Utility.utils import delete_old_checkpoints
+from Utility.utils import get_most_recent_checkpoint
+def train_loop(net,
+               datasets,
+               device,
+               save_directory,
+               batch_size,
+               steps,
+               steps_per_checkpoint,
+               lr,
+               path_to_checkpoint,
+               resume=False,
+               warmup_steps=4000):
+    # ============
+    # Preparations
+    # ============
+    net = net.to(device)
+    torch.multiprocessing.set_sharing_strategy('file_system')
+    train_loaders = list()
+    train_iters = list()
+    for dataset in datasets:
+        train_loaders.append(DataLoader(batch_size=batch_size,
+                                        dataset=dataset,
+                                        drop_last=True,
+                                        num_workers=2,
+                                        pin_memory=True,
+                                        shuffle=True,
+                                        prefetch_factor=5,
+                                        collate_fn=collate_and_pad,
+                                        persistent_workers=True))
+        train_iters.append(iter(train_loaders[-1]))
+    default_embeddings = {"en": None, "de": None, "el": None, "es": None, "fi": None, "ru": None, "hu": None, "nl": None, "fr": None}
+    for index, lang in enumerate(["en", "de", "el", "es", "fi", "ru", "hu", "nl", "fr"]):
+        default_embedding = None
+        for datapoint in datasets[index]:
+            if default_embedding is None:
+                default_embedding = datapoint[7].squeeze()
+            else:
+                default_embedding = default_embedding + datapoint[7].squeeze()
+        default_embeddings[lang] = (default_embedding / len(datasets[index])).to(device)
+    optimizer = torch.optim.RAdam(net.parameters(), lr=lr, eps=1.0e-06, weight_decay=0.0)
+    grad_scaler = GradScaler()
+    scheduler = WarmupScheduler(optimizer, warmup_steps=warmup_steps)
+    if resume:
+        previous_checkpoint = get_most_recent_checkpoint(checkpoint_dir=save_directory)
+        if previous_checkpoint is not None:
+            path_to_checkpoint = previous_checkpoint
+        else:
+            raise RuntimeError(f"No checkpoint found that can be resumed from in {save_directory}")
+    step_counter = 0
+    train_losses_total = list()
+    if path_to_checkpoint is not None:
+        check_dict = torch.load(os.path.join(path_to_checkpoint), map_location=device)
+        net.load_state_dict(check_dict["model"])
+        if resume:
+            optimizer.load_state_dict(check_dict["optimizer"])
+            step_counter = check_dict["step_counter"]
+            grad_scaler.load_state_dict(check_dict["scaler"])
+            scheduler.load_state_dict(check_dict["scheduler"])
+            if step_counter > steps:
+                print("Desired steps already reached in loaded checkpoint.")
+                return
+    net.train()
+    # =============================
+    # Actual train loop starts here
+    # =============================
+    for step in tqdm(range(step_counter, steps)):
+        batches = []
+        for index in range(len(datasets)):
+            # we get one batch for each task (i.e. language in this case)
+            try:
+                batch = next(train_iters[index])
+                batches.append(batch)
+            except StopIteration:
+                train_iters[index] = iter(train_loaders[index])
+                batch = next(train_iters[index])
+                batches.append(batch)
+        train_loss = 0.0
+        for batch in batches:
+            with autocast():
+                # we sum the loss for each task, as we would do for the
+                # second order regular MAML, but we do it only over one
+                # step (i.e. iterations of inner loop = 1)
+                train_loss = train_loss + net(text_tensors=batch[0].to(device),
+                                              text_lengths=batch[1].to(device),
+                                              gold_speech=batch[2].to(device),
+                                              speech_lengths=batch[3].to(device),
+                                              gold_durations=batch[4].to(device),
+                                              gold_pitch=batch[6].to(device),  # mind the switched order
+                                              gold_energy=batch[5].to(device),  # mind the switched order
+                                              utterance_embedding=batch[7].to(device),
+                                              lang_ids=batch[8].to(device),
+                                              return_mels=False)
+        # then we directly update our meta-parameters without
+        # the need for any task specific parameters
+        train_losses_total.append(train_loss.item())
+        optimizer.zero_grad()
+        grad_scaler.scale(train_loss).backward()
+        grad_scaler.unscale_(optimizer)
+        torch.nn.utils.clip_grad_norm_(net.parameters(), 1.0, error_if_nonfinite=False)
+        grad_scaler.step(optimizer)
+        grad_scaler.update()
+        scheduler.step()
+        if step % steps_per_checkpoint == 0:
+            # ==============================
+            # Enough steps for some insights
+            # ==============================
+            net.eval()
+            print(f"Total Loss: {round(sum(train_losses_total) / len(train_losses_total), 3)}")
+            train_losses_total = list()
+            torch.save({
+                "model"       : net.state_dict(),
+                "optimizer"   : optimizer.state_dict(),
+                "scaler"      : grad_scaler.state_dict(),
+                "scheduler"   : scheduler.state_dict(),
+                "step_counter": step,
+                "default_emb" : default_embeddings["en"]
+                },
+                os.path.join(save_directory, "checkpoint_{}.pt".format(step)))
+            delete_old_checkpoints(save_directory, keep=5)
+            for lang in ["en", "de", "el", "es", "fi", "ru", "hu", "nl", "fr"]:
+                plot_progress_spec(net=net,
+                                   device=device,
+                                   lang=lang,
+                                   save_dir=save_directory,
+                                   step=step,
+                                   utt_embeds=default_embeddings)
+            net.train()
+@torch.inference_mode()
+def plot_progress_spec(net, device, save_dir, step, lang, utt_embeds):
+    tf = ArticulatoryCombinedTextFrontend(language=lang)
+    sentence = ""
+    default_embed = utt_embeds[lang]
+    if lang == "en":
+        sentence = "This is a complex sentence, it even has a pause!"
+    elif lang == "de":
+        sentence = "Dies ist ein komplexer Satz, er hat sogar eine Pause!"
+    elif lang == "el":
+        sentence = "Αυτή είναι μια σύνθετη πρόταση, έχει ακόμη και παύση!"
+    elif lang == "es":
+        sentence = "Esta es una oración compleja, ¡incluso tiene una pausa!"
+    elif lang == "fi":
+        sentence = "Tämä on monimutkainen lause, sillä on jopa tauko!"
+    elif lang == "ru":
+        sentence = "Это сложное предложение, в нем даже есть пауза!"
+    elif lang == "hu":
+        sentence = "Ez egy összetett mondat, még szünet is van benne!"
+    elif lang == "nl":
+        sentence = "Dit is een complexe zin, er zit zelfs een pauze in!"
+    elif lang == "fr":
+        sentence = "C'est une phrase complexe, elle a même une pause !"
+    phoneme_vector = tf.string_to_tensor(sentence).squeeze(0).to(device)
+    spec, durations, *_ = net.inference(text=phoneme_vector,
+                                        return_duration_pitch_energy=True,
+                                        utterance_embedding=default_embed,
+                                        lang_id=get_language_id(lang).to(device))
+    spec = spec.transpose(0, 1).to("cpu").numpy()
+    duration_splits, label_positions = cumsum_durations(durations.cpu().numpy())
+    if not os.path.exists(os.path.join(save_dir, "spec")):
+        os.makedirs(os.path.join(save_dir, "spec"))
+    fig, ax = plt.subplots(nrows=1, ncols=1)
+    lbd.specshow(spec,
+                 ax=ax,
+                 sr=16000,
+                 cmap='GnBu',
+                 y_axis='mel',
+                 x_axis=None,
+                 hop_length=256)
+    ax.yaxis.set_visible(False)
+    ax.set_xticks(duration_splits, minor=True)
+    ax.xaxis.grid(True, which='minor')
+    ax.set_xticks(label_positions, minor=False)
+    ax.set_xticklabels(tf.get_phone_string(sentence))
+    ax.set_title(sentence)
+    plt.savefig(os.path.join(os.path.join(save_dir, "spec"), f"{step}_{lang}.png"))
+    plt.clf()
+    plt.close()
+def collate_and_pad(batch):
+    # text, text_len, speech, speech_len, durations, energy, pitch, utterance condition, language_id
+    return (pad_sequence([datapoint[0] for datapoint in batch], batch_first=True),
+            torch.stack([datapoint[1] for datapoint in batch]).squeeze(1),
+            pad_sequence([datapoint[2] for datapoint in batch], batch_first=True),
+            torch.stack([datapoint[3] for datapoint in batch]).squeeze(1),
+            pad_sequence([datapoint[4] for datapoint in batch], batch_first=True),
+            pad_sequence([datapoint[5] for datapoint in batch], batch_first=True),
+            pad_sequence([datapoint[6] for datapoint in batch], batch_first=True),
+            torch.stack([datapoint[7] for datapoint in batch]).squeeze(),
+            torch.stack([datapoint[8] for datapoint in batch]))

TrainingInterfaces/Text_to_Spectrogram/__init__.py ADDED Viewed

File without changes

TrainingInterfaces/__init__.py ADDED Viewed

File without changes