diff --git a/InferenceInterfaces/InferenceArchitectures/InferenceFastSpeech2.py b/InferenceInterfaces/InferenceArchitectures/InferenceFastSpeech2.py
new file mode 100644
index 0000000000000000000000000000000000000000..683f0873a6bfa800929586724c0bb21dc126f0dd
--- /dev/null
+++ b/InferenceInterfaces/InferenceArchitectures/InferenceFastSpeech2.py
@@ -0,0 +1,256 @@
+from abc import ABC
+
+import torch
+
+from Layers.Conformer import Conformer
+from Layers.DurationPredictor import DurationPredictor
+from Layers.LengthRegulator import LengthRegulator
+from Layers.PostNet import PostNet
+from Layers.VariancePredictor import VariancePredictor
+from Utility.utils import make_non_pad_mask
+from Utility.utils import make_pad_mask
+
+
+class FastSpeech2(torch.nn.Module, ABC):
+
+    def __init__(self,  # network structure related
+                 weights,
+                 idim=66,
+                 odim=80,
+                 adim=384,
+                 aheads=4,
+                 elayers=6,
+                 eunits=1536,
+                 dlayers=6,
+                 dunits=1536,
+                 postnet_layers=5,
+                 postnet_chans=256,
+                 postnet_filts=5,
+                 positionwise_conv_kernel_size=1,
+                 use_scaled_pos_enc=True,
+                 use_batch_norm=True,
+                 encoder_normalize_before=True,
+                 decoder_normalize_before=True,
+                 encoder_concat_after=False,
+                 decoder_concat_after=False,
+                 reduction_factor=1,
+                 # encoder / decoder
+                 use_macaron_style_in_conformer=True,
+                 use_cnn_in_conformer=True,
+                 conformer_enc_kernel_size=7,
+                 conformer_dec_kernel_size=31,
+                 # duration predictor
+                 duration_predictor_layers=2,
+                 duration_predictor_chans=256,
+                 duration_predictor_kernel_size=3,
+                 # energy predictor
+                 energy_predictor_layers=2,
+                 energy_predictor_chans=256,
+                 energy_predictor_kernel_size=3,
+                 energy_predictor_dropout=0.5,
+                 energy_embed_kernel_size=1,
+                 energy_embed_dropout=0.0,
+                 stop_gradient_from_energy_predictor=True,
+                 # pitch predictor
+                 pitch_predictor_layers=5,
+                 pitch_predictor_chans=256,
+                 pitch_predictor_kernel_size=5,
+                 pitch_predictor_dropout=0.5,
+                 pitch_embed_kernel_size=1,
+                 pitch_embed_dropout=0.0,
+                 stop_gradient_from_pitch_predictor=True,
+                 # training related
+                 transformer_enc_dropout_rate=0.2,
+                 transformer_enc_positional_dropout_rate=0.2,
+                 transformer_enc_attn_dropout_rate=0.2,
+                 transformer_dec_dropout_rate=0.2,
+                 transformer_dec_positional_dropout_rate=0.2,
+                 transformer_dec_attn_dropout_rate=0.2,
+                 duration_predictor_dropout_rate=0.2,
+                 postnet_dropout_rate=0.5,
+                 # additional features
+                 utt_embed_dim=704,
+                 connect_utt_emb_at_encoder_out=True,
+                 lang_embs=100):
+        super().__init__()
+        self.idim = idim
+        self.odim = odim
+        self.reduction_factor = reduction_factor
+        self.stop_gradient_from_pitch_predictor = stop_gradient_from_pitch_predictor
+        self.stop_gradient_from_energy_predictor = stop_gradient_from_energy_predictor
+        self.use_scaled_pos_enc = use_scaled_pos_enc
+        embed = torch.nn.Sequential(torch.nn.Linear(idim, 100),
+                                    torch.nn.Tanh(),
+                                    torch.nn.Linear(100, adim))
+        self.encoder = Conformer(idim=idim, attention_dim=adim, attention_heads=aheads, linear_units=eunits, num_blocks=elayers,
+                                 input_layer=embed, dropout_rate=transformer_enc_dropout_rate,
+                                 positional_dropout_rate=transformer_enc_positional_dropout_rate, attention_dropout_rate=transformer_enc_attn_dropout_rate,
+                                 normalize_before=encoder_normalize_before, concat_after=encoder_concat_after,
+                                 positionwise_conv_kernel_size=positionwise_conv_kernel_size, macaron_style=use_macaron_style_in_conformer,
+                                 use_cnn_module=use_cnn_in_conformer, cnn_module_kernel=conformer_enc_kernel_size, zero_triu=False,
+                                 utt_embed=utt_embed_dim, connect_utt_emb_at_encoder_out=connect_utt_emb_at_encoder_out, lang_embs=lang_embs)
+        self.duration_predictor = DurationPredictor(idim=adim, n_layers=duration_predictor_layers,
+                                                    n_chans=duration_predictor_chans,
+                                                    kernel_size=duration_predictor_kernel_size,
+                                                    dropout_rate=duration_predictor_dropout_rate, )
+        self.pitch_predictor = VariancePredictor(idim=adim, n_layers=pitch_predictor_layers,
+                                                 n_chans=pitch_predictor_chans,
+                                                 kernel_size=pitch_predictor_kernel_size,
+                                                 dropout_rate=pitch_predictor_dropout)
+        self.pitch_embed = torch.nn.Sequential(torch.nn.Conv1d(in_channels=1, out_channels=adim,
+                                                               kernel_size=pitch_embed_kernel_size,
+                                                               padding=(pitch_embed_kernel_size - 1) // 2),
+                                               torch.nn.Dropout(pitch_embed_dropout))
+        self.energy_predictor = VariancePredictor(idim=adim, n_layers=energy_predictor_layers,
+                                                  n_chans=energy_predictor_chans,
+                                                  kernel_size=energy_predictor_kernel_size,
+                                                  dropout_rate=energy_predictor_dropout)
+        self.energy_embed = torch.nn.Sequential(torch.nn.Conv1d(in_channels=1, out_channels=adim,
+                                                                kernel_size=energy_embed_kernel_size,
+                                                                padding=(energy_embed_kernel_size - 1) // 2),
+                                                torch.nn.Dropout(energy_embed_dropout))
+        self.length_regulator = LengthRegulator()
+        self.decoder = Conformer(idim=0,
+                                 attention_dim=adim,
+                                 attention_heads=aheads,
+                                 linear_units=dunits,
+                                 num_blocks=dlayers,
+                                 input_layer=None,
+                                 dropout_rate=transformer_dec_dropout_rate,
+                                 positional_dropout_rate=transformer_dec_positional_dropout_rate,
+                                 attention_dropout_rate=transformer_dec_attn_dropout_rate,
+                                 normalize_before=decoder_normalize_before,
+                                 concat_after=decoder_concat_after,
+                                 positionwise_conv_kernel_size=positionwise_conv_kernel_size,
+                                 macaron_style=use_macaron_style_in_conformer,
+                                 use_cnn_module=use_cnn_in_conformer,
+                                 cnn_module_kernel=conformer_dec_kernel_size)
+        self.feat_out = torch.nn.Linear(adim, odim * reduction_factor)
+        self.postnet = PostNet(idim=idim,
+                               odim=odim,
+                               n_layers=postnet_layers,
+                               n_chans=postnet_chans,
+                               n_filts=postnet_filts,
+                               use_batch_norm=use_batch_norm,
+                               dropout_rate=postnet_dropout_rate)
+        self.load_state_dict(weights)
+
+    def _forward(self, text_tensors, text_lens, gold_speech=None, speech_lens=None,
+                 gold_durations=None, gold_pitch=None, gold_energy=None,
+                 is_inference=False, alpha=1.0, utterance_embedding=None, lang_ids=None):
+        # forward encoder
+        text_masks = self._source_mask(text_lens)
+
+        encoded_texts, _ = self.encoder(text_tensors, text_masks, utterance_embedding=utterance_embedding, lang_ids=lang_ids)  # (B, Tmax, adim)
+
+        # forward duration predictor and variance predictors
+        duration_masks = make_pad_mask(text_lens, device=text_lens.device)
+
+        if self.stop_gradient_from_pitch_predictor:
+            pitch_predictions = self.pitch_predictor(encoded_texts.detach(), duration_masks.unsqueeze(-1))
+        else:
+            pitch_predictions = self.pitch_predictor(encoded_texts, duration_masks.unsqueeze(-1))
+
+        if self.stop_gradient_from_energy_predictor:
+            energy_predictions = self.energy_predictor(encoded_texts.detach(), duration_masks.unsqueeze(-1))
+        else:
+            energy_predictions = self.energy_predictor(encoded_texts, duration_masks.unsqueeze(-1))
+
+        if is_inference:
+            if gold_durations is not None:
+                duration_predictions = gold_durations
+            else:
+                duration_predictions = self.duration_predictor.inference(encoded_texts, duration_masks)
+            if gold_pitch is not None:
+                pitch_predictions = gold_pitch
+            if gold_energy is not None:
+                energy_predictions = gold_energy
+            pitch_embeddings = self.pitch_embed(pitch_predictions.transpose(1, 2)).transpose(1, 2)
+            energy_embeddings = self.energy_embed(energy_predictions.transpose(1, 2)).transpose(1, 2)
+            encoded_texts = encoded_texts + energy_embeddings + pitch_embeddings
+            encoded_texts = self.length_regulator(encoded_texts, duration_predictions, alpha)
+        else:
+            duration_predictions = self.duration_predictor(encoded_texts, duration_masks)
+
+            # use groundtruth in training
+            pitch_embeddings = self.pitch_embed(gold_pitch.transpose(1, 2)).transpose(1, 2)
+            energy_embeddings = self.energy_embed(gold_energy.transpose(1, 2)).transpose(1, 2)
+            encoded_texts = encoded_texts + energy_embeddings + pitch_embeddings
+            encoded_texts = self.length_regulator(encoded_texts, gold_durations)  # (B, Lmax, adim)
+
+        # forward decoder
+        if speech_lens is not None and not is_inference:
+            if self.reduction_factor > 1:
+                olens_in = speech_lens.new([olen // self.reduction_factor for olen in speech_lens])
+            else:
+                olens_in = speech_lens
+            h_masks = self._source_mask(olens_in)
+        else:
+            h_masks = None
+        zs, _ = self.decoder(encoded_texts, h_masks)  # (B, Lmax, adim)
+        before_outs = self.feat_out(zs).view(zs.size(0), -1, self.odim)  # (B, Lmax, odim)
+
+        # postnet -> (B, Lmax//r * r, odim)
+        after_outs = before_outs + self.postnet(before_outs.transpose(1, 2)).transpose(1, 2)
+
+        return before_outs, after_outs, duration_predictions, pitch_predictions, energy_predictions
+
+    @torch.no_grad()
+    def forward(self,
+                text,
+                speech=None,
+                durations=None,
+                pitch=None,
+                energy=None,
+                utterance_embedding=None,
+                return_duration_pitch_energy=False,
+                lang_id=None):
+        """
+        Generate the sequence of features given the sequences of characters.
+
+        Args:
+            text: Input sequence of characters
+            speech: Feature sequence to extract style
+            durations: Groundtruth of duration
+            pitch: Groundtruth of token-averaged pitch
+            energy: Groundtruth of token-averaged energy
+            return_duration_pitch_energy: whether to return the list of predicted durations for nicer plotting
+            utterance_embedding: embedding of utterance wide parameters
+
+        Returns:
+            Mel Spectrogram
+
+        """
+        self.eval()
+        # setup batch axis
+        ilens = torch.tensor([text.shape[0]], dtype=torch.long, device=text.device)
+        if speech is not None:
+            gold_speech = speech.unsqueeze(0)
+        else:
+            gold_speech = None
+        if durations is not None:
+            durations = durations.unsqueeze(0)
+        if pitch is not None:
+            pitch = pitch.unsqueeze(0)
+        if energy is not None:
+            energy = energy.unsqueeze(0)
+        if lang_id is not None:
+            lang_id = lang_id.unsqueeze(0)
+
+        before_outs, after_outs, d_outs, pitch_predictions, energy_predictions = self._forward(text.unsqueeze(0),
+                                                                                               ilens,
+                                                                                               gold_speech=gold_speech,
+                                                                                               gold_durations=durations,
+                                                                                               is_inference=True,
+                                                                                               gold_pitch=pitch,
+                                                                                               gold_energy=energy,
+                                                                                               utterance_embedding=utterance_embedding.unsqueeze(0),
+                                                                                               lang_ids=lang_id)
+        self.train()
+        if return_duration_pitch_energy:
+            return after_outs[0], d_outs[0], pitch_predictions[0], energy_predictions[0]
+        return after_outs[0]
+
+    def _source_mask(self, ilens):
+        x_masks = make_non_pad_mask(ilens).to(next(self.parameters()).device)
+        return x_masks.unsqueeze(-2)
diff --git a/InferenceInterfaces/InferenceArchitectures/InferenceHiFiGAN.py b/InferenceInterfaces/InferenceArchitectures/InferenceHiFiGAN.py
new file mode 100644
index 0000000000000000000000000000000000000000..056b970b12d3c536a604e95aa9736d74cdf3e4fd
--- /dev/null
+++ b/InferenceInterfaces/InferenceArchitectures/InferenceHiFiGAN.py
@@ -0,0 +1,91 @@
+import torch
+
+from Layers.ResidualBlock import HiFiGANResidualBlock as ResidualBlock
+
+
+class HiFiGANGenerator(torch.nn.Module):
+
+    def __init__(self,
+                 path_to_weights,
+                 in_channels=80,
+                 out_channels=1,
+                 channels=512,
+                 kernel_size=7,
+                 upsample_scales=(8, 6, 4, 4),
+                 upsample_kernel_sizes=(16, 12, 8, 8),
+                 resblock_kernel_sizes=(3, 7, 11),
+                 resblock_dilations=[(1, 3, 5), (1, 3, 5), (1, 3, 5)],
+                 use_additional_convs=True,
+                 bias=True,
+                 nonlinear_activation="LeakyReLU",
+                 nonlinear_activation_params={"negative_slope": 0.1},
+                 use_weight_norm=True, ):
+        super().__init__()
+        assert kernel_size % 2 == 1, "Kernal size must be odd number."
+        assert len(upsample_scales) == len(upsample_kernel_sizes)
+        assert len(resblock_dilations) == len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_kernel_sizes)
+        self.num_blocks = len(resblock_kernel_sizes)
+        self.input_conv = torch.nn.Conv1d(in_channels,
+                                          channels,
+                                          kernel_size,
+                                          1,
+                                          padding=(kernel_size - 1) // 2, )
+        self.upsamples = torch.nn.ModuleList()
+        self.blocks = torch.nn.ModuleList()
+        for i in range(len(upsample_kernel_sizes)):
+            self.upsamples += [
+                torch.nn.Sequential(getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
+                                    torch.nn.ConvTranspose1d(channels // (2 ** i),
+                                                             channels // (2 ** (i + 1)),
+                                                             upsample_kernel_sizes[i],
+                                                             upsample_scales[i],
+                                                             padding=(upsample_kernel_sizes[i] - upsample_scales[i]) // 2, ), )]
+            for j in range(len(resblock_kernel_sizes)):
+                self.blocks += [ResidualBlock(kernel_size=resblock_kernel_sizes[j],
+                                              channels=channels // (2 ** (i + 1)),
+                                              dilations=resblock_dilations[j],
+                                              bias=bias,
+                                              use_additional_convs=use_additional_convs,
+                                              nonlinear_activation=nonlinear_activation,
+                                              nonlinear_activation_params=nonlinear_activation_params, )]
+        self.output_conv = torch.nn.Sequential(
+            torch.nn.LeakyReLU(),
+            torch.nn.Conv1d(channels // (2 ** (i + 1)),
+                            out_channels,
+                            kernel_size,
+                            1,
+                            padding=(kernel_size - 1) // 2, ),
+            torch.nn.Tanh(), )
+        if use_weight_norm:
+            self.apply_weight_norm()
+        self.load_state_dict(torch.load(path_to_weights, map_location='cpu')["generator"])
+
+    def forward(self, c, normalize_before=False):
+        if normalize_before:
+            c = (c - self.mean) / self.scale
+        c = self.input_conv(c.unsqueeze(0))
+        for i in range(self.num_upsamples):
+            c = self.upsamples[i](c)
+            cs = 0.0  # initialize
+            for j in range(self.num_blocks):
+                cs = cs + self.blocks[i * self.num_blocks + j](c)
+            c = cs / self.num_blocks
+        c = self.output_conv(c)
+        return c.squeeze(0).squeeze(0)
+
+    def remove_weight_norm(self):
+        def _remove_weight_norm(m):
+            try:
+                torch.nn.utils.remove_weight_norm(m)
+            except ValueError:
+                return
+
+        self.apply(_remove_weight_norm)
+
+    def apply_weight_norm(self):
+        def _apply_weight_norm(m):
+            if isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.ConvTranspose1d):
+                torch.nn.utils.weight_norm(m)
+
+        self.apply(_apply_weight_norm)
diff --git a/InferenceInterfaces/InferenceArchitectures/__init__.py b/InferenceInterfaces/InferenceArchitectures/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/InferenceInterfaces/Meta_FastSpeech2.py b/InferenceInterfaces/Meta_FastSpeech2.py
new file mode 100644
index 0000000000000000000000000000000000000000..295e8aaf4253f2df0b724e207c3f12719e842a82
--- /dev/null
+++ b/InferenceInterfaces/Meta_FastSpeech2.py
@@ -0,0 +1,76 @@
+import os
+
+import librosa.display as lbd
+import matplotlib.pyplot as plt
+import soundfile
+import torch
+
+from InferenceInterfaces.InferenceArchitectures.InferenceFastSpeech2 import FastSpeech2
+from InferenceInterfaces.InferenceArchitectures.InferenceHiFiGAN import HiFiGANGenerator
+from Preprocessing.ArticulatoryCombinedTextFrontend import ArticulatoryCombinedTextFrontend
+from Preprocessing.ArticulatoryCombinedTextFrontend import get_language_id
+from Preprocessing.ProsodicConditionExtractor import ProsodicConditionExtractor
+
+
+class Meta_FastSpeech2(torch.nn.Module):
+
+    def __init__(self, device="cpu"):
+        super().__init__()
+        model_name = "Meta"
+        language = "en"
+        self.device = device
+        self.text2phone = ArticulatoryCombinedTextFrontend(language=language, add_silence_to_end=True)
+        checkpoint = torch.load(os.path.join("Models", f"FastSpeech2_{model_name}", "best.pt"), map_location='cpu')
+        self.phone2mel = FastSpeech2(weights=checkpoint["model"]).to(torch.device(device))
+        self.mel2wav = HiFiGANGenerator(path_to_weights=os.path.join("Models", "HiFiGAN_combined", "best.pt")).to(torch.device(device))
+        self.default_utterance_embedding = checkpoint["default_emb"].to(self.device)
+        self.phone2mel.eval()
+        self.mel2wav.eval()
+        self.lang_id = get_language_id(language)
+        self.to(torch.device(device))
+
+    def set_utterance_embedding(self, path_to_reference_audio):
+        wave, sr = soundfile.read(path_to_reference_audio)
+        self.default_utterance_embedding = ProsodicConditionExtractor(sr=sr).extract_condition_from_reference_wave(wave).to(self.device)
+
+    def set_language(self, lang_id):
+        """
+        The id parameter actually refers to the shorthand. This has become ambiguous with the introduction of the actual language IDs
+        """
+        self.text2phone = ArticulatoryCombinedTextFrontend(language=lang_id, add_silence_to_end=True, silent=True)
+        self.lang_id = get_language_id(lang_id).to(self.device)
+
+    def forward(self, text, view=False, durations=None, pitch=None, energy=None):
+        with torch.no_grad():
+            phones = self.text2phone.string_to_tensor(text).to(torch.device(self.device))
+            mel, durations, pitch, energy = self.phone2mel(phones,
+                                                           return_duration_pitch_energy=True,
+                                                           utterance_embedding=self.default_utterance_embedding,
+                                                           durations=durations,
+                                                           pitch=pitch,
+                                                           energy=energy,
+                                                           lang_id=self.lang_id)
+            mel = mel.transpose(0, 1)
+            wave = self.mel2wav(mel)
+        if view:
+            from Utility.utils import cumsum_durations
+            fig, ax = plt.subplots(nrows=2, ncols=1)
+            ax[0].plot(wave.cpu().numpy())
+            lbd.specshow(mel.cpu().numpy(),
+                         ax=ax[1],
+                         sr=16000,
+                         cmap='GnBu',
+                         y_axis='mel',
+                         x_axis=None,
+                         hop_length=256)
+            ax[0].yaxis.set_visible(False)
+            ax[1].yaxis.set_visible(False)
+            duration_splits, label_positions = cumsum_durations(durations.cpu().numpy())
+            ax[1].set_xticks(duration_splits, minor=True)
+            ax[1].xaxis.grid(True, which='minor')
+            ax[1].set_xticks(label_positions, minor=False)
+            ax[1].set_xticklabels(self.text2phone.get_phone_string(text))
+            ax[0].set_title(text)
+            plt.subplots_adjust(left=0.05, bottom=0.1, right=0.95, top=.9, wspace=0.0, hspace=0.0)
+            plt.show()
+        return wave
diff --git a/InferenceInterfaces/__init__.py b/InferenceInterfaces/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/Layers/Attention.py b/Layers/Attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb241e315de718099901a075feae2ed0e31c7347
--- /dev/null
+++ b/Layers/Attention.py
@@ -0,0 +1,324 @@
+# Written by Shigeki Karita, 2019
+# Published under Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+# Adapted by Florian Lux, 2021
+
+"""Multi-Head Attention layer definition."""
+
+import math
+
+import numpy
+import torch
+from torch import nn
+
+from Utility.utils import make_non_pad_mask
+
+
+class MultiHeadedAttention(nn.Module):
+    """
+    Multi-Head Attention layer.
+
+    Args:
+        n_head (int): The number of heads.
+        n_feat (int): The number of features.
+        dropout_rate (float): Dropout rate.
+    """
+
+    def __init__(self, n_head, n_feat, dropout_rate):
+        """
+        Construct an MultiHeadedAttention object.
+        """
+        super(MultiHeadedAttention, self).__init__()
+        assert n_feat % n_head == 0
+        # We assume d_v always equals d_k
+        self.d_k = n_feat // n_head
+        self.h = n_head
+        self.linear_q = nn.Linear(n_feat, n_feat)
+        self.linear_k = nn.Linear(n_feat, n_feat)
+        self.linear_v = nn.Linear(n_feat, n_feat)
+        self.linear_out = nn.Linear(n_feat, n_feat)
+        self.attn = None
+        self.dropout = nn.Dropout(p=dropout_rate)
+
+    def forward_qkv(self, query, key, value):
+        """
+        Transform query, key and value.
+
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+
+        Returns:
+            torch.Tensor: Transformed query tensor (#batch, n_head, time1, d_k).
+            torch.Tensor: Transformed key tensor (#batch, n_head, time2, d_k).
+            torch.Tensor: Transformed value tensor (#batch, n_head, time2, d_k).
+        """
+        n_batch = query.size(0)
+        q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k)
+        k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k)
+        v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k)
+        q = q.transpose(1, 2)  # (batch, head, time1, d_k)
+        k = k.transpose(1, 2)  # (batch, head, time2, d_k)
+        v = v.transpose(1, 2)  # (batch, head, time2, d_k)
+
+        return q, k, v
+
+    def forward_attention(self, value, scores, mask):
+        """
+        Compute attention context vector.
+
+        Args:
+            value (torch.Tensor): Transformed value (#batch, n_head, time2, d_k).
+            scores (torch.Tensor): Attention score (#batch, n_head, time1, time2).
+            mask (torch.Tensor): Mask (#batch, 1, time2) or (#batch, time1, time2).
+
+        Returns:
+            torch.Tensor: Transformed value (#batch, time1, d_model)
+                weighted by the attention score (#batch, time1, time2).
+        """
+        n_batch = value.size(0)
+        if mask is not None:
+            mask = mask.unsqueeze(1).eq(0)  # (batch, 1, *, time2)
+            min_value = float(numpy.finfo(torch.tensor(0, dtype=scores.dtype).numpy().dtype).min)
+            scores = scores.masked_fill(mask, min_value)
+            self.attn = torch.softmax(scores, dim=-1).masked_fill(mask, 0.0)  # (batch, head, time1, time2)
+        else:
+            self.attn = torch.softmax(scores, dim=-1)  # (batch, head, time1, time2)
+
+        p_attn = self.dropout(self.attn)
+        x = torch.matmul(p_attn, value)  # (batch, head, time1, d_k)
+        x = (x.transpose(1, 2).contiguous().view(n_batch, -1, self.h * self.d_k))  # (batch, time1, d_model)
+
+        return self.linear_out(x)  # (batch, time1, d_model)
+
+    def forward(self, query, key, value, mask):
+        """
+        Compute scaled dot product attention.
+
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
+                (#batch, time1, time2).
+
+        Returns:
+            torch.Tensor: Output tensor (#batch, time1, d_model).
+        """
+        q, k, v = self.forward_qkv(query, key, value)
+        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k)
+        return self.forward_attention(v, scores, mask)
+
+
+class RelPositionMultiHeadedAttention(MultiHeadedAttention):
+    """
+    Multi-Head Attention layer with relative position encoding.
+    Details can be found in https://github.com/espnet/espnet/pull/2816.
+    Paper: https://arxiv.org/abs/1901.02860
+    Args:
+        n_head (int): The number of heads.
+        n_feat (int): The number of features.
+        dropout_rate (float): Dropout rate.
+        zero_triu (bool): Whether to zero the upper triangular part of attention matrix.
+    """
+
+    def __init__(self, n_head, n_feat, dropout_rate, zero_triu=False):
+        """Construct an RelPositionMultiHeadedAttention object."""
+        super().__init__(n_head, n_feat, dropout_rate)
+        self.zero_triu = zero_triu
+        # linear transformation for positional encoding
+        self.linear_pos = nn.Linear(n_feat, n_feat, bias=False)
+        # these two learnable bias are used in matrix c and matrix d
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k))
+        self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k))
+        torch.nn.init.xavier_uniform_(self.pos_bias_u)
+        torch.nn.init.xavier_uniform_(self.pos_bias_v)
+
+    def rel_shift(self, x):
+        """
+        Compute relative positional encoding.
+        Args:
+            x (torch.Tensor): Input tensor (batch, head, time1, 2*time1-1).
+            time1 means the length of query vector.
+        Returns:
+            torch.Tensor: Output tensor.
+        """
+        zero_pad = torch.zeros((*x.size()[:3], 1), device=x.device, dtype=x.dtype)
+        x_padded = torch.cat([zero_pad, x], dim=-1)
+
+        x_padded = x_padded.view(*x.size()[:2], x.size(3) + 1, x.size(2))
+        x = x_padded[:, :, 1:].view_as(x)[:, :, :, : x.size(-1) // 2 + 1]  # only keep the positions from 0 to time2
+
+        if self.zero_triu:
+            ones = torch.ones((x.size(2), x.size(3)), device=x.device)
+            x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :]
+
+        return x
+
+    def forward(self, query, key, value, pos_emb, mask):
+        """
+        Compute 'Scaled Dot Product Attention' with rel. positional encoding.
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+            pos_emb (torch.Tensor): Positional embedding tensor
+                (#batch, 2*time1-1, size).
+            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
+                (#batch, time1, time2).
+        Returns:
+            torch.Tensor: Output tensor (#batch, time1, d_model).
+        """
+        q, k, v = self.forward_qkv(query, key, value)
+        q = q.transpose(1, 2)  # (batch, time1, head, d_k)
+
+        n_batch_pos = pos_emb.size(0)
+        p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k)
+        p = p.transpose(1, 2)  # (batch, head, 2*time1-1, d_k)
+
+        # (batch, head, time1, d_k)
+        q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2)
+        # (batch, head, time1, d_k)
+        q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2)
+
+        # compute attention score
+        # first compute matrix a and matrix c
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        # (batch, head, time1, time2)
+        matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1))
+
+        # compute matrix b and matrix d
+        # (batch, head, time1, 2*time1-1)
+        matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1))
+        matrix_bd = self.rel_shift(matrix_bd)
+
+        scores = (matrix_ac + matrix_bd) / math.sqrt(self.d_k)  # (batch, head, time1, time2)
+
+        return self.forward_attention(v, scores, mask)
+
+
+class GuidedAttentionLoss(torch.nn.Module):
+    """
+    Guided attention loss function module.
+
+    This module calculates the guided attention loss described
+    in `Efficiently Trainable Text-to-Speech System Based
+    on Deep Convolutional Networks with Guided Attention`_,
+    which forces the attention to be diagonal.
+
+    .. _`Efficiently Trainable Text-to-Speech System
+        Based on Deep Convolutional Networks with Guided Attention`:
+        https://arxiv.org/abs/1710.08969
+    """
+
+    def __init__(self, sigma=0.4, alpha=1.0):
+        """
+        Initialize guided attention loss module.
+
+        Args:
+            sigma (float, optional): Standard deviation to control
+                how close attention to a diagonal.
+            alpha (float, optional): Scaling coefficient (lambda).
+            reset_always (bool, optional): Whether to always reset masks.
+        """
+        super(GuidedAttentionLoss, self).__init__()
+        self.sigma = sigma
+        self.alpha = alpha
+        self.guided_attn_masks = None
+        self.masks = None
+
+    def _reset_masks(self):
+        self.guided_attn_masks = None
+        self.masks = None
+
+    def forward(self, att_ws, ilens, olens):
+        """
+        Calculate forward propagation.
+
+        Args:
+            att_ws (Tensor): Batch of attention weights (B, T_max_out, T_max_in).
+            ilens (LongTensor): Batch of input lenghts (B,).
+            olens (LongTensor): Batch of output lenghts (B,).
+
+        Returns:
+            Tensor: Guided attention loss value.
+        """
+        self._reset_masks()
+        self.guided_attn_masks = self._make_guided_attention_masks(ilens, olens).to(att_ws.device)
+        self.masks = self._make_masks(ilens, olens).to(att_ws.device)
+        losses = self.guided_attn_masks * att_ws
+        loss = torch.mean(losses.masked_select(self.masks))
+        self._reset_masks()
+        return self.alpha * loss
+
+    def _make_guided_attention_masks(self, ilens, olens):
+        n_batches = len(ilens)
+        max_ilen = max(ilens)
+        max_olen = max(olens)
+        guided_attn_masks = torch.zeros((n_batches, max_olen, max_ilen), device=ilens.device)
+        for idx, (ilen, olen) in enumerate(zip(ilens, olens)):
+            guided_attn_masks[idx, :olen, :ilen] = self._make_guided_attention_mask(ilen, olen, self.sigma)
+        return guided_attn_masks
+
+    @staticmethod
+    def _make_guided_attention_mask(ilen, olen, sigma):
+        """
+        Make guided attention mask.
+        """
+        grid_x, grid_y = torch.meshgrid(torch.arange(olen, device=olen.device).float(), torch.arange(ilen, device=ilen.device).float())
+        return 1.0 - torch.exp(-((grid_y / ilen - grid_x / olen) ** 2) / (2 * (sigma ** 2)))
+
+    @staticmethod
+    def _make_masks(ilens, olens):
+        """
+        Make masks indicating non-padded part.
+
+        Args:
+            ilens (LongTensor or List): Batch of lengths (B,).
+            olens (LongTensor or List): Batch of lengths (B,).
+
+        Returns:
+            Tensor: Mask tensor indicating non-padded part.
+                    dtype=torch.uint8 in PyTorch 1.2-
+                    dtype=torch.bool in PyTorch 1.2+ (including 1.2)
+        """
+        in_masks = make_non_pad_mask(ilens, device=ilens.device)  # (B, T_in)
+        out_masks = make_non_pad_mask(olens, device=olens.device)  # (B, T_out)
+        return out_masks.unsqueeze(-1) & in_masks.unsqueeze(-2)  # (B, T_out, T_in)
+
+
+class GuidedMultiHeadAttentionLoss(GuidedAttentionLoss):
+    """
+    Guided attention loss function module for multi head attention.
+
+    Args:
+        sigma (float, optional): Standard deviation to control
+        how close attention to a diagonal.
+        alpha (float, optional): Scaling coefficient (lambda).
+        reset_always (bool, optional): Whether to always reset masks.
+    """
+
+    def forward(self, att_ws, ilens, olens):
+        """
+        Calculate forward propagation.
+
+        Args:
+            att_ws (Tensor):
+                Batch of multi head attention weights (B, H, T_max_out, T_max_in).
+            ilens (LongTensor): Batch of input lenghts (B,).
+            olens (LongTensor): Batch of output lenghts (B,).
+
+        Returns:
+            Tensor: Guided attention loss value.
+        """
+        if self.guided_attn_masks is None:
+            self.guided_attn_masks = (self._make_guided_attention_masks(ilens, olens).to(att_ws.device).unsqueeze(1))
+        if self.masks is None:
+            self.masks = self._make_masks(ilens, olens).to(att_ws.device).unsqueeze(1)
+        losses = self.guided_attn_masks * att_ws
+        loss = torch.mean(losses.masked_select(self.masks))
+        if self.reset_always:
+            self._reset_masks()
+
+        return self.alpha * loss
diff --git a/Layers/Conformer.py b/Layers/Conformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ca87bfbf18bcfb84830501dc3d00e3a38916966
--- /dev/null
+++ b/Layers/Conformer.py
@@ -0,0 +1,144 @@
+"""
+Taken from ESPNet
+"""
+
+import torch
+import torch.nn.functional as F
+
+from Layers.Attention import RelPositionMultiHeadedAttention
+from Layers.Convolution import ConvolutionModule
+from Layers.EncoderLayer import EncoderLayer
+from Layers.LayerNorm import LayerNorm
+from Layers.MultiLayeredConv1d import MultiLayeredConv1d
+from Layers.MultiSequential import repeat
+from Layers.PositionalEncoding import RelPositionalEncoding
+from Layers.Swish import Swish
+
+
+class Conformer(torch.nn.Module):
+    """
+    Conformer encoder module.
+
+    Args:
+        idim (int): Input dimension.
+        attention_dim (int): Dimension of attention.
+        attention_heads (int): The number of heads of multi head attention.
+        linear_units (int): The number of units of position-wise feed forward.
+        num_blocks (int): The number of decoder blocks.
+        dropout_rate (float): Dropout rate.
+        positional_dropout_rate (float): Dropout rate after adding positional encoding.
+        attention_dropout_rate (float): Dropout rate in attention.
+        input_layer (Union[str, torch.nn.Module]): Input layer type.
+        normalize_before (bool): Whether to use layer_norm before the first block.
+        concat_after (bool): Whether to concat attention layer's input and output.
+            if True, additional linear will be applied.
+            i.e. x -> x + linear(concat(x, att(x)))
+            if False, no additional linear will be applied. i.e. x -> x + att(x)
+        positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear".
+        positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer.
+        macaron_style (bool): Whether to use macaron style for positionwise layer.
+        pos_enc_layer_type (str): Conformer positional encoding layer type.
+        selfattention_layer_type (str): Conformer attention layer type.
+        activation_type (str): Conformer activation function type.
+        use_cnn_module (bool): Whether to use convolution module.
+        cnn_module_kernel (int): Kernerl size of convolution module.
+        padding_idx (int): Padding idx for input_layer=embed.
+
+    """
+
+    def __init__(self, idim, attention_dim=256, attention_heads=4, linear_units=2048, num_blocks=6, dropout_rate=0.1, positional_dropout_rate=0.1,
+                 attention_dropout_rate=0.0, input_layer="conv2d", normalize_before=True, concat_after=False, positionwise_conv_kernel_size=1,
+                 macaron_style=False, use_cnn_module=False, cnn_module_kernel=31, zero_triu=False, utt_embed=None, connect_utt_emb_at_encoder_out=True,
+                 spk_emb_bottleneck_size=128, lang_embs=None):
+        super(Conformer, self).__init__()
+
+        activation = Swish()
+        self.conv_subsampling_factor = 1
+
+        if isinstance(input_layer, torch.nn.Module):
+            self.embed = input_layer
+            self.pos_enc = RelPositionalEncoding(attention_dim, positional_dropout_rate)
+        elif input_layer is None:
+            self.embed = None
+            self.pos_enc = torch.nn.Sequential(RelPositionalEncoding(attention_dim, positional_dropout_rate))
+        else:
+            raise ValueError("unknown input_layer: " + input_layer)
+
+        self.normalize_before = normalize_before
+
+        self.connect_utt_emb_at_encoder_out = connect_utt_emb_at_encoder_out
+        if utt_embed is not None:
+            self.hs_emb_projection = torch.nn.Linear(attention_dim + spk_emb_bottleneck_size, attention_dim)
+            # embedding projection derived from https://arxiv.org/pdf/1705.08947.pdf
+            self.embedding_projection = torch.nn.Sequential(torch.nn.Linear(utt_embed, spk_emb_bottleneck_size),
+                                                            torch.nn.Softsign())
+        if lang_embs is not None:
+            self.language_embedding = torch.nn.Embedding(num_embeddings=lang_embs, embedding_dim=attention_dim)
+
+        # self-attention module definition
+        encoder_selfattn_layer = RelPositionMultiHeadedAttention
+        encoder_selfattn_layer_args = (attention_heads, attention_dim, attention_dropout_rate, zero_triu)
+
+        # feed-forward module definition
+        positionwise_layer = MultiLayeredConv1d
+        positionwise_layer_args = (attention_dim, linear_units, positionwise_conv_kernel_size, dropout_rate,)
+
+        # convolution module definition
+        convolution_layer = ConvolutionModule
+        convolution_layer_args = (attention_dim, cnn_module_kernel, activation)
+
+        self.encoders = repeat(num_blocks, lambda lnum: EncoderLayer(attention_dim, encoder_selfattn_layer(*encoder_selfattn_layer_args),
+                                                                     positionwise_layer(*positionwise_layer_args),
+                                                                     positionwise_layer(*positionwise_layer_args) if macaron_style else None,
+                                                                     convolution_layer(*convolution_layer_args) if use_cnn_module else None, dropout_rate,
+                                                                     normalize_before, concat_after))
+        if self.normalize_before:
+            self.after_norm = LayerNorm(attention_dim)
+
+    def forward(self, xs, masks, utterance_embedding=None, lang_ids=None):
+        """
+        Encode input sequence.
+
+        Args:
+            utterance_embedding: embedding containing lots of conditioning signals
+            step: indicator for when to start updating the embedding function
+            xs (torch.Tensor): Input tensor (#batch, time, idim).
+            masks (torch.Tensor): Mask tensor (#batch, time).
+
+        Returns:
+            torch.Tensor: Output tensor (#batch, time, attention_dim).
+            torch.Tensor: Mask tensor (#batch, time).
+
+        """
+
+        if self.embed is not None:
+            xs = self.embed(xs)
+
+        if lang_ids is not None:
+            lang_embs = self.language_embedding(lang_ids)
+            xs = xs + lang_embs  # offset the phoneme distribution of a language
+
+        if utterance_embedding is not None and not self.connect_utt_emb_at_encoder_out:
+            xs = self._integrate_with_utt_embed(xs, utterance_embedding)
+
+        xs = self.pos_enc(xs)
+
+        xs, masks = self.encoders(xs, masks)
+        if isinstance(xs, tuple):
+            xs = xs[0]
+
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+
+        if utterance_embedding is not None and self.connect_utt_emb_at_encoder_out:
+            xs = self._integrate_with_utt_embed(xs, utterance_embedding)
+
+        return xs, masks
+
+    def _integrate_with_utt_embed(self, hs, utt_embeddings):
+        # project embedding into smaller space
+        speaker_embeddings_projected = self.embedding_projection(utt_embeddings)
+        # concat hidden states with spk embeds and then apply projection
+        speaker_embeddings_expanded = F.normalize(speaker_embeddings_projected).unsqueeze(1).expand(-1, hs.size(1), -1)
+        hs = self.hs_emb_projection(torch.cat([hs, speaker_embeddings_expanded], dim=-1))
+        return hs
diff --git a/Layers/Convolution.py b/Layers/Convolution.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6e56e85d5908b0db5fceaea1e701d197a824d4b
--- /dev/null
+++ b/Layers/Convolution.py
@@ -0,0 +1,55 @@
+# Copyright 2020 Johns Hopkins University (Shinji Watanabe)
+#                Northwestern Polytechnical University (Pengcheng Guo)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+# Adapted by Florian Lux 2021
+
+
+from torch import nn
+
+
+class ConvolutionModule(nn.Module):
+    """
+    ConvolutionModule in Conformer model.
+
+    Args:
+        channels (int): The number of channels of conv layers.
+        kernel_size (int): Kernel size of conv layers.
+
+    """
+
+    def __init__(self, channels, kernel_size, activation=nn.ReLU(), bias=True):
+        super(ConvolutionModule, self).__init__()
+        # kernel_size should be an odd number for 'SAME' padding
+        assert (kernel_size - 1) % 2 == 0
+
+        self.pointwise_conv1 = nn.Conv1d(channels, 2 * channels, kernel_size=1, stride=1, padding=0, bias=bias, )
+        self.depthwise_conv = nn.Conv1d(channels, channels, kernel_size, stride=1, padding=(kernel_size - 1) // 2, groups=channels, bias=bias, )
+        self.norm = nn.GroupNorm(num_groups=32, num_channels=channels)
+        self.pointwise_conv2 = nn.Conv1d(channels, channels, kernel_size=1, stride=1, padding=0, bias=bias, )
+        self.activation = activation
+
+    def forward(self, x):
+        """
+        Compute convolution module.
+
+        Args:
+            x (torch.Tensor): Input tensor (#batch, time, channels).
+
+        Returns:
+            torch.Tensor: Output tensor (#batch, time, channels).
+
+        """
+        # exchange the temporal dimension and the feature dimension
+        x = x.transpose(1, 2)
+
+        # GLU mechanism
+        x = self.pointwise_conv1(x)  # (batch, 2*channel, dim)
+        x = nn.functional.glu(x, dim=1)  # (batch, channel, dim)
+
+        # 1D Depthwise Conv
+        x = self.depthwise_conv(x)
+        x = self.activation(self.norm(x))
+
+        x = self.pointwise_conv2(x)
+
+        return x.transpose(1, 2)
diff --git a/Layers/DurationPredictor.py b/Layers/DurationPredictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ccfe1c4584a1de8f9f7b65fc7997885539976b1
--- /dev/null
+++ b/Layers/DurationPredictor.py
@@ -0,0 +1,139 @@
+# Copyright 2019 Tomoki Hayashi
+# MIT License (https://opensource.org/licenses/MIT)
+# Adapted by Florian Lux 2021
+
+
+import torch
+
+from Layers.LayerNorm import LayerNorm
+
+
+class DurationPredictor(torch.nn.Module):
+    """
+    Duration predictor module.
+
+    This is a module of duration predictor described
+    in `FastSpeech: Fast, Robust and Controllable Text to Speech`_.
+    The duration predictor predicts a duration of each frame in log domain
+    from the hidden embeddings of encoder.
+
+    .. _`FastSpeech: Fast, Robust and Controllable Text to Speech`:
+        https://arxiv.org/pdf/1905.09263.pdf
+
+    Note:
+        The calculation domain of outputs is different
+        between in `forward` and in `inference`. In `forward`,
+        the outputs are calculated in log domain but in `inference`,
+        those are calculated in linear domain.
+
+    """
+
+    def __init__(self, idim, n_layers=2, n_chans=384, kernel_size=3, dropout_rate=0.1, offset=1.0):
+        """
+        Initialize duration predictor module.
+
+        Args:
+            idim (int): Input dimension.
+            n_layers (int, optional): Number of convolutional layers.
+            n_chans (int, optional): Number of channels of convolutional layers.
+            kernel_size (int, optional): Kernel size of convolutional layers.
+            dropout_rate (float, optional): Dropout rate.
+            offset (float, optional): Offset value to avoid nan in log domain.
+
+        """
+        super(DurationPredictor, self).__init__()
+        self.offset = offset
+        self.conv = torch.nn.ModuleList()
+        for idx in range(n_layers):
+            in_chans = idim if idx == 0 else n_chans
+            self.conv += [torch.nn.Sequential(torch.nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=(kernel_size - 1) // 2, ), torch.nn.ReLU(),
+                                              LayerNorm(n_chans, dim=1), torch.nn.Dropout(dropout_rate), )]
+        self.linear = torch.nn.Linear(n_chans, 1)
+
+    def _forward(self, xs, x_masks=None, is_inference=False):
+        xs = xs.transpose(1, -1)  # (B, idim, Tmax)
+        for f in self.conv:
+            xs = f(xs)  # (B, C, Tmax)
+
+        # NOTE: calculate in log domain
+        xs = self.linear(xs.transpose(1, -1)).squeeze(-1)  # (B, Tmax)
+
+        if is_inference:
+            # NOTE: calculate in linear domain
+            xs = torch.clamp(torch.round(xs.exp() - self.offset), min=0).long()  # avoid negative value
+
+        if x_masks is not None:
+            xs = xs.masked_fill(x_masks, 0.0)
+
+        return xs
+
+    def forward(self, xs, x_masks=None):
+        """
+        Calculate forward propagation.
+
+        Args:
+            xs (Tensor): Batch of input sequences (B, Tmax, idim).
+            x_masks (ByteTensor, optional):
+                Batch of masks indicating padded part (B, Tmax).
+
+        Returns:
+            Tensor: Batch of predicted durations in log domain (B, Tmax).
+
+        """
+        return self._forward(xs, x_masks, False)
+
+    def inference(self, xs, x_masks=None):
+        """
+        Inference duration.
+
+        Args:
+            xs (Tensor): Batch of input sequences (B, Tmax, idim).
+            x_masks (ByteTensor, optional):
+                Batch of masks indicating padded part (B, Tmax).
+
+        Returns:
+            LongTensor: Batch of predicted durations in linear domain (B, Tmax).
+
+        """
+        return self._forward(xs, x_masks, True)
+
+
+class DurationPredictorLoss(torch.nn.Module):
+    """
+    Loss function module for duration predictor.
+
+    The loss value is Calculated in log domain to make it Gaussian.
+
+    """
+
+    def __init__(self, offset=1.0, reduction="mean"):
+        """
+        Args:
+            offset (float, optional): Offset value to avoid nan in log domain.
+            reduction (str): Reduction type in loss calculation.
+
+        """
+        super(DurationPredictorLoss, self).__init__()
+        self.criterion = torch.nn.MSELoss(reduction=reduction)
+        self.offset = offset
+
+    def forward(self, outputs, targets):
+        """
+        Calculate forward propagation.
+
+        Args:
+            outputs (Tensor): Batch of prediction durations in log domain (B, T)
+            targets (LongTensor): Batch of groundtruth durations in linear domain (B, T)
+
+        Returns:
+            Tensor: Mean squared error loss value.
+
+        Note:
+            `outputs` is in log domain but `targets` is in linear domain.
+
+        """
+        # NOTE: outputs is in log domain while targets in linear
+        targets = torch.log(targets.float() + self.offset)
+        loss = self.criterion(outputs, targets)
+
+        return loss
diff --git a/Layers/EncoderLayer.py b/Layers/EncoderLayer.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ae91c25d7a88dfff0603c263b63b8bb0f05c80a
--- /dev/null
+++ b/Layers/EncoderLayer.py
@@ -0,0 +1,144 @@
+# Copyright 2020 Johns Hopkins University (Shinji Watanabe)
+#                Northwestern Polytechnical University (Pengcheng Guo)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+# Adapted by Florian Lux 2021
+
+
+import torch
+from torch import nn
+
+from Layers.LayerNorm import LayerNorm
+
+
+class EncoderLayer(nn.Module):
+    """
+    Encoder layer module.
+
+    Args:
+        size (int): Input dimension.
+        self_attn (torch.nn.Module): Self-attention module instance.
+            `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance
+            can be used as the argument.
+        feed_forward (torch.nn.Module): Feed-forward module instance.
+            `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
+            can be used as the argument.
+        feed_forward_macaron (torch.nn.Module): Additional feed-forward module instance.
+            `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
+            can be used as the argument.
+        conv_module (torch.nn.Module): Convolution module instance.
+            `ConvlutionModule` instance can be used as the argument.
+        dropout_rate (float): Dropout rate.
+        normalize_before (bool): Whether to use layer_norm before the first block.
+        concat_after (bool): Whether to concat attention layer's input and output.
+            if True, additional linear will be applied.
+            i.e. x -> x + linear(concat(x, att(x)))
+            if False, no additional linear will be applied. i.e. x -> x + att(x)
+
+    """
+
+    def __init__(self, size, self_attn, feed_forward, feed_forward_macaron, conv_module, dropout_rate, normalize_before=True, concat_after=False, ):
+        super(EncoderLayer, self).__init__()
+        self.self_attn = self_attn
+        self.feed_forward = feed_forward
+        self.feed_forward_macaron = feed_forward_macaron
+        self.conv_module = conv_module
+        self.norm_ff = LayerNorm(size)  # for the FNN module
+        self.norm_mha = LayerNorm(size)  # for the MHA module
+        if feed_forward_macaron is not None:
+            self.norm_ff_macaron = LayerNorm(size)
+            self.ff_scale = 0.5
+        else:
+            self.ff_scale = 1.0
+        if self.conv_module is not None:
+            self.norm_conv = LayerNorm(size)  # for the CNN module
+            self.norm_final = LayerNorm(size)  # for the final output of the block
+        self.dropout = nn.Dropout(dropout_rate)
+        self.size = size
+        self.normalize_before = normalize_before
+        self.concat_after = concat_after
+        if self.concat_after:
+            self.concat_linear = nn.Linear(size + size, size)
+
+    def forward(self, x_input, mask, cache=None):
+        """
+        Compute encoded features.
+
+        Args:
+            x_input (Union[Tuple, torch.Tensor]): Input tensor w/ or w/o pos emb.
+                - w/ pos emb: Tuple of tensors [(#batch, time, size), (1, time, size)].
+                - w/o pos emb: Tensor (#batch, time, size).
+            mask (torch.Tensor): Mask tensor for the input (#batch, time).
+            cache (torch.Tensor): Cache tensor of the input (#batch, time - 1, size).
+
+        Returns:
+            torch.Tensor: Output tensor (#batch, time, size).
+            torch.Tensor: Mask tensor (#batch, time).
+
+        """
+        if isinstance(x_input, tuple):
+            x, pos_emb = x_input[0], x_input[1]
+        else:
+            x, pos_emb = x_input, None
+
+        # whether to use macaron style
+        if self.feed_forward_macaron is not None:
+            residual = x
+            if self.normalize_before:
+                x = self.norm_ff_macaron(x)
+            x = residual + self.ff_scale * self.dropout(self.feed_forward_macaron(x))
+            if not self.normalize_before:
+                x = self.norm_ff_macaron(x)
+
+        # multi-headed self-attention module
+        residual = x
+        if self.normalize_before:
+            x = self.norm_mha(x)
+
+        if cache is None:
+            x_q = x
+        else:
+            assert cache.shape == (x.shape[0], x.shape[1] - 1, self.size)
+            x_q = x[:, -1:, :]
+            residual = residual[:, -1:, :]
+            mask = None if mask is None else mask[:, -1:, :]
+
+        if pos_emb is not None:
+            x_att = self.self_attn(x_q, x, x, pos_emb, mask)
+        else:
+            x_att = self.self_attn(x_q, x, x, mask)
+
+        if self.concat_after:
+            x_concat = torch.cat((x, x_att), dim=-1)
+            x = residual + self.concat_linear(x_concat)
+        else:
+            x = residual + self.dropout(x_att)
+        if not self.normalize_before:
+            x = self.norm_mha(x)
+
+        # convolution module
+        if self.conv_module is not None:
+            residual = x
+            if self.normalize_before:
+                x = self.norm_conv(x)
+            x = residual + self.dropout(self.conv_module(x))
+            if not self.normalize_before:
+                x = self.norm_conv(x)
+
+        # feed forward module
+        residual = x
+        if self.normalize_before:
+            x = self.norm_ff(x)
+        x = residual + self.ff_scale * self.dropout(self.feed_forward(x))
+        if not self.normalize_before:
+            x = self.norm_ff(x)
+
+        if self.conv_module is not None:
+            x = self.norm_final(x)
+
+        if cache is not None:
+            x = torch.cat([cache, x], dim=1)
+
+        if pos_emb is not None:
+            return (x, pos_emb), mask
+
+        return x, mask
diff --git a/Layers/LayerNorm.py b/Layers/LayerNorm.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4cb4c15df0ccc0195bc18e124f4b50fb6bcee80
--- /dev/null
+++ b/Layers/LayerNorm.py
@@ -0,0 +1,36 @@
+# Written by Shigeki Karita, 2019
+# Published under Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+# Adapted by Florian Lux, 2021
+
+import torch
+
+
+class LayerNorm(torch.nn.LayerNorm):
+    """
+    Layer normalization module.
+
+    Args:
+        nout (int): Output dim size.
+        dim (int): Dimension to be normalized.
+    """
+
+    def __init__(self, nout, dim=-1):
+        """
+        Construct an LayerNorm object.
+        """
+        super(LayerNorm, self).__init__(nout, eps=1e-12)
+        self.dim = dim
+
+    def forward(self, x):
+        """
+        Apply layer normalization.
+
+        Args:
+            x (torch.Tensor): Input tensor.
+
+        Returns:
+            torch.Tensor: Normalized tensor.
+        """
+        if self.dim == -1:
+            return super(LayerNorm, self).forward(x)
+        return super(LayerNorm, self).forward(x.transpose(1, -1)).transpose(1, -1)
diff --git a/Layers/LengthRegulator.py b/Layers/LengthRegulator.py
new file mode 100644
index 0000000000000000000000000000000000000000..e375cf18524e4695da5d0909b65a56a178696d40
--- /dev/null
+++ b/Layers/LengthRegulator.py
@@ -0,0 +1,62 @@
+# Copyright 2019 Tomoki Hayashi
+# MIT License (https://opensource.org/licenses/MIT)
+# Adapted by Florian Lux 2021
+
+from abc import ABC
+
+import torch
+
+from Utility.utils import pad_list
+
+
+class LengthRegulator(torch.nn.Module, ABC):
+    """
+    Length regulator module for feed-forward Transformer.
+
+    This is a module of length regulator described in
+    `FastSpeech: Fast, Robust and Controllable Text to Speech`_.
+    The length regulator expands char or
+    phoneme-level embedding features to frame-level by repeating each
+    feature based on the corresponding predicted durations.
+
+    .. _`FastSpeech: Fast, Robust and Controllable Text to Speech`:
+        https://arxiv.org/pdf/1905.09263.pdf
+
+    """
+
+    def __init__(self, pad_value=0.0):
+        """
+        Initialize length regulator module.
+
+        Args:
+            pad_value (float, optional): Value used for padding.
+        """
+        super(LengthRegulator, self).__init__()
+        self.pad_value = pad_value
+
+    def forward(self, xs, ds, alpha=1.0):
+        """
+        Calculate forward propagation.
+
+        Args:
+            xs (Tensor): Batch of sequences of char or phoneme embeddings (B, Tmax, D).
+            ds (LongTensor): Batch of durations of each frame (B, T).
+            alpha (float, optional): Alpha value to control speed of speech.
+
+        Returns:
+            Tensor: replicated input tensor based on durations (B, T*, D).
+        """
+        if alpha != 1.0:
+            assert alpha > 0
+            ds = torch.round(ds.float() * alpha).long()
+
+        if ds.sum() == 0:
+            ds[ds.sum(dim=1).eq(0)] = 1
+
+        return pad_list([self._repeat_one_sequence(x, d) for x, d in zip(xs, ds)], self.pad_value)
+
+    def _repeat_one_sequence(self, x, d):
+        """
+        Repeat each frame according to duration
+        """
+        return torch.repeat_interleave(x, d, dim=0)
diff --git a/Layers/MultiLayeredConv1d.py b/Layers/MultiLayeredConv1d.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2de4a06a06d891fbaca726959b0f0d34d93d7cc
--- /dev/null
+++ b/Layers/MultiLayeredConv1d.py
@@ -0,0 +1,87 @@
+# Copyright 2019 Tomoki Hayashi
+# MIT License (https://opensource.org/licenses/MIT)
+# Adapted by Florian Lux 2021
+
+"""
+Layer modules for FFT block in FastSpeech (Feed-forward Transformer).
+"""
+
+import torch
+
+
+class MultiLayeredConv1d(torch.nn.Module):
+    """
+    Multi-layered conv1d for Transformer block.
+
+    This is a module of multi-layered conv1d designed
+    to replace positionwise feed-forward network
+    in Transformer block, which is introduced in
+    `FastSpeech: Fast, Robust and Controllable Text to Speech`_.
+
+    .. _`FastSpeech: Fast, Robust and Controllable Text to Speech`:
+        https://arxiv.org/pdf/1905.09263.pdf
+    """
+
+    def __init__(self, in_chans, hidden_chans, kernel_size, dropout_rate):
+        """
+        Initialize MultiLayeredConv1d module.
+
+        Args:
+            in_chans (int): Number of input channels.
+            hidden_chans (int): Number of hidden channels.
+            kernel_size (int): Kernel size of conv1d.
+            dropout_rate (float): Dropout rate.
+        """
+        super(MultiLayeredConv1d, self).__init__()
+        self.w_1 = torch.nn.Conv1d(in_chans, hidden_chans, kernel_size, stride=1, padding=(kernel_size - 1) // 2, )
+        self.w_2 = torch.nn.Conv1d(hidden_chans, in_chans, kernel_size, stride=1, padding=(kernel_size - 1) // 2, )
+        self.dropout = torch.nn.Dropout(dropout_rate)
+
+    def forward(self, x):
+        """
+        Calculate forward propagation.
+
+        Args:
+            x (torch.Tensor): Batch of input tensors (B, T, in_chans).
+
+        Returns:
+            torch.Tensor: Batch of output tensors (B, T, hidden_chans).
+        """
+        x = torch.relu(self.w_1(x.transpose(-1, 1))).transpose(-1, 1)
+        return self.w_2(self.dropout(x).transpose(-1, 1)).transpose(-1, 1)
+
+
+class Conv1dLinear(torch.nn.Module):
+    """
+    Conv1D + Linear for Transformer block.
+
+    A variant of MultiLayeredConv1d, which replaces second conv-layer to linear.
+    """
+
+    def __init__(self, in_chans, hidden_chans, kernel_size, dropout_rate):
+        """
+        Initialize Conv1dLinear module.
+
+        Args:
+            in_chans (int): Number of input channels.
+            hidden_chans (int): Number of hidden channels.
+            kernel_size (int): Kernel size of conv1d.
+            dropout_rate (float): Dropout rate.
+        """
+        super(Conv1dLinear, self).__init__()
+        self.w_1 = torch.nn.Conv1d(in_chans, hidden_chans, kernel_size, stride=1, padding=(kernel_size - 1) // 2, )
+        self.w_2 = torch.nn.Linear(hidden_chans, in_chans)
+        self.dropout = torch.nn.Dropout(dropout_rate)
+
+    def forward(self, x):
+        """
+        Calculate forward propagation.
+
+        Args:
+            x (torch.Tensor): Batch of input tensors (B, T, in_chans).
+
+        Returns:
+            torch.Tensor: Batch of output tensors (B, T, hidden_chans).
+        """
+        x = torch.relu(self.w_1(x.transpose(-1, 1))).transpose(-1, 1)
+        return self.w_2(self.dropout(x))
diff --git a/Layers/MultiSequential.py b/Layers/MultiSequential.py
new file mode 100644
index 0000000000000000000000000000000000000000..bccf8cd18bf94a42fcc1ef94f3fb23e86a114394
--- /dev/null
+++ b/Layers/MultiSequential.py
@@ -0,0 +1,33 @@
+# Written by Shigeki Karita, 2019
+# Published under Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+# Adapted by Florian Lux, 2021
+
+import torch
+
+
+class MultiSequential(torch.nn.Sequential):
+    """
+    Multi-input multi-output torch.nn.Sequential.
+    """
+
+    def forward(self, *args):
+        """
+        Repeat.
+        """
+        for m in self:
+            args = m(*args)
+        return args
+
+
+def repeat(N, fn):
+    """
+    Repeat module N times.
+
+    Args:
+        N (int): Number of repeat time.
+        fn (Callable): Function to generate module.
+
+    Returns:
+        MultiSequential: Repeated model instance.
+    """
+    return MultiSequential(*[fn(n) for n in range(N)])
diff --git a/Layers/PositionalEncoding.py b/Layers/PositionalEncoding.py
new file mode 100644
index 0000000000000000000000000000000000000000..8929a7fa6298f00e97fba1630524da014b738ace
--- /dev/null
+++ b/Layers/PositionalEncoding.py
@@ -0,0 +1,166 @@
+"""
+Taken from ESPNet
+"""
+
+import math
+
+import torch
+
+
+class PositionalEncoding(torch.nn.Module):
+    """
+    Positional encoding.
+
+    Args:
+        d_model (int): Embedding dimension.
+        dropout_rate (float): Dropout rate.
+        max_len (int): Maximum input length.
+        reverse (bool): Whether to reverse the input position.
+    """
+
+    def __init__(self, d_model, dropout_rate, max_len=5000, reverse=False):
+        """
+        Construct an PositionalEncoding object.
+        """
+        super(PositionalEncoding, self).__init__()
+        self.d_model = d_model
+        self.reverse = reverse
+        self.xscale = math.sqrt(self.d_model)
+        self.dropout = torch.nn.Dropout(p=dropout_rate)
+        self.pe = None
+        self.extend_pe(torch.tensor(0.0, device=d_model.device).expand(1, max_len))
+
+    def extend_pe(self, x):
+        """
+        Reset the positional encodings.
+        """
+        if self.pe is not None:
+            if self.pe.size(1) >= x.size(1):
+                if self.pe.dtype != x.dtype or self.pe.device != x.device:
+                    self.pe = self.pe.to(dtype=x.dtype, device=x.device)
+                return
+        pe = torch.zeros(x.size(1), self.d_model)
+        if self.reverse:
+            position = torch.arange(x.size(1) - 1, -1, -1.0, dtype=torch.float32).unsqueeze(1)
+        else:
+            position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, self.d_model, 2, dtype=torch.float32) * -(math.log(10000.0) / self.d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.pe = pe.to(device=x.device, dtype=x.dtype)
+
+    def forward(self, x):
+        """
+        Add positional encoding.
+
+        Args:
+            x (torch.Tensor): Input tensor (batch, time, `*`).
+
+        Returns:
+            torch.Tensor: Encoded tensor (batch, time, `*`).
+        """
+        self.extend_pe(x)
+        x = x * self.xscale + self.pe[:, : x.size(1)]
+        return self.dropout(x)
+
+
+class RelPositionalEncoding(torch.nn.Module):
+    """
+    Relative positional encoding module (new implementation).
+    Details can be found in https://github.com/espnet/espnet/pull/2816.
+    See : Appendix B in https://arxiv.org/abs/1901.02860
+    Args:
+        d_model (int): Embedding dimension.
+        dropout_rate (float): Dropout rate.
+        max_len (int): Maximum input length.
+    """
+
+    def __init__(self, d_model, dropout_rate, max_len=5000):
+        """
+        Construct an PositionalEncoding object.
+        """
+        super(RelPositionalEncoding, self).__init__()
+        self.d_model = d_model
+        self.xscale = math.sqrt(self.d_model)
+        self.dropout = torch.nn.Dropout(p=dropout_rate)
+        self.pe = None
+        self.extend_pe(torch.tensor(0.0).expand(1, max_len))
+
+    def extend_pe(self, x):
+        """Reset the positional encodings."""
+        if self.pe is not None:
+            # self.pe contains both positive and negative parts
+            # the length of self.pe is 2 * input_len - 1
+            if self.pe.size(1) >= x.size(1) * 2 - 1:
+                if self.pe.dtype != x.dtype or self.pe.device != x.device:
+                    self.pe = self.pe.to(dtype=x.dtype, device=x.device)
+                return
+        # Suppose `i` means to the position of query vecotr and `j` means the
+        # position of key vector. We use position relative positions when keys
+        # are to the left (i>j) and negative relative positions otherwise (i<j).
+        pe_positive = torch.zeros(x.size(1), self.d_model, device=x.device)
+        pe_negative = torch.zeros(x.size(1), self.d_model, device=x.device)
+        position = torch.arange(0, x.size(1), dtype=torch.float32, device=x.device).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, self.d_model, 2, dtype=torch.float32, device=x.device) * -(math.log(10000.0) / self.d_model))
+        pe_positive[:, 0::2] = torch.sin(position * div_term)
+        pe_positive[:, 1::2] = torch.cos(position * div_term)
+        pe_negative[:, 0::2] = torch.sin(-1 * position * div_term)
+        pe_negative[:, 1::2] = torch.cos(-1 * position * div_term)
+
+        # Reserve the order of positive indices and concat both positive and
+        # negative indices. This is used to support the shifting trick
+        # as in https://arxiv.org/abs/1901.02860
+        pe_positive = torch.flip(pe_positive, [0]).unsqueeze(0)
+        pe_negative = pe_negative[1:].unsqueeze(0)
+        pe = torch.cat([pe_positive, pe_negative], dim=1)
+        self.pe = pe.to(dtype=x.dtype)
+
+    def forward(self, x):
+        """
+        Add positional encoding.
+        Args:
+            x (torch.Tensor): Input tensor (batch, time, `*`).
+        Returns:
+            torch.Tensor: Encoded tensor (batch, time, `*`).
+        """
+        self.extend_pe(x)
+        x = x * self.xscale
+        pos_emb = self.pe[:, self.pe.size(1) // 2 - x.size(1) + 1: self.pe.size(1) // 2 + x.size(1), ]
+        return self.dropout(x), self.dropout(pos_emb)
+
+
+class ScaledPositionalEncoding(PositionalEncoding):
+    """
+    Scaled positional encoding module.
+
+    See Sec. 3.2  https://arxiv.org/abs/1809.08895
+
+    Args:
+        d_model (int): Embedding dimension.
+        dropout_rate (float): Dropout rate.
+        max_len (int): Maximum input length.
+
+    """
+
+    def __init__(self, d_model, dropout_rate, max_len=5000):
+        super().__init__(d_model=d_model, dropout_rate=dropout_rate, max_len=max_len)
+        self.alpha = torch.nn.Parameter(torch.tensor(1.0))
+
+    def reset_parameters(self):
+        self.alpha.data = torch.tensor(1.0)
+
+    def forward(self, x):
+        """
+        Add positional encoding.
+
+        Args:
+            x (torch.Tensor): Input tensor (batch, time, `*`).
+
+        Returns:
+            torch.Tensor: Encoded tensor (batch, time, `*`).
+
+        """
+        self.extend_pe(x)
+        x = x + self.alpha * self.pe[:, : x.size(1)]
+        return self.dropout(x)
diff --git a/Layers/PositionwiseFeedForward.py b/Layers/PositionwiseFeedForward.py
new file mode 100644
index 0000000000000000000000000000000000000000..1938b392e631c8c9d4179f2b34557a6b531a0174
--- /dev/null
+++ b/Layers/PositionwiseFeedForward.py
@@ -0,0 +1,26 @@
+# Written by Shigeki Karita, 2019
+# Published under Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+# Adapted by Florian Lux, 2021
+
+
+import torch
+
+
+class PositionwiseFeedForward(torch.nn.Module):
+    """
+    Args:
+        idim (int): Input dimenstion.
+        hidden_units (int): The number of hidden units.
+        dropout_rate (float): Dropout rate.
+
+    """
+
+    def __init__(self, idim, hidden_units, dropout_rate, activation=torch.nn.ReLU()):
+        super(PositionwiseFeedForward, self).__init__()
+        self.w_1 = torch.nn.Linear(idim, hidden_units)
+        self.w_2 = torch.nn.Linear(hidden_units, idim)
+        self.dropout = torch.nn.Dropout(dropout_rate)
+        self.activation = activation
+
+    def forward(self, x):
+        return self.w_2(self.dropout(self.activation(self.w_1(x))))
diff --git a/Layers/PostNet.py b/Layers/PostNet.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4d7b4fe104e81af40ca888a6cc3a8dec5c2c980
--- /dev/null
+++ b/Layers/PostNet.py
@@ -0,0 +1,74 @@
+"""
+Taken from ESPNet
+"""
+
+import torch
+
+
+class PostNet(torch.nn.Module):
+    """
+    From Tacotron2
+
+    Postnet module for Spectrogram prediction network.
+
+    This is a module of Postnet in Spectrogram prediction network,
+    which described in `Natural TTS Synthesis by
+    Conditioning WaveNet on Mel Spectrogram Predictions`_.
+    The Postnet refines the predicted
+    Mel-filterbank of the decoder,
+    which helps to compensate the detail sturcture of spectrogram.
+
+    .. _`Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`:
+       https://arxiv.org/abs/1712.05884
+    """
+
+    def __init__(self, idim, odim, n_layers=5, n_chans=512, n_filts=5, dropout_rate=0.5, use_batch_norm=True):
+        """
+        Initialize postnet module.
+
+        Args:
+            idim (int): Dimension of the inputs.
+            odim (int): Dimension of the outputs.
+            n_layers (int, optional): The number of layers.
+            n_filts (int, optional): The number of filter size.
+            n_units (int, optional): The number of filter channels.
+            use_batch_norm (bool, optional): Whether to use batch normalization..
+            dropout_rate (float, optional): Dropout rate..
+        """
+        super(PostNet, self).__init__()
+        self.postnet = torch.nn.ModuleList()
+        for layer in range(n_layers - 1):
+            ichans = odim if layer == 0 else n_chans
+            ochans = odim if layer == n_layers - 1 else n_chans
+            if use_batch_norm:
+                self.postnet += [torch.nn.Sequential(torch.nn.Conv1d(ichans, ochans, n_filts, stride=1, padding=(n_filts - 1) // 2, bias=False, ),
+                                                     torch.nn.GroupNorm(num_groups=32, num_channels=ochans), torch.nn.Tanh(),
+                                                     torch.nn.Dropout(dropout_rate), )]
+
+            else:
+                self.postnet += [
+                    torch.nn.Sequential(torch.nn.Conv1d(ichans, ochans, n_filts, stride=1, padding=(n_filts - 1) // 2, bias=False, ), torch.nn.Tanh(),
+                                        torch.nn.Dropout(dropout_rate), )]
+        ichans = n_chans if n_layers != 1 else odim
+        if use_batch_norm:
+            self.postnet += [torch.nn.Sequential(torch.nn.Conv1d(ichans, odim, n_filts, stride=1, padding=(n_filts - 1) // 2, bias=False, ),
+                                                 torch.nn.GroupNorm(num_groups=20, num_channels=odim),
+                                                 torch.nn.Dropout(dropout_rate), )]
+
+        else:
+            self.postnet += [torch.nn.Sequential(torch.nn.Conv1d(ichans, odim, n_filts, stride=1, padding=(n_filts - 1) // 2, bias=False, ),
+                                                 torch.nn.Dropout(dropout_rate), )]
+
+    def forward(self, xs):
+        """
+        Calculate forward propagation.
+
+        Args:
+            xs (Tensor): Batch of the sequences of padded input tensors (B, idim, Tmax).
+
+        Returns:
+            Tensor: Batch of padded output tensor. (B, odim, Tmax).
+        """
+        for i in range(len(self.postnet)):
+            xs = self.postnet[i](xs)
+        return xs
diff --git a/Layers/ResidualBlock.py b/Layers/ResidualBlock.py
new file mode 100644
index 0000000000000000000000000000000000000000..f80d15901c0c7d4475a5f038e0aa2883aa4f2a48
--- /dev/null
+++ b/Layers/ResidualBlock.py
@@ -0,0 +1,98 @@
+# -*- coding: utf-8 -*-
+
+"""
+References:
+    - https://github.com/jik876/hifi-gan
+    - https://github.com/kan-bayashi/ParallelWaveGAN
+"""
+
+import torch
+
+
+class Conv1d(torch.nn.Conv1d):
+    """
+    Conv1d module with customized initialization.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super(Conv1d, self).__init__(*args, **kwargs)
+
+    def reset_parameters(self):
+        torch.nn.init.kaiming_normal_(self.weight, nonlinearity="relu")
+        if self.bias is not None:
+            torch.nn.init.constant_(self.bias, 0.0)
+
+
+class Conv1d1x1(Conv1d):
+    """
+    1x1 Conv1d with customized initialization.
+    """
+
+    def __init__(self, in_channels, out_channels, bias):
+        super(Conv1d1x1, self).__init__(in_channels, out_channels, kernel_size=1, padding=0, dilation=1, bias=bias)
+
+
+class HiFiGANResidualBlock(torch.nn.Module):
+    """Residual block module in HiFiGAN."""
+
+    def __init__(self,
+                 kernel_size=3,
+                 channels=512,
+                 dilations=(1, 3, 5),
+                 bias=True,
+                 use_additional_convs=True,
+                 nonlinear_activation="LeakyReLU",
+                 nonlinear_activation_params={"negative_slope": 0.1}, ):
+        """
+        Initialize HiFiGANResidualBlock module.
+
+        Args:
+            kernel_size (int): Kernel size of dilation convolution layer.
+            channels (int): Number of channels for convolution layer.
+            dilations (List[int]): List of dilation factors.
+            use_additional_convs (bool): Whether to use additional convolution layers.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (dict): Hyperparameters for activation function.
+        """
+        super().__init__()
+        self.use_additional_convs = use_additional_convs
+        self.convs1 = torch.nn.ModuleList()
+        if use_additional_convs:
+            self.convs2 = torch.nn.ModuleList()
+        assert kernel_size % 2 == 1, "Kernel size must be odd number."
+        for dilation in dilations:
+            self.convs1 += [torch.nn.Sequential(getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
+                                                torch.nn.Conv1d(channels,
+                                                                channels,
+                                                                kernel_size,
+                                                                1,
+                                                                dilation=dilation,
+                                                                bias=bias,
+                                                                padding=(kernel_size - 1) // 2 * dilation, ), )]
+            if use_additional_convs:
+                self.convs2 += [torch.nn.Sequential(getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
+                                                    torch.nn.Conv1d(channels,
+                                                                    channels,
+                                                                    kernel_size,
+                                                                    1,
+                                                                    dilation=1,
+                                                                    bias=bias,
+                                                                    padding=(kernel_size - 1) // 2, ), )]
+
+    def forward(self, x):
+        """
+        Calculate forward propagation.
+
+        Args:
+            x (Tensor): Input tensor (B, channels, T).
+
+        Returns:
+            Tensor: Output tensor (B, channels, T).
+        """
+        for idx in range(len(self.convs1)):
+            xt = self.convs1[idx](x)
+            if self.use_additional_convs:
+                xt = self.convs2[idx](xt)
+            x = xt + x
+        return x
diff --git a/Layers/ResidualStack.py b/Layers/ResidualStack.py
new file mode 100644
index 0000000000000000000000000000000000000000..8bfe256efbecd5d24eba743ae8f3ff0a2bb604c2
--- /dev/null
+++ b/Layers/ResidualStack.py
@@ -0,0 +1,51 @@
+# Copyright 2019 Tomoki Hayashi
+# MIT License (https://opensource.org/licenses/MIT)
+# Adapted by Florian Lux 2021
+
+
+import torch
+
+
+class ResidualStack(torch.nn.Module):
+
+    def __init__(self, kernel_size=3, channels=32, dilation=1, bias=True, nonlinear_activation="LeakyReLU", nonlinear_activation_params={"negative_slope": 0.2},
+                 pad="ReflectionPad1d", pad_params={}, ):
+        """
+        Initialize ResidualStack module.
+
+        Args:
+            kernel_size (int): Kernel size of dilation convolution layer.
+            channels (int): Number of channels of convolution layers.
+            dilation (int): Dilation factor.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (dict): Hyperparameters for activation function.
+            pad (str): Padding function module name before dilated convolution layer.
+            pad_params (dict): Hyperparameters for padding function.
+
+        """
+        super(ResidualStack, self).__init__()
+
+        # defile residual stack part
+        assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size."
+        self.stack = torch.nn.Sequential(getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
+                                         getattr(torch.nn, pad)((kernel_size - 1) // 2 * dilation, **pad_params),
+                                         torch.nn.Conv1d(channels, channels, kernel_size, dilation=dilation, bias=bias),
+                                         getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
+                                         torch.nn.Conv1d(channels, channels, 1, bias=bias), )
+
+        # defile extra layer for skip connection
+        self.skip_layer = torch.nn.Conv1d(channels, channels, 1, bias=bias)
+
+    def forward(self, c):
+        """
+        Calculate forward propagation.
+
+        Args:
+            c (Tensor): Input tensor (B, channels, T).
+
+        Returns:
+            Tensor: Output tensor (B, chennels, T).
+
+        """
+        return self.stack(c) + self.skip_layer(c)
diff --git a/Layers/STFT.py b/Layers/STFT.py
new file mode 100644
index 0000000000000000000000000000000000000000..38cdfaebc1b55d17012bce74ad13767d0188338d
--- /dev/null
+++ b/Layers/STFT.py
@@ -0,0 +1,118 @@
+"""
+Taken from ESPNet
+"""
+
+import torch
+from torch.functional import stft as torch_stft
+from torch_complex.tensor import ComplexTensor
+
+from Utility.utils import make_pad_mask
+
+
+class STFT(torch.nn.Module):
+
+    def __init__(self, n_fft=512, win_length=None, hop_length=128, window="hann", center=True, normalized=False,
+                 onesided=True):
+        super().__init__()
+        self.n_fft = n_fft
+        if win_length is None:
+            self.win_length = n_fft
+        else:
+            self.win_length = win_length
+        self.hop_length = hop_length
+        self.center = center
+        self.normalized = normalized
+        self.onesided = onesided
+        self.window = window
+
+    def extra_repr(self):
+        return (f"n_fft={self.n_fft}, "
+                f"win_length={self.win_length}, "
+                f"hop_length={self.hop_length}, "
+                f"center={self.center}, "
+                f"normalized={self.normalized}, "
+                f"onesided={self.onesided}")
+
+    def forward(self, input_wave, ilens=None):
+        """
+        STFT forward function.
+        Args:
+            input_wave: (Batch, Nsamples) or (Batch, Nsample, Channels)
+            ilens: (Batch)
+        Returns:
+            output: (Batch, Frames, Freq, 2) or (Batch, Frames, Channels, Freq, 2)
+        """
+        bs = input_wave.size(0)
+
+        if input_wave.dim() == 3:
+            multi_channel = True
+            # input: (Batch, Nsample, Channels) -> (Batch * Channels, Nsample)
+            input_wave = input_wave.transpose(1, 2).reshape(-1, input_wave.size(1))
+        else:
+            multi_channel = False
+
+        # output: (Batch, Freq, Frames, 2=real_imag)
+        # or (Batch, Channel, Freq, Frames, 2=real_imag)
+        if self.window is not None:
+            window_func = getattr(torch, f"{self.window}_window")
+            window = window_func(self.win_length, dtype=input_wave.dtype, device=input_wave.device)
+        else:
+            window = None
+
+        complex_output = torch_stft(input=input_wave,
+                                    n_fft=self.n_fft,
+                                    win_length=self.win_length,
+                                    hop_length=self.hop_length,
+                                    center=self.center,
+                                    window=window,
+                                    normalized=self.normalized,
+                                    onesided=self.onesided,
+                                    return_complex=True)
+        output = torch.view_as_real(complex_output)
+        # output: (Batch, Freq, Frames, 2=real_imag)
+        # -> (Batch, Frames, Freq, 2=real_imag)
+        output = output.transpose(1, 2)
+        if multi_channel:
+            # output: (Batch * Channel, Frames, Freq, 2=real_imag)
+            # -> (Batch, Frame, Channel, Freq, 2=real_imag)
+            output = output.view(bs, -1, output.size(1), output.size(2), 2).transpose(1, 2)
+
+        if ilens is not None:
+            if self.center:
+                pad = self.win_length // 2
+                ilens = ilens + 2 * pad
+
+            olens = torch.div((ilens - self.win_length), self.hop_length, rounding_mode="trunc") + 1
+            output.masked_fill_(make_pad_mask(olens, output, 1), 0.0)
+        else:
+            olens = None
+
+        return output, olens
+
+    def inverse(self, input, ilens=None):
+        """
+        Inverse STFT.
+        Args:
+            input: Tensor(batch, T, F, 2) or ComplexTensor(batch, T, F)
+            ilens: (batch,)
+        Returns:
+            wavs: (batch, samples)
+            ilens: (batch,)
+        """
+        istft = torch.functional.istft
+
+        if self.window is not None:
+            window_func = getattr(torch, f"{self.window}_window")
+            window = window_func(self.win_length, dtype=input.dtype, device=input.device)
+        else:
+            window = None
+
+        if isinstance(input, ComplexTensor):
+            input = torch.stack([input.real, input.imag], dim=-1)
+        assert input.shape[-1] == 2
+        input = input.transpose(1, 2)
+
+        wavs = istft(input, n_fft=self.n_fft, hop_length=self.hop_length, win_length=self.win_length, window=window, center=self.center,
+                     normalized=self.normalized, onesided=self.onesided, length=ilens.max() if ilens is not None else ilens)
+
+        return wavs, ilens
diff --git a/Layers/Swish.py b/Layers/Swish.py
new file mode 100644
index 0000000000000000000000000000000000000000..1541ac7c6b33d08f7f998f7950f090afd6c14b38
--- /dev/null
+++ b/Layers/Swish.py
@@ -0,0 +1,18 @@
+# Copyright 2020 Johns Hopkins University (Shinji Watanabe)
+#                Northwestern Polytechnical University (Pengcheng Guo)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+# Adapted by Florian Lux 2021
+
+import torch
+
+
+class Swish(torch.nn.Module):
+    """
+    Construct an Swish activation function for Conformer.
+    """
+
+    def forward(self, x):
+        """
+        Return Swish activation function.
+        """
+        return x * torch.sigmoid(x)
diff --git a/Layers/VariancePredictor.py b/Layers/VariancePredictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd059bd7e5d103b68e65249c1af2ee12f4ac061c
--- /dev/null
+++ b/Layers/VariancePredictor.py
@@ -0,0 +1,65 @@
+# Copyright 2019 Tomoki Hayashi
+# MIT License (https://opensource.org/licenses/MIT)
+# Adapted by Florian Lux 2021
+
+from abc import ABC
+
+import torch
+
+from Layers.LayerNorm import LayerNorm
+
+
+class VariancePredictor(torch.nn.Module, ABC):
+    """
+    Variance predictor module.
+
+    This is a module of variance predictor described in `FastSpeech 2:
+    Fast and High-Quality End-to-End Text to Speech`_.
+
+    .. _`FastSpeech 2: Fast and High-Quality End-to-End Text to Speech`:
+        https://arxiv.org/abs/2006.04558
+
+    """
+
+    def __init__(self, idim, n_layers=2, n_chans=384, kernel_size=3, bias=True, dropout_rate=0.5, ):
+        """
+        Initilize duration predictor module.
+
+        Args:
+            idim (int): Input dimension.
+            n_layers (int, optional): Number of convolutional layers.
+            n_chans (int, optional): Number of channels of convolutional layers.
+            kernel_size (int, optional): Kernel size of convolutional layers.
+            dropout_rate (float, optional): Dropout rate.
+        """
+        super().__init__()
+        self.conv = torch.nn.ModuleList()
+        for idx in range(n_layers):
+            in_chans = idim if idx == 0 else n_chans
+            self.conv += [
+                torch.nn.Sequential(torch.nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=(kernel_size - 1) // 2, bias=bias, ), torch.nn.ReLU(),
+                                    LayerNorm(n_chans, dim=1), torch.nn.Dropout(dropout_rate), )]
+        self.linear = torch.nn.Linear(n_chans, 1)
+
+    def forward(self, xs, x_masks=None):
+        """
+        Calculate forward propagation.
+
+        Args:
+            xs (Tensor): Batch of input sequences (B, Tmax, idim).
+            x_masks (ByteTensor, optional):
+                Batch of masks indicating padded part (B, Tmax).
+
+        Returns:
+            Tensor: Batch of predicted sequences (B, Tmax, 1).
+        """
+        xs = xs.transpose(1, -1)  # (B, idim, Tmax)
+        for f in self.conv:
+            xs = f(xs)  # (B, C, Tmax)
+
+        xs = self.linear(xs.transpose(1, 2))  # (B, Tmax, 1)
+
+        if x_masks is not None:
+            xs = xs.masked_fill(x_masks, 0.0)
+
+        return xs
diff --git a/Layers/__init__.py b/Layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/Models/Aligner/__init__.py b/Models/Aligner/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/Models/FastSpeech2_Meta/__init__.py b/Models/FastSpeech2_Meta/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/Models/HiFiGAN_combined/__init__.py b/Models/HiFiGAN_combined/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/Preprocessing/ArticulatoryCombinedTextFrontend.py b/Preprocessing/ArticulatoryCombinedTextFrontend.py
new file mode 100644
index 0000000000000000000000000000000000000000..07cef80e7b3fa4a8d54236975f2484ff6ebea457
--- /dev/null
+++ b/Preprocessing/ArticulatoryCombinedTextFrontend.py
@@ -0,0 +1,323 @@
+import re
+import sys
+
+import panphon
+import phonemizer
+import torch
+
+from Preprocessing.papercup_features import generate_feature_table
+
+
+class ArticulatoryCombinedTextFrontend:
+
+    def __init__(self,
+                 language,
+                 use_word_boundaries=False,  # goes together well with 
+                 # parallel models and an aligner. Doesn't go together
+                 # well with autoregressive models.
+                 use_explicit_eos=True,
+                 use_prosody=False,  # unfortunately the non-segmental
+                 # nature of prosodic markers mixed with the sequential
+                 # phonemes hurts the performance of end-to-end models a
+                 # lot, even though one might think enriching the input
+                 # with such information would help.
+                 use_lexical_stress=False,
+                 silent=True,
+                 allow_unknown=False,
+                 add_silence_to_end=True,
+                 strip_silence=True):
+        """
+        Mostly preparing ID lookups
+        """
+        self.strip_silence = strip_silence
+        self.use_word_boundaries = use_word_boundaries
+        self.allow_unknown = allow_unknown
+        self.use_explicit_eos = use_explicit_eos
+        self.use_prosody = use_prosody
+        self.use_stress = use_lexical_stress
+        self.add_silence_to_end = add_silence_to_end
+        self.feature_table = panphon.FeatureTable()
+
+        if language == "en":
+            self.g2p_lang = "en-us"
+            self.expand_abbreviations = english_text_expansion
+            if not silent:
+                print("Created an English Text-Frontend")
+
+        elif language == "de":
+            self.g2p_lang = "de"
+            self.expand_abbreviations = lambda x: x
+            if not silent:
+                print("Created a German Text-Frontend")
+
+        elif language == "el":
+            self.g2p_lang = "el"
+            self.expand_abbreviations = lambda x: x
+            if not silent:
+                print("Created a Greek Text-Frontend")
+
+        elif language == "es":
+            self.g2p_lang = "es"
+            self.expand_abbreviations = lambda x: x
+            if not silent:
+                print("Created a Spanish Text-Frontend")
+
+        elif language == "fi":
+            self.g2p_lang = "fi"
+            self.expand_abbreviations = lambda x: x
+            if not silent:
+                print("Created a Finnish Text-Frontend")
+
+        elif language == "ru":
+            self.g2p_lang = "ru"
+            self.expand_abbreviations = lambda x: x
+            if not silent:
+                print("Created a Russian Text-Frontend")
+
+        elif language == "hu":
+            self.g2p_lang = "hu"
+            self.expand_abbreviations = lambda x: x
+            if not silent:
+                print("Created a Hungarian Text-Frontend")
+
+        elif language == "nl":
+            self.g2p_lang = "nl"
+            self.expand_abbreviations = lambda x: x
+            if not silent:
+                print("Created a Dutch Text-Frontend")
+
+        elif language == "fr":
+            self.g2p_lang = "fr-fr"
+            self.expand_abbreviations = lambda x: x
+            if not silent:
+                print("Created a French Text-Frontend")
+
+        elif language == "it":
+            self.g2p_lang = "it"
+            self.expand_abbreviations = lambda x: x
+            if not silent:
+                print("Created a Italian Text-Frontend")
+
+        elif language == "pt":
+            self.g2p_lang = "pt"
+            self.expand_abbreviations = lambda x: x
+            if not silent:
+                print("Created a Portuguese Text-Frontend")
+
+        elif language == "pl":
+            self.g2p_lang = "pl"
+            self.expand_abbreviations = lambda x: x
+            if not silent:
+                print("Created a Polish Text-Frontend")
+
+        # remember to also update get_language_id() when adding something here
+
+        else:
+            print("Language not supported yet")
+            sys.exit()
+
+        self.phone_to_vector_papercup = generate_feature_table()
+
+        self.phone_to_vector = dict()
+        for phone in self.phone_to_vector_papercup:
+            panphon_features = self.feature_table.word_to_vector_list(phone, numeric=True)
+            if panphon_features == []:
+                panphon_features = [[0] * 24]
+            papercup_features = self.phone_to_vector_papercup[phone]
+            self.phone_to_vector[phone] = papercup_features + panphon_features[0]
+
+        self.phone_to_id = {  # this lookup must be updated manually, because the only
+            # other way would be extracting them from a set, which can be non-deterministic
+            '~': 0,
+            '#': 1,
+            '?': 2,
+            '!': 3,
+            '.': 4,
+            'ɜ': 5,
+            'ɫ': 6,
+            'ə': 7,
+            'ɚ': 8,
+            'a': 9,
+            'ð': 10,
+            'ɛ': 11,
+            'ɪ': 12,
+            'ᵻ': 13,
+            'ŋ': 14,
+            'ɔ': 15,
+            'ɒ': 16,
+            'ɾ': 17,
+            'ʃ': 18,
+            'θ': 19,
+            'ʊ': 20,
+            'ʌ': 21,
+            'ʒ': 22,
+            'æ': 23,
+            'b': 24,
+            'ʔ': 25,
+            'd': 26,
+            'e': 27,
+            'f': 28,
+            'g': 29,
+            'h': 30,
+            'i': 31,
+            'j': 32,
+            'k': 33,
+            'l': 34,
+            'm': 35,
+            'n': 36,
+            'ɳ': 37,
+            'o': 38,
+            'p': 39,
+            'ɡ': 40,
+            'ɹ': 41,
+            'r': 42,
+            's': 43,
+            't': 44,
+            'u': 45,
+            'v': 46,
+            'w': 47,
+            'x': 48,
+            'z': 49,
+            'ʀ': 50,
+            'ø': 51,
+            'ç': 52,
+            'ɐ': 53,
+            'œ': 54,
+            'y': 55,
+            'ʏ': 56,
+            'ɑ': 57,
+            'c': 58,
+            'ɲ': 59,
+            'ɣ': 60,
+            'ʎ': 61,
+            'β': 62,
+            'ʝ': 63,
+            'ɟ': 64,
+            'q': 65,
+            'ɕ': 66,
+            'ʲ': 67,
+            'ɭ': 68,
+            'ɵ': 69,
+            'ʑ': 70,
+            'ʋ': 71,
+            'ʁ': 72,
+            'ɨ': 73,
+            'ʂ': 74,
+            'ɬ': 75,
+            }  # for the states of the ctc loss and dijkstra/mas in the aligner
+
+        self.id_to_phone = {v: k for k, v in self.phone_to_id.items()}
+
+    def string_to_tensor(self, text, view=False, device="cpu", handle_missing=True, input_phonemes=False):
+        """
+        Fixes unicode errors, expands some abbreviations,
+        turns graphemes into phonemes and then vectorizes
+        the sequence as articulatory features
+        """
+        if input_phonemes:
+            phones = text
+        else:
+            phones = self.get_phone_string(text=text, include_eos_symbol=True)
+        if view:
+            print("Phonemes: \n{}\n".format(phones))
+        phones_vector = list()
+        # turn into numeric vectors
+        for char in phones:
+            if handle_missing:
+                try:
+                    phones_vector.append(self.phone_to_vector[char])
+                except KeyError:
+                    print("unknown phoneme: {}".format(char))
+            else:
+                phones_vector.append(self.phone_to_vector[char])  # leave error handling to elsewhere
+
+        return torch.Tensor(phones_vector, device=device)
+
+    def get_phone_string(self, text, include_eos_symbol=True):
+        # expand abbreviations
+        utt = self.expand_abbreviations(text)
+        # phonemize
+        phones = phonemizer.phonemize(utt,
+                                      language_switch='remove-flags',
+                                      backend="espeak",
+                                      language=self.g2p_lang,
+                                      preserve_punctuation=True,
+                                      strip=True,
+                                      punctuation_marks=';:,.!?¡¿—…"«»“”~/',
+                                      with_stress=self.use_stress).replace(";", ",").replace("/", " ").replace("—", "") \
+            .replace(":", ",").replace('"', ",").replace("-", ",").replace("...", ",").replace("-", ",").replace("\n", " ") \
+            .replace("\t", " ").replace("¡", "").replace("¿", "").replace(",", "~").replace(" ̃", "").replace('̩', "").replace("̃", "").replace("̪", "")
+        # less than 1 wide characters hidden here
+        phones = re.sub("~+", "~", phones)
+        if not self.use_prosody:
+            # retain ~ as heuristic pause marker, even though all other symbols are removed with this option.
+            # also retain . ? and ! since they can be indicators for the stop token
+            phones = phones.replace("ˌ", "").replace("ː", "").replace("ˑ", "") \
+                .replace("˘", "").replace("|", "").replace("‖", "")
+        if not self.use_word_boundaries:
+            phones = phones.replace(" ", "")
+        else:
+            phones = re.sub(r"\s+", " ", phones)
+            phones = re.sub(" ", "~", phones)
+        if self.strip_silence:
+            phones = phones.lstrip("~").rstrip("~")
+        if self.add_silence_to_end:
+            phones += "~"  # adding a silence in the end during add_silence_to_end produces more natural sounding prosody
+        if include_eos_symbol:
+            phones += "#"
+
+        phones = "~" + phones
+        phones = re.sub("~+", "~", phones)
+
+        return phones
+
+
+def english_text_expansion(text):
+    """
+    Apply as small part of the tacotron style text cleaning pipeline, suitable for e.g. LJSpeech.
+    See https://github.com/keithito/tacotron/
+    Careful: Only apply to english datasets. Different languages need different cleaners.
+    """
+    _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in
+                      [('Mrs.', 'misess'), ('Mr.', 'mister'), ('Dr.', 'doctor'), ('St.', 'saint'), ('Co.', 'company'), ('Jr.', 'junior'), ('Maj.', 'major'),
+                       ('Gen.', 'general'), ('Drs.', 'doctors'), ('Rev.', 'reverend'), ('Lt.', 'lieutenant'), ('Hon.', 'honorable'), ('Sgt.', 'sergeant'),
+                       ('Capt.', 'captain'), ('Esq.', 'esquire'), ('Ltd.', 'limited'), ('Col.', 'colonel'), ('Ft.', 'fort')]]
+    for regex, replacement in _abbreviations:
+        text = re.sub(regex, replacement, text)
+    return text
+
+
+def get_language_id(language):
+    if language == "en":
+        return torch.LongTensor([0])
+    elif language == "de":
+        return torch.LongTensor([1])
+    elif language == "el":
+        return torch.LongTensor([2])
+    elif language == "es":
+        return torch.LongTensor([3])
+    elif language == "fi":
+        return torch.LongTensor([4])
+    elif language == "ru":
+        return torch.LongTensor([5])
+    elif language == "hu":
+        return torch.LongTensor([6])
+    elif language == "nl":
+        return torch.LongTensor([7])
+    elif language == "fr":
+        return torch.LongTensor([8])
+    elif language == "pt":
+        return torch.LongTensor([9])
+    elif language == "pl":
+        return torch.LongTensor([10])
+    elif language == "it":
+        return torch.LongTensor([11])
+
+
+if __name__ == '__main__':
+    # test an English utterance
+    tfr_en = ArticulatoryCombinedTextFrontend(language="en")
+    print(tfr_en.string_to_tensor("This is a complex sentence, it even has a pause! But can it do this? Nice.", view=True))
+
+    tfr_en = ArticulatoryCombinedTextFrontend(language="de")
+    print(tfr_en.string_to_tensor("Alles klar, jetzt testen wir einen deutschen Satz. Ich hoffe es gibt nicht mehr viele unspezifizierte Phoneme.", view=True))
diff --git a/Preprocessing/AudioPreprocessor.py b/Preprocessing/AudioPreprocessor.py
new file mode 100644
index 0000000000000000000000000000000000000000..522a9c15ab84d477cde5b1501397455374a488e4
--- /dev/null
+++ b/Preprocessing/AudioPreprocessor.py
@@ -0,0 +1,166 @@
+import librosa
+import librosa.core as lb
+import librosa.display as lbd
+import matplotlib.pyplot as plt
+import numpy
+import numpy as np
+import pyloudnorm as pyln
+import torch
+from torchaudio.transforms import Resample
+
+
+class AudioPreprocessor:
+
+    def __init__(self, input_sr, output_sr=None, melspec_buckets=80, hop_length=256, n_fft=1024, cut_silence=False, device="cpu"):
+        """
+        The parameters are by default set up to do well
+        on a 16kHz signal. A different sampling rate may
+        require different hop_length and n_fft (e.g.
+        doubling frequency --> doubling hop_length and
+        doubling n_fft)
+        """
+        self.cut_silence = cut_silence
+        self.device = device
+        self.sr = input_sr
+        self.new_sr = output_sr
+        self.hop_length = hop_length
+        self.n_fft = n_fft
+        self.mel_buckets = melspec_buckets
+        self.meter = pyln.Meter(input_sr)
+        self.final_sr = input_sr
+        if cut_silence:
+            torch.hub._validate_not_a_forked_repo = lambda a, b, c: True  # torch 1.9 has a bug in the hub loading, this is a workaround
+            # careful: assumes 16kHz or 8kHz audio
+            self.silero_model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
+                                                      model='silero_vad',
+                                                      force_reload=False,
+                                                      onnx=False,
+                                                      verbose=False)
+            (self.get_speech_timestamps,
+             self.save_audio,
+             self.read_audio,
+             self.VADIterator,
+             self.collect_chunks) = utils
+            self.silero_model = self.silero_model.to(self.device)
+        if output_sr is not None and output_sr != input_sr:
+            self.resample = Resample(orig_freq=input_sr, new_freq=output_sr).to(self.device)
+            self.final_sr = output_sr
+        else:
+            self.resample = lambda x: x
+
+    def cut_silence_from_audio(self, audio):
+        """
+        https://github.com/snakers4/silero-vad
+        """
+        return self.collect_chunks(self.get_speech_timestamps(audio, self.silero_model, sampling_rate=self.final_sr), audio)
+
+    def to_mono(self, x):
+        """
+        make sure we deal with a 1D array
+        """
+        if len(x.shape) == 2:
+            return lb.to_mono(numpy.transpose(x))
+        else:
+            return x
+
+    def normalize_loudness(self, audio):
+        """
+        normalize the amplitudes according to
+        their decibels, so this should turn any
+        signal with different magnitudes into
+        the same magnitude by analysing loudness
+        """
+        loudness = self.meter.integrated_loudness(audio)
+        loud_normed = pyln.normalize.loudness(audio, loudness, -30.0)
+        peak = numpy.amax(numpy.abs(loud_normed))
+        peak_normed = numpy.divide(loud_normed, peak)
+        return peak_normed
+
+    def logmelfilterbank(self, audio, sampling_rate, fmin=40, fmax=8000, eps=1e-10):
+        """
+        Compute log-Mel filterbank
+
+        one day this could be replaced by torchaudio's internal log10(melspec(audio)), but
+        for some reason it gives slightly different results, so in order not to break backwards
+        compatibility, this is kept for now. If there is ever a reason to completely re-train
+        all models, this would be a good opportunity to make the switch.
+        """
+        if isinstance(audio, torch.Tensor):
+            audio = audio.numpy()
+        # get amplitude spectrogram
+        x_stft = librosa.stft(audio, n_fft=self.n_fft, hop_length=self.hop_length, win_length=None, window="hann", pad_mode="reflect")
+        spc = np.abs(x_stft).T
+        # get mel basis
+        fmin = 0 if fmin is None else fmin
+        fmax = sampling_rate / 2 if fmax is None else fmax
+        mel_basis = librosa.filters.mel(sampling_rate, self.n_fft, self.mel_buckets, fmin, fmax)
+        # apply log and return
+        return torch.Tensor(np.log10(np.maximum(eps, np.dot(spc, mel_basis.T)))).transpose(0, 1)
+
+    def normalize_audio(self, audio):
+        """
+        one function to apply them all in an
+        order that makes sense.
+        """
+        audio = self.to_mono(audio)
+        audio = self.normalize_loudness(audio)
+        audio = torch.Tensor(audio).to(self.device)
+        audio = self.resample(audio)
+        if self.cut_silence:
+            audio = self.cut_silence_from_audio(audio)
+        return audio.to("cpu")
+
+    def visualize_cleaning(self, unclean_audio):
+        """
+        displays Mel Spectrogram of unclean audio
+        and then displays Mel Spectrogram of the
+        cleaned version.
+        """
+        fig, ax = plt.subplots(nrows=2, ncols=1)
+        unclean_audio_mono = self.to_mono(unclean_audio)
+        unclean_spec = self.audio_to_mel_spec_tensor(unclean_audio_mono, normalize=False).numpy()
+        clean_spec = self.audio_to_mel_spec_tensor(unclean_audio_mono, normalize=True).numpy()
+        lbd.specshow(unclean_spec, sr=self.sr, cmap='GnBu', y_axis='mel', ax=ax[0], x_axis='time')
+        ax[0].set(title='Uncleaned Audio')
+        ax[0].label_outer()
+        if self.new_sr is not None:
+            lbd.specshow(clean_spec, sr=self.new_sr, cmap='GnBu', y_axis='mel', ax=ax[1], x_axis='time')
+        else:
+            lbd.specshow(clean_spec, sr=self.sr, cmap='GnBu', y_axis='mel', ax=ax[1], x_axis='time')
+        ax[1].set(title='Cleaned Audio')
+        ax[1].label_outer()
+        plt.show()
+
+    def audio_to_wave_tensor(self, audio, normalize=True):
+        if normalize:
+            return self.normalize_audio(audio)
+        else:
+            if isinstance(audio, torch.Tensor):
+                return audio
+            else:
+                return torch.Tensor(audio)
+
+    def audio_to_mel_spec_tensor(self, audio, normalize=True, explicit_sampling_rate=None):
+        """
+        explicit_sampling_rate is for when
+        normalization has already been applied
+        and that included resampling. No way
+        to detect the current sr of the incoming
+        audio
+        """
+        if explicit_sampling_rate is None:
+            if normalize:
+                audio = self.normalize_audio(audio)
+                return self.logmelfilterbank(audio=audio, sampling_rate=self.final_sr)
+            return self.logmelfilterbank(audio=audio, sampling_rate=self.sr)
+        if normalize:
+            audio = self.normalize_audio(audio)
+        return self.logmelfilterbank(audio=audio, sampling_rate=explicit_sampling_rate)
+
+
+if __name__ == '__main__':
+    import soundfile
+
+    wav, sr = soundfile.read("../audios/test.wav")
+    ap = AudioPreprocessor(input_sr=sr, output_sr=16000)
+    ap.visualize_cleaning(wav)
diff --git a/Preprocessing/ProsodicConditionExtractor.py b/Preprocessing/ProsodicConditionExtractor.py
new file mode 100644
index 0000000000000000000000000000000000000000..60b030f68dcc54874b58751fbf22912b715dd2b1
--- /dev/null
+++ b/Preprocessing/ProsodicConditionExtractor.py
@@ -0,0 +1,40 @@
+import soundfile as sf
+import torch
+import torch.multiprocessing
+import torch.multiprocessing
+from numpy import trim_zeros
+from speechbrain.pretrained import EncoderClassifier
+
+from Preprocessing.AudioPreprocessor import AudioPreprocessor
+
+
+class ProsodicConditionExtractor:
+
+    def __init__(self, sr, device=torch.device("cpu")):
+        self.ap = AudioPreprocessor(input_sr=sr, output_sr=16000, melspec_buckets=80, hop_length=256, n_fft=1024, cut_silence=False)
+        # https://huggingface.co/speechbrain/spkrec-ecapa-voxceleb
+        self.speaker_embedding_func_ecapa = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb",
+                                                                           run_opts={"device": str(device)},
+                                                                           savedir="Models/SpeakerEmbedding/speechbrain_speaker_embedding_ecapa")
+        # https://huggingface.co/speechbrain/spkrec-xvect-voxceleb
+        self.speaker_embedding_func_xvector = EncoderClassifier.from_hparams(source="speechbrain/spkrec-xvect-voxceleb",
+                                                                             run_opts={"device": str(device)},
+                                                                             savedir="Models/SpeakerEmbedding/speechbrain_speaker_embedding_xvector")
+
+    def extract_condition_from_reference_wave(self, wave, already_normalized=False):
+        if already_normalized:
+            norm_wave = wave
+        else:
+            norm_wave = self.ap.audio_to_wave_tensor(normalize=True, audio=wave)
+            norm_wave = torch.tensor(trim_zeros(norm_wave.numpy()))
+        spk_emb_ecapa = self.speaker_embedding_func_ecapa.encode_batch(wavs=norm_wave.unsqueeze(0)).squeeze()
+        spk_emb_xvector = self.speaker_embedding_func_xvector.encode_batch(wavs=norm_wave.unsqueeze(0)).squeeze()
+        combined_utt_condition = torch.cat([spk_emb_ecapa.cpu(),
+                                            spk_emb_xvector.cpu()], dim=0)
+        return combined_utt_condition
+
+
+if __name__ == '__main__':
+    wave, sr = sf.read("../audios/1.wav")
+    ext = ProsodicConditionExtractor(sr=sr)
+    print(ext.extract_condition_from_reference_wave(wave=wave).shape)
diff --git a/Preprocessing/__init__.py b/Preprocessing/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/Preprocessing/papercup_features.py b/Preprocessing/papercup_features.py
new file mode 100644
index 0000000000000000000000000000000000000000..eaac7cd8f306bb06290aa2d83a92c2feb75fcc41
--- /dev/null
+++ b/Preprocessing/papercup_features.py
@@ -0,0 +1,637 @@
+# Derived from  an open-source resource provided by Papercup Technologies Limited
+# Resource-Author: Marlene Staib
+# Modified by Florian Lux, 2021
+
+def generate_feature_lookup():
+    return {
+        '~': {'symbol_type': 'silence'},
+        '#': {'symbol_type': 'end of sentence'},
+        '?': {'symbol_type': 'questionmark'},
+        '!': {'symbol_type': 'exclamationmark'},
+        '.': {'symbol_type': 'fullstop'},
+        'ɜ': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'central',
+            'vowel_openness'   : 'open-mid',
+            'vowel_roundedness': 'unrounded',
+            },
+        'ɫ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'alveolar',
+            'consonant_manner': 'lateral-approximant',
+            },
+        'ə': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'central',
+            'vowel_openness'   : 'mid',
+            'vowel_roundedness': 'unrounded',
+            },
+        'ɚ': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'central',
+            'vowel_openness'   : 'mid',
+            'vowel_roundedness': 'unrounded',
+            },
+        'a': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'front',
+            'vowel_openness'   : 'open',
+            'vowel_roundedness': 'unrounded',
+            },
+        'ð': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'dental',
+            'consonant_manner': 'fricative'
+            },
+        'ɛ': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'front',
+            'vowel_openness'   : 'open-mid',
+            'vowel_roundedness': 'unrounded',
+            },
+        'ɪ': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'front_central',
+            'vowel_openness'   : 'close_close-mid',
+            'vowel_roundedness': 'unrounded',
+            },
+        'ᵻ': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'central',
+            'vowel_openness'   : 'close',
+            'vowel_roundedness': 'unrounded',
+            },
+        'ŋ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'velar',
+            'consonant_manner': 'nasal'
+            },
+        'ɔ': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'back',
+            'vowel_openness'   : 'open-mid',
+            'vowel_roundedness': 'rounded',
+            },
+        'ɒ': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'back',
+            'vowel_openness'   : 'open',
+            'vowel_roundedness': 'rounded',
+            },
+        'ɾ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'alveolar',
+            'consonant_manner': 'tap'
+            },
+        'ʃ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'postalveolar',
+            'consonant_manner': 'fricative'
+            },
+        'θ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'dental',
+            'consonant_manner': 'fricative'
+            },
+        'ʊ': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'central_back',
+            'vowel_openness'   : 'close_close-mid',
+            'vowel_roundedness': 'unrounded'
+            },
+        'ʌ': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'back',
+            'vowel_openness'   : 'open-mid',
+            'vowel_roundedness': 'unrounded'
+            },
+        'ʒ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'postalveolar',
+            'consonant_manner': 'fricative'
+            },
+        'æ': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'front',
+            'vowel_openness'   : 'open-mid_open',
+            'vowel_roundedness': 'unrounded'
+            },
+        'b': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'bilabial',
+            'consonant_manner': 'stop'
+            },
+        'ʔ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'glottal',
+            'consonant_manner': 'stop'
+            },
+        'd': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'alveolar',
+            'consonant_manner': 'stop'
+            },
+        'e': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'front',
+            'vowel_openness'   : 'close-mid',
+            'vowel_roundedness': 'unrounded'
+            },
+        'f': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'labiodental',
+            'consonant_manner': 'fricative'
+            },
+        'g': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'velar',
+            'consonant_manner': 'stop'
+            },
+        'h': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'glottal',
+            'consonant_manner': 'fricative'
+            },
+        'i': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'front',
+            'vowel_openness'   : 'close',
+            'vowel_roundedness': 'unrounded'
+            },
+        'j': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'palatal',
+            'consonant_manner': 'approximant'
+            },
+        'k': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'velar',
+            'consonant_manner': 'stop'
+            },
+        'l': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'alveolar',
+            'consonant_manner': 'lateral-approximant'
+            },
+        'm': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'bilabial',
+            'consonant_manner': 'nasal'
+            },
+        'n': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'alveolar',
+            'consonant_manner': 'nasal'
+            },
+        'ɳ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'palatal',
+            'consonant_manner': 'nasal'
+            },
+        'o': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'back',
+            'vowel_openness'   : 'close-mid',
+            'vowel_roundedness': 'rounded'
+            },
+        'p': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'bilabial',
+            'consonant_manner': 'stop'
+            },
+        'ɡ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'velar',
+            'consonant_manner': 'stop'
+            },
+        'ɹ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'alveolar',
+            'consonant_manner': 'approximant'
+            },
+        'r': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'alveolar',
+            'consonant_manner': 'trill'
+            },
+        's': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'alveolar',
+            'consonant_manner': 'fricative'
+            },
+        't': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'alveolar',
+            'consonant_manner': 'stop'
+            },
+        'u': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'back',
+            'vowel_openness'   : 'close',
+            'vowel_roundedness': 'rounded',
+            },
+        'v': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'labiodental',
+            'consonant_manner': 'fricative'
+            },
+        'w': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'labial-velar',
+            'consonant_manner': 'approximant'
+            },
+        'x': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'velar',
+            'consonant_manner': 'fricative'
+            },
+        'z': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'alveolar',
+            'consonant_manner': 'fricative'
+            },
+        'ʀ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'uvular',
+            'consonant_manner': 'trill'
+            },
+        'ø': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'front',
+            'vowel_openness'   : 'close-mid',
+            'vowel_roundedness': 'rounded'
+            },
+        'ç': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'palatal',
+            'consonant_manner': 'fricative'
+            },
+        'ɐ': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'central',
+            'vowel_openness'   : 'open',
+            'vowel_roundedness': 'unrounded'
+            },
+        'œ': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'front',
+            'vowel_openness'   : 'open-mid',
+            'vowel_roundedness': 'rounded'
+            },
+        'y': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'front',
+            'vowel_openness'   : 'close',
+            'vowel_roundedness': 'rounded'
+            },
+        'ʏ': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'front_central',
+            'vowel_openness'   : 'close_close-mid',
+            'vowel_roundedness': 'rounded'
+            },
+        'ɑ': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'back',
+            'vowel_openness'   : 'open',
+            'vowel_roundedness': 'unrounded'
+            },
+        'c': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'palatal',
+            'consonant_manner': 'stop'
+            },
+        'ɲ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'palatal',
+            'consonant_manner': 'nasal'
+            },
+        'ɣ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'velar',
+            'consonant_manner': 'fricative'
+            },
+        'ʎ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'palatal',
+            'consonant_manner': 'lateral-approximant'
+            },
+        'β': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'bilabial',
+            'consonant_manner': 'fricative'
+            },
+        'ʝ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'palatal',
+            'consonant_manner': 'fricative'
+            },
+        'ɟ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'palatal',
+            'consonant_manner': 'stop'
+            },
+        'q': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'uvular',
+            'consonant_manner': 'stop'
+            },
+        'ɕ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'alveolopalatal',
+            'consonant_manner': 'fricative'
+            },
+        'ʲ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'palatal',
+            'consonant_manner': 'approximant'
+            },
+        'ɭ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'palatal',  # should be retroflex, but palatal should be close enough
+            'consonant_manner': 'lateral-approximant'
+            },
+        'ɵ': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'central',
+            'vowel_openness'   : 'open-mid',
+            'vowel_roundedness': 'rounded'
+            },
+        'ʑ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'alveolopalatal',
+            'consonant_manner': 'fricative'
+            },
+        'ʋ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'labiodental',
+            'consonant_manner': 'approximant'
+            },
+        'ʁ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'voiced',
+            'consonant_place' : 'uvular',
+            'consonant_manner': 'fricative'
+            },
+        'ɨ': {
+            'symbol_type'      : 'phoneme',
+            'vowel_consonant'  : 'vowel',
+            'VUV'              : 'voiced',
+            'vowel_frontness'  : 'central',
+            'vowel_openness'   : 'close',
+            'vowel_roundedness': 'unrounded'
+            },
+        'ʂ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'palatal',  # should be retroflex, but palatal should be close enough
+            'consonant_manner': 'fricative'
+            },
+        'ɬ': {
+            'symbol_type'     : 'phoneme',
+            'vowel_consonant' : 'consonant',
+            'VUV'             : 'unvoiced',
+            'consonant_place' : 'alveolar',  # should be noted it's also lateral, but should be close enough
+            'consonant_manner': 'fricative'
+            },
+        }  # REMEMBER to also add the phonemes added here to the ID lookup table in the TextFrontend as the new highest ID
+
+
+def generate_feature_table():
+    ipa_to_phonemefeats = generate_feature_lookup()
+
+    feat_types = set()
+    for ipa in ipa_to_phonemefeats:
+        if len(ipa) == 1:
+            [feat_types.add(feat) for feat in ipa_to_phonemefeats[ipa].keys()]
+
+    feat_to_val_set = dict()
+    for feat in feat_types:
+        feat_to_val_set[feat] = set()
+    for ipa in ipa_to_phonemefeats:
+        if len(ipa) == 1:
+            for feat in ipa_to_phonemefeats[ipa]:
+                feat_to_val_set[feat].add(ipa_to_phonemefeats[ipa][feat])
+
+    # print(feat_to_val_set)
+
+    value_list = set()
+    for val_set in [feat_to_val_set[feat] for feat in feat_to_val_set]:
+        for value in val_set:
+            value_list.add(value)
+    # print("{")
+    # for index, value in enumerate(list(value_list)):
+    #     print('"{}":{},'.format(value,index))
+    # print("}")
+
+    value_to_index = {
+        "dental"             : 0,
+        "postalveolar"       : 1,
+        "mid"                : 2,
+        "close-mid"          : 3,
+        "vowel"              : 4,
+        "silence"            : 5,
+        "consonant"          : 6,
+        "close"              : 7,
+        "velar"              : 8,
+        "stop"               : 9,
+        "palatal"            : 10,
+        "nasal"              : 11,
+        "glottal"            : 12,
+        "central"            : 13,
+        "back"               : 14,
+        "approximant"        : 15,
+        "uvular"             : 16,
+        "open-mid"           : 17,
+        "front_central"      : 18,
+        "front"              : 19,
+        "end of sentence"    : 20,
+        "labiodental"        : 21,
+        "close_close-mid"    : 22,
+        "labial-velar"       : 23,
+        "unvoiced"           : 24,
+        "central_back"       : 25,
+        "trill"              : 26,
+        "rounded"            : 27,
+        "open-mid_open"      : 28,
+        "tap"                : 29,
+        "alveolar"           : 30,
+        "bilabial"           : 31,
+        "phoneme"            : 32,
+        "open"               : 33,
+        "fricative"          : 34,
+        "unrounded"          : 35,
+        "lateral-approximant": 36,
+        "voiced"             : 37,
+        "questionmark"       : 38,
+        "exclamationmark"    : 39,
+        "fullstop"           : 40,
+        "alveolopalatal"     : 41
+        }
+
+    phone_to_vector = dict()
+    for ipa in ipa_to_phonemefeats:
+        if len(ipa) == 1:
+            phone_to_vector[ipa] = [0] * sum([len(values) for values in [feat_to_val_set[feat] for feat in feat_to_val_set]])
+            for feat in ipa_to_phonemefeats[ipa]:
+                if ipa_to_phonemefeats[ipa][feat] in value_to_index:
+                    phone_to_vector[ipa][value_to_index[ipa_to_phonemefeats[ipa][feat]]] = 1
+
+    for feat in feat_to_val_set:
+        for value in feat_to_val_set[feat]:
+            if value not in value_to_index:
+                print(f"Unknown feature value in featureset! {value}")
+
+    # print(f"{sum([len(values) for values in [feat_to_val_set[feat] for feat in feat_to_val_set]])} should be 42")
+
+    return phone_to_vector
+
+
+def generate_phone_to_id_lookup():
+    ipa_to_phonemefeats = generate_feature_lookup()
+    count = 0
+    phone_to_id = dict()
+    for key in sorted(list(ipa_to_phonemefeats)):  # careful: non-deterministic
+        phone_to_id[key] = count
+        count += 1
+    return phone_to_id
+
+
+if __name__ == '__main__':
+    print(generate_phone_to_id_lookup())
diff --git a/TrainingInterfaces/Text_to_Spectrogram/AutoAligner/Aligner.py b/TrainingInterfaces/Text_to_Spectrogram/AutoAligner/Aligner.py
new file mode 100644
index 0000000000000000000000000000000000000000..7257735aa8f3ff175110546f5b8c645873844ce6
--- /dev/null
+++ b/TrainingInterfaces/Text_to_Spectrogram/AutoAligner/Aligner.py
@@ -0,0 +1,287 @@
+"""
+taken and adapted from https://github.com/as-ideas/DeepForcedAligner
+"""
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+import torch.multiprocessing
+import torch.nn as nn
+from scipy.sparse import coo_matrix
+from scipy.sparse.csgraph import dijkstra
+from torch.nn import CTCLoss
+from torch.nn.utils.rnn import pack_padded_sequence
+from torch.nn.utils.rnn import pad_packed_sequence
+
+from Preprocessing.ArticulatoryCombinedTextFrontend import ArticulatoryCombinedTextFrontend
+
+
+class BatchNormConv(nn.Module):
+
+    def __init__(self, in_channels: int, out_channels: int, kernel_size: int):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            in_channels, out_channels, kernel_size,
+            stride=1, padding=kernel_size // 2, bias=False)
+        self.bnorm = nn.BatchNorm1d(out_channels)
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        x = x.transpose(1, 2)
+        x = self.conv(x)
+        x = self.relu(x)
+        x = self.bnorm(x)
+        x = x.transpose(1, 2)
+        return x
+
+
+class Aligner(torch.nn.Module):
+
+    def __init__(self,
+                 n_mels=80,
+                 num_symbols=145,
+                 lstm_dim=512,
+                 conv_dim=512):
+        super().__init__()
+        self.convs = nn.ModuleList([
+            BatchNormConv(n_mels, conv_dim, 3),
+            nn.Dropout(p=0.5),
+            BatchNormConv(conv_dim, conv_dim, 3),
+            nn.Dropout(p=0.5),
+            BatchNormConv(conv_dim, conv_dim, 3),
+            nn.Dropout(p=0.5),
+            BatchNormConv(conv_dim, conv_dim, 3),
+            nn.Dropout(p=0.5),
+            BatchNormConv(conv_dim, conv_dim, 3),
+            nn.Dropout(p=0.5),
+            ])
+        self.rnn = torch.nn.LSTM(conv_dim, lstm_dim, batch_first=True, bidirectional=True)
+        self.proj = torch.nn.Linear(2 * lstm_dim, num_symbols)
+        self.tf = ArticulatoryCombinedTextFrontend(language="en")
+        self.ctc_loss = CTCLoss(blank=144, zero_infinity=True)
+        self.vector_to_id = dict()
+        for phone in self.tf.phone_to_vector:
+            self.vector_to_id[tuple(self.tf.phone_to_vector[phone])] = self.tf.phone_to_id[phone]
+
+    def forward(self, x, lens=None):
+        for conv in self.convs:
+            x = conv(x)
+
+        if lens is not None:
+            x = pack_padded_sequence(x, lens.cpu(), batch_first=True, enforce_sorted=False)
+        x, _ = self.rnn(x)
+        if lens is not None:
+            x, _ = pad_packed_sequence(x, batch_first=True)
+
+        x = self.proj(x)
+
+        return x
+
+    @torch.no_grad()
+    def label_speech(self, speech):
+        # theoretically possible, but doesn't work well at all. Would probably require a beamsearch
+        probabilities_of_phones_over_frames = self(speech.unsqueeze(0)).squeeze()[:, :73]
+        smoothed_phone_probs_over_frames = list()
+        for index, _ in enumerate(probabilities_of_phones_over_frames):
+            access_safe_prev_index = max(0, index - 1)
+            access_safe_next_index = min(index + 1, len(probabilities_of_phones_over_frames) - 1)
+            smoothed_probs = (probabilities_of_phones_over_frames[access_safe_prev_index] +
+                              probabilities_of_phones_over_frames[access_safe_next_index] +
+                              probabilities_of_phones_over_frames[index]) / 3
+            smoothed_phone_probs_over_frames.append(smoothed_probs.unsqueeze(0))
+        print(torch.cat(smoothed_phone_probs_over_frames))
+        _, phone_ids_over_frames = torch.max(torch.cat(smoothed_phone_probs_over_frames), dim=1)
+        phone_ids = torch.unique_consecutive(phone_ids_over_frames)
+        phones = list()
+        for id_of_phone in phone_ids:
+            phones.append(self.tf.id_to_phone[int(id_of_phone)])
+        return "".join(phones)
+
+    @torch.inference_mode()
+    def inference(self, mel, tokens, save_img_for_debug=None, train=False, pathfinding="MAS", return_ctc=False):
+        if not train:
+            tokens_indexed = list()  # first we need to convert the articulatory vectors to IDs, so we can apply dijkstra or viterbi
+            for vector in tokens:
+                tokens_indexed.append(self.vector_to_id[tuple(vector.cpu().detach().numpy().tolist())])
+            tokens = np.asarray(tokens_indexed)
+        else:
+            tokens = tokens.cpu().detach().numpy()
+
+        pred = self(mel.unsqueeze(0))
+        if return_ctc:
+            ctc_loss = self.ctc_loss(pred.transpose(0, 1).log_softmax(2), torch.LongTensor(tokens), torch.LongTensor([len(pred[0])]),
+                                     torch.LongTensor([len(tokens)])).item()
+        pred = pred.squeeze().cpu().detach().numpy()
+        pred_max = pred[:, tokens]
+        path_probs = 1. - pred_max
+        adj_matrix = to_adj_matrix(path_probs)
+
+        if pathfinding == "MAS":
+
+            alignment_matrix = binarize_alignment(pred_max)
+
+            if save_img_for_debug is not None:
+                phones = list()
+                for index in tokens:
+                    for phone in self.tf.phone_to_id:
+                        if self.tf.phone_to_id[phone] == index:
+                            phones.append(phone)
+                fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 4))
+
+                ax.imshow(alignment_matrix, interpolation='nearest', aspect='auto', origin="lower", cmap='cividis')
+
+                ax.set_ylabel("Mel-Frames")
+
+                ax.set_xticks(range(len(pred_max[0])))
+                ax.set_xticklabels(labels=phones)
+
+                ax.set_title("MAS Path")
+
+                plt.tight_layout()
+                fig.savefig(save_img_for_debug)
+                fig.clf()
+                plt.close()
+
+            if return_ctc:
+                return alignment_matrix, ctc_loss
+            return alignment_matrix
+
+        elif pathfinding == "dijkstra":
+
+            dist_matrix, predecessors, *_ = dijkstra(csgraph=adj_matrix,
+                                                     directed=True,
+                                                     indices=0,
+                                                     return_predecessors=True)
+            path = []
+            pr_index = predecessors[-1]
+            while pr_index != 0:
+                path.append(pr_index)
+                pr_index = predecessors[pr_index]
+            path.reverse()
+
+            # append first and last node
+            path = [0] + path + [dist_matrix.size - 1]
+            cols = path_probs.shape[1]
+            mel_text = {}
+
+            # collect indices (mel, text) along the path
+            for node_index in path:
+                i, j = from_node_index(node_index, cols)
+                mel_text[i] = j
+
+            path_plot = np.zeros_like(pred_max)
+            for i in mel_text:
+                path_plot[i][mel_text[i]] = 1.0
+
+            if save_img_for_debug is not None:
+
+                phones = list()
+                for index in tokens:
+                    for phone in self.tf.phone_to_id:
+                        if self.tf.phone_to_id[phone] == index:
+                            phones.append(phone)
+                fig, ax = plt.subplots(nrows=2, ncols=1, figsize=(10, 9))
+
+                ax[0].imshow(pred_max, interpolation='nearest', aspect='auto', origin="lower")
+                ax[1].imshow(path_plot, interpolation='nearest', aspect='auto', origin="lower", cmap='cividis')
+
+                ax[0].set_ylabel("Mel-Frames")
+                ax[1].set_ylabel("Mel-Frames")
+
+                ax[0].set_xticks(range(len(pred_max[0])))
+                ax[0].set_xticklabels(labels=phones)
+
+                ax[1].set_xticks(range(len(pred_max[0])))
+                ax[1].set_xticklabels(labels=phones)
+
+                ax[0].set_title("Path Probabilities")
+                ax[1].set_title("Dijkstra Path")
+
+                plt.tight_layout()
+                fig.savefig(save_img_for_debug)
+                fig.clf()
+                plt.close()
+
+            if return_ctc:
+                return path_plot, ctc_loss
+            return path_plot
+
+
+def binarize_alignment(alignment_prob):
+    """
+    # Implementation by:
+    # https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/SpeechSynthesis/FastPitch/fastpitch/alignment.py
+    # https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/SpeechSynthesis/FastPitch/fastpitch/attn_loss_function.py
+
+    Binarizes alignment with MAS.
+    """
+    # assumes mel x text
+    opt = np.zeros_like(alignment_prob)
+    alignment_prob = alignment_prob + (np.abs(alignment_prob).max() + 1.0)  # make all numbers positive and add an offset to avoid log of 0 later
+    alignment_prob * alignment_prob * (1.0 / alignment_prob.max())  # normalize to (0,  1]
+    attn_map = np.log(alignment_prob)
+    attn_map[0, 1:] = -np.inf
+    log_p = np.zeros_like(attn_map)
+    log_p[0, :] = attn_map[0, :]
+    prev_ind = np.zeros_like(attn_map, dtype=np.int64)
+    for i in range(1, attn_map.shape[0]):
+        for j in range(attn_map.shape[1]):  # for each text dim
+            prev_log = log_p[i - 1, j]
+            prev_j = j
+            if j - 1 >= 0 and log_p[i - 1, j - 1] >= log_p[i - 1, j]:
+                prev_log = log_p[i - 1, j - 1]
+                prev_j = j - 1
+            log_p[i, j] = attn_map[i, j] + prev_log
+            prev_ind[i, j] = prev_j
+    # now backtrack
+    curr_text_idx = attn_map.shape[1] - 1
+    for i in range(attn_map.shape[0] - 1, -1, -1):
+        opt[i, curr_text_idx] = 1
+        curr_text_idx = prev_ind[i, curr_text_idx]
+    opt[0, curr_text_idx] = 1
+    return opt
+
+
+def to_node_index(i, j, cols):
+    return cols * i + j
+
+
+def from_node_index(node_index, cols):
+    return node_index // cols, node_index % cols
+
+
+def to_adj_matrix(mat):
+    rows = mat.shape[0]
+    cols = mat.shape[1]
+
+    row_ind = []
+    col_ind = []
+    data = []
+
+    for i in range(rows):
+        for j in range(cols):
+
+            node = to_node_index(i, j, cols)
+
+            if j < cols - 1:
+                right_node = to_node_index(i, j + 1, cols)
+                weight_right = mat[i, j + 1]
+                row_ind.append(node)
+                col_ind.append(right_node)
+                data.append(weight_right)
+
+            if i < rows - 1 and j < cols:
+                bottom_node = to_node_index(i + 1, j, cols)
+                weight_bottom = mat[i + 1, j]
+                row_ind.append(node)
+                col_ind.append(bottom_node)
+                data.append(weight_bottom)
+
+            if i < rows - 1 and j < cols - 1:
+                bottom_right_node = to_node_index(i + 1, j + 1, cols)
+                weight_bottom_right = mat[i + 1, j + 1]
+                row_ind.append(node)
+                col_ind.append(bottom_right_node)
+                data.append(weight_bottom_right)
+
+    adj_mat = coo_matrix((data, (row_ind, col_ind)), shape=(rows * cols, rows * cols))
+    return adj_mat.tocsr()
diff --git a/TrainingInterfaces/Text_to_Spectrogram/AutoAligner/AlignerDataset.py b/TrainingInterfaces/Text_to_Spectrogram/AutoAligner/AlignerDataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e7c022331ebbfcbbeb07af121a6766e9eea68db
--- /dev/null
+++ b/TrainingInterfaces/Text_to_Spectrogram/AutoAligner/AlignerDataset.py
@@ -0,0 +1,211 @@
+import os
+import random
+import warnings
+
+import soundfile as sf
+import torch
+from numpy import trim_zeros
+from speechbrain.pretrained import EncoderClassifier
+from torch.multiprocessing import Manager
+from torch.multiprocessing import Process
+from torch.multiprocessing import set_start_method
+from torch.utils.data import Dataset
+from tqdm import tqdm
+
+from Preprocessing.ArticulatoryCombinedTextFrontend import ArticulatoryCombinedTextFrontend
+from Preprocessing.AudioPreprocessor import AudioPreprocessor
+
+
+class AlignerDataset(Dataset):
+
+    def __init__(self,
+                 path_to_transcript_dict,
+                 cache_dir,
+                 lang,
+                 loading_processes=30,  # careful with the amount of processes if you use silence removal, only as many processes as you have cores
+                 min_len_in_seconds=1,
+                 max_len_in_seconds=20,
+                 cut_silences=False,
+                 rebuild_cache=False,
+                 verbose=False,
+                 device="cpu"):
+        os.makedirs(cache_dir, exist_ok=True)
+        if not os.path.exists(os.path.join(cache_dir, "aligner_train_cache.pt")) or rebuild_cache:
+            if (device == "cuda" or device == torch.device("cuda")) and cut_silences:
+                try:
+                    set_start_method('spawn')  # in order to be able to make use of cuda in multiprocessing
+                except RuntimeError:
+                    pass
+            elif cut_silences:
+                torch.set_num_threads(1)
+            if cut_silences:
+                torch.hub.load(repo_or_dir='snakers4/silero-vad',
+                               model='silero_vad',
+                               force_reload=False,
+                               onnx=False,
+                               verbose=False)  # download and cache for it to be loaded and used later
+                torch.set_grad_enabled(True)
+            resource_manager = Manager()
+            self.path_to_transcript_dict = resource_manager.dict(path_to_transcript_dict)
+            key_list = list(self.path_to_transcript_dict.keys())
+            with open(os.path.join(cache_dir, "files_used.txt"), encoding='utf8', mode="w") as files_used_note:
+                files_used_note.write(str(key_list))
+            random.shuffle(key_list)
+            # build cache
+            print("... building dataset cache ...")
+            self.datapoints = resource_manager.list()
+            # make processes
+            key_splits = list()
+            process_list = list()
+            for i in range(loading_processes):
+                key_splits.append(key_list[i * len(key_list) // loading_processes:(i + 1) * len(key_list) // loading_processes])
+            for key_split in key_splits:
+                process_list.append(
+                    Process(target=self.cache_builder_process,
+                            args=(key_split,
+                                  lang,
+                                  min_len_in_seconds,
+                                  max_len_in_seconds,
+                                  cut_silences,
+                                  verbose,
+                                  device),
+                            daemon=True))
+                process_list[-1].start()
+            for process in process_list:
+                process.join()
+            self.datapoints = list(self.datapoints)
+            tensored_datapoints = list()
+            # we had to turn all of the tensors to numpy arrays to avoid shared memory
+            # issues. Now that the multi-processing is over, we can convert them back
+            # to tensors to save on conversions in the future.
+            print("Converting into convenient format...")
+            norm_waves = list()
+            for datapoint in tqdm(self.datapoints):
+                tensored_datapoints.append([torch.Tensor(datapoint[0]),
+                                            torch.LongTensor(datapoint[1]),
+                                            torch.Tensor(datapoint[2]),
+                                            torch.LongTensor(datapoint[3])])
+                norm_waves.append(torch.Tensor(datapoint[-1]))
+
+            self.datapoints = tensored_datapoints
+
+            pop_indexes = list()
+            for index, el in enumerate(self.datapoints):
+                try:
+                    if len(el[0][0]) != 66:
+                        pop_indexes.append(index)
+                except TypeError:
+                    pop_indexes.append(index)
+            for pop_index in sorted(pop_indexes, reverse=True):
+                print(f"There seems to be a problem in the transcriptions. Deleting datapoint {pop_index}.")
+                self.datapoints.pop(pop_index)
+
+            # add speaker embeddings
+            self.speaker_embeddings = list()
+            speaker_embedding_func_ecapa = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb",
+                                                                          run_opts={"device": str(device)},
+                                                                          savedir="Models/SpeakerEmbedding/speechbrain_speaker_embedding_ecapa")
+            with torch.no_grad():
+                for wave in tqdm(norm_waves):
+                    self.speaker_embeddings.append(speaker_embedding_func_ecapa.encode_batch(wavs=wave.to(device).unsqueeze(0)).squeeze().cpu())
+
+            # save to cache
+            torch.save((self.datapoints, norm_waves, self.speaker_embeddings), os.path.join(cache_dir, "aligner_train_cache.pt"))
+        else:
+            # just load the datapoints from cache
+            self.datapoints = torch.load(os.path.join(cache_dir, "aligner_train_cache.pt"), map_location='cpu')
+            if len(self.datapoints) == 2:
+                # speaker embeddings are still missing, have to add them here
+                wave_datapoints = self.datapoints[1]
+                self.datapoints = self.datapoints[0]
+                self.speaker_embeddings = list()
+                speaker_embedding_func_ecapa = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb",
+                                                                              run_opts={"device": str(device)},
+                                                                              savedir="Models/SpeakerEmbedding/speechbrain_speaker_embedding_ecapa")
+                with torch.no_grad():
+                    for wave in tqdm(wave_datapoints):
+                        self.speaker_embeddings.append(speaker_embedding_func_ecapa.encode_batch(wavs=wave.to(device).unsqueeze(0)).squeeze().cpu())
+                torch.save((self.datapoints, wave_datapoints, self.speaker_embeddings), os.path.join(cache_dir, "aligner_train_cache.pt"))
+            else:
+                self.speaker_embeddings = self.datapoints[2]
+                self.datapoints = self.datapoints[0]
+
+        self.tf = ArticulatoryCombinedTextFrontend(language=lang, use_word_boundaries=True)
+        print(f"Prepared an Aligner dataset with {len(self.datapoints)} datapoints in {cache_dir}.")
+
+    def cache_builder_process(self,
+                              path_list,
+                              lang,
+                              min_len,
+                              max_len,
+                              cut_silences,
+                              verbose,
+                              device):
+        process_internal_dataset_chunk = list()
+        tf = ArticulatoryCombinedTextFrontend(language=lang, use_word_boundaries=False)
+        _, sr = sf.read(path_list[0])
+        ap = AudioPreprocessor(input_sr=sr, output_sr=16000, melspec_buckets=80, hop_length=256, n_fft=1024, cut_silence=cut_silences, device=device)
+
+        for path in tqdm(path_list):
+            if self.path_to_transcript_dict[path].strip() == "":
+                continue
+
+            wave, sr = sf.read(path)
+            dur_in_seconds = len(wave) / sr
+            if not (min_len <= dur_in_seconds <= max_len):
+                if verbose:
+                    print(f"Excluding {path} because of its duration of {round(dur_in_seconds, 2)} seconds.")
+                continue
+            try:
+                with warnings.catch_warnings():
+                    warnings.simplefilter("ignore")  # otherwise we get tons of warnings about an RNN not being in contiguous chunks
+                    norm_wave = ap.audio_to_wave_tensor(normalize=True, audio=wave)
+            except ValueError:
+                continue
+            dur_in_seconds = len(norm_wave) / 16000
+            if not (min_len <= dur_in_seconds <= max_len):
+                if verbose:
+                    print(f"Excluding {path} because of its duration of {round(dur_in_seconds, 2)} seconds.")
+                continue
+            norm_wave = torch.tensor(trim_zeros(norm_wave.numpy()))
+            # raw audio preprocessing is done
+            transcript = self.path_to_transcript_dict[path]
+            try:
+                cached_text = tf.string_to_tensor(transcript, handle_missing=False).squeeze(0).cpu().numpy()
+            except KeyError:
+                tf.string_to_tensor(transcript, handle_missing=True).squeeze(0).cpu().numpy()
+                continue  # we skip sentences with unknown symbols
+            try:
+                if len(cached_text[0]) != 66:
+                    print(f"There seems to be a problem with the following transcription: {transcript}")
+                    continue
+            except TypeError:
+                print(f"There seems to be a problem with the following transcription: {transcript}")
+                continue
+            cached_text_len = torch.LongTensor([len(cached_text)]).numpy()
+            cached_speech = ap.audio_to_mel_spec_tensor(audio=norm_wave, normalize=False, explicit_sampling_rate=16000).transpose(0, 1).cpu().numpy()
+            cached_speech_len = torch.LongTensor([len(cached_speech)]).numpy()
+            process_internal_dataset_chunk.append([cached_text,
+                                                   cached_text_len,
+                                                   cached_speech,
+                                                   cached_speech_len,
+                                                   norm_wave.cpu().detach().numpy()])
+        self.datapoints += process_internal_dataset_chunk
+
+    def __getitem__(self, index):
+        text_vector = self.datapoints[index][0]
+        tokens = list()
+        for vector in text_vector:
+            for phone in self.tf.phone_to_vector:
+                if vector.numpy().tolist() == self.tf.phone_to_vector[phone]:
+                    tokens.append(self.tf.phone_to_id[phone])
+                    # this is terribly inefficient, but it's good enough for testing for now.
+        tokens = torch.LongTensor(tokens)
+        return tokens, \
+               self.datapoints[index][1], \
+               self.datapoints[index][2], \
+               self.datapoints[index][3], \
+               self.speaker_embeddings[index]
+
+    def __len__(self):
+        return len(self.datapoints)
diff --git a/TrainingInterfaces/Text_to_Spectrogram/AutoAligner/TinyTTS.py b/TrainingInterfaces/Text_to_Spectrogram/AutoAligner/TinyTTS.py
new file mode 100644
index 0000000000000000000000000000000000000000..800e1300f09fa2b3c681aeb8e6cdef1a0daac2a0
--- /dev/null
+++ b/TrainingInterfaces/Text_to_Spectrogram/AutoAligner/TinyTTS.py
@@ -0,0 +1,36 @@
+import torch
+import torch.multiprocessing
+from torch.nn.utils.rnn import pack_padded_sequence
+from torch.nn.utils.rnn import pad_packed_sequence
+
+from Utility.utils import make_non_pad_mask
+
+
+class TinyTTS(torch.nn.Module):
+
+    def __init__(self,
+                 n_mels=80,
+                 num_symbols=145,
+                 speaker_embedding_dim=192,
+                 lstm_dim=512):
+        super().__init__()
+        self.in_proj = torch.nn.Linear(num_symbols + speaker_embedding_dim, lstm_dim)
+        self.rnn1 = torch.nn.LSTM(lstm_dim, lstm_dim, batch_first=True, bidirectional=True)
+        self.rnn2 = torch.nn.LSTM(2 * lstm_dim, lstm_dim, batch_first=True, bidirectional=True)
+        self.out_proj = torch.nn.Linear(2 * lstm_dim, n_mels)
+        self.l1_criterion = torch.nn.L1Loss(reduction="none")
+        self.l2_criterion = torch.nn.MSELoss(reduction="none")
+
+    def forward(self, x, lens, ys):
+        x = self.in_proj(x)
+        x = pack_padded_sequence(x, lens.cpu(), batch_first=True, enforce_sorted=False)
+        x, _ = self.rnn1(x)
+        x, _ = self.rnn2(x)
+        x, _ = pad_packed_sequence(x, batch_first=True)
+        x = self.out_proj(x)
+        out_masks = make_non_pad_mask(lens).unsqueeze(-1).to(ys.device)
+        out_weights = out_masks.float() / out_masks.sum(dim=1, keepdim=True).float()
+        out_weights /= ys.size(0) * ys.size(2)
+        l1_loss = self.l1_criterion(x, ys).mul(out_weights).masked_select(out_masks).sum()
+        l2_loss = self.l2_criterion(x, ys).mul(out_weights).masked_select(out_masks).sum()
+        return l1_loss + l2_loss
diff --git a/TrainingInterfaces/Text_to_Spectrogram/AutoAligner/__init__.py b/TrainingInterfaces/Text_to_Spectrogram/AutoAligner/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/TrainingInterfaces/Text_to_Spectrogram/AutoAligner/autoaligner_train_loop.py b/TrainingInterfaces/Text_to_Spectrogram/AutoAligner/autoaligner_train_loop.py
new file mode 100644
index 0000000000000000000000000000000000000000..38b2462338cf694b6820e6b65587b1c86afeb15f
--- /dev/null
+++ b/TrainingInterfaces/Text_to_Spectrogram/AutoAligner/autoaligner_train_loop.py
@@ -0,0 +1,145 @@
+import os
+import time
+
+import torch
+import torch.multiprocessing
+from torch.nn.utils.rnn import pad_sequence
+from torch.optim import RAdam
+from torch.utils.data.dataloader import DataLoader
+from tqdm import tqdm
+
+from TrainingInterfaces.Text_to_Spectrogram.AutoAligner.Aligner import Aligner
+from TrainingInterfaces.Text_to_Spectrogram.AutoAligner.TinyTTS import TinyTTS
+
+
+def collate_and_pad(batch):
+    # text, text_len, speech, speech_len
+    return (pad_sequence([datapoint[0] for datapoint in batch], batch_first=True),
+            torch.stack([datapoint[1] for datapoint in batch]).squeeze(1),
+            pad_sequence([datapoint[2] for datapoint in batch], batch_first=True),
+            torch.stack([datapoint[3] for datapoint in batch]).squeeze(1),
+            torch.stack([datapoint[4] for datapoint in batch]).squeeze())
+
+
+def train_loop(train_dataset,
+               device,
+               save_directory,
+               batch_size,
+               steps,
+               path_to_checkpoint=None,
+               fine_tune=False,
+               resume=False,
+               debug_img_path=None,
+               use_reconstruction=True):
+    """
+    Args:
+        resume: whether to resume from the most recent checkpoint
+        steps: How many steps to train
+        path_to_checkpoint: reloads a checkpoint to continue training from there
+        fine_tune: whether to load everything from a checkpoint, or only the model parameters
+        train_dataset: Pytorch Dataset Object for train data
+        device: Device to put the loaded tensors on
+        save_directory: Where to save the checkpoints
+        batch_size: How many elements should be loaded at once
+    """
+    os.makedirs(save_directory, exist_ok=True)
+    train_loader = DataLoader(batch_size=batch_size,
+                              dataset=train_dataset,
+                              drop_last=True,
+                              num_workers=8,
+                              pin_memory=False,
+                              shuffle=True,
+                              prefetch_factor=16,
+                              collate_fn=collate_and_pad,
+                              persistent_workers=True)
+
+    asr_model = Aligner().to(device)
+    optim_asr = RAdam(asr_model.parameters(), lr=0.0001)
+
+    tiny_tts = TinyTTS().to(device)
+    optim_tts = RAdam(tiny_tts.parameters(), lr=0.0001)
+
+    step_counter = 0
+    if resume:
+        previous_checkpoint = os.path.join(save_directory, "aligner.pt")
+        path_to_checkpoint = previous_checkpoint
+        fine_tune = False
+
+    if path_to_checkpoint is not None:
+        check_dict = torch.load(os.path.join(path_to_checkpoint), map_location=device)
+        asr_model.load_state_dict(check_dict["asr_model"])
+        tiny_tts.load_state_dict(check_dict["tts_model"])
+        if not fine_tune:
+            optim_asr.load_state_dict(check_dict["optimizer"])
+            optim_tts.load_state_dict(check_dict["tts_optimizer"])
+            step_counter = check_dict["step_counter"]
+            if step_counter > steps:
+                print("Desired steps already reached in loaded checkpoint.")
+                return
+    start_time = time.time()
+
+    while True:
+        loss_sum = list()
+
+        asr_model.train()
+        tiny_tts.train()
+        for batch in tqdm(train_loader):
+            tokens = batch[0].to(device)
+            tokens_len = batch[1].to(device)
+            mel = batch[2].to(device)
+            mel_len = batch[3].to(device)
+            speaker_embeddings = batch[4].to(device)
+
+            pred = asr_model(mel, mel_len)
+
+            ctc_loss = asr_model.ctc_loss(pred.transpose(0, 1).log_softmax(2),
+                                          tokens,
+                                          mel_len,
+                                          tokens_len)
+
+            if use_reconstruction:
+                speaker_embeddings_expanded = torch.nn.functional.normalize(speaker_embeddings).unsqueeze(1).expand(-1, pred.size(1), -1)
+                tts_lambda = min([5, step_counter / 2000])  # super simple schedule
+                reconstruction_loss = tiny_tts(x=torch.cat([pred, speaker_embeddings_expanded], dim=-1),
+                                               # combine ASR prediction with speaker embeddings to allow for reconstruction loss on multiple speakers
+                                               lens=mel_len,
+                                               ys=mel) * tts_lambda  # reconstruction loss to make the states more distinct
+                loss = ctc_loss + reconstruction_loss
+            else:
+                loss = ctc_loss
+
+            optim_asr.zero_grad()
+            if use_reconstruction:
+                optim_tts.zero_grad()
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(asr_model.parameters(), 1.0)
+            if use_reconstruction:
+                torch.nn.utils.clip_grad_norm_(tiny_tts.parameters(), 1.0)
+            optim_asr.step()
+            if use_reconstruction:
+                optim_tts.step()
+
+            step_counter += 1
+
+            loss_sum.append(loss.item())
+
+        asr_model.eval()
+        loss_this_epoch = sum(loss_sum) / len(loss_sum)
+        torch.save({
+            "asr_model"    : asr_model.state_dict(),
+            "optimizer"    : optim_asr.state_dict(),
+            "tts_model"    : tiny_tts.state_dict(),
+            "tts_optimizer": optim_tts.state_dict(),
+            "step_counter" : step_counter,
+            },
+            os.path.join(save_directory, "aligner.pt"))
+        print("Total Loss:   {}".format(round(loss_this_epoch, 3)))
+        print("Time elapsed: {} Minutes".format(round((time.time() - start_time) / 60)))
+        print("Steps:        {}".format(step_counter))
+        if debug_img_path is not None:
+            asr_model.inference(mel=mel[0][:mel_len[0]],
+                                tokens=tokens[0][:tokens_len[0]],
+                                save_img_for_debug=debug_img_path + f"/{step_counter}.png",
+                                train=True)  # for testing
+        if step_counter > steps:
+            return
diff --git a/TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/DurationCalculator.py b/TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/DurationCalculator.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea92f1f293dfb4c160a091e8162b924cbab075e9
--- /dev/null
+++ b/TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/DurationCalculator.py
@@ -0,0 +1,31 @@
+# Copyright 2020 Nagoya University (Tomoki Hayashi)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+# Adapted by Florian Lux 2021
+
+import matplotlib.pyplot as plt
+
+import torch
+
+
+class DurationCalculator(torch.nn.Module):
+
+    def __init__(self, reduction_factor):
+        self.reduction_factor = reduction_factor
+        super().__init__()
+
+    @torch.no_grad()
+    def forward(self, att_ws, vis=None):
+        """
+        Convert alignment matrix to durations.
+        """
+        if vis is not None:
+            plt.figure(figsize=(8, 4))
+            plt.imshow(att_ws.cpu().numpy(), interpolation='nearest', aspect='auto', origin="lower")
+            plt.xlabel("Inputs")
+            plt.ylabel("Outputs")
+            plt.tight_layout()
+            plt.savefig(vis)
+            plt.close()
+        # calculate duration from 2d alignment matrix
+        durations = torch.stack([att_ws.argmax(-1).eq(i).sum() for i in range(att_ws.shape[1])])
+        return durations.view(-1) * self.reduction_factor
diff --git a/TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/EnergyCalculator.py b/TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/EnergyCalculator.py
new file mode 100644
index 0000000000000000000000000000000000000000..665ff62449b93cf5f017e156863769166994f2c3
--- /dev/null
+++ b/TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/EnergyCalculator.py
@@ -0,0 +1,86 @@
+# Copyright 2020 Nagoya University (Tomoki Hayashi)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+# Adapted by Florian Lux 2021
+
+import torch
+import torch.nn.functional as F
+
+from Layers.STFT import STFT
+from Utility.utils import pad_list
+
+
+class EnergyCalculator(torch.nn.Module):
+
+    def __init__(self, fs=16000, n_fft=1024, win_length=None, hop_length=256, window="hann", center=True,
+                 normalized=False, onesided=True, use_token_averaged_energy=True, reduction_factor=1):
+        super().__init__()
+
+        self.fs = fs
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.win_length = win_length
+        self.window = window
+        self.use_token_averaged_energy = use_token_averaged_energy
+        if use_token_averaged_energy:
+            assert reduction_factor >= 1
+        self.reduction_factor = reduction_factor
+
+        self.stft = STFT(n_fft=n_fft, win_length=win_length, hop_length=hop_length, window=window, center=center, normalized=normalized, onesided=onesided)
+
+    def output_size(self):
+        return 1
+
+    def get_parameters(self):
+        return dict(fs=self.fs, n_fft=self.n_fft, hop_length=self.hop_length, window=self.window, win_length=self.win_length, center=self.stft.center,
+                    normalized=self.stft.normalized, use_token_averaged_energy=self.use_token_averaged_energy, reduction_factor=self.reduction_factor)
+
+    def forward(self, input_waves, input_waves_lengths=None, feats_lengths=None, durations=None,
+                durations_lengths=None, norm_by_average=True):
+        # If not provided, we assume that the inputs have the same length
+        if input_waves_lengths is None:
+            input_waves_lengths = (input_waves.new_ones(input_waves.shape[0], dtype=torch.long) * input_waves.shape[1])
+
+        # Domain-conversion: e.g. Stft: time -> time-freq
+        input_stft, energy_lengths = self.stft(input_waves, input_waves_lengths)
+
+        assert input_stft.dim() >= 4, input_stft.shape
+        assert input_stft.shape[-1] == 2, input_stft.shape
+
+        # input_stft: (..., F, 2) -> (..., F)
+        input_power = input_stft[..., 0] ** 2 + input_stft[..., 1] ** 2
+        # sum over frequency (B, N, F) -> (B, N)
+        energy = torch.sqrt(torch.clamp(input_power.sum(dim=2), min=1.0e-10))
+
+        # (Optional): Adjust length to match with the mel-spectrogram
+        if feats_lengths is not None:
+            energy = [self._adjust_num_frames(e[:el].view(-1), fl) for e, el, fl in zip(energy, energy_lengths, feats_lengths)]
+            energy_lengths = feats_lengths
+
+        # (Optional): Average by duration to calculate token-wise energy
+        if self.use_token_averaged_energy:
+            energy = [self._average_by_duration(e[:el].view(-1), d) for e, el, d in zip(energy, energy_lengths, durations)]
+            energy_lengths = durations_lengths
+
+        # Padding
+        if isinstance(energy, list):
+            energy = pad_list(energy, 0.0)
+
+        # Return with the shape (B, T, 1)
+        if norm_by_average:
+            average = energy[0][energy[0] != 0.0].mean()
+            energy = energy / average
+        return energy.unsqueeze(-1), energy_lengths
+
+    def _average_by_duration(self, x, d):
+        assert 0 <= len(x) - d.sum() < self.reduction_factor
+        d_cumsum = F.pad(d.cumsum(dim=0), (1, 0))
+        x_avg = [x[start:end].mean() if len(x[start:end]) != 0 else x.new_tensor(0.0) for start, end in zip(d_cumsum[:-1], d_cumsum[1:])]
+        return torch.stack(x_avg)
+
+    @staticmethod
+    def _adjust_num_frames(x, num_frames):
+        if num_frames > len(x):
+            x = F.pad(x, (0, num_frames - len(x)))
+        elif num_frames < len(x):
+            x = x[:num_frames]
+        return x
diff --git a/TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/FastSpeech2.py b/TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/FastSpeech2.py
new file mode 100644
index 0000000000000000000000000000000000000000..75a664b0f30cca856b5ad6ee57cb9ed925f50a7d
--- /dev/null
+++ b/TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/FastSpeech2.py
@@ -0,0 +1,379 @@
+"""
+Taken from ESPNet
+"""
+
+from abc import ABC
+
+import torch
+
+from Layers.Conformer import Conformer
+from Layers.DurationPredictor import DurationPredictor
+from Layers.LengthRegulator import LengthRegulator
+from Layers.PostNet import PostNet
+from Layers.VariancePredictor import VariancePredictor
+from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.FastSpeech2Loss import FastSpeech2Loss
+from Utility.SoftDTW.sdtw_cuda_loss import SoftDTW
+from Utility.utils import initialize
+from Utility.utils import make_non_pad_mask
+from Utility.utils import make_pad_mask
+
+
+class FastSpeech2(torch.nn.Module, ABC):
+    """
+    FastSpeech 2 module.
+
+    This is a module of FastSpeech 2 described in FastSpeech 2: Fast and
+    High-Quality End-to-End Text to Speech. Instead of quantized pitch and
+    energy, we use token-averaged value introduced in FastPitch: Parallel
+    Text-to-speech with Pitch Prediction. The encoder and decoder are Conformers
+    instead of regular Transformers.
+
+        https://arxiv.org/abs/2006.04558
+        https://arxiv.org/abs/2006.06873
+        https://arxiv.org/pdf/2005.08100
+
+    """
+
+    def __init__(self,
+                 # network structure related
+                 idim=66,
+                 odim=80,
+                 adim=384,
+                 aheads=4,
+                 elayers=6,
+                 eunits=1536,
+                 dlayers=6,
+                 dunits=1536,
+                 postnet_layers=5,
+                 postnet_chans=256,
+                 postnet_filts=5,
+                 positionwise_layer_type="conv1d",
+                 positionwise_conv_kernel_size=1,
+                 use_scaled_pos_enc=True,
+                 use_batch_norm=True,
+                 encoder_normalize_before=True,
+                 decoder_normalize_before=True,
+                 encoder_concat_after=False,
+                 decoder_concat_after=False,
+                 reduction_factor=1,
+                 # encoder / decoder
+                 use_macaron_style_in_conformer=True,
+                 use_cnn_in_conformer=True,
+                 conformer_enc_kernel_size=7,
+                 conformer_dec_kernel_size=31,
+                 # duration predictor
+                 duration_predictor_layers=2,
+                 duration_predictor_chans=256,
+                 duration_predictor_kernel_size=3,
+                 # energy predictor
+                 energy_predictor_layers=2,
+                 energy_predictor_chans=256,
+                 energy_predictor_kernel_size=3,
+                 energy_predictor_dropout=0.5,
+                 energy_embed_kernel_size=1,
+                 energy_embed_dropout=0.0,
+                 stop_gradient_from_energy_predictor=False,
+                 # pitch predictor
+                 pitch_predictor_layers=5,
+                 pitch_predictor_chans=256,
+                 pitch_predictor_kernel_size=5,
+                 pitch_predictor_dropout=0.5,
+                 pitch_embed_kernel_size=1,
+                 pitch_embed_dropout=0.0,
+                 stop_gradient_from_pitch_predictor=True,
+                 # training related
+                 transformer_enc_dropout_rate=0.2,
+                 transformer_enc_positional_dropout_rate=0.2,
+                 transformer_enc_attn_dropout_rate=0.2,
+                 transformer_dec_dropout_rate=0.2,
+                 transformer_dec_positional_dropout_rate=0.2,
+                 transformer_dec_attn_dropout_rate=0.2,
+                 duration_predictor_dropout_rate=0.2,
+                 postnet_dropout_rate=0.5,
+                 init_type="xavier_uniform",
+                 init_enc_alpha=1.0,
+                 init_dec_alpha=1.0,
+                 use_masking=False,
+                 use_weighted_masking=True,
+                 # additional features
+                 use_dtw_loss=False,
+                 utt_embed_dim=704,
+                 connect_utt_emb_at_encoder_out=True,
+                 lang_embs=100):
+        super().__init__()
+
+        # store hyperparameters
+        self.idim = idim
+        self.odim = odim
+        self.use_dtw_loss = use_dtw_loss
+        self.eos = 1
+        self.reduction_factor = reduction_factor
+        self.stop_gradient_from_pitch_predictor = stop_gradient_from_pitch_predictor
+        self.stop_gradient_from_energy_predictor = stop_gradient_from_energy_predictor
+        self.use_scaled_pos_enc = use_scaled_pos_enc
+        self.multilingual_model = lang_embs is not None
+        self.multispeaker_model = utt_embed_dim is not None
+
+        # define encoder
+        embed = torch.nn.Sequential(torch.nn.Linear(idim, 100),
+                                    torch.nn.Tanh(),
+                                    torch.nn.Linear(100, adim))
+        self.encoder = Conformer(idim=idim, attention_dim=adim, attention_heads=aheads, linear_units=eunits, num_blocks=elayers,
+                                 input_layer=embed, dropout_rate=transformer_enc_dropout_rate,
+                                 positional_dropout_rate=transformer_enc_positional_dropout_rate, attention_dropout_rate=transformer_enc_attn_dropout_rate,
+                                 normalize_before=encoder_normalize_before, concat_after=encoder_concat_after,
+                                 positionwise_conv_kernel_size=positionwise_conv_kernel_size, macaron_style=use_macaron_style_in_conformer,
+                                 use_cnn_module=use_cnn_in_conformer, cnn_module_kernel=conformer_enc_kernel_size, zero_triu=False,
+                                 utt_embed=utt_embed_dim, connect_utt_emb_at_encoder_out=connect_utt_emb_at_encoder_out, lang_embs=lang_embs)
+
+        # define duration predictor
+        self.duration_predictor = DurationPredictor(idim=adim, n_layers=duration_predictor_layers, n_chans=duration_predictor_chans,
+                                                    kernel_size=duration_predictor_kernel_size, dropout_rate=duration_predictor_dropout_rate, )
+
+        # define pitch predictor
+        self.pitch_predictor = VariancePredictor(idim=adim, n_layers=pitch_predictor_layers, n_chans=pitch_predictor_chans,
+                                                 kernel_size=pitch_predictor_kernel_size, dropout_rate=pitch_predictor_dropout)
+        # continuous pitch + FastPitch style avg
+        self.pitch_embed = torch.nn.Sequential(
+            torch.nn.Conv1d(in_channels=1, out_channels=adim, kernel_size=pitch_embed_kernel_size, padding=(pitch_embed_kernel_size - 1) // 2),
+            torch.nn.Dropout(pitch_embed_dropout))
+
+        # define energy predictor
+        self.energy_predictor = VariancePredictor(idim=adim, n_layers=energy_predictor_layers, n_chans=energy_predictor_chans,
+                                                  kernel_size=energy_predictor_kernel_size, dropout_rate=energy_predictor_dropout)
+        # continuous energy + FastPitch style avg
+        self.energy_embed = torch.nn.Sequential(
+            torch.nn.Conv1d(in_channels=1, out_channels=adim, kernel_size=energy_embed_kernel_size, padding=(energy_embed_kernel_size - 1) // 2),
+            torch.nn.Dropout(energy_embed_dropout))
+
+        # define length regulator
+        self.length_regulator = LengthRegulator()
+
+        self.decoder = Conformer(idim=0, attention_dim=adim, attention_heads=aheads, linear_units=dunits, num_blocks=dlayers, input_layer=None,
+                                 dropout_rate=transformer_dec_dropout_rate, positional_dropout_rate=transformer_dec_positional_dropout_rate,
+                                 attention_dropout_rate=transformer_dec_attn_dropout_rate, normalize_before=decoder_normalize_before,
+                                 concat_after=decoder_concat_after, positionwise_conv_kernel_size=positionwise_conv_kernel_size,
+                                 macaron_style=use_macaron_style_in_conformer, use_cnn_module=use_cnn_in_conformer, cnn_module_kernel=conformer_dec_kernel_size)
+
+        # define final projection
+        self.feat_out = torch.nn.Linear(adim, odim * reduction_factor)
+
+        # define postnet
+        self.postnet = PostNet(idim=idim, odim=odim, n_layers=postnet_layers, n_chans=postnet_chans, n_filts=postnet_filts, use_batch_norm=use_batch_norm,
+                               dropout_rate=postnet_dropout_rate)
+
+        # initialize parameters
+        self._reset_parameters(init_type=init_type, init_enc_alpha=init_enc_alpha, init_dec_alpha=init_dec_alpha)
+
+        # define criterions
+        self.criterion = FastSpeech2Loss(use_masking=use_masking, use_weighted_masking=use_weighted_masking)
+        self.dtw_criterion = SoftDTW(use_cuda=True, gamma=0.1)
+
+    def forward(self,
+                text_tensors,
+                text_lengths,
+                gold_speech,
+                speech_lengths,
+                gold_durations,
+                gold_pitch,
+                gold_energy,
+                utterance_embedding,
+                return_mels=False,
+                lang_ids=None):
+        """
+        Calculate forward propagation.
+
+        Args:
+            return_mels: whether to return the predicted spectrogram
+            text_tensors (LongTensor): Batch of padded text vectors (B, Tmax).
+            text_lengths (LongTensor): Batch of lengths of each input (B,).
+            gold_speech (Tensor): Batch of padded target features (B, Lmax, odim).
+            speech_lengths (LongTensor): Batch of the lengths of each target (B,).
+            gold_durations (LongTensor): Batch of padded durations (B, Tmax + 1).
+            gold_pitch (Tensor): Batch of padded token-averaged pitch (B, Tmax + 1, 1).
+            gold_energy (Tensor): Batch of padded token-averaged energy (B, Tmax + 1, 1).
+
+        Returns:
+            Tensor: Loss scalar value.
+            Dict: Statistics to be monitored.
+            Tensor: Weight value.
+        """
+        # Texts include EOS token from the teacher model already in this version
+
+        # forward propagation
+        before_outs, after_outs, d_outs, p_outs, e_outs = self._forward(text_tensors, text_lengths, gold_speech, speech_lengths,
+                                                                        gold_durations, gold_pitch, gold_energy, utterance_embedding=utterance_embedding,
+                                                                        is_inference=False, lang_ids=lang_ids)
+
+        # modify mod part of groundtruth (speaking pace)
+        if self.reduction_factor > 1:
+            speech_lengths = speech_lengths.new([olen - olen % self.reduction_factor for olen in speech_lengths])
+
+        # calculate loss
+        l1_loss, duration_loss, pitch_loss, energy_loss = self.criterion(after_outs=after_outs, before_outs=before_outs, d_outs=d_outs, p_outs=p_outs,
+                                                                         e_outs=e_outs, ys=gold_speech, ds=gold_durations, ps=gold_pitch, es=gold_energy,
+                                                                         ilens=text_lengths, olens=speech_lengths)
+        loss = l1_loss + duration_loss + pitch_loss + energy_loss
+
+        if self.use_dtw_loss:
+            # print("Regular Loss: {}".format(loss))
+            dtw_loss = self.dtw_criterion(after_outs, gold_speech).mean() / 2000.0  # division to balance orders of magnitude
+            # print("DTW Loss: {}".format(dtw_loss))
+            loss = loss + dtw_loss
+
+        if return_mels:
+            return loss, after_outs
+        return loss
+
+    def _forward(self, text_tensors, text_lens, gold_speech=None, speech_lens=None,
+                 gold_durations=None, gold_pitch=None, gold_energy=None,
+                 is_inference=False, alpha=1.0, utterance_embedding=None, lang_ids=None):
+
+        if not self.multilingual_model:
+            lang_ids = None
+
+        if not self.multispeaker_model:
+            utterance_embedding = None
+
+        # forward encoder
+        text_masks = self._source_mask(text_lens)
+
+        encoded_texts, _ = self.encoder(text_tensors, text_masks, utterance_embedding=utterance_embedding, lang_ids=lang_ids)  # (B, Tmax, adim)
+
+        # forward duration predictor and variance predictors
+        d_masks = make_pad_mask(text_lens, device=text_lens.device)
+
+        if self.stop_gradient_from_pitch_predictor:
+            pitch_predictions = self.pitch_predictor(encoded_texts.detach(), d_masks.unsqueeze(-1))
+        else:
+            pitch_predictions = self.pitch_predictor(encoded_texts, d_masks.unsqueeze(-1))
+
+        if self.stop_gradient_from_energy_predictor:
+            energy_predictions = self.energy_predictor(encoded_texts.detach(), d_masks.unsqueeze(-1))
+        else:
+            energy_predictions = self.energy_predictor(encoded_texts, d_masks.unsqueeze(-1))
+
+        if is_inference:
+            d_outs = self.duration_predictor.inference(encoded_texts, d_masks)  # (B, Tmax)
+            # use prediction in inference
+            p_embs = self.pitch_embed(pitch_predictions.transpose(1, 2)).transpose(1, 2)
+            e_embs = self.energy_embed(energy_predictions.transpose(1, 2)).transpose(1, 2)
+            encoded_texts = encoded_texts + e_embs + p_embs
+            encoded_texts = self.length_regulator(encoded_texts, d_outs, alpha)  # (B, Lmax, adim)
+        else:
+            d_outs = self.duration_predictor(encoded_texts, d_masks)
+
+            # use groundtruth in training
+            p_embs = self.pitch_embed(gold_pitch.transpose(1, 2)).transpose(1, 2)
+            e_embs = self.energy_embed(gold_energy.transpose(1, 2)).transpose(1, 2)
+            encoded_texts = encoded_texts + e_embs + p_embs
+            encoded_texts = self.length_regulator(encoded_texts, gold_durations)  # (B, Lmax, adim)
+
+        # forward decoder
+        if speech_lens is not None and not is_inference:
+            if self.reduction_factor > 1:
+                olens_in = speech_lens.new([olen // self.reduction_factor for olen in speech_lens])
+            else:
+                olens_in = speech_lens
+            h_masks = self._source_mask(olens_in)
+        else:
+            h_masks = None
+        zs, _ = self.decoder(encoded_texts, h_masks)  # (B, Lmax, adim)
+        before_outs = self.feat_out(zs).view(zs.size(0), -1, self.odim)  # (B, Lmax, odim)
+
+        # postnet -> (B, Lmax//r * r, odim)
+        after_outs = before_outs + self.postnet(before_outs.transpose(1, 2)).transpose(1, 2)
+
+        return before_outs, after_outs, d_outs, pitch_predictions, energy_predictions
+
+    def batch_inference(self, texts, text_lens, utt_emb):
+        _, after_outs, d_outs, _, _ = self._forward(texts,
+                                                    text_lens,
+                                                    None,
+                                                    is_inference=True,
+                                                    alpha=1.0)
+        return after_outs, d_outs
+
+    def inference(self,
+                  text,
+                  speech=None,
+                  durations=None,
+                  pitch=None,
+                  energy=None,
+                  alpha=1.0,
+                  use_teacher_forcing=False,
+                  utterance_embedding=None,
+                  return_duration_pitch_energy=False,
+                  lang_id=None):
+        """
+        Generate the sequence of features given the sequences of characters.
+
+        Args:
+            text (LongTensor): Input sequence of characters (T,).
+            speech (Tensor, optional): Feature sequence to extract style (N, idim).
+            durations (LongTensor, optional): Groundtruth of duration (T + 1,).
+            pitch (Tensor, optional): Groundtruth of token-averaged pitch (T + 1, 1).
+            energy (Tensor, optional): Groundtruth of token-averaged energy (T + 1, 1).
+            alpha (float, optional): Alpha to control the speed.
+            use_teacher_forcing (bool, optional): Whether to use teacher forcing.
+                If true, groundtruth of duration, pitch and energy will be used.
+            return_duration_pitch_energy: whether to return the list of predicted durations for nicer plotting
+
+        Returns:
+            Tensor: Output sequence of features (L, odim).
+
+        """
+        self.eval()
+        x, y = text, speech
+        d, p, e = durations, pitch, energy
+
+        # setup batch axis
+        ilens = torch.tensor([x.shape[0]], dtype=torch.long, device=x.device)
+        xs, ys = x.unsqueeze(0), None
+        if y is not None:
+            ys = y.unsqueeze(0)
+        if lang_id is not None:
+            lang_id = lang_id.unsqueeze(0)
+
+        if use_teacher_forcing:
+            # use groundtruth of duration, pitch, and energy
+            ds, ps, es = d.unsqueeze(0), p.unsqueeze(0), e.unsqueeze(0)
+            before_outs, after_outs, d_outs, pitch_predictions, energy_predictions = self._forward(xs,
+                                                                                                   ilens,
+                                                                                                   ys,
+                                                                                                   gold_durations=ds,
+                                                                                                   gold_pitch=ps,
+                                                                                                   gold_energy=es,
+                                                                                                   utterance_embedding=utterance_embedding.unsqueeze(0),
+                                                                                                   lang_ids=lang_id)  # (1, L, odim)
+        else:
+            before_outs, after_outs, d_outs, pitch_predictions, energy_predictions = self._forward(xs,
+                                                                                                   ilens,
+                                                                                                   ys,
+                                                                                                   is_inference=True,
+                                                                                                   alpha=alpha,
+                                                                                                   utterance_embedding=utterance_embedding.unsqueeze(0),
+                                                                                                   lang_ids=lang_id)  # (1, L, odim)
+        self.train()
+        if return_duration_pitch_energy:
+            return after_outs[0], d_outs[0], pitch_predictions[0], energy_predictions[0]
+        return after_outs[0]
+
+    def _source_mask(self, ilens):
+        """
+        Make masks for self-attention.
+
+        Args:
+            ilens (LongTensor): Batch of lengths (B,).
+
+        Returns:
+            Tensor: Mask tensor for self-attention.
+
+        """
+        x_masks = make_non_pad_mask(ilens, device=ilens.device)
+        return x_masks.unsqueeze(-2)
+
+    def _reset_parameters(self, init_type, init_enc_alpha, init_dec_alpha):
+        # initialize parameters
+        if init_type != "pytorch":
+            initialize(self, init_type)
diff --git a/TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/FastSpeech2Loss.py b/TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/FastSpeech2Loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4f7289ec5a538a29b764c6847fc1ad9505611fb
--- /dev/null
+++ b/TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/FastSpeech2Loss.py
@@ -0,0 +1,96 @@
+"""
+Taken from ESPNet
+"""
+
+import torch
+
+from Layers.DurationPredictor import DurationPredictorLoss
+from Utility.utils import make_non_pad_mask
+
+
+class FastSpeech2Loss(torch.nn.Module):
+
+    def __init__(self, use_masking=True, use_weighted_masking=False):
+        """
+            use_masking (bool):
+                Whether to apply masking for padded part in loss calculation.
+            use_weighted_masking (bool):
+                Whether to weighted masking in loss calculation.
+        """
+        super().__init__()
+
+        assert (use_masking != use_weighted_masking) or not use_masking
+        self.use_masking = use_masking
+        self.use_weighted_masking = use_weighted_masking
+
+        # define criterions
+        reduction = "none" if self.use_weighted_masking else "mean"
+        self.l1_criterion = torch.nn.L1Loss(reduction=reduction)
+        self.mse_criterion = torch.nn.MSELoss(reduction=reduction)
+        self.duration_criterion = DurationPredictorLoss(reduction=reduction)
+
+    def forward(self, after_outs, before_outs, d_outs, p_outs, e_outs, ys,
+                ds, ps, es, ilens, olens, ):
+        """
+        Args:
+            after_outs (Tensor): Batch of outputs after postnets (B, Lmax, odim).
+            before_outs (Tensor): Batch of outputs before postnets (B, Lmax, odim).
+            d_outs (LongTensor): Batch of outputs of duration predictor (B, Tmax).
+            p_outs (Tensor): Batch of outputs of pitch predictor (B, Tmax, 1).
+            e_outs (Tensor): Batch of outputs of energy predictor (B, Tmax, 1).
+            ys (Tensor): Batch of target features (B, Lmax, odim).
+            ds (LongTensor): Batch of durations (B, Tmax).
+            ps (Tensor): Batch of target token-averaged pitch (B, Tmax, 1).
+            es (Tensor): Batch of target token-averaged energy (B, Tmax, 1).
+            ilens (LongTensor): Batch of the lengths of each input (B,).
+            olens (LongTensor): Batch of the lengths of each target (B,).
+
+        Returns:
+            Tensor: L1 loss value.
+            Tensor: Duration predictor loss value.
+            Tensor: Pitch predictor loss value.
+            Tensor: Energy predictor loss value.
+
+        """
+        # apply mask to remove padded part
+        if self.use_masking:
+            out_masks = make_non_pad_mask(olens).unsqueeze(-1).to(ys.device)
+            before_outs = before_outs.masked_select(out_masks)
+            if after_outs is not None:
+                after_outs = after_outs.masked_select(out_masks)
+            ys = ys.masked_select(out_masks)
+            duration_masks = make_non_pad_mask(ilens).to(ys.device)
+            d_outs = d_outs.masked_select(duration_masks)
+            ds = ds.masked_select(duration_masks)
+            pitch_masks = make_non_pad_mask(ilens).unsqueeze(-1).to(ys.device)
+            p_outs = p_outs.masked_select(pitch_masks)
+            e_outs = e_outs.masked_select(pitch_masks)
+            ps = ps.masked_select(pitch_masks)
+            es = es.masked_select(pitch_masks)
+
+        # calculate loss
+        l1_loss = self.l1_criterion(before_outs, ys)
+        if after_outs is not None:
+            l1_loss += self.l1_criterion(after_outs, ys)
+        duration_loss = self.duration_criterion(d_outs, ds)
+        pitch_loss = self.mse_criterion(p_outs, ps)
+        energy_loss = self.mse_criterion(e_outs, es)
+
+        # make weighted mask and apply it
+        if self.use_weighted_masking:
+            out_masks = make_non_pad_mask(olens).unsqueeze(-1).to(ys.device)
+            out_weights = out_masks.float() / out_masks.sum(dim=1, keepdim=True).float()
+            out_weights /= ys.size(0) * ys.size(2)
+            duration_masks = make_non_pad_mask(ilens).to(ys.device)
+            duration_weights = (duration_masks.float() / duration_masks.sum(dim=1, keepdim=True).float())
+            duration_weights /= ds.size(0)
+
+            # apply weight
+            l1_loss = l1_loss.mul(out_weights).masked_select(out_masks).sum()
+            duration_loss = (duration_loss.mul(duration_weights).masked_select(duration_masks).sum())
+            pitch_masks = duration_masks.unsqueeze(-1)
+            pitch_weights = duration_weights.unsqueeze(-1)
+            pitch_loss = pitch_loss.mul(pitch_weights).masked_select(pitch_masks).sum()
+            energy_loss = (energy_loss.mul(pitch_weights).masked_select(pitch_masks).sum())
+
+        return l1_loss, duration_loss, pitch_loss, energy_loss
diff --git a/TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/FastSpeechDatasetLanguageID.py b/TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/FastSpeechDatasetLanguageID.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e215de1881ae7b5e165740fd1aa3689aed63094
--- /dev/null
+++ b/TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/FastSpeechDatasetLanguageID.py
@@ -0,0 +1,217 @@
+import os
+import statistics
+
+import torch
+from torch.utils.data import Dataset
+from tqdm import tqdm
+
+from Preprocessing.ArticulatoryCombinedTextFrontend import get_language_id
+from Preprocessing.ProsodicConditionExtractor import ProsodicConditionExtractor
+from TrainingInterfaces.Text_to_Spectrogram.AutoAligner.Aligner import Aligner
+from TrainingInterfaces.Text_to_Spectrogram.AutoAligner.AlignerDataset import AlignerDataset
+from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.DurationCalculator import DurationCalculator
+from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.EnergyCalculator import EnergyCalculator
+from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.PitchCalculator import Dio
+
+
+class FastSpeechDataset(Dataset):
+
+    def __init__(self,
+                 path_to_transcript_dict,
+                 acoustic_checkpoint_path,
+                 cache_dir,
+                 lang,
+                 loading_processes=40,
+                 min_len_in_seconds=1,
+                 max_len_in_seconds=20,
+                 cut_silence=False,
+                 reduction_factor=1,
+                 device=torch.device("cpu"),
+                 rebuild_cache=False,
+                 ctc_selection=True,
+                 save_imgs=False):
+        self.cache_dir = cache_dir
+        os.makedirs(cache_dir, exist_ok=True)
+        if not os.path.exists(os.path.join(cache_dir, "fast_train_cache.pt")) or rebuild_cache:
+            if not os.path.exists(os.path.join(cache_dir, "aligner_train_cache.pt")) or rebuild_cache:
+                AlignerDataset(path_to_transcript_dict=path_to_transcript_dict,
+                               cache_dir=cache_dir,
+                               lang=lang,
+                               loading_processes=loading_processes,
+                               min_len_in_seconds=min_len_in_seconds,
+                               max_len_in_seconds=max_len_in_seconds,
+                               cut_silences=cut_silence,
+                               rebuild_cache=rebuild_cache,
+                               device=device)
+            datapoints = torch.load(os.path.join(cache_dir, "aligner_train_cache.pt"), map_location='cpu')
+            # we use the aligner dataset as basis and augment it to contain the additional information we need for fastspeech.
+            if not isinstance(datapoints, tuple):  # check for backwards compatibility
+                print(f"It seems like the Aligner dataset in {cache_dir} is not a tuple. Regenerating it, since we need the preprocessed waves.")
+                AlignerDataset(path_to_transcript_dict=path_to_transcript_dict,
+                               cache_dir=cache_dir,
+                               lang=lang,
+                               loading_processes=loading_processes,
+                               min_len_in_seconds=min_len_in_seconds,
+                               max_len_in_seconds=max_len_in_seconds,
+                               cut_silences=cut_silence,
+                               rebuild_cache=True)
+                datapoints = torch.load(os.path.join(cache_dir, "aligner_train_cache.pt"), map_location='cpu')
+            dataset = datapoints[0]
+            norm_waves = datapoints[1]
+
+            # build cache
+            print("... building dataset cache ...")
+            self.datapoints = list()
+            self.ctc_losses = list()
+
+            acoustic_model = Aligner()
+            acoustic_model.load_state_dict(torch.load(acoustic_checkpoint_path, map_location='cpu')["asr_model"])
+
+            # ==========================================
+            # actual creation of datapoints starts here
+            # ==========================================
+
+            acoustic_model = acoustic_model.to(device)
+            dio = Dio(reduction_factor=reduction_factor, fs=16000)
+            energy_calc = EnergyCalculator(reduction_factor=reduction_factor, fs=16000)
+            dc = DurationCalculator(reduction_factor=reduction_factor)
+            vis_dir = os.path.join(cache_dir, "duration_vis")
+            os.makedirs(vis_dir, exist_ok=True)
+            pros_cond_ext = ProsodicConditionExtractor(sr=16000, device=device)
+
+            for index in tqdm(range(len(dataset))):
+                norm_wave = norm_waves[index]
+                norm_wave_length = torch.LongTensor([len(norm_wave)])
+
+                if len(norm_wave) / 16000 < min_len_in_seconds and ctc_selection:
+                    continue
+
+                text = dataset[index][0]
+                melspec = dataset[index][2]
+                melspec_length = dataset[index][3]
+
+                alignment_path, ctc_loss = acoustic_model.inference(mel=melspec.to(device),
+                                                                    tokens=text.to(device),
+                                                                    save_img_for_debug=os.path.join(vis_dir, f"{index}.png") if save_imgs else None,
+                                                                    return_ctc=True)
+
+                cached_duration = dc(torch.LongTensor(alignment_path), vis=None).cpu()
+
+                last_vec = None
+                for phoneme_index, vec in enumerate(text):
+                    if last_vec is not None:
+                        if last_vec.numpy().tolist() == vec.numpy().tolist():
+                            # we found a case of repeating phonemes!
+                            # now we must repair their durations by giving the first one 3/5 of their sum and the second one 2/5 (i.e. the rest)
+                            dur_1 = cached_duration[phoneme_index - 1]
+                            dur_2 = cached_duration[phoneme_index]
+                            total_dur = dur_1 + dur_2
+                            new_dur_1 = int((total_dur / 5) * 3)
+                            new_dur_2 = total_dur - new_dur_1
+                            cached_duration[phoneme_index - 1] = new_dur_1
+                            cached_duration[phoneme_index] = new_dur_2
+                    last_vec = vec
+
+                cached_energy = energy_calc(input_waves=norm_wave.unsqueeze(0),
+                                            input_waves_lengths=norm_wave_length,
+                                            feats_lengths=melspec_length,
+                                            durations=cached_duration.unsqueeze(0),
+                                            durations_lengths=torch.LongTensor([len(cached_duration)]))[0].squeeze(0).cpu()
+
+                cached_pitch = dio(input_waves=norm_wave.unsqueeze(0),
+                                   input_waves_lengths=norm_wave_length,
+                                   feats_lengths=melspec_length,
+                                   durations=cached_duration.unsqueeze(0),
+                                   durations_lengths=torch.LongTensor([len(cached_duration)]))[0].squeeze(0).cpu()
+
+                try:
+                    prosodic_condition = pros_cond_ext.extract_condition_from_reference_wave(norm_wave, already_normalized=True).cpu()
+                except RuntimeError:
+                    # if there is an audio without any voiced segments whatsoever we have to skip it.
+                    continue
+
+                self.datapoints.append([dataset[index][0],
+                                        dataset[index][1],
+                                        dataset[index][2],
+                                        dataset[index][3],
+                                        cached_duration.cpu(),
+                                        cached_energy,
+                                        cached_pitch,
+                                        prosodic_condition])
+                self.ctc_losses.append(ctc_loss)
+
+            # =============================
+            # done with datapoint creation
+            # =============================
+
+            if ctc_selection:
+                # now we can filter out some bad datapoints based on the CTC scores we collected
+                mean_ctc = sum(self.ctc_losses) / len(self.ctc_losses)
+                std_dev = statistics.stdev(self.ctc_losses)
+                threshold = mean_ctc + std_dev
+                for index in range(len(self.ctc_losses), 0, -1):
+                    if self.ctc_losses[index - 1] > threshold:
+                        self.datapoints.pop(index - 1)
+                        print(
+                            f"Removing datapoint {index - 1}, because the CTC loss is one standard deviation higher than the mean. \n ctc: {round(self.ctc_losses[index - 1], 4)} vs. mean: {round(mean_ctc, 4)}")
+
+            # save to cache
+            if len(self.datapoints) > 0:
+                torch.save(self.datapoints, os.path.join(cache_dir, "fast_train_cache.pt"))
+            else:
+                import sys
+                print("No datapoints were prepared! Exiting...")
+                sys.exit()
+        else:
+            # just load the datapoints from cache
+            self.datapoints = torch.load(os.path.join(cache_dir, "fast_train_cache.pt"), map_location='cpu')
+
+        self.cache_dir = cache_dir
+        self.language_id = get_language_id(lang)
+        print(f"Prepared a FastSpeech dataset with {len(self.datapoints)} datapoints in {cache_dir}.")
+
+    def __getitem__(self, index):
+        return self.datapoints[index][0], \
+               self.datapoints[index][1], \
+               self.datapoints[index][2], \
+               self.datapoints[index][3], \
+               self.datapoints[index][4], \
+               self.datapoints[index][5], \
+               self.datapoints[index][6], \
+               self.datapoints[index][7], \
+               self.language_id
+
+    def __len__(self):
+        return len(self.datapoints)
+
+    def remove_samples(self, list_of_samples_to_remove):
+        for remove_id in sorted(list_of_samples_to_remove, reverse=True):
+            self.datapoints.pop(remove_id)
+        torch.save(self.datapoints, os.path.join(self.cache_dir, "fast_train_cache.pt"))
+        print("Dataset updated!")
+
+    def fix_repeating_phones(self):
+        """
+        The viterbi decoding of the durations cannot
+        handle repetitions. This is now solved heuristically,
+        but if you have a cache from before March 2022,
+        use this method to postprocess those cases.
+        """
+        for datapoint_index in tqdm(list(range(len(self.datapoints)))):
+            last_vec = None
+            for phoneme_index, vec in enumerate(self.datapoints[datapoint_index][0]):
+                if last_vec is not None:
+                    if last_vec.numpy().tolist() == vec.numpy().tolist():
+                        # we found a case of repeating phonemes!
+                        # now we must repair their durations by giving the first one 3/5 of their sum and the second one 2/5 (i.e. the rest)
+                        dur_1 = self.datapoints[datapoint_index][4][phoneme_index - 1]
+                        dur_2 = self.datapoints[datapoint_index][4][phoneme_index]
+                        total_dur = dur_1 + dur_2
+                        new_dur_1 = int((total_dur / 5) * 3)
+                        new_dur_2 = total_dur - new_dur_1
+                        self.datapoints[datapoint_index][4][phoneme_index - 1] = new_dur_1
+                        self.datapoints[datapoint_index][4][phoneme_index] = new_dur_2
+                        print("fix applied")
+                last_vec = vec
+        torch.save(self.datapoints, os.path.join(self.cache_dir, "fast_train_cache.pt"))
+        print("Dataset updated!")
diff --git a/TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/PitchCalculator.py b/TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/PitchCalculator.py
new file mode 100644
index 0000000000000000000000000000000000000000..80ae25cb9a15a8d60455771c0fb9d68669db28d0
--- /dev/null
+++ b/TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/PitchCalculator.py
@@ -0,0 +1,121 @@
+# Copyright 2020 Nagoya University (Tomoki Hayashi)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+# Adapted by Florian Lux 2021
+
+import numpy as np
+import pyworld
+import torch
+import torch.nn.functional as F
+from scipy.interpolate import interp1d
+
+from Utility.utils import pad_list
+
+
+class Dio(torch.nn.Module):
+    """
+    F0 estimation with dio + stonemask algortihm.
+    This is f0 extractor based on dio + stonemask algorithm
+    introduced in https://doi.org/10.1587/transinf.2015EDP7457
+    """
+
+    def __init__(self, fs=16000, n_fft=1024, hop_length=256, f0min=40, f0max=400, use_token_averaged_f0=True,
+                 use_continuous_f0=True, use_log_f0=True, reduction_factor=1):
+        super().__init__()
+        self.fs = fs
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.frame_period = 1000 * hop_length / fs
+        self.f0min = f0min
+        self.f0max = f0max
+        self.use_token_averaged_f0 = use_token_averaged_f0
+        self.use_continuous_f0 = use_continuous_f0
+        self.use_log_f0 = use_log_f0
+        if use_token_averaged_f0:
+            assert reduction_factor >= 1
+        self.reduction_factor = reduction_factor
+
+    def output_size(self):
+        return 1
+
+    def get_parameters(self):
+        return dict(fs=self.fs, n_fft=self.n_fft, hop_length=self.hop_length, f0min=self.f0min, f0max=self.f0max,
+                    use_token_averaged_f0=self.use_token_averaged_f0, use_continuous_f0=self.use_continuous_f0, use_log_f0=self.use_log_f0,
+                    reduction_factor=self.reduction_factor)
+
+    def forward(self, input_waves, input_waves_lengths=None, feats_lengths=None, durations=None,
+                durations_lengths=None, norm_by_average=True):
+        # If not provided, we assume that the inputs have the same length
+        if input_waves_lengths is None:
+            input_waves_lengths = (input_waves.new_ones(input_waves.shape[0], dtype=torch.long) * input_waves.shape[1])
+
+        # F0 extraction
+        pitch = [self._calculate_f0(x[:xl]) for x, xl in zip(input_waves, input_waves_lengths)]
+
+        # (Optional): Adjust length to match with the mel-spectrogram
+        if feats_lengths is not None:
+            pitch = [self._adjust_num_frames(p, fl).view(-1) for p, fl in zip(pitch, feats_lengths)]
+
+        # (Optional): Average by duration to calculate token-wise f0
+        if self.use_token_averaged_f0:
+            pitch = [self._average_by_duration(p, d).view(-1) for p, d in zip(pitch, durations)]
+            pitch_lengths = durations_lengths
+        else:
+            pitch_lengths = input_waves.new_tensor([len(p) for p in pitch], dtype=torch.long)
+
+        # Padding
+        pitch = pad_list(pitch, 0.0)
+
+        # Return with the shape (B, T, 1)
+        if norm_by_average:
+            average = pitch[0][pitch[0] != 0.0].mean()
+            pitch = pitch / average
+        return pitch.unsqueeze(-1), pitch_lengths
+
+    def _calculate_f0(self, input):
+        x = input.cpu().numpy().astype(np.double)
+        f0, timeaxis = pyworld.dio(x, self.fs, f0_floor=self.f0min, f0_ceil=self.f0max, frame_period=self.frame_period)
+        f0 = pyworld.stonemask(x, f0, timeaxis, self.fs)
+        if self.use_continuous_f0:
+            f0 = self._convert_to_continuous_f0(f0)
+        if self.use_log_f0:
+            nonzero_idxs = np.where(f0 != 0)[0]
+            f0[nonzero_idxs] = np.log(f0[nonzero_idxs])
+        return input.new_tensor(f0.reshape(-1), dtype=torch.float)
+
+    @staticmethod
+    def _adjust_num_frames(x, num_frames):
+        if num_frames > len(x):
+            x = F.pad(x, (0, num_frames - len(x)))
+        elif num_frames < len(x):
+            x = x[:num_frames]
+        return x
+
+    @staticmethod
+    def _convert_to_continuous_f0(f0: np.array):
+        if (f0 == 0).all():
+            return f0
+
+        # padding start and end of f0 sequence
+        start_f0 = f0[f0 != 0][0]
+        end_f0 = f0[f0 != 0][-1]
+        start_idx = np.where(f0 == start_f0)[0][0]
+        end_idx = np.where(f0 == end_f0)[0][-1]
+        f0[:start_idx] = start_f0
+        f0[end_idx:] = end_f0
+
+        # get non-zero frame index
+        nonzero_idxs = np.where(f0 != 0)[0]
+
+        # perform linear interpolation
+        interp_fn = interp1d(nonzero_idxs, f0[nonzero_idxs])
+        f0 = interp_fn(np.arange(0, f0.shape[0]))
+
+        return f0
+
+    def _average_by_duration(self, x, d):
+        assert 0 <= len(x) - d.sum() < self.reduction_factor
+        d_cumsum = F.pad(d.cumsum(dim=0), (1, 0))
+        x_avg = [
+            x[start:end].masked_select(x[start:end].gt(0.0)).mean(dim=0) if len(x[start:end].masked_select(x[start:end].gt(0.0))) != 0 else x.new_tensor(0.0)
+            for start, end in zip(d_cumsum[:-1], d_cumsum[1:])]
+        return torch.stack(x_avg)
diff --git a/TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/__init__.py b/TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/fastspeech2_train_loop.py b/TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/fastspeech2_train_loop.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3effac2c1629f121e08cf4f4edfb94d421fe6c8
--- /dev/null
+++ b/TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/fastspeech2_train_loop.py
@@ -0,0 +1,201 @@
+import os
+import time
+
+import librosa.display as lbd
+import matplotlib.pyplot as plt
+import torch
+import torch.multiprocessing
+import torch.multiprocessing
+from torch.cuda.amp import GradScaler
+from torch.cuda.amp import autocast
+from torch.nn.utils.rnn import pad_sequence
+from torch.utils.data.dataloader import DataLoader
+from tqdm import tqdm
+
+from Preprocessing.ArticulatoryCombinedTextFrontend import ArticulatoryCombinedTextFrontend
+from Preprocessing.ArticulatoryCombinedTextFrontend import get_language_id
+from Utility.WarmupScheduler import WarmupScheduler
+from Utility.utils import cumsum_durations
+from Utility.utils import delete_old_checkpoints
+from Utility.utils import get_most_recent_checkpoint
+
+
+@torch.no_grad()
+def plot_progress_spec(net, device, save_dir, step, lang, default_emb):
+    tf = ArticulatoryCombinedTextFrontend(language=lang)
+    sentence = ""
+    if lang == "en":
+        sentence = "This is a complex sentence, it even has a pause!"
+    elif lang == "de":
+        sentence = "Dies ist ein komplexer Satz, er hat sogar eine Pause!"
+    elif lang == "el":
+        sentence = "Αυτή είναι μια σύνθετη πρόταση, έχει ακόμη και παύση!"
+    elif lang == "es":
+        sentence = "Esta es una oración compleja, ¡incluso tiene una pausa!"
+    elif lang == "fi":
+        sentence = "Tämä on monimutkainen lause, sillä on jopa tauko!"
+    elif lang == "ru":
+        sentence = "Это сложное предложение, в нем даже есть пауза!"
+    elif lang == "hu":
+        sentence = "Ez egy összetett mondat, még szünet is van benne!"
+    elif lang == "nl":
+        sentence = "Dit is een complexe zin, er zit zelfs een pauze in!"
+    elif lang == "fr":
+        sentence = "C'est une phrase complexe, elle a même une pause !"
+    phoneme_vector = tf.string_to_tensor(sentence).squeeze(0).to(device)
+    spec, durations, *_ = net.inference(text=phoneme_vector,
+                                        return_duration_pitch_energy=True,
+                                        utterance_embedding=default_emb,
+                                        lang_id=get_language_id(lang).to(device))
+    spec = spec.transpose(0, 1).to("cpu").numpy()
+    duration_splits, label_positions = cumsum_durations(durations.cpu().numpy())
+    if not os.path.exists(os.path.join(save_dir, "spec")):
+        os.makedirs(os.path.join(save_dir, "spec"))
+    fig, ax = plt.subplots(nrows=1, ncols=1)
+    lbd.specshow(spec,
+                 ax=ax,
+                 sr=16000,
+                 cmap='GnBu',
+                 y_axis='mel',
+                 x_axis=None,
+                 hop_length=256)
+    ax.yaxis.set_visible(False)
+    ax.set_xticks(duration_splits, minor=True)
+    ax.xaxis.grid(True, which='minor')
+    ax.set_xticks(label_positions, minor=False)
+    ax.set_xticklabels(tf.get_phone_string(sentence))
+    ax.set_title(sentence)
+    plt.savefig(os.path.join(os.path.join(save_dir, "spec"), str(step) + ".png"))
+    plt.clf()
+    plt.close()
+
+
+def collate_and_pad(batch):
+    # text, text_len, speech, speech_len, durations, energy, pitch, utterance condition, language_id
+    return (pad_sequence([datapoint[0] for datapoint in batch], batch_first=True),
+            torch.stack([datapoint[1] for datapoint in batch]).squeeze(1),
+            pad_sequence([datapoint[2] for datapoint in batch], batch_first=True),
+            torch.stack([datapoint[3] for datapoint in batch]).squeeze(1),
+            pad_sequence([datapoint[4] for datapoint in batch], batch_first=True),
+            pad_sequence([datapoint[5] for datapoint in batch], batch_first=True),
+            pad_sequence([datapoint[6] for datapoint in batch], batch_first=True),
+            torch.stack([datapoint[7] for datapoint in batch]).squeeze(),
+            torch.stack([datapoint[8] for datapoint in batch]))
+
+
+def train_loop(net,
+               train_dataset,
+               device,
+               save_directory,
+               batch_size=32,
+               steps=300000,
+               epochs_per_save=1,
+               lang="en",
+               lr=0.0001,
+               warmup_steps=4000,
+               path_to_checkpoint=None,
+               fine_tune=False,
+               resume=False):
+    """
+    Args:
+        resume: whether to resume from the most recent checkpoint
+        warmup_steps: how long the learning rate should increase before it reaches the specified value
+        steps: How many steps to train
+        lr: The initial learning rate for the optimiser
+        path_to_checkpoint: reloads a checkpoint to continue training from there
+        fine_tune: whether to load everything from a checkpoint, or only the model parameters
+        lang: language of the synthesis
+        net: Model to train
+        train_dataset: Pytorch Dataset Object for train data
+        device: Device to put the loaded tensors on
+        save_directory: Where to save the checkpoints
+        batch_size: How many elements should be loaded at once
+        epochs_per_save: how many epochs to train in between checkpoints
+
+    """
+    net = net.to(device)
+
+    torch.multiprocessing.set_sharing_strategy('file_system')
+    train_loader = DataLoader(batch_size=batch_size,
+                              dataset=train_dataset,
+                              drop_last=True,
+                              num_workers=8,
+                              pin_memory=True,
+                              shuffle=True,
+                              prefetch_factor=8,
+                              collate_fn=collate_and_pad,
+                              persistent_workers=True)
+    default_embedding = None
+    for index in range(20):  # slicing is not implemented for datasets, so this detour is needed.
+        if default_embedding is None:
+            default_embedding = train_dataset[index][7].squeeze()
+        else:
+            default_embedding = default_embedding + train_dataset[index][7].squeeze()
+    default_embedding = (default_embedding / len(train_dataset)).to(device)
+    # default speaker embedding for inference is the average of the first 20 speaker embeddings. So if you use multiple datasets combined,
+    # put a single speaker one with the nicest voice first into the concat dataset.
+    step_counter = 0
+    optimizer = torch.optim.Adam(net.parameters(), lr=lr)
+    scheduler = WarmupScheduler(optimizer, warmup_steps=warmup_steps)
+    scaler = GradScaler()
+    epoch = 0
+    if resume:
+        path_to_checkpoint = get_most_recent_checkpoint(checkpoint_dir=save_directory)
+    if path_to_checkpoint is not None:
+        check_dict = torch.load(path_to_checkpoint, map_location=device)
+        net.load_state_dict(check_dict["model"])
+        if not fine_tune:
+            optimizer.load_state_dict(check_dict["optimizer"])
+            scheduler.load_state_dict(check_dict["scheduler"])
+            step_counter = check_dict["step_counter"]
+            scaler.load_state_dict(check_dict["scaler"])
+    start_time = time.time()
+    while True:
+        net.train()
+        epoch += 1
+        optimizer.zero_grad()
+        train_losses_this_epoch = list()
+        for batch in tqdm(train_loader):
+            with autocast():
+                train_loss = net(text_tensors=batch[0].to(device),
+                                 text_lengths=batch[1].to(device),
+                                 gold_speech=batch[2].to(device),
+                                 speech_lengths=batch[3].to(device),
+                                 gold_durations=batch[4].to(device),
+                                 gold_pitch=batch[6].to(device),  # mind the switched order
+                                 gold_energy=batch[5].to(device),  # mind the switched order
+                                 utterance_embedding=batch[7].to(device),
+                                 lang_ids=batch[8].to(device),
+                                 return_mels=False)
+                train_losses_this_epoch.append(train_loss.item())
+
+            optimizer.zero_grad()
+            scaler.scale(train_loss).backward()
+            del train_loss
+            step_counter += 1
+            scaler.unscale_(optimizer)
+            torch.nn.utils.clip_grad_norm_(net.parameters(), 1.0, error_if_nonfinite=False)
+            scaler.step(optimizer)
+            scaler.update()
+            scheduler.step()
+
+        net.eval()
+        if epoch % epochs_per_save == 0:
+            torch.save({
+                "model"       : net.state_dict(),
+                "optimizer"   : optimizer.state_dict(),
+                "step_counter": step_counter,
+                "scaler"      : scaler.state_dict(),
+                "scheduler"   : scheduler.state_dict(),
+                "default_emb" : default_embedding,
+                }, os.path.join(save_directory, "checkpoint_{}.pt".format(step_counter)))
+            delete_old_checkpoints(save_directory, keep=5)
+            plot_progress_spec(net, device, save_dir=save_directory, step=step_counter, lang=lang, default_emb=default_embedding)
+            if step_counter > steps:
+                # DONE
+                return
+        print("Epoch:        {}".format(epoch))
+        print("Train Loss:   {}".format(sum(train_losses_this_epoch) / len(train_losses_this_epoch)))
+        print("Time elapsed: {} Minutes".format(round((time.time() - start_time) / 60)))
+        print("Steps:        {}".format(step_counter))
+        net.train()
diff --git a/TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/fastspeech2_train_loop_ctc.py b/TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/fastspeech2_train_loop_ctc.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b95c2ad882656353bf0b9fc0d0a5de9891556f3
--- /dev/null
+++ b/TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/fastspeech2_train_loop_ctc.py
@@ -0,0 +1,191 @@
+import os
+import random
+import time
+
+import librosa.display as lbd
+import matplotlib.pyplot as plt
+import torch
+import torch.multiprocessing
+import torch.multiprocessing
+from torch.cuda.amp import GradScaler
+from torch.nn.utils.rnn import pad_sequence
+from tqdm import tqdm
+
+from Preprocessing.ArticulatoryCombinedTextFrontend import ArticulatoryCombinedTextFrontend
+from TrainingInterfaces.Text_to_Spectrogram.AutoAligner.Aligner import Aligner
+from Utility.WarmupScheduler import WarmupScheduler
+from Utility.utils import cumsum_durations
+from Utility.utils import delete_old_checkpoints
+from Utility.utils import get_most_recent_checkpoint
+
+
+def plot_progress_spec(net, device, save_dir, step, lang):
+    tf = ArticulatoryCombinedTextFrontend(language=lang)
+    sentence = ""
+    if lang == "en":
+        sentence = "This is a complex sentence, it even has a pause!"
+    elif lang == "de":
+        sentence = "Dies ist ein komplexer Satz, er hat sogar eine Pause!"
+    elif lang == "el":
+        sentence = "Αυτή είναι μια σύνθετη πρόταση, έχει ακόμη και παύση!"
+    elif lang == "es":
+        sentence = "Esta es una oración compleja, ¡incluso tiene una pausa!"
+    elif lang == "fi":
+        sentence = "Tämä on monimutkainen lause, sillä on jopa tauko!"
+    elif lang == "ru":
+        sentence = "Это сложное предложение, в нем даже есть пауза!"
+    elif lang == "hu":
+        sentence = "Ez egy összetett mondat, még szünet is van benne!"
+    elif lang == "nl":
+        sentence = "Dit is een complexe zin, er zit zelfs een pauze in!"
+    elif lang == "fr":
+        sentence = "C'est une phrase complexe, elle a même une pause !"
+    phoneme_vector = tf.string_to_tensor(sentence).squeeze(0).to(device)
+    spec, durations, *_ = net.inference(text=phoneme_vector, return_duration_pitch_energy=True)
+    spec = spec.transpose(0, 1).to("cpu").numpy()
+    duration_splits, label_positions = cumsum_durations(durations.cpu().numpy())
+    if not os.path.exists(os.path.join(save_dir, "spec")):
+        os.makedirs(os.path.join(save_dir, "spec"))
+    fig, ax = plt.subplots(nrows=1, ncols=1)
+    lbd.specshow(spec,
+                 ax=ax,
+                 sr=16000,
+                 cmap='GnBu',
+                 y_axis='mel',
+                 x_axis=None,
+                 hop_length=256)
+    ax.yaxis.set_visible(False)
+    ax.set_xticks(duration_splits, minor=True)
+    ax.xaxis.grid(True, which='minor')
+    ax.set_xticks(label_positions, minor=False)
+    ax.set_xticklabels(tf.get_phone_string(sentence))
+    ax.set_title(sentence)
+    plt.savefig(os.path.join(os.path.join(save_dir, "spec"), str(step) + ".png"))
+    plt.clf()
+    plt.close()
+
+
+def train_loop(net,
+               train_sentences,
+               device,
+               save_directory,
+               aligner_checkpoint,
+               batch_size=32,
+               steps=300000,
+               epochs_per_save=5,
+               lang="en",
+               lr=0.0001,
+               warmup_steps=4000,
+               path_to_checkpoint=None,
+               fine_tune=False,
+               resume=False):
+    """
+    Args:
+        resume: whether to resume from the most recent checkpoint
+        warmup_steps: how long the learning rate should increase before it reaches the specified value
+        steps: How many steps to train
+        lr: The initial learning rate for the optimiser
+        path_to_checkpoint: reloads a checkpoint to continue training from there
+        fine_tune: whether to load everything from a checkpoint, or only the model parameters
+        lang: language of the synthesis and of the train sentences
+        net: Model to train
+        train_sentences: list of (string) sentences the CTC objective should be learned on
+        device: Device to put the loaded tensors on
+        save_directory: Where to save the checkpoints
+        batch_size: How many elements should be loaded at once
+        epochs_per_save: how many epochs to train in between checkpoints
+
+    """
+    net = net.to(device)
+
+    torch.multiprocessing.set_sharing_strategy('file_system')
+    text_to_art_vec = ArticulatoryCombinedTextFrontend(language=lang)
+    asr_aligner = Aligner().to(device)
+    check_dict = torch.load(os.path.join(aligner_checkpoint), map_location=device)
+    asr_aligner.load_state_dict(check_dict["asr_model"])
+    net.stop_gradient_from_energy_predictor = False
+    net.stop_gradient_from_pitch_predictor = False
+    step_counter = 0
+    optimizer = torch.optim.Adam(net.parameters(), lr=lr)
+    scheduler = WarmupScheduler(optimizer, warmup_steps=warmup_steps)
+    scaler = GradScaler()
+    epoch = 0
+    if resume:
+        path_to_checkpoint = get_most_recent_checkpoint(checkpoint_dir=save_directory)
+    if path_to_checkpoint is not None:
+        check_dict = torch.load(path_to_checkpoint, map_location=device)
+        net.load_state_dict(check_dict["model"])
+        if not fine_tune:
+            optimizer.load_state_dict(check_dict["optimizer"])
+            scheduler.load_state_dict(check_dict["scheduler"])
+            step_counter = check_dict["step_counter"]
+            scaler.load_state_dict(check_dict["scaler"])
+    start_time = time.time()
+    while True:
+        net.train()
+        epoch += 1
+        optimizer.zero_grad()
+        train_losses_this_epoch = list()
+        random.shuffle(train_sentences)
+        batch_of_text_vecs = list()
+        batch_of_tokens = list()
+
+        for sentence in tqdm(train_sentences):
+            if sentence.strip() == "":
+                continue
+
+            phonemes = text_to_art_vec.get_phone_string(sentence)
+            # collect batch of texts
+            batch_of_text_vecs.append(text_to_art_vec.string_to_tensor(phonemes, input_phonemes=True).squeeze(0).to(device))
+
+            # collect batch of tokens
+            tokens = list()
+            for phone in phonemes:
+                tokens.append(text_to_art_vec.phone_to_id[phone])
+            tokens = torch.LongTensor(tokens).to(device)
+            batch_of_tokens.append(tokens)
+
+            if len(batch_of_tokens) == batch_size:
+                token_batch = pad_sequence(batch_of_tokens, batch_first=True)
+                token_lens = torch.LongTensor([len(x) for x in batch_of_tokens]).to(device)
+                text_batch = pad_sequence(batch_of_text_vecs, batch_first=True)
+                spec_batch, d_outs = net.batch_inference(texts=text_batch, text_lens=token_lens)
+                spec_lens = torch.LongTensor([sum(x) for x in d_outs]).to(device)
+
+                asr_pred = asr_aligner(spec_batch, spec_lens)
+                train_loss = asr_aligner.ctc_loss(asr_pred.transpose(0, 1).log_softmax(2), token_batch, spec_lens, token_lens)
+                train_losses_this_epoch.append(train_loss.item())
+
+                optimizer.zero_grad()
+                asr_aligner.zero_grad()
+                scaler.scale(train_loss).backward()
+                del train_loss
+                step_counter += 1
+                scaler.unscale_(optimizer)
+                torch.nn.utils.clip_grad_norm_(net.parameters(), 1.0, error_if_nonfinite=False)
+                scaler.step(optimizer)
+                scaler.update()
+                scheduler.step()
+                batch_of_tokens = list()
+                batch_of_text_vecs = list()
+
+        net.eval()
+        if epoch % epochs_per_save == 0:
+            torch.save({
+                "model"       : net.state_dict(),
+                "optimizer"   : optimizer.state_dict(),
+                "step_counter": step_counter,
+                "scaler"      : scaler.state_dict(),
+                "scheduler"   : scheduler.state_dict(),
+                }, os.path.join(save_directory, "checkpoint_{}.pt".format(step_counter)))
+            delete_old_checkpoints(save_directory, keep=5)
+            with torch.no_grad():
+                plot_progress_spec(net, device, save_dir=save_directory, step=step_counter, lang=lang)
+            if step_counter > steps:
+                # DONE
+                return
+        print("Epoch:        {}".format(epoch))
+        print("Train Loss:   {}".format(sum(train_losses_this_epoch) / len(train_losses_this_epoch)))
+        print("Time elapsed: {} Minutes".format(round((time.time() - start_time) / 60)))
+        print("Steps:        {}".format(step_counter))
+        net.train()
diff --git a/TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/meta_train_loop.py b/TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/meta_train_loop.py
new file mode 100644
index 0000000000000000000000000000000000000000..2db3d9767bb0cc99e7976b38b0031c8a5d4c8b52
--- /dev/null
+++ b/TrainingInterfaces/Text_to_Spectrogram/FastSpeech2/meta_train_loop.py
@@ -0,0 +1,211 @@
+import librosa.display as lbd
+import matplotlib.pyplot as plt
+import torch
+import torch.multiprocessing
+from torch.cuda.amp import GradScaler
+from torch.cuda.amp import autocast
+from torch.nn.utils.rnn import pad_sequence
+from torch.utils.data.dataloader import DataLoader
+from tqdm import tqdm
+
+from Preprocessing.ArticulatoryCombinedTextFrontend import ArticulatoryCombinedTextFrontend
+from Preprocessing.ArticulatoryCombinedTextFrontend import get_language_id
+from Utility.WarmupScheduler import WarmupScheduler
+from Utility.path_to_transcript_dicts import *
+from Utility.utils import cumsum_durations
+from Utility.utils import delete_old_checkpoints
+from Utility.utils import get_most_recent_checkpoint
+
+
+def train_loop(net,
+               datasets,
+               device,
+               save_directory,
+               batch_size,
+               steps,
+               steps_per_checkpoint,
+               lr,
+               path_to_checkpoint,
+               resume=False,
+               warmup_steps=4000):
+    # ============
+    # Preparations
+    # ============
+    net = net.to(device)
+    torch.multiprocessing.set_sharing_strategy('file_system')
+    train_loaders = list()
+    train_iters = list()
+    for dataset in datasets:
+        train_loaders.append(DataLoader(batch_size=batch_size,
+                                        dataset=dataset,
+                                        drop_last=True,
+                                        num_workers=2,
+                                        pin_memory=True,
+                                        shuffle=True,
+                                        prefetch_factor=5,
+                                        collate_fn=collate_and_pad,
+                                        persistent_workers=True))
+        train_iters.append(iter(train_loaders[-1]))
+    default_embeddings = {"en": None, "de": None, "el": None, "es": None, "fi": None, "ru": None, "hu": None, "nl": None, "fr": None}
+    for index, lang in enumerate(["en", "de", "el", "es", "fi", "ru", "hu", "nl", "fr"]):
+        default_embedding = None
+        for datapoint in datasets[index]:
+            if default_embedding is None:
+                default_embedding = datapoint[7].squeeze()
+            else:
+                default_embedding = default_embedding + datapoint[7].squeeze()
+        default_embeddings[lang] = (default_embedding / len(datasets[index])).to(device)
+    optimizer = torch.optim.RAdam(net.parameters(), lr=lr, eps=1.0e-06, weight_decay=0.0)
+    grad_scaler = GradScaler()
+    scheduler = WarmupScheduler(optimizer, warmup_steps=warmup_steps)
+    if resume:
+        previous_checkpoint = get_most_recent_checkpoint(checkpoint_dir=save_directory)
+        if previous_checkpoint is not None:
+            path_to_checkpoint = previous_checkpoint
+        else:
+            raise RuntimeError(f"No checkpoint found that can be resumed from in {save_directory}")
+    step_counter = 0
+    train_losses_total = list()
+    if path_to_checkpoint is not None:
+        check_dict = torch.load(os.path.join(path_to_checkpoint), map_location=device)
+        net.load_state_dict(check_dict["model"])
+        if resume:
+            optimizer.load_state_dict(check_dict["optimizer"])
+            step_counter = check_dict["step_counter"]
+            grad_scaler.load_state_dict(check_dict["scaler"])
+            scheduler.load_state_dict(check_dict["scheduler"])
+            if step_counter > steps:
+                print("Desired steps already reached in loaded checkpoint.")
+                return
+
+    net.train()
+    # =============================
+    # Actual train loop starts here
+    # =============================
+    for step in tqdm(range(step_counter, steps)):
+        batches = []
+        for index in range(len(datasets)):
+            # we get one batch for each task (i.e. language in this case)
+            try:
+                batch = next(train_iters[index])
+                batches.append(batch)
+            except StopIteration:
+                train_iters[index] = iter(train_loaders[index])
+                batch = next(train_iters[index])
+                batches.append(batch)
+        train_loss = 0.0
+        for batch in batches:
+            with autocast():
+                # we sum the loss for each task, as we would do for the
+                # second order regular MAML, but we do it only over one
+                # step (i.e. iterations of inner loop = 1)
+                train_loss = train_loss + net(text_tensors=batch[0].to(device),
+                                              text_lengths=batch[1].to(device),
+                                              gold_speech=batch[2].to(device),
+                                              speech_lengths=batch[3].to(device),
+                                              gold_durations=batch[4].to(device),
+                                              gold_pitch=batch[6].to(device),  # mind the switched order
+                                              gold_energy=batch[5].to(device),  # mind the switched order
+                                              utterance_embedding=batch[7].to(device),
+                                              lang_ids=batch[8].to(device),
+                                              return_mels=False)
+        # then we directly update our meta-parameters without
+        # the need for any task specific parameters
+        train_losses_total.append(train_loss.item())
+        optimizer.zero_grad()
+        grad_scaler.scale(train_loss).backward()
+        grad_scaler.unscale_(optimizer)
+        torch.nn.utils.clip_grad_norm_(net.parameters(), 1.0, error_if_nonfinite=False)
+        grad_scaler.step(optimizer)
+        grad_scaler.update()
+        scheduler.step()
+
+        if step % steps_per_checkpoint == 0:
+            # ==============================
+            # Enough steps for some insights
+            # ==============================
+            net.eval()
+            print(f"Total Loss: {round(sum(train_losses_total) / len(train_losses_total), 3)}")
+            train_losses_total = list()
+            torch.save({
+                "model"       : net.state_dict(),
+                "optimizer"   : optimizer.state_dict(),
+                "scaler"      : grad_scaler.state_dict(),
+                "scheduler"   : scheduler.state_dict(),
+                "step_counter": step,
+                "default_emb" : default_embeddings["en"]
+                },
+                os.path.join(save_directory, "checkpoint_{}.pt".format(step)))
+            delete_old_checkpoints(save_directory, keep=5)
+            for lang in ["en", "de", "el", "es", "fi", "ru", "hu", "nl", "fr"]:
+                plot_progress_spec(net=net,
+                                   device=device,
+                                   lang=lang,
+                                   save_dir=save_directory,
+                                   step=step,
+                                   utt_embeds=default_embeddings)
+            net.train()
+
+
+@torch.inference_mode()
+def plot_progress_spec(net, device, save_dir, step, lang, utt_embeds):
+    tf = ArticulatoryCombinedTextFrontend(language=lang)
+    sentence = ""
+    default_embed = utt_embeds[lang]
+    if lang == "en":
+        sentence = "This is a complex sentence, it even has a pause!"
+    elif lang == "de":
+        sentence = "Dies ist ein komplexer Satz, er hat sogar eine Pause!"
+    elif lang == "el":
+        sentence = "Αυτή είναι μια σύνθετη πρόταση, έχει ακόμη και παύση!"
+    elif lang == "es":
+        sentence = "Esta es una oración compleja, ¡incluso tiene una pausa!"
+    elif lang == "fi":
+        sentence = "Tämä on monimutkainen lause, sillä on jopa tauko!"
+    elif lang == "ru":
+        sentence = "Это сложное предложение, в нем даже есть пауза!"
+    elif lang == "hu":
+        sentence = "Ez egy összetett mondat, még szünet is van benne!"
+    elif lang == "nl":
+        sentence = "Dit is een complexe zin, er zit zelfs een pauze in!"
+    elif lang == "fr":
+        sentence = "C'est une phrase complexe, elle a même une pause !"
+    phoneme_vector = tf.string_to_tensor(sentence).squeeze(0).to(device)
+    spec, durations, *_ = net.inference(text=phoneme_vector,
+                                        return_duration_pitch_energy=True,
+                                        utterance_embedding=default_embed,
+                                        lang_id=get_language_id(lang).to(device))
+    spec = spec.transpose(0, 1).to("cpu").numpy()
+    duration_splits, label_positions = cumsum_durations(durations.cpu().numpy())
+    if not os.path.exists(os.path.join(save_dir, "spec")):
+        os.makedirs(os.path.join(save_dir, "spec"))
+    fig, ax = plt.subplots(nrows=1, ncols=1)
+    lbd.specshow(spec,
+                 ax=ax,
+                 sr=16000,
+                 cmap='GnBu',
+                 y_axis='mel',
+                 x_axis=None,
+                 hop_length=256)
+    ax.yaxis.set_visible(False)
+    ax.set_xticks(duration_splits, minor=True)
+    ax.xaxis.grid(True, which='minor')
+    ax.set_xticks(label_positions, minor=False)
+    ax.set_xticklabels(tf.get_phone_string(sentence))
+    ax.set_title(sentence)
+    plt.savefig(os.path.join(os.path.join(save_dir, "spec"), f"{step}_{lang}.png"))
+    plt.clf()
+    plt.close()
+
+
+def collate_and_pad(batch):
+    # text, text_len, speech, speech_len, durations, energy, pitch, utterance condition, language_id
+    return (pad_sequence([datapoint[0] for datapoint in batch], batch_first=True),
+            torch.stack([datapoint[1] for datapoint in batch]).squeeze(1),
+            pad_sequence([datapoint[2] for datapoint in batch], batch_first=True),
+            torch.stack([datapoint[3] for datapoint in batch]).squeeze(1),
+            pad_sequence([datapoint[4] for datapoint in batch], batch_first=True),
+            pad_sequence([datapoint[5] for datapoint in batch], batch_first=True),
+            pad_sequence([datapoint[6] for datapoint in batch], batch_first=True),
+            torch.stack([datapoint[7] for datapoint in batch]).squeeze(),
+            torch.stack([datapoint[8] for datapoint in batch]))
diff --git a/TrainingInterfaces/Text_to_Spectrogram/__init__.py b/TrainingInterfaces/Text_to_Spectrogram/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/TrainingInterfaces/__init__.py b/TrainingInterfaces/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/Utility/__init__.py b/Utility/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/Utility/utils.py b/Utility/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5fa60ebea1b4c9b50c1c0ca49644867699b2ddf2
--- /dev/null
+++ b/Utility/utils.py
@@ -0,0 +1,320 @@
+"""
+Taken from ESPNet, modified by Florian Lux
+"""
+
+import os
+from abc import ABC
+
+import torch
+
+
+def cumsum_durations(durations):
+    out = [0]
+    for duration in durations:
+        out.append(duration + out[-1])
+    centers = list()
+    for index, _ in enumerate(out):
+        if index + 1 < len(out):
+            centers.append((out[index] + out[index + 1]) / 2)
+    return out, centers
+
+
+def delete_old_checkpoints(checkpoint_dir, keep=5):
+    checkpoint_list = list()
+    for el in os.listdir(checkpoint_dir):
+        if el.endswith(".pt") and el != "best.pt":
+            checkpoint_list.append(int(el.split(".")[0].split("_")[1]))
+    if len(checkpoint_list) <= keep:
+        return
+    else:
+        checkpoint_list.sort(reverse=False)
+        checkpoints_to_delete = [os.path.join(checkpoint_dir, "checkpoint_{}.pt".format(step)) for step in checkpoint_list[:-keep]]
+        for old_checkpoint in checkpoints_to_delete:
+            os.remove(os.path.join(old_checkpoint))
+
+
+def get_most_recent_checkpoint(checkpoint_dir, verbose=True):
+    checkpoint_list = list()
+    for el in os.listdir(checkpoint_dir):
+        if el.endswith(".pt") and el != "best.pt":
+            checkpoint_list.append(int(el.split(".")[0].split("_")[1]))
+    if len(checkpoint_list) == 0:
+        print("No previous checkpoints found, cannot reload.")
+        return None
+    checkpoint_list.sort(reverse=True)
+    if verbose:
+        print("Reloading checkpoint_{}.pt".format(checkpoint_list[0]))
+    return os.path.join(checkpoint_dir, "checkpoint_{}.pt".format(checkpoint_list[0]))
+
+
+def make_pad_mask(lengths, xs=None, length_dim=-1, device=None):
+    """
+    Make mask tensor containing indices of padded part.
+
+    Args:
+        lengths (LongTensor or List): Batch of lengths (B,).
+        xs (Tensor, optional): The reference tensor.
+            If set, masks will be the same shape as this tensor.
+        length_dim (int, optional): Dimension indicator of the above tensor.
+            See the example.
+
+    Returns:
+        Tensor: Mask tensor containing indices of padded part.
+                dtype=torch.uint8 in PyTorch 1.2-
+                dtype=torch.bool in PyTorch 1.2+ (including 1.2)
+
+    """
+    if length_dim == 0:
+        raise ValueError("length_dim cannot be 0: {}".format(length_dim))
+
+    if not isinstance(lengths, list):
+        lengths = lengths.tolist()
+    bs = int(len(lengths))
+    if xs is None:
+        maxlen = int(max(lengths))
+    else:
+        maxlen = xs.size(length_dim)
+
+    if device is not None:
+        seq_range = torch.arange(0, maxlen, dtype=torch.int64, device=device)
+    else:
+        seq_range = torch.arange(0, maxlen, dtype=torch.int64)
+    seq_range_expand = seq_range.unsqueeze(0).expand(bs, maxlen)
+    seq_length_expand = seq_range_expand.new(lengths).unsqueeze(-1)
+    mask = seq_range_expand >= seq_length_expand
+
+    if xs is not None:
+        assert xs.size(0) == bs, (xs.size(0), bs)
+
+        if length_dim < 0:
+            length_dim = xs.dim() + length_dim
+        # ind = (:, None, ..., None, :, , None, ..., None)
+        ind = tuple(slice(None) if i in (0, length_dim) else None for i in range(xs.dim()))
+        mask = mask[ind].expand_as(xs).to(xs.device)
+    return mask
+
+
+def make_non_pad_mask(lengths, xs=None, length_dim=-1, device=None):
+    """
+    Make mask tensor containing indices of non-padded part.
+
+    Args:
+        lengths (LongTensor or List): Batch of lengths (B,).
+        xs (Tensor, optional): The reference tensor.
+            If set, masks will be the same shape as this tensor.
+        length_dim (int, optional): Dimension indicator of the above tensor.
+            See the example.
+
+    Returns:
+        ByteTensor: mask tensor containing indices of padded part.
+                    dtype=torch.uint8 in PyTorch 1.2-
+                    dtype=torch.bool in PyTorch 1.2+ (including 1.2)
+
+    """
+    return ~make_pad_mask(lengths, xs, length_dim, device=device)
+
+
+def initialize(model, init):
+    """
+    Initialize weights of a neural network module.
+
+    Parameters are initialized using the given method or distribution.
+
+    Args:
+        model: Target.
+        init: Method of initialization.
+    """
+
+    # weight init
+    for p in model.parameters():
+        if p.dim() > 1:
+            if init == "xavier_uniform":
+                torch.nn.init.xavier_uniform_(p.data)
+            elif init == "xavier_normal":
+                torch.nn.init.xavier_normal_(p.data)
+            elif init == "kaiming_uniform":
+                torch.nn.init.kaiming_uniform_(p.data, nonlinearity="relu")
+            elif init == "kaiming_normal":
+                torch.nn.init.kaiming_normal_(p.data, nonlinearity="relu")
+            else:
+                raise ValueError("Unknown initialization: " + init)
+    # bias init
+    for p in model.parameters():
+        if p.dim() == 1:
+            p.data.zero_()
+
+    # reset some modules with default init
+    for m in model.modules():
+        if isinstance(m, (torch.nn.Embedding, torch.nn.LayerNorm)):
+            m.reset_parameters()
+
+
+def pad_list(xs, pad_value):
+    """
+    Perform padding for the list of tensors.
+
+    Args:
+        xs (List): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)].
+        pad_value (float): Value for padding.
+
+    Returns:
+        Tensor: Padded tensor (B, Tmax, `*`).
+
+    """
+    n_batch = len(xs)
+    max_len = max(x.size(0) for x in xs)
+    pad = xs[0].new(n_batch, max_len, *xs[0].size()[1:]).fill_(pad_value)
+
+    for i in range(n_batch):
+        pad[i, : xs[i].size(0)] = xs[i]
+
+    return pad
+
+
+def subsequent_mask(size, device="cpu", dtype=torch.bool):
+    """
+    Create mask for subsequent steps (size, size).
+
+    :param int size: size of mask
+    :param str device: "cpu" or "cuda" or torch.Tensor.device
+    :param torch.dtype dtype: result dtype
+    :rtype
+    """
+    ret = torch.ones(size, size, device=device, dtype=dtype)
+    return torch.tril(ret, out=ret)
+
+
+class ScorerInterface:
+    """
+    Scorer interface for beam search.
+
+    The scorer performs scoring of the all tokens in vocabulary.
+
+    Examples:
+        * Search heuristics
+            * :class:`espnet.nets.scorers.length_bonus.LengthBonus`
+        * Decoder networks of the sequence-to-sequence models
+            * :class:`espnet.nets.pytorch_backend.nets.transformer.decoder.Decoder`
+            * :class:`espnet.nets.pytorch_backend.nets.rnn.decoders.Decoder`
+        * Neural language models
+            * :class:`espnet.nets.pytorch_backend.lm.transformer.TransformerLM`
+            * :class:`espnet.nets.pytorch_backend.lm.default.DefaultRNNLM`
+            * :class:`espnet.nets.pytorch_backend.lm.seq_rnn.SequentialRNNLM`
+
+    """
+
+    def init_state(self, x):
+        """
+        Get an initial state for decoding (optional).
+
+        Args:
+            x (torch.Tensor): The encoded feature tensor
+
+        Returns: initial state
+
+        """
+        return None
+
+    def select_state(self, state, i, new_id=None):
+        """
+        Select state with relative ids in the main beam search.
+
+        Args:
+            state: Decoder state for prefix tokens
+            i (int): Index to select a state in the main beam search
+            new_id (int): New label index to select a state if necessary
+
+        Returns:
+            state: pruned state
+
+        """
+        return None if state is None else state[i]
+
+    def score(self, y, state, x):
+        """
+        Score new token (required).
+
+        Args:
+            y (torch.Tensor): 1D torch.int64 prefix tokens.
+            state: Scorer state for prefix tokens
+            x (torch.Tensor): The encoder feature that generates ys.
+
+        Returns:
+            tuple[torch.Tensor, Any]: Tuple of
+                scores for next token that has a shape of `(n_vocab)`
+                and next state for ys
+
+        """
+        raise NotImplementedError
+
+    def final_score(self, state):
+        """
+        Score eos (optional).
+
+        Args:
+            state: Scorer state for prefix tokens
+
+        Returns:
+            float: final score
+
+        """
+        return 0.0
+
+
+class BatchScorerInterface(ScorerInterface, ABC):
+
+    def batch_init_state(self, x):
+        """
+        Get an initial state for decoding (optional).
+
+        Args:
+            x (torch.Tensor): The encoded feature tensor
+
+        Returns: initial state
+
+        """
+        return self.init_state(x)
+
+    def batch_score(self, ys, states, xs):
+        """
+        Score new token batch (required).
+
+        Args:
+            ys (torch.Tensor): torch.int64 prefix tokens (n_batch, ylen).
+            states (List[Any]): Scorer states for prefix tokens.
+            xs (torch.Tensor):
+                The encoder feature that generates ys (n_batch, xlen, n_feat).
+
+        Returns:
+            tuple[torch.Tensor, List[Any]]: Tuple of
+                batchfied scores for next token with shape of `(n_batch, n_vocab)`
+                and next state list for ys.
+
+        """
+        scores = list()
+        outstates = list()
+        for i, (y, state, x) in enumerate(zip(ys, states, xs)):
+            score, outstate = self.score(y, state, x)
+            outstates.append(outstate)
+            scores.append(score)
+        scores = torch.cat(scores, 0).view(ys.shape[0], -1)
+        return scores, outstates
+
+
+def to_device(m, x):
+    """Send tensor into the device of the module.
+    Args:
+        m (torch.nn.Module): Torch module.
+        x (Tensor): Torch tensor.
+    Returns:
+        Tensor: Torch tensor located in the same place as torch module.
+    """
+    if isinstance(m, torch.nn.Module):
+        device = next(m.parameters()).device
+    elif isinstance(m, torch.Tensor):
+        device = m.device
+    else:
+        raise TypeError(
+            "Expected torch.nn.Module or torch.tensor, " f"bot got: {type(m)}"
+            )
+    return x.to(device)
diff --git a/app.py b/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..17e60484c0eb2d718aa8dc3d1f55ecb48b2e82c4
--- /dev/null
+++ b/app.py
@@ -0,0 +1,178 @@
+import os
+
+import gradio as gr
+import numpy as np
+import soundfile as sf
+import torch
+
+from Preprocessing.ArticulatoryCombinedTextFrontend import ArticulatoryCombinedTextFrontend
+from Preprocessing.AudioPreprocessor import AudioPreprocessor
+from TrainingInterfaces.Text_to_Spectrogram.AutoAligner.Aligner import Aligner
+from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.DurationCalculator import DurationCalculator
+from run_utterance_cloner import UtteranceCloner
+
+os.system("pip uninstall -y gradio")
+os.system("pip install gradio==2.7.5.2")
+
+
+def float2pcm(sig, dtype='int16'):
+    """
+    https://gist.github.com/HudsonHuang/fbdf8e9af7993fe2a91620d3fb86a182
+    """
+    sig = np.asarray(sig)
+    if sig.dtype.kind != 'f':
+        raise TypeError("'sig' must be a float array")
+    dtype = np.dtype(dtype)
+    if dtype.kind not in 'iu':
+        raise TypeError("'dtype' must be an integer type")
+    i = np.iinfo(dtype)
+    abs_max = 2 ** (i.bits - 1)
+    offset = i.min + abs_max
+    return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype)
+
+
+class TTS_Interface:
+
+    def __init__(self):
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.utterance_cloner = UtteranceCloner(device=self.device)
+        self.speaker_path_lookup = {
+            "Voice 1": "reference_audios/evandro3.wav",
+            "Voice 2": "reference_audios/bruno8.wav",
+            }
+        self.acoustic_model = Aligner()
+        self.acoustic_model.load_state_dict(torch.load("Models/Aligner/aligner.pt", map_location='cpu')["asr_model"])
+        self.acoustic_model = self.acoustic_model.to(self.device)
+        self.dc = DurationCalculator(reduction_factor=1)
+        self.tf = ArticulatoryCombinedTextFrontend(language="pt")
+
+    def read(self, prompt, speaker_1, speaker_2, speaker_3):
+        if prompt == "Era o fato de que você não tem mão de obra, né?":
+            reference_audio = "reference_audios/bruno8.wav"
+        elif prompt == "Gente um aviso muito importante: ontem eu fiz todo o trâmite para me inscrever no bbb e na hora que cliquei no botão de enviar, as inscrições estavam encerradas. Então é, lamento informar, mas o jeito vai ser eu licensiar conteúdo para o onlyfans.":
+            reference_audio = "reference_audios/evandro3.wav"
+
+        text_list = prompt.replace(".", ".|").replace("?", "?|").replace("!", "!|").split("|")
+        # we don't split on the punctuation marks because we want to retain them.
+
+        self.split_audio(reference_audio, text_list)
+        # at this point, split_1.wav, split_2.wav and split_3.wav should exist.
+
+        self.utterance_cloner.tts.set_utterance_embedding(self.speaker_path_lookup[speaker_1])
+        part_1 = self.utterance_cloner.clone_utterance(path_to_reference_audio="split_1.wav",
+                                                       reference_transcription=text_list[0],
+                                                       clone_speaker_identity=False,
+                                                       lang="pt")
+
+        self.utterance_cloner.tts.set_utterance_embedding(self.speaker_path_lookup[speaker_2])
+        part_2 = self.utterance_cloner.clone_utterance(path_to_reference_audio="split_2.wav",
+                                                       reference_transcription=text_list[1],
+                                                       clone_speaker_identity=False,
+                                                       lang="pt")
+
+        self.utterance_cloner.tts.set_utterance_embedding(self.speaker_path_lookup[speaker_3])
+        part_3 = self.utterance_cloner.clone_utterance(path_to_reference_audio="split_3.wav",
+                                                       reference_transcription=text_list[2],
+                                                       clone_speaker_identity=False,
+                                                       lang="pt")
+
+        return "alignment.png", \
+               reference_audio, \
+               self.speaker_path_lookup["Voice 1"], \
+               self.speaker_path_lookup["Voice 2"], \
+               self.speaker_path_lookup["Voice 3"], \
+               (48000, float2pcm(torch.cat([part_1, part_2, part_3], dim=0).numpy()))
+
+    def split_audio(self, path_to_audio, text_list):
+        # extract audio
+        audio, sr = sf.read(path_to_audio)
+        ap = AudioPreprocessor(input_sr=sr, output_sr=16000, melspec_buckets=80, hop_length=256, n_fft=1024, cut_silence=False)
+        norm_wave = ap.audio_to_wave_tensor(normalize=True, audio=audio)
+        melspec = ap.audio_to_mel_spec_tensor(audio=norm_wave, normalize=False, explicit_sampling_rate=16000).transpose(0, 1)
+
+        # extract phonemes
+        lines = list()
+        for segment in text_list:
+            if segment.strip() != "":
+                lines.append(self.tf.string_to_tensor(segment, handle_missing=False).squeeze())
+        # postprocess phonemes: [~ sentence ~ #] --> [sentence ~] except for the first one, which is [~ sentence ~]
+        processed_lines = list()
+        for index, line in enumerate(lines):
+            if index == 0:
+                processed_lines.append(line[:-1])
+            else:
+                processed_lines.append(line[1:-1])
+        lines = processed_lines
+        joined_phonemes = torch.cat(lines, dim=0)
+
+        # get durations of each phone in audio as average of an ensemble
+        alignment_paths = list()
+        ensemble_of_durations = list()
+        for ensemble in range(2):
+            alignment_paths.append(self.acoustic_model.inference(mel=melspec.to(self.device),
+                                                                 tokens=joined_phonemes.to(self.device),
+                                                                 save_img_for_debug="alignment.png" if ensemble == 1 else None,
+                                                                 return_ctc=False))
+        for alignment_path in alignment_paths:
+            ensemble_of_durations.append(self.dc(torch.LongTensor(alignment_path), vis=None).squeeze())
+        durations = list()
+        for i, _ in enumerate(ensemble_of_durations[0]):
+            duration_of_phone = list()
+            for ensemble_member in ensemble_of_durations:
+                duration_of_phone.append(ensemble_member.squeeze()[i])
+            durations.append(sum(duration_of_phone) / len(duration_of_phone))
+
+        # cut audio according to duration sum of each line in transcript
+        line_lens = [len(x) for x in lines]
+        index = 0
+        segment_durations = list()
+        for num_phones in line_lens:
+            segment_durations.append(sum(durations[index: index + num_phones]))
+            index += num_phones
+        spec_to_wave_factor = len(norm_wave) / sum(segment_durations)
+        wave_segment_lens = [int(x * spec_to_wave_factor) for x in segment_durations]
+        start_index = 0
+        wave_segments = list()
+        for index, segment_len in enumerate(wave_segment_lens):
+            if index == len(wave_segment_lens) - 1:
+                wave_segments.append(norm_wave[start_index:])
+            else:
+                wave_segments.append(norm_wave[start_index: start_index + segment_len])
+                start_index += segment_len
+
+        # write the audio segments into new files
+        for index, wave_segment in enumerate(wave_segments):
+            sf.write(f"split_{index + 1}.wav", wave_segment, 16000)
+
+
+meta_model = TTS_Interface()
+article = "<p style='text-align: left'>This is still a work in progress, models will be exchanged for better ones as soon as they are done. More diverse training data can help with more exact cloning. For example we are still trying to incorporate more singing data. </p><p style='text-align: center'><a href='https://github.com/DigitalPhonetics/IMS-Toucan' target='_blank'>Click here to learn more about the IMS Toucan Speech Synthesis Toolkit</a></p>"
+
+iface = gr.Interface(fn=meta_model.read,
+                     inputs=[gr.inputs.Dropdown(
+                         ["Era o fato de que você não tem mão de obra, né?",
+                          "Gente um aviso muito importante: ontem eu fiz todo o trâmite para me inscrever no bbb e na hora que cliquei no botão de enviar, as inscrições estavam encerradas. Então é, lamento informar, mas o jeito vai ser eu licensiar conteúdo para o onlyfans."],
+                         type="value",
+                         default="Era o fato de que você não tem mão de obra, né?",
+                         label="Select which utterance should be customized"),
+                         gr.inputs.Dropdown(["Voice 1",
+                                             "Voice 2"], type="value", default="Voice 1", label="Speaker selection for the first sentence"),
+                         gr.inputs.Dropdown(["Voice 1",
+                                             "Voice 2"], type="value", default="Voice 2", label="Speaker selection for the second sentence"),
+                         gr.inputs.Dropdown(["Voice 1",
+                                             "Voice 2"], type="value", default="Voice 3", label="Speaker selection for the third sentence")],
+                     outputs=[gr.outputs.Image(label="Alignment of Phonemes to Audio"),
+                              gr.outputs.Audio(type="file", label="Original Audio"),
+                              gr.outputs.Audio(type="file", label="Reference-Voice 1"),
+                              gr.outputs.Audio(type="file", label="Reference-Voice 2"),
+                              gr.outputs.Audio(type="file", label="Reference-Voice 3"),
+                              gr.outputs.Audio(type="numpy", label="Customized Audio")],
+                     layout="vertical",
+                     title="IMS Toucan - Speech Customization through Voice Cloning",
+                     thumbnail="Utility/toucan.png",
+                     theme="default",
+                     allow_flagging="never",
+                     allow_screenshot=False,
+                     description="In this demo, an audio is split automatically into individual sentences. Then each of the sentences is re-synthesized into speech with the exact same prosody, but with a voice that you can choose. This allows customizing any existing read speech while retaining as much from the original reading as possible. Unfortunately, we cannot show you the reference audio and the reference voices ahead of time, so they will be displayed together with the resulting cloned speech.",
+                     article=article)
+iface.launch(enable_queue=True)
diff --git a/packages.txt b/packages.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ec6aacf9dd8816e4e98aa59536ba10e5ef57d835
--- /dev/null
+++ b/packages.txt
@@ -0,0 +1,3 @@
+libsndfile1
+espeak-ng
+ffmpeg
\ No newline at end of file
diff --git a/reference_audios/__init__.py b/reference_audios/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a8dfe4c1ae9be5cc0a4da42fb20be9bf64a78467
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,86 @@
+appdirs~=1.4.4
+attrs~=21.2.0
+audioread~=2.1.9
+auraloss~=0.2.1
+certifi~=2021.5.30
+cffi~=1.14.6
+charset-normalizer~=2.0.4
+clldutils~=3.9.0
+colorama~=0.4.4
+colorlog~=6.4.1
+commonmark~=0.9.1
+csvw~=1.11.0
+cycler~=0.10.0
+Cython~=0.29.24
+decorator~=5.0.9
+editdistance~=0.5.3
+emoji~=1.4.2
+filelock~=3.0.12
+ftfy~=6.0.3
+future~=0.18.2
+graphviz~=0.17
+huggingface-hub~=0.0.16
+HyperPyYAML~=1.0.0
+idna~=3.2
+isodate~=0.6.0
+jiwer~=2.2.1
+joblib~=1.0.1
+kiwisolver~=1.3.2
+librosa~=0.8.1
+llvmlite~=0.37.0
+matplotlib~=3.4.3
+munkres~=1.1.4
+numba~=0.54.0
+numpy~=1.20.3
+packaging~=21.0
+panphon~=0.19
+pedalboard~=0.3.11
+phonemizer~=2.2.2
+Pillow~=8.3.2
+pooch~=1.5.1
+pycparser~=2.20
+Pygments~=2.10.0
+pyloudnorm~=0.1.0
+pyparsing~=2.4.7
+pysndfx~=0.3.6
+PySocks~=1.7.1
+python-dateutil~=2.8.2
+python-Levenshtein~=0.12.2
+pyworld~=0.3.0
+PyYAML~=5.4.1
+regex~=2021.8.28
+requests~=2.26.0
+resampy~=0.2.2
+rfc3986~=1.5.0
+rich~=9.10.0
+ruamel.yaml~=0.17.16
+ruamel.yaml.clib~=0.2.6
+scikit-learn~=0.24.2
+scipy~=1.7.1
+segments~=2.2.0
+sentencepiece~=0.1.96
+six~=1.16.0
+sklearn~=0.0
+sounddevice~=0.4.2
+SoundFile~=0.10.3.post1
+speechbrain~=0.5.10
+tabulate~=0.8.9
+threadpoolctl~=2.2.0
+torch~=1.10.1
+torch-complex~=0.2.1
+torchaudio~=0.10.1
+torchvision~=0.11.2
+torchviz~=0.0.2
+tqdm~=4.62.2
+typing-extensions~=3.10.0.2
+unicodecsv~=0.14.1
+Unidecode~=1.2.0
+unsilence~=1.0.8
+uritemplate~=3.0.1
+urllib3~=1.26.6
+wcwidth~=0.2.5
+wincertstore~=0.2
+gradio
+jinja2
+gdown==4.2.1
+pydub
\ No newline at end of file
diff --git a/run_utterance_cloner.py b/run_utterance_cloner.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d26375cb3e9297f3d22e7d6ee3557d37d84019f
--- /dev/null
+++ b/run_utterance_cloner.py
@@ -0,0 +1,121 @@
+import os
+
+import soundfile as sf
+import torch
+from torch.optim import SGD
+from tqdm import tqdm
+
+from InferenceInterfaces.Meta_FastSpeech2 import Meta_FastSpeech2
+from Preprocessing.ArticulatoryCombinedTextFrontend import ArticulatoryCombinedTextFrontend
+from Preprocessing.AudioPreprocessor import AudioPreprocessor
+from TrainingInterfaces.Text_to_Spectrogram.AutoAligner.Aligner import Aligner
+from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.DurationCalculator import DurationCalculator
+from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.EnergyCalculator import EnergyCalculator
+from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.PitchCalculator import Dio
+
+
+class UtteranceCloner:
+
+    def __init__(self, device):
+        self.tts = Meta_FastSpeech2(device=device)
+        self.device = device
+        torch.hub._validate_not_a_forked_repo = lambda a, b, c: True  # torch 1.9 has a bug in the hub loading, this is a workaround
+        # careful: assumes 16kHz or 8kHz audio
+        self.silero_model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
+                                                  model='silero_vad',
+                                                  force_reload=False,
+                                                  onnx=False,
+                                                  verbose=False)
+        (self.get_speech_timestamps, _, _, _, _) = utils
+        torch.set_grad_enabled(True)  # finding this issue was very infuriating: silero sets
+        # this to false globally during model loading rather than using inference mode or no_grad
+        self.silero_model = self.silero_model.to(self.device)
+
+    def extract_prosody(self, transcript, ref_audio_path, lang="de", on_line_fine_tune=False):
+        acoustic_model = Aligner()
+        acoustic_checkpoint_path = os.path.join("Models", "Aligner", "aligner.pt")
+        acoustic_model.load_state_dict(torch.load(acoustic_checkpoint_path, map_location='cpu')["asr_model"])
+        acoustic_model = acoustic_model.to(self.device)
+        dio = Dio(reduction_factor=1, fs=16000)
+        energy_calc = EnergyCalculator(reduction_factor=1, fs=16000)
+        dc = DurationCalculator(reduction_factor=1)
+        wave, sr = sf.read(ref_audio_path)
+        tf = ArticulatoryCombinedTextFrontend(language=lang, use_word_boundaries=False)
+        ap = AudioPreprocessor(input_sr=sr, output_sr=16000, melspec_buckets=80, hop_length=256, n_fft=1024, cut_silence=False)
+        try:
+            norm_wave = ap.audio_to_wave_tensor(normalize=True, audio=wave)
+        except ValueError:
+            print('Something went wrong, the reference wave might be too short.')
+            raise RuntimeError
+
+        with torch.inference_mode():
+            speech_timestamps = self.get_speech_timestamps(norm_wave, self.silero_model, sampling_rate=16000)
+        norm_wave = norm_wave[speech_timestamps[0]['start']:speech_timestamps[-1]['end']]
+
+        norm_wave_length = torch.LongTensor([len(norm_wave)])
+        text = tf.string_to_tensor(transcript, handle_missing=False).squeeze(0)
+        melspec = ap.audio_to_mel_spec_tensor(audio=norm_wave, normalize=False, explicit_sampling_rate=16000).transpose(0, 1)
+        melspec_length = torch.LongTensor([len(melspec)]).numpy()
+
+        if on_line_fine_tune:
+            # we fine-tune the aligner for a couple steps using SGD. This makes cloning pretty slow, but the results are greatly improved.
+            steps = 10
+            tokens = list()  # we need an ID sequence for training rather than a sequence of phonological features
+            for vector in text:
+                for phone in tf.phone_to_vector:
+                    if vector.numpy().tolist() == tf.phone_to_vector[phone]:
+                        tokens.append(tf.phone_to_id[phone])
+            tokens = torch.LongTensor(tokens)
+            tokens = tokens.squeeze().to(self.device)
+            tokens_len = torch.LongTensor([len(tokens)]).to(self.device)
+            mel = melspec.unsqueeze(0).to(self.device)
+            mel.requires_grad = True
+            mel_len = torch.LongTensor([len(mel[0])]).to(self.device)
+            # actual fine-tuning starts here
+            optim_asr = SGD(acoustic_model.parameters(), lr=0.1)
+            acoustic_model.train()
+            for _ in tqdm(list(range(steps))):
+                pred = acoustic_model(mel)
+                loss = acoustic_model.ctc_loss(pred.transpose(0, 1).log_softmax(2), tokens, mel_len, tokens_len)
+                optim_asr.zero_grad()
+                loss.backward()
+                torch.nn.utils.clip_grad_norm_(acoustic_model.parameters(), 1.0)
+                optim_asr.step()
+            acoustic_model.eval()
+        torch.save({"asr_model": acoustic_model.state_dict()},
+                   os.path.join(os.path.join("Models", "Aligner", "aligner.pt")))
+
+        alignment_path = acoustic_model.inference(mel=melspec.to(self.device),
+                                                  tokens=text.to(self.device),
+                                                  return_ctc=False)
+
+        duration = dc(torch.LongTensor(alignment_path), vis=None).cpu()
+        energy = energy_calc(input_waves=norm_wave.unsqueeze(0),
+                             input_waves_lengths=norm_wave_length,
+                             feats_lengths=melspec_length,
+                             durations=duration.unsqueeze(0),
+                             durations_lengths=torch.LongTensor([len(duration)]))[0].squeeze(0).cpu()
+        pitch = dio(input_waves=norm_wave.unsqueeze(0),
+                    input_waves_lengths=norm_wave_length,
+                    feats_lengths=melspec_length,
+                    durations=duration.unsqueeze(0),
+                    durations_lengths=torch.LongTensor([len(duration)]))[0].squeeze(0).cpu()
+
+        return duration, pitch, energy, speech_timestamps[0]['start'], speech_timestamps[-1]['end']
+
+    def clone_utterance(self,
+                        path_to_reference_audio,
+                        reference_transcription,
+                        clone_speaker_identity=True,
+                        lang="en"):
+        if clone_speaker_identity:
+            self.tts.set_utterance_embedding(path_to_reference_audio=path_to_reference_audio)
+        duration, pitch, energy, silence_frames_start, silence_frames_end = self.extract_prosody(reference_transcription,
+                                                                                                 path_to_reference_audio,
+                                                                                                 lang=lang)
+        self.tts.set_language(lang)
+        start_sil = torch.zeros([silence_frames_start]).to(self.device)
+        end_sil = torch.zeros([silence_frames_end]).to(self.device)
+        cloned_speech = self.tts(reference_transcription, view=False, durations=duration, pitch=pitch, energy=energy)
+        cloned_utt = torch.cat((start_sil, cloned_speech, end_sil), dim=0)
+        return cloned_utt.cpu()