""" Taken from ESPNet """ import torch import torch.nn.functional as F from Layers.Attention import RelPositionMultiHeadedAttention from Layers.Convolution import ConvolutionModule from Layers.EncoderLayer import EncoderLayer from Layers.LayerNorm import LayerNorm from Layers.MultiLayeredConv1d import MultiLayeredConv1d from Layers.MultiSequential import repeat from Layers.PositionalEncoding import RelPositionalEncoding from Layers.Swish import Swish class Conformer(torch.nn.Module): """ Conformer encoder module. Args: idim (int): Input dimension. attention_dim (int): Dimension of attention. attention_heads (int): The number of heads of multi head attention. linear_units (int): The number of units of position-wise feed forward. num_blocks (int): The number of decoder blocks. dropout_rate (float): Dropout rate. positional_dropout_rate (float): Dropout rate after adding positional encoding. attention_dropout_rate (float): Dropout rate in attention. input_layer (Union[str, torch.nn.Module]): Input layer type. normalize_before (bool): Whether to use layer_norm before the first block. concat_after (bool): Whether to concat attention layer's input and output. if True, additional linear will be applied. i.e. x -> x + linear(concat(x, att(x))) if False, no additional linear will be applied. i.e. x -> x + att(x) positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear". positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer. macaron_style (bool): Whether to use macaron style for positionwise layer. pos_enc_layer_type (str): Conformer positional encoding layer type. selfattention_layer_type (str): Conformer attention layer type. activation_type (str): Conformer activation function type. use_cnn_module (bool): Whether to use convolution module. cnn_module_kernel (int): Kernerl size of convolution module. padding_idx (int): Padding idx for input_layer=embed. """ def __init__(self, idim, attention_dim=256, attention_heads=4, linear_units=2048, num_blocks=6, dropout_rate=0.1, positional_dropout_rate=0.1, attention_dropout_rate=0.0, input_layer="conv2d", normalize_before=True, concat_after=False, positionwise_conv_kernel_size=1, macaron_style=False, use_cnn_module=False, cnn_module_kernel=31, zero_triu=False, utt_embed=None, connect_utt_emb_at_encoder_out=True, spk_emb_bottleneck_size=128, lang_embs=None): super(Conformer, self).__init__() activation = Swish() self.conv_subsampling_factor = 1 if isinstance(input_layer, torch.nn.Module): self.embed = input_layer self.pos_enc = RelPositionalEncoding(attention_dim, positional_dropout_rate) elif input_layer is None: self.embed = None self.pos_enc = torch.nn.Sequential(RelPositionalEncoding(attention_dim, positional_dropout_rate)) else: raise ValueError("unknown input_layer: " + input_layer) self.normalize_before = normalize_before self.connect_utt_emb_at_encoder_out = connect_utt_emb_at_encoder_out if utt_embed is not None: self.hs_emb_projection = torch.nn.Linear(attention_dim + spk_emb_bottleneck_size, attention_dim) # embedding projection derived from https://arxiv.org/pdf/1705.08947.pdf self.embedding_projection = torch.nn.Sequential(torch.nn.Linear(utt_embed, spk_emb_bottleneck_size), torch.nn.Softsign()) if lang_embs is not None: self.language_embedding = torch.nn.Embedding(num_embeddings=lang_embs, embedding_dim=attention_dim) # self-attention module definition encoder_selfattn_layer = RelPositionMultiHeadedAttention encoder_selfattn_layer_args = (attention_heads, attention_dim, attention_dropout_rate, zero_triu) # feed-forward module definition positionwise_layer = MultiLayeredConv1d positionwise_layer_args = (attention_dim, linear_units, positionwise_conv_kernel_size, dropout_rate,) # convolution module definition convolution_layer = ConvolutionModule convolution_layer_args = (attention_dim, cnn_module_kernel, activation) self.encoders = repeat(num_blocks, lambda lnum: EncoderLayer(attention_dim, encoder_selfattn_layer(*encoder_selfattn_layer_args), positionwise_layer(*positionwise_layer_args), positionwise_layer(*positionwise_layer_args) if macaron_style else None, convolution_layer(*convolution_layer_args) if use_cnn_module else None, dropout_rate, normalize_before, concat_after)) if self.normalize_before: self.after_norm = LayerNorm(attention_dim) def forward(self, xs, masks, utterance_embedding=None, lang_ids=None): """ Encode input sequence. Args: utterance_embedding: embedding containing lots of conditioning signals step: indicator for when to start updating the embedding function xs (torch.Tensor): Input tensor (#batch, time, idim). masks (torch.Tensor): Mask tensor (#batch, time). Returns: torch.Tensor: Output tensor (#batch, time, attention_dim). torch.Tensor: Mask tensor (#batch, time). """ if self.embed is not None: xs = self.embed(xs) if lang_ids is not None: lang_embs = self.language_embedding(lang_ids) xs = xs + lang_embs # offset the phoneme distribution of a language if utterance_embedding is not None and not self.connect_utt_emb_at_encoder_out: xs = self._integrate_with_utt_embed(xs, utterance_embedding) xs = self.pos_enc(xs) xs, masks = self.encoders(xs, masks) if isinstance(xs, tuple): xs = xs[0] if self.normalize_before: xs = self.after_norm(xs) if utterance_embedding is not None and self.connect_utt_emb_at_encoder_out: xs = self._integrate_with_utt_embed(xs, utterance_embedding) return xs, masks def _integrate_with_utt_embed(self, hs, utt_embeddings): # project embedding into smaller space speaker_embeddings_projected = self.embedding_projection(utt_embeddings) # concat hidden states with spk embeds and then apply projection speaker_embeddings_expanded = F.normalize(speaker_embeddings_projected).unsqueeze(1).expand(-1, hs.size(1), -1) hs = self.hs_emb_projection(torch.cat([hs, speaker_embeddings_expanded], dim=-1)) return hs