|
""" |
|
Some custom modules that are used by the TTS model |
|
""" |
|
from typing import List |
|
import torch |
|
from torch import nn |
|
|
|
from poetry_diacritizer.modules.layers import BatchNormConv1d |
|
|
|
|
|
class Prenet(nn.Module): |
|
""" |
|
A prenet is a collection of linear layers with dropout(0.5), |
|
and RELU activation function |
|
Args: |
|
config: the hyperparameters object |
|
in_dim (int): the input dim |
|
""" |
|
|
|
def __init__( |
|
self, in_dim: int, prenet_depth: List[int] = [256, 128], dropout: int = 0.5 |
|
): |
|
""" Initializing the prenet module """ |
|
super().__init__() |
|
in_sizes = [in_dim] + prenet_depth[:-1] |
|
self.layers = nn.ModuleList( |
|
[ |
|
nn.Linear(in_size, out_size) |
|
for (in_size, out_size) in zip(in_sizes, prenet_depth) |
|
] |
|
) |
|
self.relu = nn.ReLU() |
|
self.dropout = nn.Dropout(dropout) |
|
|
|
def forward(self, inputs: torch.Tensor): |
|
"""Calculate forward propagation |
|
Args: |
|
inputs (batch_size, seqLen): the inputs to the prenet, the input shapes could |
|
be different as it is being used in both encoder and decoder. |
|
Returns: |
|
Tensor: the output of the forward propagation |
|
""" |
|
for linear in self.layers: |
|
inputs = self.dropout(self.relu(linear(inputs))) |
|
return inputs |
|
|
|
|
|
class Highway(nn.Module): |
|
"""Highway Networks were developed by (Srivastava et al., 2015) |
|
to overcome the difficulty of training deep neural networks |
|
(https://arxiv.org/abs/1507.06228). |
|
Args: |
|
in_size (int): the input size |
|
out_size (int): the output size |
|
""" |
|
|
|
def __init__(self, in_size, out_size): |
|
""" |
|
Initializing Highway networks |
|
""" |
|
super().__init__() |
|
self.H = nn.Linear(in_size, out_size) |
|
self.H.bias.data.zero_() |
|
self.T = nn.Linear(in_size, out_size) |
|
self.T.bias.data.fill_(-1) |
|
self.relu = nn.ReLU() |
|
self.sigmoid = nn.Sigmoid() |
|
|
|
def forward(self, inputs: torch.Tensor): |
|
"""Calculate forward propagation |
|
Args: |
|
inputs (Tensor): |
|
""" |
|
H = self.relu(self.H(inputs)) |
|
T = self.sigmoid(self.T(inputs)) |
|
return H * T + inputs * (1.0 - T) |
|
|
|
|
|
class CBHG(nn.Module): |
|
"""The CBHG module (1-D Convolution Bank + Highway network + Bidirectional GRU) |
|
was proposed by (Lee et al., 2017, https://www.aclweb.org/anthology/Q17-1026) |
|
for a character-level NMT model. |
|
It was adapted by (Wang et al., 2017) for building the Tacotron. |
|
It is used in both the encoder and decoder with different parameters. |
|
""" |
|
|
|
def __init__( |
|
self, |
|
in_dim: int, |
|
out_dim: int, |
|
K: int, |
|
projections: List[int], |
|
highway_layers: int = 4, |
|
): |
|
"""Initializing the CBHG module |
|
Args: |
|
in_dim (int): the input size |
|
out_dim (int): the output size |
|
k (int): number of filters |
|
""" |
|
super().__init__() |
|
|
|
self.in_dim = in_dim |
|
self.out_dim = out_dim |
|
self.relu = nn.ReLU() |
|
self.conv1d_banks = nn.ModuleList( |
|
[ |
|
BatchNormConv1d( |
|
in_dim, |
|
in_dim, |
|
kernel_size=k, |
|
stride=1, |
|
padding=k // 2, |
|
activation=self.relu, |
|
) |
|
for k in range(1, K + 1) |
|
] |
|
) |
|
self.max_pool1d = nn.MaxPool1d(kernel_size=2, stride=1, padding=1) |
|
|
|
in_sizes = [K * in_dim] + projections[:-1] |
|
activations = [self.relu] * (len(projections) - 1) + [None] |
|
self.conv1d_projections = nn.ModuleList( |
|
[ |
|
BatchNormConv1d( |
|
in_size, out_size, kernel_size=3, stride=1, padding=1, activation=ac |
|
) |
|
for (in_size, out_size, ac) in zip(in_sizes, projections, activations) |
|
] |
|
) |
|
|
|
self.pre_highway = nn.Linear(projections[-1], in_dim, bias=False) |
|
self.highways = nn.ModuleList([Highway(in_dim, in_dim) for _ in range(4)]) |
|
|
|
self.gru = nn.GRU(in_dim, out_dim, 1, batch_first=True, bidirectional=True) |
|
|
|
def forward(self, inputs, input_lengths=None): |
|
|
|
x = inputs |
|
x = x.transpose(1, 2) |
|
T = x.size(-1) |
|
|
|
|
|
|
|
x = torch.cat([conv1d(x)[:, :, :T] for conv1d in self.conv1d_banks], dim=1) |
|
assert x.size(1) == self.in_dim * len(self.conv1d_banks) |
|
x = self.max_pool1d(x)[:, :, :T] |
|
|
|
for conv1d in self.conv1d_projections: |
|
x = conv1d(x) |
|
|
|
|
|
|
|
x = x.transpose(1, 2) |
|
|
|
if x.size(-1) != self.in_dim: |
|
x = self.pre_highway(x) |
|
|
|
|
|
x += inputs |
|
for highway in self.highways: |
|
x = highway(x) |
|
|
|
if input_lengths is not None: |
|
x = nn.utils.rnn.pack_padded_sequence(x, input_lengths, batch_first=True) |
|
|
|
|
|
self.gru.flatten_parameters() |
|
outputs, _ = self.gru(x) |
|
|
|
if input_lengths is not None: |
|
outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True) |
|
|
|
return outputs |
|
|