zyingt's picture
Upload 685 files
0d80816
# Copyright (c) 2023 Amphion.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import math
from torch import nn
from torch.nn import functional as F
from .modules import Conv1d1x1, ResidualConv1dGLU
from .upsample import ConvInUpsampleNetwork
def receptive_field_size(
total_layers, num_cycles, kernel_size, dilation=lambda x: 2**x
):
"""Compute receptive field size
Args:
total_layers (int): total layers
num_cycles (int): cycles
kernel_size (int): kernel size
dilation (lambda): lambda to compute dilation factor. ``lambda x : 1``
to disable dilated convolution.
Returns:
int: receptive field size in sample
"""
assert total_layers % num_cycles == 0
layers_per_cycle = total_layers // num_cycles
dilations = [dilation(i % layers_per_cycle) for i in range(total_layers)]
return (kernel_size - 1) * sum(dilations) + 1
class WaveNet(nn.Module):
"""The WaveNet model that supports local and global conditioning.
Args:
out_channels (int): Output channels. If input_type is mu-law quantized
one-hot vecror. this must equal to the quantize channels. Other wise
num_mixtures x 3 (pi, mu, log_scale).
layers (int): Number of total layers
stacks (int): Number of dilation cycles
residual_channels (int): Residual input / output channels
gate_channels (int): Gated activation channels.
skip_out_channels (int): Skip connection channels.
kernel_size (int): Kernel size of convolution layers.
dropout (float): Dropout probability.
input_dim (int): Number of mel-spec dimension.
upsample_scales (list): List of upsample scale.
``np.prod(upsample_scales)`` must equal to hop size. Used only if
upsample_conditional_features is enabled.
freq_axis_kernel_size (int): Freq-axis kernel_size for transposed
convolution layers for upsampling. If you only care about time-axis
upsampling, set this to 1.
scalar_input (Bool): If True, scalar input ([-1, 1]) is expected, otherwise
quantized one-hot vector is expected..
"""
def __init__(self, cfg):
super(WaveNet, self).__init__()
self.cfg = cfg
self.scalar_input = self.cfg.VOCODER.SCALAR_INPUT
self.out_channels = self.cfg.VOCODER.OUT_CHANNELS
self.cin_channels = self.cfg.VOCODER.INPUT_DIM
self.residual_channels = self.cfg.VOCODER.RESIDUAL_CHANNELS
self.layers = self.cfg.VOCODER.LAYERS
self.stacks = self.cfg.VOCODER.STACKS
self.gate_channels = self.cfg.VOCODER.GATE_CHANNELS
self.kernel_size = self.cfg.VOCODER.KERNEL_SIZE
self.skip_out_channels = self.cfg.VOCODER.SKIP_OUT_CHANNELS
self.dropout = self.cfg.VOCODER.DROPOUT
self.upsample_scales = self.cfg.VOCODER.UPSAMPLE_SCALES
self.mel_frame_pad = self.cfg.VOCODER.MEL_FRAME_PAD
assert self.layers % self.stacks == 0
layers_per_stack = self.layers // self.stacks
if self.scalar_input:
self.first_conv = Conv1d1x1(1, self.residual_channels)
else:
self.first_conv = Conv1d1x1(self.out_channels, self.residual_channels)
self.conv_layers = nn.ModuleList()
for layer in range(self.layers):
dilation = 2 ** (layer % layers_per_stack)
conv = ResidualConv1dGLU(
self.residual_channels,
self.gate_channels,
kernel_size=self.kernel_size,
skip_out_channels=self.skip_out_channels,
bias=True,
dilation=dilation,
dropout=self.dropout,
cin_channels=self.cin_channels,
)
self.conv_layers.append(conv)
self.last_conv_layers = nn.ModuleList(
[
nn.ReLU(inplace=True),
Conv1d1x1(self.skip_out_channels, self.skip_out_channels),
nn.ReLU(inplace=True),
Conv1d1x1(self.skip_out_channels, self.out_channels),
]
)
self.upsample_net = ConvInUpsampleNetwork(
upsample_scales=self.upsample_scales,
cin_pad=self.mel_frame_pad,
cin_channels=self.cin_channels,
)
self.receptive_field = receptive_field_size(
self.layers, self.stacks, self.kernel_size
)
def forward(self, x, mel, softmax=False):
"""Forward step
Args:
x (Tensor): One-hot encoded audio signal, shape (B x C x T)
mel (Tensor): Local conditioning features,
shape (B x cin_channels x T)
softmax (bool): Whether applies softmax or not.
Returns:
Tensor: output, shape B x out_channels x T
"""
B, _, T = x.size()
mel = self.upsample_net(mel)
assert mel.shape[-1] == x.shape[-1]
x = self.first_conv(x)
skips = 0
for f in self.conv_layers:
x, h = f(x, mel)
skips += h
skips *= math.sqrt(1.0 / len(self.conv_layers))
x = skips
for f in self.last_conv_layers:
x = f(x)
x = F.softmax(x, dim=1) if softmax else x
return x
def clear_buffer(self):
self.first_conv.clear_buffer()
for f in self.conv_layers:
f.clear_buffer()
for f in self.last_conv_layers:
try:
f.clear_buffer()
except AttributeError:
pass
def make_generation_fast_(self):
def remove_weight_norm(m):
try:
nn.utils.remove_weight_norm(m)
except ValueError: # this module didn't have weight norm
return
self.apply(remove_weight_norm)