Spaces:

devesg
/

singing_voice_conversion

Running

File size: 5,918 Bytes

df2accb

# Copyright (c) 2023 Amphion.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import math

from torch import nn
from torch.nn import functional as F

from .modules import Conv1d1x1, ResidualConv1dGLU
from .upsample import ConvInUpsampleNetwork


def receptive_field_size(
    total_layers, num_cycles, kernel_size, dilation=lambda x: 2**x
):
    """Compute receptive field size

    Args:
        total_layers (int): total layers
        num_cycles (int): cycles
        kernel_size (int): kernel size
        dilation (lambda): lambda to compute dilation factor. ``lambda x : 1``
          to disable dilated convolution.

    Returns:
        int: receptive field size in sample

    """
    assert total_layers % num_cycles == 0

    layers_per_cycle = total_layers // num_cycles
    dilations = [dilation(i % layers_per_cycle) for i in range(total_layers)]
    return (kernel_size - 1) * sum(dilations) + 1


class WaveNet(nn.Module):
    """The WaveNet model that supports local and global conditioning.

    Args:
        out_channels (int): Output channels. If input_type is mu-law quantized
          one-hot vecror. this must equal to the quantize channels. Other wise
          num_mixtures x 3 (pi, mu, log_scale).
        layers (int): Number of total layers
        stacks (int): Number of dilation cycles
        residual_channels (int): Residual input / output channels
        gate_channels (int): Gated activation channels.
        skip_out_channels (int): Skip connection channels.
        kernel_size (int): Kernel size of convolution layers.
        dropout (float): Dropout probability.
        input_dim (int): Number of mel-spec dimension.
        upsample_scales (list): List of upsample scale.
          ``np.prod(upsample_scales)`` must equal to hop size. Used only if
          upsample_conditional_features is enabled.
        freq_axis_kernel_size (int): Freq-axis kernel_size for transposed
          convolution layers for upsampling. If you only care about time-axis
          upsampling, set this to 1.
        scalar_input (Bool): If True, scalar input ([-1, 1]) is expected, otherwise
          quantized one-hot vector is expected..
    """

    def __init__(self, cfg):
        super(WaveNet, self).__init__()
        self.cfg = cfg
        self.scalar_input = self.cfg.VOCODER.SCALAR_INPUT
        self.out_channels = self.cfg.VOCODER.OUT_CHANNELS
        self.cin_channels = self.cfg.VOCODER.INPUT_DIM
        self.residual_channels = self.cfg.VOCODER.RESIDUAL_CHANNELS
        self.layers = self.cfg.VOCODER.LAYERS
        self.stacks = self.cfg.VOCODER.STACKS
        self.gate_channels = self.cfg.VOCODER.GATE_CHANNELS
        self.kernel_size = self.cfg.VOCODER.KERNEL_SIZE
        self.skip_out_channels = self.cfg.VOCODER.SKIP_OUT_CHANNELS
        self.dropout = self.cfg.VOCODER.DROPOUT
        self.upsample_scales = self.cfg.VOCODER.UPSAMPLE_SCALES
        self.mel_frame_pad = self.cfg.VOCODER.MEL_FRAME_PAD

        assert self.layers % self.stacks == 0

        layers_per_stack = self.layers // self.stacks
        if self.scalar_input:
            self.first_conv = Conv1d1x1(1, self.residual_channels)
        else:
            self.first_conv = Conv1d1x1(self.out_channels, self.residual_channels)

        self.conv_layers = nn.ModuleList()
        for layer in range(self.layers):
            dilation = 2 ** (layer % layers_per_stack)
            conv = ResidualConv1dGLU(
                self.residual_channels,
                self.gate_channels,
                kernel_size=self.kernel_size,
                skip_out_channels=self.skip_out_channels,
                bias=True,
                dilation=dilation,
                dropout=self.dropout,
                cin_channels=self.cin_channels,
            )
            self.conv_layers.append(conv)

        self.last_conv_layers = nn.ModuleList(
            [
                nn.ReLU(inplace=True),
                Conv1d1x1(self.skip_out_channels, self.skip_out_channels),
                nn.ReLU(inplace=True),
                Conv1d1x1(self.skip_out_channels, self.out_channels),
            ]
        )

        self.upsample_net = ConvInUpsampleNetwork(
            upsample_scales=self.upsample_scales,
            cin_pad=self.mel_frame_pad,
            cin_channels=self.cin_channels,
        )

        self.receptive_field = receptive_field_size(
            self.layers, self.stacks, self.kernel_size
        )

    def forward(self, x, mel, softmax=False):
        """Forward step

        Args:
            x (Tensor): One-hot encoded audio signal, shape (B x C x T)
            mel (Tensor): Local conditioning features,
              shape (B x cin_channels x T)
            softmax (bool): Whether applies softmax or not.

        Returns:
            Tensor: output, shape B x out_channels x T
        """
        B, _, T = x.size()

        mel = self.upsample_net(mel)
        assert mel.shape[-1] == x.shape[-1]

        x = self.first_conv(x)
        skips = 0
        for f in self.conv_layers:
            x, h = f(x, mel)
            skips += h
        skips *= math.sqrt(1.0 / len(self.conv_layers))

        x = skips
        for f in self.last_conv_layers:
            x = f(x)

        x = F.softmax(x, dim=1) if softmax else x

        return x

    def clear_buffer(self):
        self.first_conv.clear_buffer()
        for f in self.conv_layers:
            f.clear_buffer()
        for f in self.last_conv_layers:
            try:
                f.clear_buffer()
            except AttributeError:
                pass

    def make_generation_fast_(self):
        def remove_weight_norm(m):
            try:
                nn.utils.remove_weight_norm(m)
            except ValueError:  # this module didn't have weight norm
                return

        self.apply(remove_weight_norm)