from dac.nn.quantize import ResidualVectorQuantize from torch import nn from modules.wavenet import WN import torch import torchaudio import torchaudio.functional as audio_F import numpy as np from .bigvgan import * from torch.nn.utils import weight_norm from torch import nn, sin, pow from einops.layers.torch import Rearrange from dac.model.encodec import SConv1d def init_weights(m): if isinstance(m, nn.Conv1d): nn.init.trunc_normal_(m.weight, std=0.02) nn.init.constant_(m.bias, 0) def WNConv1d(*args, **kwargs): return weight_norm(nn.Conv1d(*args, **kwargs)) def WNConvTranspose1d(*args, **kwargs): return weight_norm(nn.ConvTranspose1d(*args, **kwargs)) class SnakeBeta(nn.Module): """ A modified Snake function which uses separate parameters for the magnitude of the periodic components Shape: - Input: (B, C, T) - Output: (B, C, T), same shape as the input Parameters: - alpha - trainable parameter that controls frequency - beta - trainable parameter that controls magnitude References: - This activation function is a modified version based on this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda: https://arxiv.org/abs/2006.08195 Examples: >>> a1 = snakebeta(256) >>> x = torch.randn(256) >>> x = a1(x) """ def __init__( self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False ): """ Initialization. INPUT: - in_features: shape of the input - alpha - trainable parameter that controls frequency - beta - trainable parameter that controls magnitude alpha is initialized to 1 by default, higher values = higher-frequency. beta is initialized to 1 by default, higher values = higher-magnitude. alpha will be trained along with the rest of your model. """ super(SnakeBeta, self).__init__() self.in_features = in_features # initialize alpha self.alpha_logscale = alpha_logscale if self.alpha_logscale: # log scale alphas initialized to zeros self.alpha = nn.Parameter(torch.zeros(in_features) * alpha) self.beta = nn.Parameter(torch.zeros(in_features) * alpha) else: # linear scale alphas initialized to ones self.alpha = nn.Parameter(torch.ones(in_features) * alpha) self.beta = nn.Parameter(torch.ones(in_features) * alpha) self.alpha.requires_grad = alpha_trainable self.beta.requires_grad = alpha_trainable self.no_div_by_zero = 0.000000001 def forward(self, x): """ Forward pass of the function. Applies the function to the input elementwise. SnakeBeta := x + 1/b * sin^2 (xa) """ alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T] beta = self.beta.unsqueeze(0).unsqueeze(-1) if self.alpha_logscale: alpha = torch.exp(alpha) beta = torch.exp(beta) x = x + (1.0 / (beta + self.no_div_by_zero)) * pow(sin(x * alpha), 2) return x class ResidualUnit(nn.Module): def __init__(self, dim: int = 16, dilation: int = 1): super().__init__() pad = ((7 - 1) * dilation) // 2 self.block = nn.Sequential( Activation1d(activation=SnakeBeta(dim, alpha_logscale=True)), WNConv1d(dim, dim, kernel_size=7, dilation=dilation, padding=pad), Activation1d(activation=SnakeBeta(dim, alpha_logscale=True)), WNConv1d(dim, dim, kernel_size=1), ) def forward(self, x): return x + self.block(x) class CNNLSTM(nn.Module): def __init__(self, indim, outdim, head, global_pred=False): super().__init__() self.global_pred = global_pred self.model = nn.Sequential( ResidualUnit(indim, dilation=1), ResidualUnit(indim, dilation=2), ResidualUnit(indim, dilation=3), Activation1d(activation=SnakeBeta(indim, alpha_logscale=True)), Rearrange("b c t -> b t c"), ) self.heads = nn.ModuleList([nn.Linear(indim, outdim) for i in range(head)]) def forward(self, x): # x: [B, C, T] x = self.model(x) if self.global_pred: x = torch.mean(x, dim=1, keepdim=False) outs = [head(x) for head in self.heads] return outs def sequence_mask(length, max_length=None): if max_length is None: max_length = length.max() x = torch.arange(max_length, dtype=length.dtype, device=length.device) return x.unsqueeze(0) < length.unsqueeze(1) class FAquantizer(nn.Module): def __init__(self, in_dim=1024, n_p_codebooks=1, n_c_codebooks=2, n_t_codebooks=2, n_r_codebooks=3, codebook_size=1024, codebook_dim=8, quantizer_dropout=0.5, causal=False, separate_prosody_encoder=False, timbre_norm=False,): super(FAquantizer, self).__init__() conv1d_type = SConv1d# if causal else nn.Conv1d self.prosody_quantizer = ResidualVectorQuantize( input_dim=in_dim, n_codebooks=n_p_codebooks, codebook_size=codebook_size, codebook_dim=codebook_dim, quantizer_dropout=quantizer_dropout, ) self.content_quantizer = ResidualVectorQuantize( input_dim=in_dim, n_codebooks=n_c_codebooks, codebook_size=codebook_size, codebook_dim=codebook_dim, quantizer_dropout=quantizer_dropout, ) self.residual_quantizer = ResidualVectorQuantize( input_dim=in_dim, n_codebooks=n_r_codebooks, codebook_size=codebook_size, codebook_dim=codebook_dim, quantizer_dropout=quantizer_dropout, ) self.melspec_linear = conv1d_type(in_channels=20, out_channels=256, kernel_size=1, causal=causal) self.melspec_encoder = WN(hidden_channels=256, kernel_size=5, dilation_rate=1, n_layers=8, gin_channels=0, p_dropout=0.2, causal=causal) self.melspec_linear2 = conv1d_type(in_channels=256, out_channels=1024, kernel_size=1, causal=causal) self.prob_random_mask_residual = 0.75 SPECT_PARAMS = { "n_fft": 2048, "win_length": 1200, "hop_length": 300, } MEL_PARAMS = { "n_mels": 80, } self.to_mel = torchaudio.transforms.MelSpectrogram( n_mels=MEL_PARAMS["n_mels"], sample_rate=24000, **SPECT_PARAMS ) self.mel_mean, self.mel_std = -4, 4 self.frame_rate = 24000 / 300 self.hop_length = 300 def preprocess(self, wave_tensor, n_bins=20): mel_tensor = self.to_mel(wave_tensor.squeeze(1)) mel_tensor = (torch.log(1e-5 + mel_tensor) - self.mel_mean) / self.mel_std return mel_tensor[:, :n_bins, :int(wave_tensor.size(-1) / self.hop_length)] def forward(self, x, wave_segments): outs = 0 prosody_feature = self.preprocess(wave_segments) f0_input = prosody_feature # (B, T, 20) f0_input = self.melspec_linear(f0_input) f0_input = self.melspec_encoder(f0_input, torch.ones(f0_input.shape[0], 1, f0_input.shape[2]).to( f0_input.device).bool()) f0_input = self.melspec_linear2(f0_input) common_min_size = min(f0_input.size(2), x.size(2)) f0_input = f0_input[:, :, :common_min_size] x = x[:, :, :common_min_size] z_p, codes_p, latents_p, commitment_loss_p, codebook_loss_p = self.prosody_quantizer( f0_input, 1 ) outs += z_p.detach() z_c, codes_c, latents_c, commitment_loss_c, codebook_loss_c = self.content_quantizer( x, 2 ) outs += z_c.detach() residual_feature = x - z_p.detach() - z_c.detach() z_r, codes_r, latents_r, commitment_loss_r, codebook_loss_r = self.residual_quantizer( residual_feature, 3 ) quantized = [z_p, z_c, z_r] codes = [codes_p, codes_c, codes_r] return quantized, codes