"""https://github.com/fishaudio/fish-speech/blob/main/fish_speech/models/vqgan/modules/wavenet.py""" import math from typing import Optional import torch import torch.nn.functional as F from torch import nn class Mish(nn.Module): def forward(self, x): return x * torch.tanh(F.softplus(x)) class DiffusionEmbedding(nn.Module): """Diffusion Step Embedding""" def __init__(self, d_denoiser): super(DiffusionEmbedding, self).__init__() self.dim = d_denoiser def forward(self, x): device = x.device half_dim = self.dim // 2 emb = math.log(10000) / (half_dim - 1) emb = torch.exp(torch.arange(half_dim, device=device) * -emb) emb = x[:, None] * emb[None, :] emb = torch.cat((emb.sin(), emb.cos()), dim=-1) return emb class LinearNorm(nn.Module): """LinearNorm Projection""" def __init__(self, in_features, out_features, bias=False): super(LinearNorm, self).__init__() self.linear = nn.Linear(in_features, out_features, bias) nn.init.xavier_uniform_(self.linear.weight) if bias: nn.init.constant_(self.linear.bias, 0.0) def forward(self, x): x = self.linear(x) return x class ConvNorm(nn.Module): """1D Convolution""" def __init__( self, in_channels, out_channels, kernel_size=1, stride=1, padding=None, dilation=1, bias=True, w_init_gain="linear", ): super(ConvNorm, self).__init__() if padding is None: assert kernel_size % 2 == 1 padding = int(dilation * (kernel_size - 1) / 2) self.conv = nn.Conv1d( in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, bias=bias, ) nn.init.kaiming_normal_(self.conv.weight) def forward(self, signal): conv_signal = self.conv(signal) return conv_signal class ResidualBlock(nn.Module): """Residual Block""" def __init__( self, residual_channels, use_linear_bias=False, dilation=1, condition_channels=None, ): super(ResidualBlock, self).__init__() self.conv_layer = ConvNorm( residual_channels, 2 * residual_channels, kernel_size=3, stride=1, padding=dilation, dilation=dilation, ) if condition_channels is not None: self.diffusion_projection = LinearNorm( residual_channels, residual_channels, use_linear_bias ) self.condition_projection = ConvNorm( condition_channels, 2 * residual_channels, kernel_size=1 ) self.output_projection = ConvNorm( residual_channels, 2 * residual_channels, kernel_size=1 ) def forward(self, x, condition=None, diffusion_step=None): y = x if diffusion_step is not None: diffusion_step = self.diffusion_projection(diffusion_step).unsqueeze(-1) y = y + diffusion_step y = self.conv_layer(y) if condition is not None: condition = self.condition_projection(condition) y = y + condition gate, filter = torch.chunk(y, 2, dim=1) y = torch.sigmoid(gate) * torch.tanh(filter) y = self.output_projection(y) residual, skip = torch.chunk(y, 2, dim=1) return (x + residual) / math.sqrt(2.0), skip class WaveNet(nn.Module): def __init__( self, input_channels: Optional[int] = None, output_channels: Optional[int] = None, residual_channels: int = 512, residual_layers: int = 20, dilation_cycle: Optional[int] = 4, is_diffusion: bool = False, condition_channels: Optional[int] = None, ): super().__init__() # Input projection self.input_projection = None if input_channels is not None and input_channels != residual_channels: self.input_projection = ConvNorm( input_channels, residual_channels, kernel_size=1 ) if input_channels is None: input_channels = residual_channels self.input_channels = input_channels # Residual layers self.residual_layers = nn.ModuleList( [ ResidualBlock( residual_channels=residual_channels, use_linear_bias=False, dilation=2 ** (i % dilation_cycle) if dilation_cycle else 1, condition_channels=condition_channels, ) for i in range(residual_layers) ] ) # Skip projection self.skip_projection = ConvNorm( residual_channels, residual_channels, kernel_size=1 ) # Output projection self.output_projection = None if output_channels is not None and output_channels != residual_channels: self.output_projection = ConvNorm( residual_channels, output_channels, kernel_size=1 ) if is_diffusion: self.diffusion_embedding = DiffusionEmbedding(residual_channels) self.mlp = nn.Sequential( LinearNorm(residual_channels, residual_channels * 4, False), Mish(), LinearNorm(residual_channels * 4, residual_channels, False), ) self.apply(self._init_weights) def _init_weights(self, m): if isinstance(m, (nn.Conv1d, nn.Linear)): nn.init.trunc_normal_(m.weight, std=0.02) if getattr(m, "bias", None) is not None: nn.init.constant_(m.bias, 0) def forward(self, x, t=None, condition=None): if self.input_projection is not None: x = self.input_projection(x) x = F.silu(x) if t is not None: t = self.diffusion_embedding(t) t = self.mlp(t) skip = [] for layer in self.residual_layers: x, skip_connection = layer(x, condition, t) skip.append(skip_connection) x = torch.sum(torch.stack(skip), dim=0) / math.sqrt(len(self.residual_layers)) x = self.skip_projection(x) if self.output_projection is not None: x = F.silu(x) x = self.output_projection(x) return x