zhzluke96
update
1df74c6
raw
history blame
6.57 kB
"""https://github.com/fishaudio/fish-speech/blob/main/fish_speech/models/vqgan/modules/wavenet.py"""
import math
from typing import Optional
import torch
import torch.nn.functional as F
from torch import nn
class Mish(nn.Module):
def forward(self, x):
return x * torch.tanh(F.softplus(x))
class DiffusionEmbedding(nn.Module):
"""Diffusion Step Embedding"""
def __init__(self, d_denoiser):
super(DiffusionEmbedding, self).__init__()
self.dim = d_denoiser
def forward(self, x):
device = x.device
half_dim = self.dim // 2
emb = math.log(10000) / (half_dim - 1)
emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
emb = x[:, None] * emb[None, :]
emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
return emb
class LinearNorm(nn.Module):
"""LinearNorm Projection"""
def __init__(self, in_features, out_features, bias=False):
super(LinearNorm, self).__init__()
self.linear = nn.Linear(in_features, out_features, bias)
nn.init.xavier_uniform_(self.linear.weight)
if bias:
nn.init.constant_(self.linear.bias, 0.0)
def forward(self, x):
x = self.linear(x)
return x
class ConvNorm(nn.Module):
"""1D Convolution"""
def __init__(
self,
in_channels,
out_channels,
kernel_size=1,
stride=1,
padding=None,
dilation=1,
bias=True,
w_init_gain="linear",
):
super(ConvNorm, self).__init__()
if padding is None:
assert kernel_size % 2 == 1
padding = int(dilation * (kernel_size - 1) / 2)
self.conv = nn.Conv1d(
in_channels,
out_channels,
kernel_size=kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
bias=bias,
)
nn.init.kaiming_normal_(self.conv.weight)
def forward(self, signal):
conv_signal = self.conv(signal)
return conv_signal
class ResidualBlock(nn.Module):
"""Residual Block"""
def __init__(
self,
residual_channels,
use_linear_bias=False,
dilation=1,
condition_channels=None,
):
super(ResidualBlock, self).__init__()
self.conv_layer = ConvNorm(
residual_channels,
2 * residual_channels,
kernel_size=3,
stride=1,
padding=dilation,
dilation=dilation,
)
if condition_channels is not None:
self.diffusion_projection = LinearNorm(
residual_channels, residual_channels, use_linear_bias
)
self.condition_projection = ConvNorm(
condition_channels, 2 * residual_channels, kernel_size=1
)
self.output_projection = ConvNorm(
residual_channels, 2 * residual_channels, kernel_size=1
)
def forward(self, x, condition=None, diffusion_step=None):
y = x
if diffusion_step is not None:
diffusion_step = self.diffusion_projection(diffusion_step).unsqueeze(-1)
y = y + diffusion_step
y = self.conv_layer(y)
if condition is not None:
condition = self.condition_projection(condition)
y = y + condition
gate, filter = torch.chunk(y, 2, dim=1)
y = torch.sigmoid(gate) * torch.tanh(filter)
y = self.output_projection(y)
residual, skip = torch.chunk(y, 2, dim=1)
return (x + residual) / math.sqrt(2.0), skip
class WaveNet(nn.Module):
def __init__(
self,
input_channels: Optional[int] = None,
output_channels: Optional[int] = None,
residual_channels: int = 512,
residual_layers: int = 20,
dilation_cycle: Optional[int] = 4,
is_diffusion: bool = False,
condition_channels: Optional[int] = None,
):
super().__init__()
# Input projection
self.input_projection = None
if input_channels is not None and input_channels != residual_channels:
self.input_projection = ConvNorm(
input_channels, residual_channels, kernel_size=1
)
if input_channels is None:
input_channels = residual_channels
self.input_channels = input_channels
# Residual layers
self.residual_layers = nn.ModuleList(
[
ResidualBlock(
residual_channels=residual_channels,
use_linear_bias=False,
dilation=2 ** (i % dilation_cycle) if dilation_cycle else 1,
condition_channels=condition_channels,
)
for i in range(residual_layers)
]
)
# Skip projection
self.skip_projection = ConvNorm(
residual_channels, residual_channels, kernel_size=1
)
# Output projection
self.output_projection = None
if output_channels is not None and output_channels != residual_channels:
self.output_projection = ConvNorm(
residual_channels, output_channels, kernel_size=1
)
if is_diffusion:
self.diffusion_embedding = DiffusionEmbedding(residual_channels)
self.mlp = nn.Sequential(
LinearNorm(residual_channels, residual_channels * 4, False),
Mish(),
LinearNorm(residual_channels * 4, residual_channels, False),
)
self.apply(self._init_weights)
def _init_weights(self, m):
if isinstance(m, (nn.Conv1d, nn.Linear)):
nn.init.trunc_normal_(m.weight, std=0.02)
if getattr(m, "bias", None) is not None:
nn.init.constant_(m.bias, 0)
def forward(self, x, t=None, condition=None):
if self.input_projection is not None:
x = self.input_projection(x)
x = F.silu(x)
if t is not None:
t = self.diffusion_embedding(t)
t = self.mlp(t)
skip = []
for layer in self.residual_layers:
x, skip_connection = layer(x, condition, t)
skip.append(skip_connection)
x = torch.sum(torch.stack(skip), dim=0) / math.sqrt(len(self.residual_layers))
x = self.skip_projection(x)
if self.output_projection is not None:
x = F.silu(x)
x = self.output_projection(x)
return x