|
import numpy as np |
|
import torch |
|
import logging |
|
|
|
import torch.nn.functional as F |
|
from torch import nn |
|
|
|
from modules.univ_ddsp.block import LVCBlock |
|
|
|
LRELU_SLOPE = 0.1 |
|
|
|
from modules.ddsp.vocoder import CombSub, Sins |
|
|
|
|
|
class SineGen(torch.nn.Module): |
|
""" Definition of sine generator |
|
SineGen(samp_rate, harmonic_num = 0, |
|
sine_amp = 0.1, noise_std = 0.003, |
|
voiced_threshold = 0, |
|
flag_for_pulse=False) |
|
samp_rate: sampling rate in Hz |
|
harmonic_num: number of harmonic overtones (default 0) |
|
sine_amp: amplitude of sine-waveform (default 0.1) |
|
noise_std: std of Gaussian noise (default 0.003) |
|
voiced_threshold: F0 threshold for U/V classification (default 0) |
|
flag_for_pulse: this SinGen is used inside PulseGen (default False) |
|
Note: when flag_for_pulse is True, the first time step of a voiced |
|
segment is always sin(np.pi) or cos(0) |
|
""" |
|
|
|
def __init__(self, samp_rate, harmonic_num=0, |
|
sine_amp=0.1, noise_std=0.003, |
|
voiced_threshold=0): |
|
super(SineGen, self).__init__() |
|
self.sine_amp = sine_amp |
|
self.noise_std = noise_std |
|
self.harmonic_num = harmonic_num |
|
self.dim = self.harmonic_num + 1 |
|
self.sampling_rate = samp_rate |
|
self.voiced_threshold = voiced_threshold |
|
|
|
def _f02uv(self, f0): |
|
|
|
uv = torch.ones_like(f0) |
|
uv = uv * (f0 > self.voiced_threshold) |
|
return uv |
|
|
|
def _f02sine(self, f0_values, upp): |
|
""" f0_values: (batchsize, length, dim) |
|
where dim indicates fundamental tone and overtones |
|
""" |
|
rad_values = (f0_values / self.sampling_rate).fmod(1.) |
|
rand_ini = torch.rand(1, self.dim, device=f0_values.device) |
|
rand_ini[:, 0] = 0 |
|
rad_values[:, 0, :] += rand_ini |
|
is_half = rad_values.dtype is not torch.float32 |
|
tmp_over_one = torch.cumsum(rad_values.double(), 1) |
|
if is_half: |
|
tmp_over_one = tmp_over_one.half() |
|
else: |
|
tmp_over_one = tmp_over_one.float() |
|
tmp_over_one *= upp |
|
tmp_over_one = F.interpolate( |
|
tmp_over_one.transpose(2, 1), scale_factor=upp, |
|
mode='linear', align_corners=True |
|
).transpose(2, 1) |
|
rad_values = F.interpolate(rad_values.transpose(2, 1), scale_factor=upp, mode='nearest').transpose(2, 1) |
|
tmp_over_one = tmp_over_one.fmod(1.) |
|
diff = F.conv2d( |
|
tmp_over_one.unsqueeze(1), torch.FloatTensor([[[[-1.], [1.]]]]).to(tmp_over_one.device), |
|
stride=(1, 1), padding=0, dilation=(1, 1) |
|
).squeeze(1) |
|
cumsum_shift = (diff < 0).double() |
|
cumsum_shift = torch.cat(( |
|
torch.zeros((f0_values.size()[0], 1, self.dim), dtype=torch.double).to(f0_values.device), |
|
cumsum_shift |
|
), dim=1) |
|
sines = torch.sin(torch.cumsum(rad_values.double() + cumsum_shift, dim=1) * 2 * np.pi) |
|
if is_half: |
|
sines = sines.half() |
|
else: |
|
sines = sines.float() |
|
return sines |
|
|
|
@torch.no_grad() |
|
def forward(self, f0, upp): |
|
""" sine_tensor, uv = forward(f0) |
|
input F0: tensor(batchsize=1, length, dim=1) |
|
f0 for unvoiced steps should be 0 |
|
output sine_tensor: tensor(batchsize=1, length, dim) |
|
output uv: tensor(batchsize=1, length, 1) |
|
""" |
|
f0 = f0.unsqueeze(-1) |
|
fn = torch.multiply(f0, torch.arange(1, self.dim + 1, device=f0.device).reshape((1, 1, -1))) |
|
sine_waves = self._f02sine(fn, upp) * self.sine_amp |
|
uv = (f0 > self.voiced_threshold).float() |
|
uv = F.interpolate(uv.transpose(2, 1), scale_factor=upp, mode='nearest').transpose(2, 1) |
|
noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 |
|
noise = noise_amp * torch.randn_like(sine_waves) |
|
sine_waves = sine_waves * uv + noise |
|
return sine_waves |
|
|
|
|
|
class SourceModuleHnNSF(torch.nn.Module): |
|
""" SourceModule for hn-nsf |
|
SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1, |
|
add_noise_std=0.003, voiced_threshod=0) |
|
sampling_rate: sampling_rate in Hz |
|
harmonic_num: number of harmonic above F0 (default: 0) |
|
sine_amp: amplitude of sine source signal (default: 0.1) |
|
add_noise_std: std of additive Gaussian noise (default: 0.003) |
|
note that amplitude of noise in unvoiced is decided |
|
by sine_amp |
|
voiced_threshold: threhold to set U/V given F0 (default: 0) |
|
Sine_source, noise_source = SourceModuleHnNSF(F0_sampled) |
|
F0_sampled (batchsize, length, 1) |
|
Sine_source (batchsize, length, 1) |
|
noise_source (batchsize, length 1) |
|
uv (batchsize, length, 1) |
|
""" |
|
|
|
def __init__(self, sampling_rate, harmonic_num=0, sine_amp=0.1, |
|
add_noise_std=0.003, voiced_threshold=0): |
|
super(SourceModuleHnNSF, self).__init__() |
|
|
|
self.sine_amp = sine_amp |
|
self.noise_std = add_noise_std |
|
|
|
|
|
self.l_sin_gen = SineGen(sampling_rate, harmonic_num, |
|
sine_amp, add_noise_std, voiced_threshold) |
|
|
|
|
|
self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) |
|
self.l_tanh = torch.nn.Tanh() |
|
|
|
def forward(self, x, upp): |
|
sine_wavs = self.l_sin_gen(x, upp) |
|
sine_merge = self.l_tanh(self.l_linear(sine_wavs)) |
|
return sine_merge |
|
|
|
class DDSP(nn.Module): |
|
def __init__(self,config): |
|
super().__init__() |
|
if config['model_args']['type']=='CombSub': |
|
self.ddsp = CombSub( |
|
sampling_rate=config['audio_sample_rate'], |
|
block_size=config['hop_size'], |
|
win_length=config['win_size'], |
|
n_mag_harmonic=config['model_args']['n_mag_harmonic'], |
|
n_mag_noise=config['model_args']['n_mag_noise'], |
|
n_mels=config['audio_num_mel_bins']) |
|
elif config['model_args']['type']=='Sins': |
|
self.ddsp = Sins( |
|
sampling_rate=config['audio_sample_rate'], |
|
block_size=config['hop_size'], |
|
win_length=config['win_size'], |
|
n_harmonics=config['model_args']['n_harmonics'], |
|
n_mag_noise=config['model_args']['n_mag_noise'], |
|
n_mels=config['audio_num_mel_bins']) |
|
|
|
def forward(self,mel,f0,infer=False): |
|
signal, _, (s_h, s_n) = self.ddsp(mel.transpose(1,2), torch.unsqueeze(f0,dim=-1), infer=infer) |
|
return signal.unsqueeze(1),s_h,s_n |
|
|
|
class downblock(nn.Module): |
|
def __init__(self, down, indim, outdim): |
|
super().__init__() |
|
self.c = nn.Conv1d(indim, outdim * 2, kernel_size=down * 2, stride=down, padding=down // 2) |
|
self.act = GLU(1) |
|
self.out = nn.Conv1d(outdim, outdim, kernel_size=3, padding=1) |
|
self.act1 = nn.GELU() |
|
|
|
def forward(self, x): |
|
return self.act1(self.out(self.act(self.c(x)))) |
|
|
|
class ddsp_down(nn.Module): |
|
def __init__(self,dims,downs:list,): |
|
super().__init__() |
|
|
|
dl=[] |
|
ppl=[] |
|
downs.reverse() |
|
self.fistpp=nn.Conv1d(1,dims,kernel_size=1) |
|
for idx,i in enumerate(downs[:-1]): |
|
if idx==0: |
|
dl.append(downblock(i,1,dims)) |
|
ppl.append(nn.Conv1d(dims,dims,kernel_size=1)) |
|
else: |
|
dl.append(downblock(i,dims*idx,dims*(idx + 1))) |
|
ppl.append(nn.Conv1d(dims*(idx + 1), dims, kernel_size=1)) |
|
self.downs = nn.ModuleList(dl) |
|
self.ppls = nn.ModuleList(ppl) |
|
def forward(self,x): |
|
spec=[] |
|
spec.append(self.fistpp(x)) |
|
|
|
for dl,ppl in zip(self.downs,self.ppls ): |
|
x=dl(x) |
|
spec.append(ppl(x)) |
|
spec.reverse() |
|
return spec |
|
|
|
|
|
|
|
class GLU(torch.nn.Module): |
|
def __init__(self, dim): |
|
super().__init__() |
|
self.dim = dim |
|
|
|
def forward(self, x): |
|
out, gate = x.chunk(2, dim=self.dim) |
|
return out * gate.sigmoid() |
|
|
|
class Upspamper(torch.nn.Module): |
|
|
|
def __init__(self): |
|
super().__init__() |
|
self.c1 = torch.nn.Conv2d(1, 8, kernel_size=1) |
|
self.UP = torch.nn.ConvTranspose2d(4, 8, [3, 32], stride=[1, 2], padding=[1, 15]) |
|
self.Glu = GLU(1) |
|
self.c2 = torch.nn.Conv2d(4, 8, kernel_size=3,padding=1) |
|
self.c3 = torch.nn.Conv2d(4, 2, kernel_size=1) |
|
|
|
|
|
|
|
def forward(self, x): |
|
x = torch.unsqueeze(x, 1) |
|
x=self.Glu (self.c1(x)) |
|
|
|
|
|
x = self.Glu (self.UP(x)) |
|
x=self.Glu(self.c2(x))+x |
|
x =self.Glu(self.c3(x)) |
|
|
|
spectrogram = torch.squeeze(x, 1) |
|
return spectrogram |
|
|
|
class nsfUnivNet(torch.nn.Module): |
|
"""Parallel WaveGAN Generator module.""" |
|
|
|
def __init__(self, h, use_weight_norm=True): |
|
|
|
super().__init__() |
|
|
|
|
|
self.m_source = SourceModuleHnNSF( |
|
sampling_rate=h['audio_sample_rate'], |
|
harmonic_num=8 |
|
) |
|
self.upp = int(np.prod(h['hop_size'])) |
|
|
|
|
|
|
|
in_channels = h['model_args']['cond_in_channels'] |
|
out_channels = h['model_args']['out_channels'] |
|
inner_channels = h['model_args']['cg_channels'] |
|
cond_channels = h['audio_num_mel_bins'] |
|
upsample_ratios = h['model_args']['upsample_rates'] |
|
lvc_layers_each_block = h['model_args']['num_lvc_blocks'] |
|
lvc_kernel_size = h['model_args']['lvc_kernels'] |
|
kpnet_hidden_channels = h['model_args']['lvc_hidden_channels'] |
|
kpnet_conv_size = h['model_args']['lvc_conv_size'] |
|
dropout = h['model_args']['dropout'] |
|
|
|
self.ddspd = ddsp_down(dims=inner_channels,downs=upsample_ratios.copy(),) |
|
|
|
upmel=h['model_args'].get('upmel') |
|
self.upblocke=torch.nn.Sequential(*[Upspamper() for i in range(upmel//2)]) if upmel is not None or upmel==1 else torch.nn.Identity() |
|
|
|
self.in_channels = in_channels |
|
self.out_channels = out_channels |
|
self.cond_channels = cond_channels |
|
self.lvc_block_nums = len(upsample_ratios) |
|
|
|
|
|
self.first_conv = torch.nn.Conv1d(in_channels, inner_channels, |
|
kernel_size=7, padding=(7 - 1) // 2, |
|
dilation=1, bias=True) |
|
|
|
|
|
self.lvc_blocks = torch.nn.ModuleList() |
|
cond_hop_length = 1 |
|
for n in range(self.lvc_block_nums): |
|
cond_hop_length = cond_hop_length * upsample_ratios[n] |
|
lvcb = LVCBlock( |
|
in_channels=inner_channels, |
|
cond_channels=cond_channels, |
|
upsample_ratio=upsample_ratios[n], |
|
conv_layers=lvc_layers_each_block, |
|
conv_kernel_size=lvc_kernel_size, |
|
cond_hop_length=cond_hop_length, |
|
kpnet_hidden_channels=kpnet_hidden_channels, |
|
kpnet_conv_size=kpnet_conv_size, |
|
kpnet_dropout=dropout, |
|
) |
|
self.lvc_blocks += [lvcb] |
|
|
|
|
|
self.last_conv_layers = torch.nn.ModuleList([ |
|
torch.nn.Conv1d(inner_channels, out_channels, kernel_size=7, padding=(7 - 1) // 2, |
|
dilation=1, bias=True), |
|
|
|
]) |
|
|
|
|
|
if use_weight_norm: |
|
self.apply_weight_norm() |
|
|
|
def forward(self, x, c,f0,infer=False): |
|
"""Calculate forward propagation. |
|
Args: |
|
x (Tensor): Input noise signal (B, 1, T). |
|
c (Tensor): Local conditioning auxiliary features (B, C ,T'). |
|
Returns: |
|
Tensor: Output tensor (B, out_channels, T) |
|
""" |
|
pass |
|
|
|
har_source = self.m_source(f0, self.upp).transpose(1, 2) |
|
specl=self.ddspd(har_source) |
|
|
|
x = self.first_conv(x) |
|
c=self.upblocke(c) |
|
|
|
for n in range(self.lvc_block_nums): |
|
x = self.lvc_blocks[n](x, c,specl[n]) |
|
|
|
|
|
for f in self.last_conv_layers: |
|
x = F.leaky_relu(x, LRELU_SLOPE) |
|
x = f(x) |
|
x = torch.tanh(x) |
|
return x,har_source |
|
|
|
def remove_weight_norm(self): |
|
"""Remove weight normalization module from all of the layers.""" |
|
def _remove_weight_norm(m): |
|
try: |
|
logging.debug(f"Weight norm is removed from {m}.") |
|
torch.nn.utils.remove_weight_norm(m) |
|
except ValueError: |
|
return |
|
|
|
self.apply(_remove_weight_norm) |
|
|
|
def apply_weight_norm(self): |
|
"""Apply weight normalization module from all of the layers.""" |
|
def _apply_weight_norm(m): |
|
if isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.Conv2d): |
|
torch.nn.utils.weight_norm(m) |
|
logging.debug(f"Weight norm is applied to {m}.") |
|
|
|
self.apply(_apply_weight_norm) |
|
|
|
@staticmethod |
|
def _get_receptive_field_size(layers, stacks, kernel_size, |
|
dilation=lambda x: 2 ** x): |
|
assert layers % stacks == 0 |
|
layers_per_cycle = layers // stacks |
|
dilations = [dilation(i % layers_per_cycle) for i in range(layers)] |
|
return (kernel_size - 1) * sum(dilations) + 1 |
|
|
|
@property |
|
def receptive_field_size(self): |
|
"""Return receptive field size.""" |
|
return self._get_receptive_field_size(self.layers, self.stacks, self.kernel_size) |
|
|
|
def inference(self, c=None, x=None): |
|
"""Perform inference. |
|
Args: |
|
c (Union[Tensor, ndarray]): Local conditioning auxiliary features (T' ,C). |
|
x (Union[Tensor, ndarray]): Input noise signal (T, 1). |
|
Returns: |
|
Tensor: Output tensor (T, out_channels) |
|
""" |
|
if x is not None: |
|
if not isinstance(x, torch.Tensor): |
|
x = torch.tensor(x, dtype=torch.float).to(next(self.parameters()).device) |
|
x = x.transpose(1, 0).unsqueeze(0) |
|
else: |
|
assert c is not None |
|
x = torch.randn(1, 1, len(c) * self.upsample_factor).to(next(self.parameters()).device) |
|
if c is not None: |
|
if not isinstance(c, torch.Tensor): |
|
c = torch.tensor(c, dtype=torch.float).to(next(self.parameters()).device) |
|
c = c.transpose(1, 0).unsqueeze(0) |
|
c = torch.nn.ReplicationPad1d(self.aux_context_window)(c) |
|
return self.forward(x, c).squeeze(0).transpose(1, 0) |
|
|