SingingVocoders / models /nsf_univnet /nsfunivnet.py

Upload 112 files

2a94974 verified over 1 year ago

14.9 kB

	import numpy as np
	import torch
	import logging
	# from modules import LVCBlock
	import torch.nn.functional as F
	from torch import nn

	from modules.univ_ddsp.block import LVCBlock

	LRELU_SLOPE = 0.1

	from modules.ddsp.vocoder import CombSub, Sins


	class SineGen(torch.nn.Module):
	""" Definition of sine generator
	SineGen(samp_rate, harmonic_num = 0,
	sine_amp = 0.1, noise_std = 0.003,
	voiced_threshold = 0,
	flag_for_pulse=False)
	samp_rate: sampling rate in Hz
	harmonic_num: number of harmonic overtones (default 0)
	sine_amp: amplitude of sine-waveform (default 0.1)
	noise_std: std of Gaussian noise (default 0.003)
	voiced_threshold: F0 threshold for U/V classification (default 0)
	flag_for_pulse: this SinGen is used inside PulseGen (default False)
	Note: when flag_for_pulse is True, the first time step of a voiced
	segment is always sin(np.pi) or cos(0)
	"""

	def __init__(self, samp_rate, harmonic_num=0,
	sine_amp=0.1, noise_std=0.003,
	voiced_threshold=0):
	super(SineGen, self).__init__()
	self.sine_amp = sine_amp
	self.noise_std = noise_std
	self.harmonic_num = harmonic_num
	self.dim = self.harmonic_num + 1
	self.sampling_rate = samp_rate
	self.voiced_threshold = voiced_threshold

	def _f02uv(self, f0):
	# generate uv signal
	uv = torch.ones_like(f0)
	uv = uv * (f0 > self.voiced_threshold)
	return uv

	def _f02sine(self, f0_values, upp):
	""" f0_values: (batchsize, length, dim)
	where dim indicates fundamental tone and overtones
	"""
	rad_values = (f0_values / self.sampling_rate).fmod(1.) # %1意味着n_har的乘积无法后处理优化
	rand_ini = torch.rand(1, self.dim, device=f0_values.device)
	rand_ini[:, 0] = 0
	rad_values[:, 0, :] += rand_ini
	is_half = rad_values.dtype is not torch.float32
	tmp_over_one = torch.cumsum(rad_values.double(), 1) # % 1 #####%1意味着后面的cumsum无法再优化
	if is_half:
	tmp_over_one = tmp_over_one.half()
	else:
	tmp_over_one = tmp_over_one.float()
	tmp_over_one *= upp
	tmp_over_one = F.interpolate(
	tmp_over_one.transpose(2, 1), scale_factor=upp,
	mode='linear', align_corners=True
	).transpose(2, 1)
	rad_values = F.interpolate(rad_values.transpose(2, 1), scale_factor=upp, mode='nearest').transpose(2, 1)
	tmp_over_one = tmp_over_one.fmod(1.)
	diff = F.conv2d(
	tmp_over_one.unsqueeze(1), torch.FloatTensor([[[[-1.], [1.]]]]).to(tmp_over_one.device),
	stride=(1, 1), padding=0, dilation=(1, 1)
	).squeeze(1) # Equivalent to torch.diff, but able to export ONNX
	cumsum_shift = (diff < 0).double()
	cumsum_shift = torch.cat((
	torch.zeros((f0_values.size()[0], 1, self.dim), dtype=torch.double).to(f0_values.device),
	cumsum_shift
	), dim=1)
	sines = torch.sin(torch.cumsum(rad_values.double() + cumsum_shift, dim=1) * 2 * np.pi)
	if is_half:
	sines = sines.half()
	else:
	sines = sines.float()
	return sines

	@torch.no_grad()
	def forward(self, f0, upp):
	""" sine_tensor, uv = forward(f0)
	input F0: tensor(batchsize=1, length, dim=1)
	f0 for unvoiced steps should be 0
	output sine_tensor: tensor(batchsize=1, length, dim)
	output uv: tensor(batchsize=1, length, 1)
	"""
	f0 = f0.unsqueeze(-1)
	fn = torch.multiply(f0, torch.arange(1, self.dim + 1, device=f0.device).reshape((1, 1, -1)))
	sine_waves = self._f02sine(fn, upp) * self.sine_amp
	uv = (f0 > self.voiced_threshold).float()
	uv = F.interpolate(uv.transpose(2, 1), scale_factor=upp, mode='nearest').transpose(2, 1)
	noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
	noise = noise_amp * torch.randn_like(sine_waves)
	sine_waves = sine_waves * uv + noise
	return sine_waves


	class SourceModuleHnNSF(torch.nn.Module):
	""" SourceModule for hn-nsf
	SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
	add_noise_std=0.003, voiced_threshod=0)
	sampling_rate: sampling_rate in Hz
	harmonic_num: number of harmonic above F0 (default: 0)
	sine_amp: amplitude of sine source signal (default: 0.1)
	add_noise_std: std of additive Gaussian noise (default: 0.003)
	note that amplitude of noise in unvoiced is decided
	by sine_amp
	voiced_threshold: threhold to set U/V given F0 (default: 0)
	Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
	F0_sampled (batchsize, length, 1)
	Sine_source (batchsize, length, 1)
	noise_source (batchsize, length 1)
	uv (batchsize, length, 1)
	"""

	def __init__(self, sampling_rate, harmonic_num=0, sine_amp=0.1,
	add_noise_std=0.003, voiced_threshold=0):
	super(SourceModuleHnNSF, self).__init__()

	self.sine_amp = sine_amp
	self.noise_std = add_noise_std

	# to produce sine waveforms
	self.l_sin_gen = SineGen(sampling_rate, harmonic_num,
	sine_amp, add_noise_std, voiced_threshold)

	# to merge source harmonics into a single excitation
	self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
	self.l_tanh = torch.nn.Tanh()

	def forward(self, x, upp):
	sine_wavs = self.l_sin_gen(x, upp)
	sine_merge = self.l_tanh(self.l_linear(sine_wavs))
	return sine_merge

	class DDSP(nn.Module):
	def __init__(self,config):
	super().__init__()
	if config['model_args']['type']=='CombSub':
	self.ddsp = CombSub(
	sampling_rate=config['audio_sample_rate'],
	block_size=config['hop_size'],
	win_length=config['win_size'],
	n_mag_harmonic=config['model_args']['n_mag_harmonic'],
	n_mag_noise=config['model_args']['n_mag_noise'],
	n_mels=config['audio_num_mel_bins'])
	elif config['model_args']['type']=='Sins':
	self.ddsp = Sins(
	sampling_rate=config['audio_sample_rate'],
	block_size=config['hop_size'],
	win_length=config['win_size'],
	n_harmonics=config['model_args']['n_harmonics'],
	n_mag_noise=config['model_args']['n_mag_noise'],
	n_mels=config['audio_num_mel_bins'])

	def forward(self,mel,f0,infer=False):
	signal, _, (s_h, s_n) = self.ddsp(mel.transpose(1,2), torch.unsqueeze(f0,dim=-1), infer=infer)
	return signal.unsqueeze(1),s_h,s_n

	class downblock(nn.Module):
	def __init__(self, down, indim, outdim):
	super().__init__()
	self.c = nn.Conv1d(indim, outdim * 2, kernel_size=down * 2, stride=down, padding=down // 2)
	self.act = GLU(1)
	self.out = nn.Conv1d(outdim, outdim, kernel_size=3, padding=1)
	self.act1 = nn.GELU()

	def forward(self, x):
	return self.act1(self.out(self.act(self.c(x))))

	class ddsp_down(nn.Module):
	def __init__(self,dims,downs:list,):
	super().__init__()

	dl=[]
	ppl=[]
	downs.reverse()
	self.fistpp=nn.Conv1d(1,dims,kernel_size=1)
	for idx,i in enumerate(downs[:-1]):
	if idx==0:
	dl.append(downblock(i,1,dims))
	ppl.append(nn.Conv1d(dims,dims,kernel_size=1))
	else:
	dl.append(downblock(i,dimsidx,dims(idx + 1)))
	ppl.append(nn.Conv1d(dims*(idx + 1), dims, kernel_size=1))
	self.downs = nn.ModuleList(dl)
	self.ppls = nn.ModuleList(ppl)
	def forward(self,x):
	spec=[]
	spec.append(self.fistpp(x))

	for dl,ppl in zip(self.downs,self.ppls ):
	x=dl(x)
	spec.append(ppl(x))
	spec.reverse()
	return spec



	class GLU(torch.nn.Module):
	def __init__(self, dim):
	super().__init__()
	self.dim = dim

	def forward(self, x):
	out, gate = x.chunk(2, dim=self.dim)
	return out * gate.sigmoid()

	class Upspamper(torch.nn.Module):

	def __init__(self):
	super().__init__()
	self.c1 = torch.nn.Conv2d(1, 8, kernel_size=1)
	self.UP = torch.nn.ConvTranspose2d(4, 8, [3, 32], stride=[1, 2], padding=[1, 15])
	self.Glu = GLU(1)
	self.c2 = torch.nn.Conv2d(4, 8, kernel_size=3,padding=1)
	self.c3 = torch.nn.Conv2d(4, 2, kernel_size=1)



	def forward(self, x):
	x = torch.unsqueeze(x, 1)
	x=self.Glu (self.c1(x))
	# x=self.net(x)

	x = self.Glu (self.UP(x))
	x=self.Glu(self.c2(x))+x
	x =self.Glu(self.c3(x))

	spectrogram = torch.squeeze(x, 1)
	return spectrogram

	class nsfUnivNet(torch.nn.Module):
	"""Parallel WaveGAN Generator module."""

	def __init__(self, h, use_weight_norm=True):

	super().__init__()

	# self.ddsp=DDSP(h)
	self.m_source = SourceModuleHnNSF(
	sampling_rate=h['audio_sample_rate'],
	harmonic_num=8
	)
	self.upp = int(np.prod(h['hop_size']))



	in_channels = h['model_args']['cond_in_channels']
	out_channels = h['model_args']['out_channels']
	inner_channels = h['model_args']['cg_channels']
	cond_channels = h['audio_num_mel_bins']
	upsample_ratios = h['model_args']['upsample_rates']
	lvc_layers_each_block = h['model_args']['num_lvc_blocks']
	lvc_kernel_size = h['model_args']['lvc_kernels']
	kpnet_hidden_channels = h['model_args']['lvc_hidden_channels']
	kpnet_conv_size = h['model_args']['lvc_conv_size']
	dropout = h['model_args']['dropout']
	# upsample_ratios:list
	self.ddspd = ddsp_down(dims=inner_channels,downs=upsample_ratios.copy(),)

	upmel=h['model_args'].get('upmel')
	self.upblocke=torch.nn.Sequential(*[Upspamper() for i in range(upmel//2)]) if upmel is not None or upmel==1 else torch.nn.Identity()

	self.in_channels = in_channels
	self.out_channels = out_channels
	self.cond_channels = cond_channels
	self.lvc_block_nums = len(upsample_ratios)

	# define first convolution
	self.first_conv = torch.nn.Conv1d(in_channels, inner_channels,
	kernel_size=7, padding=(7 - 1) // 2,
	dilation=1, bias=True)

	# define residual blocks
	self.lvc_blocks = torch.nn.ModuleList()
	cond_hop_length = 1
	for n in range(self.lvc_block_nums):
	cond_hop_length = cond_hop_length * upsample_ratios[n]
	lvcb = LVCBlock(
	in_channels=inner_channels,
	cond_channels=cond_channels,
	upsample_ratio=upsample_ratios[n],
	conv_layers=lvc_layers_each_block,
	conv_kernel_size=lvc_kernel_size,
	cond_hop_length=cond_hop_length,
	kpnet_hidden_channels=kpnet_hidden_channels,
	kpnet_conv_size=kpnet_conv_size,
	kpnet_dropout=dropout,
	)
	self.lvc_blocks += [lvcb]

	# define output layers
	self.last_conv_layers = torch.nn.ModuleList([
	torch.nn.Conv1d(inner_channels, out_channels, kernel_size=7, padding=(7 - 1) // 2,
	dilation=1, bias=True),

	])

	# apply weight norm
	if use_weight_norm:
	self.apply_weight_norm()

	def forward(self, x, c,f0,infer=False):
	"""Calculate forward propagation.
	Args:
	x (Tensor): Input noise signal (B, 1, T).
	c (Tensor): Local conditioning auxiliary features (B, C ,T').
	Returns:
	Tensor: Output tensor (B, out_channels, T)
	"""
	pass
	# ddspwav,s_h,s_n=self.ddsp(mel=c,f0=f0,infer=infer)
	har_source = self.m_source(f0, self.upp).transpose(1, 2)
	specl=self.ddspd(har_source)

	x = self.first_conv(x)
	c=self.upblocke(c)

	for n in range(self.lvc_block_nums):
	x = self.lvc_blocks[n](x, c,specl[n])

	# apply final layers
	for f in self.last_conv_layers:
	x = F.leaky_relu(x, LRELU_SLOPE)
	x = f(x)
	x = torch.tanh(x)
	return x,har_source

	def remove_weight_norm(self):
	"""Remove weight normalization module from all of the layers."""
	def _remove_weight_norm(m):
	try:
	logging.debug(f"Weight norm is removed from {m}.")
	torch.nn.utils.remove_weight_norm(m)
	except ValueError: # this module didn't have weight norm
	return

	self.apply(_remove_weight_norm)

	def apply_weight_norm(self):
	"""Apply weight normalization module from all of the layers."""
	def _apply_weight_norm(m):
	if isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.Conv2d):
	torch.nn.utils.weight_norm(m)
	logging.debug(f"Weight norm is applied to {m}.")

	self.apply(_apply_weight_norm)

	@staticmethod
	def _get_receptive_field_size(layers, stacks, kernel_size,
	dilation=lambda x: 2 ** x):
	assert layers % stacks == 0
	layers_per_cycle = layers // stacks
	dilations = [dilation(i % layers_per_cycle) for i in range(layers)]
	return (kernel_size - 1) * sum(dilations) + 1

	@property
	def receptive_field_size(self):
	"""Return receptive field size."""
	return self._get_receptive_field_size(self.layers, self.stacks, self.kernel_size)

	def inference(self, c=None, x=None):
	"""Perform inference.
	Args:
	c (Union[Tensor, ndarray]): Local conditioning auxiliary features (T' ,C).
	x (Union[Tensor, ndarray]): Input noise signal (T, 1).
	Returns:
	Tensor: Output tensor (T, out_channels)
	"""
	if x is not None:
	if not isinstance(x, torch.Tensor):
	x = torch.tensor(x, dtype=torch.float).to(next(self.parameters()).device)
	x = x.transpose(1, 0).unsqueeze(0)
	else:
	assert c is not None
	x = torch.randn(1, 1, len(c) * self.upsample_factor).to(next(self.parameters()).device)
	if c is not None:
	if not isinstance(c, torch.Tensor):
	c = torch.tensor(c, dtype=torch.float).to(next(self.parameters()).device)
	c = c.transpose(1, 0).unsqueeze(0)
	c = torch.nn.ReplicationPad1d(self.aux_context_window)(c)
	return self.forward(x, c).squeeze(0).transpose(1, 0)