Voice-Clone2 / TTS /vocoder /models /univnet_discriminator.py
Shadhil's picture
voice-clone with single audio sample input
9b2107c
raw
history blame
3.23 kB
import torch
import torch.nn.functional as F
from torch import nn
from torch.nn.utils import spectral_norm
from torch.nn.utils.parametrizations import weight_norm
from TTS.utils.audio.torch_transforms import TorchSTFT
from TTS.vocoder.models.hifigan_discriminator import MultiPeriodDiscriminator
LRELU_SLOPE = 0.1
class SpecDiscriminator(nn.Module):
"""docstring for Discriminator."""
def __init__(self, fft_size=1024, hop_length=120, win_length=600, use_spectral_norm=False):
super().__init__()
norm_f = weight_norm if use_spectral_norm is False else spectral_norm
self.fft_size = fft_size
self.hop_length = hop_length
self.win_length = win_length
self.stft = TorchSTFT(fft_size, hop_length, win_length)
self.discriminators = nn.ModuleList(
[
norm_f(nn.Conv2d(1, 32, kernel_size=(3, 9), padding=(1, 4))),
norm_f(nn.Conv2d(32, 32, kernel_size=(3, 9), stride=(1, 2), padding=(1, 4))),
norm_f(nn.Conv2d(32, 32, kernel_size=(3, 9), stride=(1, 2), padding=(1, 4))),
norm_f(nn.Conv2d(32, 32, kernel_size=(3, 9), stride=(1, 2), padding=(1, 4))),
norm_f(nn.Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))),
]
)
self.out = norm_f(nn.Conv2d(32, 1, 3, 1, 1))
def forward(self, y):
fmap = []
with torch.no_grad():
y = y.squeeze(1)
y = self.stft(y)
y = y.unsqueeze(1)
for _, d in enumerate(self.discriminators):
y = d(y)
y = F.leaky_relu(y, LRELU_SLOPE)
fmap.append(y)
y = self.out(y)
fmap.append(y)
return torch.flatten(y, 1, -1), fmap
class MultiResSpecDiscriminator(torch.nn.Module):
def __init__( # pylint: disable=dangerous-default-value
self, fft_sizes=[1024, 2048, 512], hop_sizes=[120, 240, 50], win_lengths=[600, 1200, 240], window="hann_window"
):
super().__init__()
self.discriminators = nn.ModuleList(
[
SpecDiscriminator(fft_sizes[0], hop_sizes[0], win_lengths[0], window),
SpecDiscriminator(fft_sizes[1], hop_sizes[1], win_lengths[1], window),
SpecDiscriminator(fft_sizes[2], hop_sizes[2], win_lengths[2], window),
]
)
def forward(self, x):
scores = []
feats = []
for d in self.discriminators:
score, feat = d(x)
scores.append(score)
feats.append(feat)
return scores, feats
class UnivnetDiscriminator(nn.Module):
"""Univnet discriminator wrapping MPD and MSD."""
def __init__(self):
super().__init__()
self.mpd = MultiPeriodDiscriminator()
self.msd = MultiResSpecDiscriminator()
def forward(self, x):
"""
Args:
x (Tensor): input waveform.
Returns:
List[Tensor]: discriminator scores.
List[List[Tensor]]: list of list of features from each layers of each discriminator.
"""
scores, feats = self.mpd(x)
scores_, feats_ = self.msd(x)
return scores + scores_, feats + feats_