+
HierSpeech++ is a zero-shot speech synthesis model.
+
Our model is trained with LibriTTS dataset so this model only supports english. We will release a multi-lingual HierSpeech++ soon.
+
[Demo Page] [Source Code]
+
''',
+ examples=[["HierSpeech is a zero-shot speech synthesis model, which can generate high-quality audio", "./example/3_rick_gt.wav", 0.333,0.333, 1.0, 1.0, 0, 1111],
+ ["HierSpeech is a zero-shot speech synthesis model, which can generate high-quality audio", "./example/ex01_whisper_00359.wav", 0.333,0.333, 1.0, 1.0, 0, 1111],
+ ["Hi there, I'm your new voice clone. Try your best to upload quality audio", "./example/female.wav", 0.333,0.333, 1.0, 1.0, 0, 1111],
+ ["Hello I'm HierSpeech++", "./example/reference_1.wav", 0.333,0.333, 1.0, 1.0, 0, 1111],
+ ]
+ )
+ demo_play.launch(share=True, server_port=8888)
+
+if __name__ == '__main__':
+ main()
\ No newline at end of file
diff --git a/attentions.py b/attentions.py
new file mode 100644
index 0000000000000000000000000000000000000000..383c5da5c34103003a973aab97cc39180db35fda
--- /dev/null
+++ b/attentions.py
@@ -0,0 +1,313 @@
+import copy
+import math
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+import commons
+import modules
+from modules import LayerNorm
+
+
+class Encoder(nn.Module):
+ def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=4,
+ **kwargs):
+ super().__init__()
+ self.hidden_channels = hidden_channels
+ self.filter_channels = filter_channels
+ self.n_heads = n_heads
+ self.n_layers = n_layers
+ self.kernel_size = kernel_size
+ self.p_dropout = p_dropout
+ self.window_size = window_size
+
+ self.drop = nn.Dropout(p_dropout)
+ self.attn_layers = nn.ModuleList()
+ self.norm_layers_1 = nn.ModuleList()
+ self.ffn_layers = nn.ModuleList()
+ self.norm_layers_2 = nn.ModuleList()
+ for i in range(self.n_layers):
+ self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout,
+ window_size=window_size))
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
+ self.ffn_layers.append(
+ FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout))
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
+
+ def forward(self, x, x_mask):
+ attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
+ x = x * x_mask
+ for i in range(self.n_layers):
+ y = self.attn_layers[i](x, x, attn_mask)
+ y = self.drop(y)
+ x = self.norm_layers_1[i](x + y)
+
+ y = self.ffn_layers[i](x, x_mask)
+ y = self.drop(y)
+ x = self.norm_layers_2[i](x + y)
+ x = x * x_mask
+ return x
+
+
+class Decoder(nn.Module):
+ def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0.,
+ proximal_bias=False, proximal_init=True, **kwargs):
+ super().__init__()
+ self.hidden_channels = hidden_channels
+ self.filter_channels = filter_channels
+ self.n_heads = n_heads
+ self.n_layers = n_layers
+ self.kernel_size = kernel_size
+ self.p_dropout = p_dropout
+ self.proximal_bias = proximal_bias
+ self.proximal_init = proximal_init
+
+ self.drop = nn.Dropout(p_dropout)
+ self.self_attn_layers = nn.ModuleList()
+ self.norm_layers_0 = nn.ModuleList()
+ self.encdec_attn_layers = nn.ModuleList()
+ self.norm_layers_1 = nn.ModuleList()
+ self.ffn_layers = nn.ModuleList()
+ self.norm_layers_2 = nn.ModuleList()
+ for i in range(self.n_layers):
+ self.self_attn_layers.append(
+ MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout,
+ proximal_bias=proximal_bias, proximal_init=proximal_init))
+ self.norm_layers_0.append(LayerNorm(hidden_channels))
+ self.encdec_attn_layers.append(
+ MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout))
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
+ self.ffn_layers.append(
+ FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True))
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
+
+ def forward(self, x, x_mask, h, h_mask):
+ """
+ x: decoder input
+ h: encoder output
+ """
+ self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype)
+ encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
+ x = x * x_mask
+ for i in range(self.n_layers):
+ y = self.self_attn_layers[i](x, x, self_attn_mask)
+ y = self.drop(y)
+ x = self.norm_layers_0[i](x + y)
+
+ y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
+ y = self.drop(y)
+ x = self.norm_layers_1[i](x + y)
+
+ y = self.ffn_layers[i](x, x_mask)
+ y = self.drop(y)
+ x = self.norm_layers_2[i](x + y)
+ x = x * x_mask
+ return x
+
+
+class MultiHeadAttention(nn.Module):
+ def __init__(self, channels, out_channels, n_heads, p_dropout=0., window_size=None, heads_share=True,
+ block_length=None, proximal_bias=False, proximal_init=False):
+ super().__init__()
+ assert channels % n_heads == 0
+
+ self.channels = channels
+ self.out_channels = out_channels
+ self.n_heads = n_heads
+ self.p_dropout = p_dropout
+ self.window_size = window_size
+ self.heads_share = heads_share
+ self.block_length = block_length
+ self.proximal_bias = proximal_bias
+ self.proximal_init = proximal_init
+ self.attn = None
+
+ self.k_channels = channels // n_heads
+ self.conv_q = nn.Conv1d(channels, channels, 1)
+ self.conv_k = nn.Conv1d(channels, channels, 1)
+ self.conv_v = nn.Conv1d(channels, channels, 1)
+ self.conv_o = nn.Conv1d(channels, out_channels, 1)
+ self.drop = nn.Dropout(p_dropout)
+
+ if window_size is not None:
+ n_heads_rel = 1 if heads_share else n_heads
+ rel_stddev = self.k_channels ** -0.5
+ self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
+ self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
+
+ nn.init.xavier_uniform_(self.conv_q.weight)
+ nn.init.xavier_uniform_(self.conv_k.weight)
+ nn.init.xavier_uniform_(self.conv_v.weight)
+ if proximal_init:
+ with torch.no_grad():
+ self.conv_k.weight.copy_(self.conv_q.weight)
+ self.conv_k.bias.copy_(self.conv_q.bias)
+
+ def forward(self, x, c, attn_mask=None):
+ q = self.conv_q(x)
+ k = self.conv_k(c)
+ v = self.conv_v(c)
+
+ x, self.attn = self.attention(q, k, v, mask=attn_mask)
+
+ x = self.conv_o(x)
+ return x
+
+ def attention(self, query, key, value, mask=None):
+ # reshape [b, d, t] -> [b, n_h, t, d_k]
+ b, d, t_s, t_t = (*key.size(), query.size(2))
+ query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
+ key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
+ value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
+
+ scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
+ if self.window_size is not None:
+ assert t_s == t_t, "Relative attention is only available for self-attention."
+ key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
+ rel_logits = self._matmul_with_relative_keys(query / math.sqrt(self.k_channels), key_relative_embeddings)
+ scores_local = self._relative_position_to_absolute_position(rel_logits)
+ scores = scores + scores_local
+ if self.proximal_bias:
+ assert t_s == t_t, "Proximal bias is only available for self-attention."
+ scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype)
+ if mask is not None:
+ scores = scores.masked_fill(mask == 0, -1e4)
+ if self.block_length is not None:
+ assert t_s == t_t, "Local attention is only available for self-attention."
+ block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length)
+ scores = scores.masked_fill(block_mask == 0, -1e4)
+ p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
+ p_attn = self.drop(p_attn)
+ output = torch.matmul(p_attn, value)
+ if self.window_size is not None:
+ relative_weights = self._absolute_position_to_relative_position(p_attn)
+ value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
+ output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
+ output = output.transpose(2, 3).contiguous().view(b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t]
+ return output, p_attn
+
+ def _matmul_with_relative_values(self, x, y):
+ """
+ x: [b, h, l, m]
+ y: [h or 1, m, d]
+ ret: [b, h, l, d]
+ """
+ ret = torch.matmul(x, y.unsqueeze(0))
+ return ret
+
+ def _matmul_with_relative_keys(self, x, y):
+ """
+ x: [b, h, l, d]
+ y: [h or 1, m, d]
+ ret: [b, h, l, m]
+ """
+ ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
+ return ret
+
+ def _get_relative_embeddings(self, relative_embeddings, length):
+ max_relative_position = 2 * self.window_size + 1
+ # Pad first before slice to avoid using cond ops.
+ pad_length = max(length - (self.window_size + 1), 0)
+ slice_start_position = max((self.window_size + 1) - length, 0)
+ slice_end_position = slice_start_position + 2 * length - 1
+ if pad_length > 0:
+ padded_relative_embeddings = F.pad(
+ relative_embeddings,
+ commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]))
+ else:
+ padded_relative_embeddings = relative_embeddings
+ used_relative_embeddings = padded_relative_embeddings[:, slice_start_position:slice_end_position]
+ return used_relative_embeddings
+
+ def _relative_position_to_absolute_position(self, x):
+ """
+ x: [b, h, l, 2*l-1]
+ ret: [b, h, l, l]
+ """
+ batch, heads, length, _ = x.size()
+ # Concat columns of pad to shift from relative to absolute indexing.
+ x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]))
+
+ # Concat extra elements so to add up to shape (len+1, 2*len-1).
+ x_flat = x.view([batch, heads, length * 2 * length])
+ x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]]))
+
+ # Reshape and slice out the padded elements.
+ x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[:, :, :length, length - 1:]
+ return x_final
+
+ def _absolute_position_to_relative_position(self, x):
+ """
+ x: [b, h, l, l]
+ ret: [b, h, l, 2*l-1]
+ """
+ batch, heads, length, _ = x.size()
+ # padd along column
+ x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]]))
+ x_flat = x.view([batch, heads, length ** 2 + length * (length - 1)])
+ # add 0's in the beginning that will skew the elements after reshape
+ x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
+ x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
+ return x_final
+
+ def _attention_bias_proximal(self, length):
+ """Bias for self-attention to encourage attention to close positions.
+ Args:
+ length: an integer scalar.
+ Returns:
+ a Tensor with shape [1, 1, length, length]
+ """
+ r = torch.arange(length, dtype=torch.float32)
+ diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
+ return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
+
+
+class FFN(nn.Module):
+ def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None,
+ causal=False):
+ super().__init__()
+ self.in_channels = in_channels
+ self.out_channels = out_channels
+ self.filter_channels = filter_channels
+ self.kernel_size = kernel_size
+ self.p_dropout = p_dropout
+ self.activation = activation
+ self.causal = causal
+
+ if causal:
+ self.padding = self._causal_padding
+ else:
+ self.padding = self._same_padding
+
+ self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
+ self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
+ self.drop = nn.Dropout(p_dropout)
+
+ def forward(self, x, x_mask):
+ x = self.conv_1(self.padding(x * x_mask))
+ if self.activation == "gelu":
+ x = x * torch.sigmoid(1.702 * x)
+ else:
+ x = torch.relu(x)
+ x = self.drop(x)
+ x = self.conv_2(self.padding(x * x_mask))
+ return x * x_mask
+
+ def _causal_padding(self, x):
+ if self.kernel_size == 1:
+ return x
+ pad_l = self.kernel_size - 1
+ pad_r = 0
+ padding = [[0, 0], [0, 0], [pad_l, pad_r]]
+ x = F.pad(x, commons.convert_pad_shape(padding))
+ return x
+
+ def _same_padding(self, x):
+ if self.kernel_size == 1:
+ return x
+ pad_l = (self.kernel_size - 1) // 2
+ pad_r = self.kernel_size // 2
+ padding = [[0, 0], [0, 0], [pad_l, pad_r]]
+ x = F.pad(x, commons.convert_pad_shape(padding))
+ return x
diff --git a/commons.py b/commons.py
new file mode 100644
index 0000000000000000000000000000000000000000..d07dc14ccc73e050dd4bc2c765cfdbd82e271a3f
--- /dev/null
+++ b/commons.py
@@ -0,0 +1,168 @@
+import math
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+
+def init_weights(m, mean=0.0, std=0.01):
+ classname = m.__class__.__name__
+ if classname.find("Conv") != -1:
+ m.weight.data.normal_(mean, std)
+
+
+def get_padding(kernel_size, dilation=1):
+ return int((kernel_size*dilation - dilation)/2)
+
+
+def convert_pad_shape(pad_shape):
+ l = pad_shape[::-1]
+ pad_shape = [item for sublist in l for item in sublist]
+ return pad_shape
+
+
+def intersperse(lst, item):
+ result = [item] * (len(lst) * 2 + 1)
+ result[1::2] = lst
+ return result
+
+
+def kl_divergence(m_p, logs_p, m_q, logs_q):
+ """KL(P||Q)"""
+ kl = (logs_q - logs_p) - 0.5
+ kl += 0.5 * (torch.exp(2. * logs_p) + ((m_p - m_q)**2)) * torch.exp(-2. * logs_q)
+ return kl
+
+
+def rand_gumbel(shape):
+ """Sample from the Gumbel distribution, protect from overflows."""
+ uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
+ return -torch.log(-torch.log(uniform_samples))
+
+
+def rand_gumbel_like(x):
+ g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
+ return g
+
+
+def slice_segments(x, ids_str, segment_size=4):
+ ret = torch.zeros_like(x[:, :, :segment_size])
+ for i in range(x.size(0)):
+ idx_str = ids_str[i]
+ idx_end = idx_str + segment_size
+ ret[i] = x[i, :, idx_str:idx_end]
+ return ret
+
+def slice_segments_audio(x, ids_str, segment_size=4):
+ ret = torch.zeros_like(x[:, :segment_size])
+ for i in range(x.size(0)):
+ idx_str = ids_str[i]
+ idx_end = idx_str + segment_size
+ ret[i] = x[i, idx_str:idx_end]
+ return ret
+
+def rand_slice_segments(x, x_lengths=None, segment_size=4):
+ b, d, t = x.size()
+ if x_lengths is None:
+ x_lengths = t
+ ids_str_max = x_lengths - segment_size + 1
+ ids_str = ((torch.rand([b]).to(device=x.device) * ids_str_max).clip(0)).to(dtype=torch.long)
+ ret = slice_segments(x, ids_str, segment_size)
+ return ret, ids_str
+
+
+def get_timing_signal_1d(
+ length, channels, min_timescale=1.0, max_timescale=1.0e4):
+ position = torch.arange(length, dtype=torch.float)
+ num_timescales = channels // 2
+ log_timescale_increment = (
+ math.log(float(max_timescale) / float(min_timescale)) /
+ (num_timescales - 1))
+ inv_timescales = min_timescale * torch.exp(
+ torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment)
+ scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
+ signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
+ signal = F.pad(signal, [0, 0, 0, channels % 2])
+ signal = signal.view(1, channels, length)
+ return signal
+
+
+def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
+ b, channels, length = x.size()
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
+ return x + signal.to(dtype=x.dtype, device=x.device)
+
+
+def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
+ b, channels, length = x.size()
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
+ return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
+
+
+def subsequent_mask(length):
+ mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
+ return mask
+
+
+@torch.jit.script
+def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
+ n_channels_int = n_channels[0]
+ in_act = input_a + input_b
+ t_act = torch.tanh(in_act[:, :n_channels_int, :])
+ s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
+ acts = t_act * s_act
+ return acts
+
+
+def convert_pad_shape(pad_shape):
+ l = pad_shape[::-1]
+ pad_shape = [item for sublist in l for item in sublist]
+ return pad_shape
+
+
+def shift_1d(x):
+ x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
+ return x
+
+
+def sequence_mask(length, max_length=None):
+ if max_length is None:
+ max_length = length.max()
+ x = torch.arange(max_length, dtype=length.dtype, device=length.device)
+ return x.unsqueeze(0) < length.unsqueeze(1)
+
+
+def generate_path(duration, mask):
+ """
+ duration: [b, 1, t_x]
+ mask: [b, 1, t_y, t_x]
+ """
+ device = duration.device
+
+ b, _, t_y, t_x = mask.shape
+ cum_duration = torch.cumsum(duration, -1)
+
+ cum_duration_flat = cum_duration.view(b * t_x)
+ path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
+ path = path.view(b, t_x, t_y)
+ path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
+ path = path.unsqueeze(1).transpose(2,3) * mask
+ return path
+
+
+def clip_grad_value_(parameters, clip_value, norm_type=2):
+ if isinstance(parameters, torch.Tensor):
+ parameters = [parameters]
+ parameters = list(filter(lambda p: p.grad is not None, parameters))
+ norm_type = float(norm_type)
+ if clip_value is not None:
+ clip_value = float(clip_value)
+
+ total_norm = 0
+ for p in parameters:
+ param_norm = p.grad.data.norm(norm_type)
+ total_norm += param_norm.item() ** norm_type
+ if clip_value is not None:
+ p.grad.data.clamp_(min=-clip_value, max=clip_value)
+ total_norm = total_norm ** (1. / norm_type)
+ return total_norm
diff --git a/denoiser/config.json b/denoiser/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..20dc683a2f9f0d24ff19b4d65fdb97075d6d1d34
--- /dev/null
+++ b/denoiser/config.json
@@ -0,0 +1,28 @@
+{
+ "num_gpus": 0,
+ "batch_size": 4,
+ "learning_rate": 0.0005,
+ "adam_b1": 0.8,
+ "adam_b2": 0.99,
+ "lr_decay": 0.99,
+ "seed": 1234,
+
+ "dense_channel": 64,
+ "compress_factor": 0.3,
+ "num_tsconformers": 4,
+ "beta": 2.0,
+
+ "sampling_rate": 16000,
+ "segment_size": 32000,
+ "n_fft": 400,
+ "hop_size": 100,
+ "win_size": 400,
+
+ "num_workers": 4,
+
+ "dist_config": {
+ "dist_backend": "nccl",
+ "dist_url": "tcp://localhost:54321",
+ "world_size": 1
+ }
+}
diff --git a/denoiser/conformer.py b/denoiser/conformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..f10aef58202ef6038d3c29a54138834dbd64358e
--- /dev/null
+++ b/denoiser/conformer.py
@@ -0,0 +1,86 @@
+import torch
+import torch.nn as nn
+from einops.layers.torch import Rearrange
+
+def get_padding(kernel_size, dilation=1):
+ return int((kernel_size*dilation - dilation)/2)
+
+class FeedForwardModule(nn.Module):
+ def __init__(self, dim, mult=4, dropout=0):
+ super(FeedForwardModule, self).__init__()
+ self.ffm = nn.Sequential(
+ nn.LayerNorm(dim),
+ nn.Linear(dim, dim * mult),
+ nn.SiLU(),
+ nn.Dropout(dropout),
+ nn.Linear(dim * mult, dim),
+ nn.Dropout(dropout)
+ )
+
+ def forward(self, x):
+ return self.ffm(x)
+
+
+class ConformerConvModule(nn.Module):
+ def __init__(self, dim, expansion_factor=2, kernel_size=31, dropout=0.):
+ super(ConformerConvModule, self).__init__()
+ inner_dim = dim * expansion_factor
+ self.ccm = nn.Sequential(
+ nn.LayerNorm(dim),
+ Rearrange('b n c -> b c n'),
+ nn.Conv1d(dim, inner_dim*2, 1),
+ nn.GLU(dim=1),
+ nn.Conv1d(inner_dim, inner_dim, kernel_size=kernel_size,
+ padding=get_padding(kernel_size), groups=inner_dim), # DepthWiseConv1d
+ nn.BatchNorm1d(inner_dim),
+ nn.SiLU(),
+ nn.Conv1d(inner_dim, dim, 1),
+ Rearrange('b c n -> b n c'),
+ nn.Dropout(dropout)
+ )
+
+ def forward(self, x):
+ return self.ccm(x)
+
+
+class AttentionModule(nn.Module):
+ def __init__(self, dim, n_head=8, dropout=0.):
+ super(AttentionModule, self).__init__()
+ self.attn = nn.MultiheadAttention(dim, n_head, dropout=dropout)
+ self.layernorm = nn.LayerNorm(dim)
+
+ def forward(self, x, attn_mask=None, key_padding_mask=None):
+ x = self.layernorm(x)
+ x, _ = self.attn(x, x, x,
+ attn_mask=attn_mask,
+ key_padding_mask=key_padding_mask)
+ return x
+
+
+class ConformerBlock(nn.Module):
+ def __init__(self, dim, n_head=8, ffm_mult=4, ccm_expansion_factor=2, ccm_kernel_size=31,
+ ffm_dropout=0., attn_dropout=0., ccm_dropout=0.):
+ super(ConformerBlock, self).__init__()
+ self.ffm1 = FeedForwardModule(dim, ffm_mult, dropout=ffm_dropout)
+ self.attn = AttentionModule(dim, n_head, dropout=attn_dropout)
+ self.ccm = ConformerConvModule(dim, ccm_expansion_factor, ccm_kernel_size, dropout=ccm_dropout)
+ self.ffm2 = FeedForwardModule(dim, ffm_mult, dropout=ffm_dropout)
+ self.post_norm = nn.LayerNorm(dim)
+
+ def forward(self, x):
+ x = x + 0.5 * self.ffm1(x)
+ x = x + self.attn(x)
+ x = x + self.ccm(x)
+ x = x + 0.5 * self.ffm2(x)
+ x = self.post_norm(x)
+ return x
+
+
+def main():
+ x = torch.ones(10, 100, 64)
+ conformer = ConformerBlock(dim=64)
+ print(conformer(x))
+
+
+if __name__ == '__main__':
+ main()
\ No newline at end of file
diff --git a/denoiser/g_best b/denoiser/g_best
new file mode 100644
index 0000000000000000000000000000000000000000..892dc35e94d3334e10f2822a26bacb104023d7d1
--- /dev/null
+++ b/denoiser/g_best
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0088cd06068a97b97cc13fe10fe155ea5c24beea79564b2162fab22a79dc9dc5
+size 8350488
diff --git a/denoiser/generator.py b/denoiser/generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..75003798aaf0de138ca991fe2827bd8efe1b15fe
--- /dev/null
+++ b/denoiser/generator.py
@@ -0,0 +1,193 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from denoiser.conformer import ConformerBlock
+from denoiser.utils import get_padding_2d, LearnableSigmoid_2d
+from pesq import pesq
+from joblib import Parallel, delayed
+
+class DenseBlock(nn.Module):
+ def __init__(self, h, kernel_size=(3, 3), depth=4):
+ super(DenseBlock, self).__init__()
+ self.h = h
+ self.depth = depth
+ self.dense_block = nn.ModuleList([])
+ for i in range(depth):
+ dil = 2 ** i
+ dense_conv = nn.Sequential(
+ nn.Conv2d(h.dense_channel*(i+1), h.dense_channel, kernel_size, dilation=(dil, 1),
+ padding=get_padding_2d(kernel_size, (dil, 1))),
+ nn.InstanceNorm2d(h.dense_channel, affine=True),
+ nn.PReLU(h.dense_channel)
+ )
+ self.dense_block.append(dense_conv)
+
+ def forward(self, x):
+ skip = x
+ for i in range(self.depth):
+ x = self.dense_block[i](skip)
+ skip = torch.cat([x, skip], dim=1)
+ return x
+
+
+class DenseEncoder(nn.Module):
+ def __init__(self, h, in_channel):
+ super(DenseEncoder, self).__init__()
+ self.h = h
+ self.dense_conv_1 = nn.Sequential(
+ nn.Conv2d(in_channel, h.dense_channel, (1, 1)),
+ nn.InstanceNorm2d(h.dense_channel, affine=True),
+ nn.PReLU(h.dense_channel))
+
+ self.dense_block = DenseBlock(h, depth=4) # [b, h.dense_channel, ndim_time, h.n_fft//2+1]
+
+ self.dense_conv_2 = nn.Sequential(
+ nn.Conv2d(h.dense_channel, h.dense_channel, (1, 3), (1, 2)),
+ nn.InstanceNorm2d(h.dense_channel, affine=True),
+ nn.PReLU(h.dense_channel))
+
+ def forward(self, x):
+ x = self.dense_conv_1(x) # [b, 64, T, F]
+ x = self.dense_block(x) # [b, 64, T, F]
+ x = self.dense_conv_2(x) # [b, 64, T, F//2]
+ return x
+
+
+class MaskDecoder(nn.Module):
+ def __init__(self, h, out_channel=1):
+ super(MaskDecoder, self).__init__()
+ self.dense_block = DenseBlock(h, depth=4)
+ self.mask_conv = nn.Sequential(
+ nn.ConvTranspose2d(h.dense_channel, h.dense_channel, (1, 3), (1, 2)),
+ nn.Conv2d(h.dense_channel, out_channel, (1, 1)),
+ nn.InstanceNorm2d(out_channel, affine=True),
+ nn.PReLU(out_channel),
+ nn.Conv2d(out_channel, out_channel, (1, 1))
+ )
+ self.lsigmoid = LearnableSigmoid_2d(h.n_fft//2+1, beta=h.beta)
+
+ def forward(self, x):
+ x = self.dense_block(x)
+ x = self.mask_conv(x)
+ x = x.permute(0, 3, 2, 1).squeeze(-1)
+ x = self.lsigmoid(x).permute(0, 2, 1).unsqueeze(1)
+ return x
+
+
+class PhaseDecoder(nn.Module):
+ def __init__(self, h, out_channel=1):
+ super(PhaseDecoder, self).__init__()
+ self.dense_block = DenseBlock(h, depth=4)
+ self.phase_conv = nn.Sequential(
+ nn.ConvTranspose2d(h.dense_channel, h.dense_channel, (1, 3), (1, 2)),
+ nn.InstanceNorm2d(h.dense_channel, affine=True),
+ nn.PReLU(h.dense_channel)
+ )
+ self.phase_conv_r = nn.Conv2d(h.dense_channel, out_channel, (1, 1))
+ self.phase_conv_i = nn.Conv2d(h.dense_channel, out_channel, (1, 1))
+
+ def forward(self, x):
+ x = self.dense_block(x)
+ x = self.phase_conv(x)
+ x_r = self.phase_conv_r(x)
+ x_i = self.phase_conv_i(x)
+ x = torch.atan2(x_i, x_r)
+ return x
+
+
+class TSConformerBlock(nn.Module):
+ def __init__(self, h):
+ super(TSConformerBlock, self).__init__()
+ self.h = h
+ self.time_conformer = ConformerBlock(dim=h.dense_channel, n_head=4, ccm_kernel_size=31,
+ ffm_dropout=0.2, attn_dropout=0.2)
+ self.freq_conformer = ConformerBlock(dim=h.dense_channel, n_head=4, ccm_kernel_size=31,
+ ffm_dropout=0.2, attn_dropout=0.2)
+
+ def forward(self, x):
+ b, c, t, f = x.size()
+ x = x.permute(0, 3, 2, 1).contiguous().view(b*f, t, c)
+ x = self.time_conformer(x) + x
+ x = x.view(b, f, t, c).permute(0, 2, 1, 3).contiguous().view(b*t, f, c)
+ x = self.freq_conformer(x) + x
+ x = x.view(b, t, f, c).permute(0, 3, 1, 2)
+ return x
+
+
+class MPNet(nn.Module):
+ def __init__(self, h, num_tscblocks=4):
+ super(MPNet, self).__init__()
+ self.h = h
+ self.num_tscblocks = num_tscblocks
+ self.dense_encoder = DenseEncoder(h, in_channel=2)
+
+ self.TSConformer = nn.ModuleList([])
+ for i in range(num_tscblocks):
+ self.TSConformer.append(TSConformerBlock(h))
+
+ self.mask_decoder = MaskDecoder(h, out_channel=1)
+ self.phase_decoder = PhaseDecoder(h, out_channel=1)
+
+ def forward(self, noisy_mag, noisy_pha): # [B, F, T]
+ noisy_mag = noisy_mag.unsqueeze(-1).permute(0, 3, 2, 1) # [B, 1, T, F]
+ noisy_pha = noisy_pha.unsqueeze(-1).permute(0, 3, 2, 1) # [B, 1, T, F]
+ x = torch.cat((noisy_mag, noisy_pha), dim=1) # [B, 2, T, F]
+ x = self.dense_encoder(x)
+
+ for i in range(self.num_tscblocks):
+ x = self.TSConformer[i](x)
+
+ denoised_mag = (noisy_mag * self.mask_decoder(x)).permute(0, 3, 2, 1).squeeze(-1)
+ denoised_pha = self.phase_decoder(x).permute(0, 3, 2, 1).squeeze(-1)
+ denoised_com = torch.stack((denoised_mag*torch.cos(denoised_pha),
+ denoised_mag*torch.sin(denoised_pha)), dim=-1)
+
+ return denoised_mag, denoised_pha, denoised_com
+
+
+def phase_losses(phase_r, phase_g, h):
+
+ dim_freq = h.n_fft // 2 + 1
+ dim_time = phase_r.size(-1)
+
+ gd_matrix = (torch.triu(torch.ones(dim_freq, dim_freq), diagonal=1) - torch.triu(torch.ones(dim_freq, dim_freq), diagonal=2) - torch.eye(dim_freq)).to(phase_g.device)
+ gd_r = torch.matmul(phase_r.permute(0, 2, 1), gd_matrix)
+ gd_g = torch.matmul(phase_g.permute(0, 2, 1), gd_matrix)
+
+ iaf_matrix = (torch.triu(torch.ones(dim_time, dim_time), diagonal=1) - torch.triu(torch.ones(dim_time, dim_time), diagonal=2) - torch.eye(dim_time)).to(phase_g.device)
+ iaf_r = torch.matmul(phase_r, iaf_matrix)
+ iaf_g = torch.matmul(phase_g, iaf_matrix)
+
+ ip_loss = torch.mean(anti_wrapping_function(phase_r-phase_g))
+ gd_loss = torch.mean(anti_wrapping_function(gd_r-gd_g))
+ iaf_loss = torch.mean(anti_wrapping_function(iaf_r-iaf_g))
+
+ return ip_loss, gd_loss, iaf_loss
+
+
+def anti_wrapping_function(x):
+
+ return torch.abs(x - torch.round(x / (2 * np.pi)) * 2 * np.pi)
+
+
+def pesq_score(utts_r, utts_g, h):
+
+ pesq_score = Parallel(n_jobs=30)(delayed(eval_pesq)(
+ utts_r[i].squeeze().cpu().numpy(),
+ utts_g[i].squeeze().cpu().numpy(),
+ h.sampling_rate)
+ for i in range(len(utts_r)))
+ pesq_score = np.mean(pesq_score)
+
+ return pesq_score
+
+
+def eval_pesq(clean_utt, esti_utt, sr):
+ try:
+ pesq_score = pesq(sr, clean_utt, esti_utt)
+ except:
+ # error can happen due to silent period
+ pesq_score = -1
+
+ return pesq_score
diff --git a/denoiser/infer.py b/denoiser/infer.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5b17a13ef580bea2701cf98ff2c09e900c0e043
--- /dev/null
+++ b/denoiser/infer.py
@@ -0,0 +1,33 @@
+
+import torch
+
+def denoise(noisy_wav, model, hps):
+ norm_factor = torch.sqrt(len(noisy_wav) / torch.sum(noisy_wav ** 2.0)).to(noisy_wav.device)
+ noisy_wav = (noisy_wav * norm_factor).unsqueeze(0)
+ noisy_amp, noisy_pha, noisy_com = mag_pha_stft(noisy_wav, hps.n_fft, hps.hop_size, hps.win_size, hps.compress_factor)
+ amp_g, pha_g, com_g = model(noisy_amp, noisy_pha)
+ audio_g = mag_pha_istft(amp_g, pha_g, hps.n_fft, hps.hop_size, hps.win_size, hps.compress_factor)
+ audio_g = audio_g / norm_factor
+ return audio_g
+
+def mag_pha_stft(y, n_fft, hop_size, win_size, compress_factor=1.0, center=True):
+
+ hann_window = torch.hann_window(win_size).to(y.device)
+ stft_spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window,
+ center=center, pad_mode='reflect', normalized=False, return_complex=True)
+ mag = torch.abs(stft_spec)
+ pha = torch.angle(stft_spec)
+ # Magnitude Compression
+ mag = torch.pow(mag, compress_factor)
+ com = torch.stack((mag*torch.cos(pha), mag*torch.sin(pha)), dim=-1)
+
+ return mag, pha, com
+
+def mag_pha_istft(mag, pha, n_fft, hop_size, win_size, compress_factor=1.0, center=True):
+ # Magnitude Decompression
+ mag = torch.pow(mag, (1.0/compress_factor))
+ com = torch.complex(mag*torch.cos(pha), mag*torch.sin(pha))
+ hann_window = torch.hann_window(win_size).to(com.device)
+ wav = torch.istft(com, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window, center=center)
+
+ return wav
\ No newline at end of file
diff --git a/denoiser/utils.py b/denoiser/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..4cbe60eb78ae2bd53a98ea373053afc0e50856d5
--- /dev/null
+++ b/denoiser/utils.py
@@ -0,0 +1,55 @@
+import glob
+import os
+import torch
+import torch.nn as nn
+
+def get_padding(kernel_size, dilation=1):
+ return int((kernel_size*dilation - dilation)/2)
+
+
+def get_padding_2d(kernel_size, dilation=(1, 1)):
+ return (int((kernel_size[0]*dilation[0] - dilation[0])/2), int((kernel_size[1]*dilation[1] - dilation[1])/2))
+
+
+def load_checkpoint(filepath, device):
+ assert os.path.isfile(filepath)
+ print("Loading '{}'".format(filepath))
+ checkpoint_dict = torch.load(filepath, map_location=device)
+ print("Complete.")
+ return checkpoint_dict
+
+
+def save_checkpoint(filepath, obj):
+ print("Saving checkpoint to {}".format(filepath))
+ torch.save(obj, filepath)
+ print("Complete.")
+
+
+def scan_checkpoint(cp_dir, prefix):
+ pattern = os.path.join(cp_dir, prefix + '????????')
+ cp_list = glob.glob(pattern)
+ if len(cp_list) == 0:
+ return None
+ return sorted(cp_list)[-1]
+
+
+class LearnableSigmoid_1d(nn.Module):
+ def __init__(self, in_features, beta=1):
+ super().__init__()
+ self.beta = beta
+ self.slope = nn.Parameter(torch.ones(in_features))
+ self.slope.requiresGrad = True
+
+ def forward(self, x):
+ return self.beta * torch.sigmoid(self.slope * x)
+
+
+class LearnableSigmoid_2d(nn.Module):
+ def __init__(self, in_features, beta=1):
+ super().__init__()
+ self.beta = beta
+ self.slope = nn.Parameter(torch.ones(in_features, 1))
+ self.slope.requiresGrad = True
+
+ def forward(self, x):
+ return self.beta * torch.sigmoid(self.slope * x)
diff --git a/example/reference_1.txt b/example/reference_1.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a24bb2c505d60941604bbf738241b11e5db89e83
--- /dev/null
+++ b/example/reference_1.txt
@@ -0,0 +1 @@
+And lay me down in my cold bed and leave my shining lot.
\ No newline at end of file
diff --git a/example/reference_1.wav b/example/reference_1.wav
new file mode 100644
index 0000000000000000000000000000000000000000..37fea52cdb9f4a2493ad078e0309962bff1f4197
Binary files /dev/null and b/example/reference_1.wav differ
diff --git a/example/reference_2.txt b/example/reference_2.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d64cb9a3cd500bc0b96b2c5ad436a374823bd9f2
--- /dev/null
+++ b/example/reference_2.txt
@@ -0,0 +1 @@
+Yea, his honourable worship is within, but he hath a godly minister or two with him, and likewise a leech.
\ No newline at end of file
diff --git a/example/reference_2.wav b/example/reference_2.wav
new file mode 100644
index 0000000000000000000000000000000000000000..72cd0ba80d53065540405880c0af1a8b6e9afdaf
Binary files /dev/null and b/example/reference_2.wav differ
diff --git a/example/reference_3.txt b/example/reference_3.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f024e6dd32e3fe5e3bdeca7a48a52e876d8199e2
--- /dev/null
+++ b/example/reference_3.txt
@@ -0,0 +1 @@
+The army found the people in poverty and left them in comparative wealth.
\ No newline at end of file
diff --git a/example/reference_3.wav b/example/reference_3.wav
new file mode 100644
index 0000000000000000000000000000000000000000..401fc24553f6d5ac97196e2427ce0b12daa83615
Binary files /dev/null and b/example/reference_3.wav differ
diff --git a/example/reference_4.txt b/example/reference_4.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4038db731d3c9e96a6eb8354be0f44bfd19e82e4
--- /dev/null
+++ b/example/reference_4.txt
@@ -0,0 +1 @@
+Thus did this humane and right minded father comfort his unhappy daughter, and her mother embracing her again, did all she could to soothe her feelings.
\ No newline at end of file
diff --git a/example/reference_4.wav b/example/reference_4.wav
new file mode 100644
index 0000000000000000000000000000000000000000..c28fdf94818de6dd6cdaa9aaa5d238053ea205dd
Binary files /dev/null and b/example/reference_4.wav differ
diff --git a/hierspeechpp_speechsynthesizer.py b/hierspeechpp_speechsynthesizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b38ea9c9d6811701f5cd73a7c6e5a74c0e3b8dd
--- /dev/null
+++ b/hierspeechpp_speechsynthesizer.py
@@ -0,0 +1,716 @@
+import torch
+from torch import nn
+from torch.nn import functional as F
+import modules
+import attentions
+
+from torch.nn import Conv1d, ConvTranspose1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
+from commons import init_weights, get_padding
+
+import torchaudio
+from einops import rearrange
+import transformers
+import math
+from styleencoder import StyleEncoder
+import commons
+
+from alias_free_torch import *
+import activations
+
+class Wav2vec2(torch.nn.Module):
+ def __init__(self, layer=7, w2v='mms'):
+
+ """we use the intermediate features of mms-300m.
+ More specifically, we used the output from the 7th layer of the 24-layer transformer encoder.
+ """
+ super().__init__()
+
+ if w2v == 'mms':
+ self.wav2vec2 = transformers.Wav2Vec2ForPreTraining.from_pretrained("facebook/mms-300m")
+ else:
+ self.wav2vec2 = transformers.Wav2Vec2ForPreTraining.from_pretrained("facebook/wav2vec2-xls-r-300m")
+
+ for param in self.wav2vec2.parameters():
+ param.requires_grad = False
+ param.grad = None
+ self.wav2vec2.eval()
+ self.feature_layer = layer
+
+ @torch.no_grad()
+ def forward(self, x):
+ """
+ Args:
+ x: torch.Tensor of shape (B x t)
+ Returns:
+ y: torch.Tensor of shape(B x C x t)
+ """
+ outputs = self.wav2vec2(x.squeeze(1), output_hidden_states=True)
+ y = outputs.hidden_states[self.feature_layer] # B x t x C(1024)
+ y = y.permute((0, 2, 1)) # B x t x C -> B x C x t
+ return y
+
+class ResidualCouplingBlock_Transformer(nn.Module):
+ def __init__(self,
+ channels,
+ hidden_channels,
+ kernel_size,
+ dilation_rate,
+ n_layers=3,
+ n_flows=4,
+ gin_channels=0):
+ super().__init__()
+ self.channels = channels
+ self.hidden_channels = hidden_channels
+ self.kernel_size = kernel_size
+ self.dilation_rate = dilation_rate
+ self.n_layers = n_layers
+ self.n_flows = n_flows
+ self.gin_channels = gin_channels
+ self.cond_block = torch.nn.Sequential(torch.nn.Linear(gin_channels, 4 * hidden_channels),
+ nn.SiLU(), torch.nn.Linear(4 * hidden_channels, hidden_channels))
+
+ self.flows = nn.ModuleList()
+ for i in range(n_flows):
+ self.flows.append(modules.ResidualCouplingLayer_Transformer_simple(channels, hidden_channels, kernel_size, dilation_rate, n_layers, mean_only=True))
+ self.flows.append(modules.Flip())
+
+ def forward(self, x, x_mask, g=None, reverse=False):
+
+ g = self.cond_block(g.squeeze(2))
+
+ if not reverse:
+ for flow in self.flows:
+ x, _ = flow(x, x_mask, g=g, reverse=reverse)
+ else:
+ for flow in reversed(self.flows):
+ x = flow(x, x_mask, g=g, reverse=reverse)
+ return x
+
+class PosteriorAudioEncoder(nn.Module):
+ def __init__(self,
+ in_channels,
+ out_channels,
+ hidden_channels,
+ kernel_size,
+ dilation_rate,
+ n_layers,
+ gin_channels=0):
+ super().__init__()
+ self.in_channels = in_channels
+ self.out_channels = out_channels
+ self.hidden_channels = hidden_channels
+ self.kernel_size = kernel_size
+ self.dilation_rate = dilation_rate
+ self.n_layers = n_layers
+ self.gin_channels = gin_channels
+
+ self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
+ self.down_pre = nn.Conv1d(1, 16, 7, 1, padding=3)
+ self.resblocks = nn.ModuleList()
+ downsample_rates = [8,5,4,2]
+ downsample_kernel_sizes = [17, 10, 8, 4]
+ ch = [16, 32, 64, 128, 192]
+
+ resblock = AMPBlock1
+ resblock_kernel_sizes = [3,7,11]
+ resblock_dilation_sizes = [[1,3,5], [1,3,5], [1,3,5]]
+ self.num_kernels = 3
+ self.downs = nn.ModuleList()
+ for i, (u, k) in enumerate(zip(downsample_rates, downsample_kernel_sizes)):
+ self.downs.append(weight_norm(
+ Conv1d(ch[i], ch[i+1], k, u, padding=(k-1)//2)))
+ for i in range(4):
+ for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
+ self.resblocks.append(resblock(ch[i+1], k, d, activation="snakebeta"))
+
+ activation_post = activations.SnakeBeta(ch[i+1], alpha_logscale=True)
+ self.activation_post = Activation1d(activation=activation_post)
+
+ self.conv_post = Conv1d(ch[i+1], hidden_channels, 7, 1, padding=3)
+
+
+ self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
+ self.proj = nn.Conv1d(hidden_channels*2, out_channels * 2, 1)
+
+ def forward(self, x, x_audio, x_mask, g=None):
+
+ x_audio = self.down_pre(x_audio)
+
+ for i in range(4):
+
+ x_audio = self.downs[i](x_audio)
+
+ xs = None
+ for j in range(self.num_kernels):
+ if xs is None:
+ xs = self.resblocks[i*self.num_kernels+j](x_audio)
+ else:
+ xs += self.resblocks[i*self.num_kernels+j](x_audio)
+ x_audio = xs / self.num_kernels
+
+ x_audio = self.activation_post(x_audio)
+ x_audio = self.conv_post(x_audio)
+
+ x = self.pre(x) * x_mask
+ x = self.enc(x, x_mask, g=g)
+
+ x_audio = x_audio * x_mask
+
+ x = torch.cat([x, x_audio], dim=1)
+
+ stats = self.proj(x) * x_mask
+
+ m, logs = torch.split(stats, self.out_channels, dim=1)
+ z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
+ return z, m, logs
+
+class PosteriorSFEncoder(nn.Module):
+ def __init__(self,
+ src_channels,
+ out_channels,
+ hidden_channels,
+ kernel_size,
+ dilation_rate,
+ n_layers,
+ gin_channels=0):
+ super().__init__()
+
+ self.out_channels = out_channels
+ self.hidden_channels = hidden_channels
+ self.kernel_size = kernel_size
+ self.dilation_rate = dilation_rate
+ self.n_layers = n_layers
+ self.gin_channels = gin_channels
+
+ self.pre_source = nn.Conv1d(src_channels, hidden_channels, 1)
+ self.pre_filter = nn.Conv1d(1, hidden_channels, kernel_size=9, stride=4, padding=4)
+ self.source_enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers//2, gin_channels=gin_channels)
+ self.filter_enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers//2, gin_channels=gin_channels)
+ self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers//2, gin_channels=gin_channels)
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+
+ def forward(self, x_src, x_ftr, x_mask, g=None):
+
+ x_src = self.pre_source(x_src) * x_mask
+ x_ftr = self.pre_filter(x_ftr) * x_mask
+ x_src = self.source_enc(x_src, x_mask, g=g)
+ x_ftr = self.filter_enc(x_ftr, x_mask, g=g)
+ x = self.enc(x_src+x_ftr, x_mask, g=g)
+ stats = self.proj(x) * x_mask
+ m, logs = torch.split(stats, self.out_channels, dim=1)
+ z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
+ return z, m, logs
+
+
+class MelDecoder(nn.Module):
+ def __init__(self,
+ hidden_channels,
+ filter_channels,
+ n_heads,
+ n_layers,
+ kernel_size,
+ p_dropout,
+ mel_size=20,
+ gin_channels=0):
+ super().__init__()
+
+ self.hidden_channels = hidden_channels
+ self.filter_channels = filter_channels
+ self.n_heads = n_heads
+ self.n_layers = n_layers
+ self.kernel_size = kernel_size
+ self.p_dropout = p_dropout
+
+ self.conv_pre = Conv1d(hidden_channels, hidden_channels, 3, 1, padding=1)
+
+ self.encoder = attentions.Encoder(
+ hidden_channels,
+ filter_channels,
+ n_heads,
+ n_layers,
+ kernel_size,
+ p_dropout)
+
+ self.proj= nn.Conv1d(hidden_channels, mel_size, 1, bias=False)
+
+ if gin_channels != 0:
+ self.cond = nn.Conv1d(gin_channels, hidden_channels, 1)
+
+ def forward(self, x, x_mask, g=None):
+
+ x = self.conv_pre(x*x_mask)
+ if g is not None:
+ x = x + self.cond(g)
+
+ x = self.encoder(x * x_mask, x_mask)
+ x = self.proj(x) * x_mask
+
+ return x
+
+class SourceNetwork(nn.Module):
+ def __init__(self, upsample_initial_channel=256):
+ super().__init__()
+
+ resblock_kernel_sizes = [3,5,7]
+ upsample_rates = [2,2]
+ initial_channel = 192
+ upsample_initial_channel = upsample_initial_channel
+ upsample_kernel_sizes = [4,4]
+ resblock_dilation_sizes = [[1,3,5], [1,3,5], [1,3,5]]
+
+ self.num_kernels = len(resblock_kernel_sizes)
+ self.num_upsamples = len(upsample_rates)
+
+ self.conv_pre = weight_norm(Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3))
+ resblock = AMPBlock1
+
+ self.ups = nn.ModuleList()
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+ self.ups.append(weight_norm(
+ ConvTranspose1d(upsample_initial_channel//(2**i), upsample_initial_channel//(2**(i+1)),
+ k, u, padding=(k-u)//2)))
+
+ self.resblocks = nn.ModuleList()
+ for i in range(len(self.ups)):
+ ch = upsample_initial_channel//(2**(i+1))
+ for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
+ self.resblocks.append(resblock(ch, k, d, activation="snakebeta"))
+
+ activation_post = activations.SnakeBeta(ch, alpha_logscale=True)
+ self.activation_post = Activation1d(activation=activation_post)
+
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
+
+ self.cond = Conv1d(256, upsample_initial_channel, 1)
+
+ self.ups.apply(init_weights)
+
+
+ def forward(self, x, g):
+
+ x = self.conv_pre(x) + self.cond(g)
+
+ for i in range(self.num_upsamples):
+
+ x = self.ups[i](x)
+ xs = None
+ for j in range(self.num_kernels):
+ if xs is None:
+ xs = self.resblocks[i*self.num_kernels+j](x)
+ else:
+ xs += self.resblocks[i*self.num_kernels+j](x)
+ x = xs / self.num_kernels
+
+ x = self.activation_post(x)
+ ## Predictor
+ x_ = self.conv_post(x)
+ return x, x_
+
+def remove_weight_norm(self):
+ print('Removing weight norm...')
+ for l in self.ups:
+ remove_weight_norm(l)
+ for l in self.resblocks:
+ l.remove_weight_norm()
+
+class DBlock(nn.Module):
+ def __init__(self, input_size, hidden_size, factor):
+ super().__init__()
+ self.factor = factor
+ self.residual_dense = weight_norm(Conv1d(input_size, hidden_size, 1))
+ self.conv = nn.ModuleList([
+ weight_norm(Conv1d(input_size, hidden_size, 3, dilation=1, padding=1)),
+ weight_norm(Conv1d(hidden_size, hidden_size, 3, dilation=2, padding=2)),
+ weight_norm(Conv1d(hidden_size, hidden_size, 3, dilation=4, padding=4)),
+ ])
+ self.conv.apply(init_weights)
+ def forward(self, x):
+ size = x.shape[-1] // self.factor
+
+ residual = self.residual_dense(x)
+ residual = F.interpolate(residual, size=size)
+
+ x = F.interpolate(x, size=size)
+ for layer in self.conv:
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
+ x = layer(x)
+
+ return x + residual
+ def remove_weight_norm(self):
+ for l in self.conv:
+ remove_weight_norm(l)
+
+class AMPBlock1(torch.nn.Module):
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5), activation=None):
+ super(AMPBlock1, self).__init__()
+
+ self.convs1 = nn.ModuleList([
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
+ padding=get_padding(kernel_size, dilation[0]))),
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
+ padding=get_padding(kernel_size, dilation[1]))),
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
+ padding=get_padding(kernel_size, dilation[2])))
+ ])
+ self.convs1.apply(init_weights)
+
+ self.convs2 = nn.ModuleList([
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+ padding=get_padding(kernel_size, 1))),
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+ padding=get_padding(kernel_size, 1))),
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+ padding=get_padding(kernel_size, 1)))
+ ])
+ self.convs2.apply(init_weights)
+
+ self.num_layers = len(self.convs1) + len(self.convs2) # total number of conv layers
+
+
+ self.activations = nn.ModuleList([
+ Activation1d(
+ activation=activations.SnakeBeta(channels, alpha_logscale=True))
+ for _ in range(self.num_layers)
+ ])
+
+ def forward(self, x):
+ acts1, acts2 = self.activations[::2], self.activations[1::2]
+ for c1, c2, a1, a2 in zip(self.convs1, self.convs2, acts1, acts2):
+ xt = a1(x)
+ xt = c1(xt)
+ xt = a2(xt)
+ xt = c2(xt)
+ x = xt + x
+
+ return x
+
+ def remove_weight_norm(self):
+ for l in self.convs1:
+ remove_weight_norm(l)
+ for l in self.convs2:
+ remove_weight_norm(l)
+
+class Generator(torch.nn.Module):
+ def __init__(self, initial_channel, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=256):
+ super(Generator, self).__init__()
+ self.num_kernels = len(resblock_kernel_sizes)
+ self.num_upsamples = len(upsample_rates)
+
+
+ self.conv_pre = weight_norm(Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3))
+ resblock = AMPBlock1
+
+ self.ups = nn.ModuleList()
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+ self.ups.append(weight_norm(
+ ConvTranspose1d(upsample_initial_channel//(2**i), upsample_initial_channel//(2**(i+1)),
+ k, u, padding=(k-u)//2)))
+
+ self.resblocks = nn.ModuleList()
+ for i in range(len(self.ups)):
+ ch = upsample_initial_channel//(2**(i+1))
+ for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
+ self.resblocks.append(resblock(ch, k, d, activation="snakebeta"))
+
+ activation_post = activations.SnakeBeta(ch, alpha_logscale=True)
+ self.activation_post = Activation1d(activation=activation_post)
+
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
+ self.ups.apply(init_weights)
+
+ if gin_channels != 0:
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
+
+ self.downs = DBlock(upsample_initial_channel//8, upsample_initial_channel, 4)
+ self.proj = Conv1d(upsample_initial_channel//8, upsample_initial_channel//2, 7, 1, padding=3)
+
+ def forward(self, x, pitch, g=None):
+
+ x = self.conv_pre(x) + self.downs(pitch) + self.cond(g)
+
+ for i in range(self.num_upsamples):
+
+ x = self.ups[i](x)
+
+ if i == 0:
+ pitch = self.proj(pitch)
+ x = x + pitch
+
+ xs = None
+ for j in range(self.num_kernels):
+ if xs is None:
+ xs = self.resblocks[i*self.num_kernels+j](x)
+ else:
+ xs += self.resblocks[i*self.num_kernels+j](x)
+ x = xs / self.num_kernels
+
+ x = self.activation_post(x)
+ x = self.conv_post(x)
+ x = torch.tanh(x)
+ return x
+
+ def remove_weight_norm(self):
+ print('Removing weight norm...')
+ for l in self.ups:
+ remove_weight_norm(l)
+ for l in self.resblocks:
+ l.remove_weight_norm()
+ for l in self.downs:
+ l.remove_weight_norm()
+ remove_weight_norm(self.conv_pre)
+
+class DiscriminatorP(torch.nn.Module):
+ def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
+ super(DiscriminatorP, self).__init__()
+ self.period = period
+ self.use_spectral_norm = use_spectral_norm
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+ self.convs = nn.ModuleList([
+ norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
+ norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
+ norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
+ norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
+ norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))),
+ ])
+ self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+
+ def forward(self, x):
+ fmap = []
+
+ # 1d to 2d
+ b, c, t = x.shape
+ if t % self.period != 0: # pad first
+ n_pad = self.period - (t % self.period)
+ x = F.pad(x, (0, n_pad), "reflect")
+ t = t + n_pad
+ x = x.view(b, c, t // self.period, self.period)
+
+ for l in self.convs:
+ x = l(x)
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
+ fmap.append(x)
+ x = self.conv_post(x)
+ fmap.append(x)
+ x = torch.flatten(x, 1, -1)
+
+ return x, fmap
+
+class DiscriminatorR(torch.nn.Module):
+ def __init__(self, resolution, use_spectral_norm=False):
+ super(DiscriminatorR, self).__init__()
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+
+ n_fft, hop_length, win_length = resolution
+ self.spec_transform = torchaudio.transforms.Spectrogram(
+ n_fft=n_fft, hop_length=hop_length, win_length=win_length, window_fn=torch.hann_window,
+ normalized=True, center=False, pad_mode=None, power=None)
+
+ self.convs = nn.ModuleList([
+ norm_f(nn.Conv2d(2, 32, (3, 9), padding=(1, 4))),
+ norm_f(nn.Conv2d(32, 32, (3, 9), stride=(1, 2), padding=(1, 4))),
+ norm_f(nn.Conv2d(32, 32, (3, 9), stride=(1, 2), dilation=(2,1), padding=(2, 4))),
+ norm_f(nn.Conv2d(32, 32, (3, 9), stride=(1, 2), dilation=(4,1), padding=(4, 4))),
+ norm_f(nn.Conv2d(32, 32, (3, 3), padding=(1, 1))),
+ ])
+ self.conv_post = norm_f(nn.Conv2d(32, 1, (3, 3), padding=(1, 1)))
+
+ def forward(self, y):
+ fmap = []
+
+ x = self.spec_transform(y) # [B, 2, Freq, Frames, 2]
+ x = torch.cat([x.real, x.imag], dim=1)
+ x = rearrange(x, 'b c w t -> b c t w')
+
+ for l in self.convs:
+ x = l(x)
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
+ fmap.append(x)
+ x = self.conv_post(x)
+ fmap.append(x)
+ x = torch.flatten(x, 1, -1)
+
+ return x, fmap
+
+
+class MultiPeriodDiscriminator(torch.nn.Module):
+ def __init__(self, use_spectral_norm=False):
+ super(MultiPeriodDiscriminator, self).__init__()
+ periods = [2,3,5,7,11]
+ resolutions = [[2048, 512, 2048], [1024, 256, 1024], [512, 128, 512], [256, 64, 256], [128, 32, 128]]
+
+ discs = [DiscriminatorR(resolutions[i], use_spectral_norm=use_spectral_norm) for i in range(len(resolutions))]
+ discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
+
+ self.discriminators = nn.ModuleList(discs)
+
+ def forward(self, y, y_hat):
+ y_d_rs = []
+ y_d_gs = []
+ fmap_rs = []
+ fmap_gs = []
+ for i, d in enumerate(self.discriminators):
+ y_d_r, fmap_r = d(y)
+ y_d_g, fmap_g = d(y_hat)
+ y_d_rs.append(y_d_r)
+ y_d_gs.append(y_d_g)
+ fmap_rs.append(fmap_r)
+ fmap_gs.append(fmap_g)
+
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+
+class SynthesizerTrn(nn.Module):
+ """
+ Synthesizer for Training
+ """
+
+ def __init__(self,
+
+ spec_channels,
+ segment_size,
+ inter_channels,
+ hidden_channels,
+ filter_channels,
+ n_heads,
+ n_layers,
+ kernel_size,
+ p_dropout,
+ resblock,
+ resblock_kernel_sizes,
+ resblock_dilation_sizes,
+ upsample_rates,
+ upsample_initial_channel,
+ upsample_kernel_sizes,
+ gin_channels=256,
+ prosody_size=20,
+ uncond_ratio=0.,
+ cfg=False,
+ **kwargs):
+
+ super().__init__()
+ self.spec_channels = spec_channels
+ self.inter_channels = inter_channels
+ self.hidden_channels = hidden_channels
+ self.filter_channels = filter_channels
+ self.n_heads = n_heads
+ self.n_layers = n_layers
+ self.kernel_size = kernel_size
+ self.p_dropout = p_dropout
+ self.resblock = resblock
+ self.resblock_kernel_sizes = resblock_kernel_sizes
+ self.resblock_dilation_sizes = resblock_dilation_sizes
+ self.upsample_rates = upsample_rates
+ self.upsample_initial_channel = upsample_initial_channel
+ self.upsample_kernel_sizes = upsample_kernel_sizes
+ self.segment_size = segment_size
+ self.mel_size = prosody_size
+
+ self.enc_p_l = PosteriorSFEncoder(1024, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
+ self.flow_l = ResidualCouplingBlock_Transformer(inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels)
+
+ self.enc_p = PosteriorSFEncoder(1024, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
+ self.enc_q = PosteriorAudioEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
+ self.flow = ResidualCouplingBlock_Transformer(inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels)
+
+ self.mel_decoder = MelDecoder(inter_channels,
+ filter_channels,
+ n_heads=2,
+ n_layers=2,
+ kernel_size=5,
+ p_dropout=0.1,
+ mel_size=self.mel_size,
+ gin_channels=gin_channels)
+
+ self.dec = Generator(inter_channels, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
+ self.sn = SourceNetwork(upsample_initial_channel//2)
+ self.emb_g = StyleEncoder(in_dim=80, hidden_dim=256, out_dim=gin_channels)
+
+ if cfg:
+
+ self.emb = torch.nn.Embedding(1, 256)
+ torch.nn.init.normal_(self.emb.weight, 0.0, 256 ** -0.5)
+ self.null = torch.LongTensor([0]).cuda()
+ self.uncond_ratio = uncond_ratio
+ self.cfg = cfg
+ @torch.no_grad()
+ def infer(self, x_mel, w2v, length, f0):
+
+ x_mask = torch.unsqueeze(commons.sequence_mask(length, x_mel.size(2)), 1).to(x_mel.dtype)
+
+ # Speaker embedding from mel (Style Encoder)
+ g = self.emb_g(x_mel, x_mask).unsqueeze(-1)
+
+ z, _, _ = self.enc_p_l(w2v, f0, x_mask, g=g)
+
+ z = self.flow_l(z, x_mask, g=g, reverse=True)
+ z = self.flow(z, x_mask, g=g, reverse=True)
+
+ e, e_ = self.sn(z, g)
+ o = self.dec(z, e, g=g)
+
+ return o, e_
+ @torch.no_grad()
+ def voice_conversion(self, src, src_length, trg_mel, trg_length, f0, noise_scale = 0.333, uncond=False):
+
+ trg_mask = torch.unsqueeze(commons.sequence_mask(trg_length, trg_mel.size(2)), 1).to(trg_mel.dtype)
+ g = self.emb_g(trg_mel, trg_mask).unsqueeze(-1)
+
+ y_mask = torch.unsqueeze(commons.sequence_mask(src_length, src.size(2)), 1).to(trg_mel.dtype)
+ z, m_p, logs_p = self.enc_p_l(src, f0, y_mask, g=g)
+
+ z = (m_p + torch.randn_like(m_p) * torch.exp(logs_p)*noise_scale) * y_mask
+
+ z = self.flow_l(z, y_mask, g=g, reverse=True)
+ z = self.flow(z, y_mask, g=g, reverse=True)
+
+ if uncond:
+ null_emb = self.emb(self.null) * math.sqrt(256)
+ g = null_emb.unsqueeze(-1)
+
+ e, _ = self.sn(z, g)
+ o = self.dec(z, e, g=g)
+
+ return o
+ @torch.no_grad()
+ def voice_conversion_noise_control(self, src, src_length, trg_mel, trg_length, f0, noise_scale = 0.333, uncond=False, denoise_ratio = 0):
+
+ trg_mask = torch.unsqueeze(commons.sequence_mask(trg_length, trg_mel.size(2)), 1).to(trg_mel.dtype)
+ g = self.emb_g(trg_mel, trg_mask).unsqueeze(-1)
+
+ g_org, g_denoise = g[:1, :, :], g[1:, :, :]
+
+ g_interpolation = (1-denoise_ratio)*g_org + denoise_ratio*g_denoise
+
+ y_mask = torch.unsqueeze(commons.sequence_mask(src_length, src.size(2)), 1).to(trg_mel.dtype)
+ z, m_p, logs_p = self.enc_p_l(src, f0, y_mask, g=g_interpolation)
+
+ z = (m_p + torch.randn_like(m_p) * torch.exp(logs_p)*noise_scale) * y_mask
+
+ z = self.flow_l(z, y_mask, g=g_interpolation, reverse=True)
+ z = self.flow(z, y_mask, g=g_interpolation, reverse=True)
+
+ if uncond:
+ null_emb = self.emb(self.null) * math.sqrt(256)
+ g = null_emb.unsqueeze(-1)
+
+ e, _ = self.sn(z, g_interpolation)
+ o = self.dec(z, e, g=g_interpolation)
+
+ return o
+ @torch.no_grad()
+ def f0_extraction(self, x_linear, x_mel, length, x_audio, noise_scale = 0.333):
+
+ x_mask = torch.unsqueeze(commons.sequence_mask(length, x_mel.size(2)), 1).to(x_mel.dtype)
+
+ # Speaker embedding from mel (Style Encoder)
+ g = self.emb_g(x_mel, x_mask).unsqueeze(-1)
+
+ # posterior encoder from linear spec.
+ _, m_q, logs_q= self.enc_q(x_linear, x_audio, x_mask, g=g)
+ z = (m_q + torch.randn_like(m_q) * torch.exp(logs_q)*noise_scale)
+
+ # Source Networks
+ _, e_ = self.sn(z, g)
+
+ return e_
+
diff --git a/inference.py b/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..48e5adb091998b08d66aa3b242331bd6bb5c5f24
--- /dev/null
+++ b/inference.py
@@ -0,0 +1,220 @@
+import os
+import torch
+import argparse
+import numpy as np
+from scipy.io.wavfile import write
+import torchaudio
+import utils
+from Mels_preprocess import MelSpectrogramFixed
+
+from hierspeechpp_speechsynthesizer import (
+ SynthesizerTrn
+)
+from ttv_v1.text import text_to_sequence
+from ttv_v1.t2w2v_transformer import SynthesizerTrn as Text2W2V
+from speechsr24k.speechsr import SynthesizerTrn as AudioSR
+from speechsr48k.speechsr import SynthesizerTrn as AudioSR48
+from denoiser.generator import MPNet
+from denoiser.infer import denoise
+
+seed = 1111
+torch.manual_seed(seed)
+torch.cuda.manual_seed(seed)
+np.random.seed(seed)
+
+def load_text(fp):
+ with open(fp, 'r') as f:
+ filelist = [line.strip() for line in f.readlines()]
+ return filelist
+def load_checkpoint(filepath, device):
+ print(filepath)
+ assert os.path.isfile(filepath)
+ print("Loading '{}'".format(filepath))
+ checkpoint_dict = torch.load(filepath, map_location=device)
+ print("Complete.")
+ return checkpoint_dict
+def get_param_num(model):
+ num_param = sum(param.numel() for param in model.parameters())
+ return num_param
+def intersperse(lst, item):
+ result = [item] * (len(lst) * 2 + 1)
+ result[1::2] = lst
+ return result
+
+def add_blank_token(text):
+
+ text_norm = intersperse(text, 0)
+ text_norm = torch.LongTensor(text_norm)
+ return text_norm
+
+def tts(text, a, hierspeech):
+
+ net_g, text2w2v, audiosr, denoiser, mel_fn = hierspeech
+
+ os.makedirs(a.output_dir, exist_ok=True)
+ text = text_to_sequence(str(text), ["english_cleaners2"])
+ token = add_blank_token(text).unsqueeze(0).cuda()
+ token_length = torch.LongTensor([token.size(-1)]).cuda()
+
+ # Prompt load
+ audio, sample_rate = torchaudio.load(a.input_prompt)
+
+ # support only single channel
+ audio = audio[:1,:]
+ # Resampling
+ if sample_rate != 16000:
+ audio = torchaudio.functional.resample(audio, sample_rate, 16000, resampling_method="kaiser_window")
+ if a.scale_norm == 'prompt':
+ prompt_audio_max = torch.max(audio.abs())
+
+ # We utilize a hop size of 320 but denoiser uses a hop size of 400 so we utilize a hop size of 1600
+ ori_prompt_len = audio.shape[-1]
+ p = (ori_prompt_len // 1600 + 1) * 1600 - ori_prompt_len
+ audio = torch.nn.functional.pad(audio, (0, p), mode='constant').data
+
+ file_name = os.path.splitext(os.path.basename(a.input_prompt))[0]
+
+ # If you have a memory issue during denosing the prompt, try to denoise the prompt with cpu before TTS
+ # We will have a plan to replace a memory-efficient denoiser
+ if a.denoise_ratio == 0:
+ audio = torch.cat([audio.cuda(), audio.cuda()], dim=0)
+ else:
+ with torch.no_grad():
+ denoised_audio = denoise(audio.squeeze(0).cuda(), denoiser, hps_denoiser)
+ audio = torch.cat([audio.cuda(), denoised_audio[:,:audio.shape[-1]]], dim=0)
+
+
+ audio = audio[:,:ori_prompt_len] # 20231108 We found that large size of padding decreases a performance so we remove the paddings after denosing.
+
+ src_mel = mel_fn(audio.cuda())
+
+ src_length = torch.LongTensor([src_mel.size(2)]).to(device)
+ src_length2 = torch.cat([src_length,src_length], dim=0)
+
+ ## TTV (Text --> W2V, F0)
+ with torch.no_grad():
+ w2v_x, pitch = text2w2v.infer_noise_control(token, token_length, src_mel, src_length2, noise_scale=a.noise_scale_ttv, denoise_ratio=a.denoise_ratio)
+
+ src_length = torch.LongTensor([w2v_x.size(2)]).cuda()
+
+ ## Pitch Clipping
+ pitch[pitch