candlend commited on
Commit
3817de1
1 Parent(s): 1f55a13
Files changed (44) hide show
  1. app.py +7 -3
  2. out_temp.wav +0 -0
  3. pth/hubert-soft-0d54a1f4.pt +3 -0
  4. requirements.txt +2 -1
  5. sovits/G_420000.pth +3 -0
  6. sovits/__init__.py +5 -0
  7. sovits/attentions.py +311 -0
  8. sovits/commons.py +180 -0
  9. sovits/configs/hoshimi_base.json +99 -0
  10. sovits/hubert_model.py +224 -0
  11. sovits/infer_tool.py +247 -0
  12. sovits/mel_processing.py +112 -0
  13. sovits/models.py +418 -0
  14. sovits/models/G_0.pth +3 -0
  15. sovits/models/G_16000.pth +3 -0
  16. sovits/modules.py +353 -0
  17. sovits/preprocess_wave.py +67 -0
  18. sovits/slicer.py +166 -0
  19. sovits/sovits_inferencer.py +51 -0
  20. sovits/transforms.py +185 -0
  21. sovits/utils.py +95 -0
  22. sovits/vdecoder/__init__.py +0 -0
  23. sovits/vdecoder/hifigan/hifigan.py +366 -0
  24. sovits/vdecoder/hifigan/mel_utils.py +80 -0
  25. sovits/vdecoder/parallel_wavegan/__init__.py +0 -0
  26. sovits/vdecoder/parallel_wavegan/layers/__init__.py +5 -0
  27. sovits/vdecoder/parallel_wavegan/layers/causal_conv.py +56 -0
  28. sovits/vdecoder/parallel_wavegan/layers/pqmf.py +129 -0
  29. sovits/vdecoder/parallel_wavegan/layers/residual_block.py +129 -0
  30. sovits/vdecoder/parallel_wavegan/layers/residual_stack.py +75 -0
  31. sovits/vdecoder/parallel_wavegan/layers/tf_layers.py +129 -0
  32. sovits/vdecoder/parallel_wavegan/layers/upsample.py +183 -0
  33. sovits/vdecoder/parallel_wavegan/losses/__init__.py +1 -0
  34. sovits/vdecoder/parallel_wavegan/losses/stft_loss.py +153 -0
  35. sovits/vdecoder/parallel_wavegan/models/__init__.py +2 -0
  36. sovits/vdecoder/parallel_wavegan/models/melgan.py +427 -0
  37. sovits/vdecoder/parallel_wavegan/models/parallel_wavegan.py +434 -0
  38. sovits/vdecoder/parallel_wavegan/models/source.py +538 -0
  39. sovits/vdecoder/parallel_wavegan/optimizers/__init__.py +2 -0
  40. sovits/vdecoder/parallel_wavegan/optimizers/radam.py +91 -0
  41. sovits/vdecoder/parallel_wavegan/stft_loss.py +100 -0
  42. sovits/vdecoder/parallel_wavegan/utils/__init__.py +1 -0
  43. sovits/vdecoder/parallel_wavegan/utils/utils.py +169 -0
  44. vits/{tts_inferencer.py → vits_inferencer.py} +1 -1
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import gradio as gr
2
- from vits.tts_inferencer import TTSInferencer
 
3
 
4
  app = gr.Blocks()
5
  with app:
@@ -7,6 +8,9 @@ with app:
7
  gr.HTML(f.read())
8
  with gr.Tabs():
9
  with gr.TabItem("语音合成"):
10
- tts_inferencer = TTSInferencer("vits/configs/hoshimi_base.json")
11
- tts_inferencer.render()
 
 
 
12
  app.launch()
 
1
  import gradio as gr
2
+ from vits.vits_inferencer import VitsInferencer
3
+ from sovits.sovits_inferencer import SovitsInferencer
4
 
5
  app = gr.Blocks()
6
  with app:
 
8
  gr.HTML(f.read())
9
  with gr.Tabs():
10
  with gr.TabItem("语音合成"):
11
+ vits_inferencer = VitsInferencer("vits/configs/hoshimi_base.json")
12
+ vits_inferencer.render()
13
+ with gr.TabItem("声线转换"):
14
+ sovits_inferencer = SovitsInferencer("sovits/configs/hoshimi_base.json")
15
+ sovits_inferencer.render()
16
  app.launch()
out_temp.wav ADDED
Binary file (236 kB). View file
 
pth/hubert-soft-0d54a1f4.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e82e7d079df05fe3aa535f6f7d42d309bdae1d2a53324e2b2386c56721f4f649
3
+ size 378435957
requirements.txt CHANGED
@@ -18,4 +18,5 @@ ko-pron==1.3
18
  inflect==6.0.0
19
  eng-to-ipa==0.0.2
20
  num-thai==0.0.5
21
- opencc==1.1.4
 
 
18
  inflect==6.0.0
19
  eng-to-ipa==0.0.2
20
  num-thai==0.0.5
21
+ opencc==1.1.4
22
+ scikit-maad
sovits/G_420000.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d2ba3b18b43b35c464fcfb85bd2f277737ec85e781c1327a68944f697b5a572e
3
+ size 633838909
sovits/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+
4
+ ROOT_PATH = os.path.dirname(os.path.abspath(__file__))
5
+ sys.path.append(ROOT_PATH)
sovits/attentions.py ADDED
@@ -0,0 +1,311 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+
3
+ import torch
4
+ from torch import nn
5
+ from torch.nn import functional as t_func
6
+
7
+ from sovits import commons
8
+ from sovits.modules import LayerNorm
9
+
10
+
11
+ class Encoder(nn.Module):
12
+ def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=4,
13
+ **kwargs):
14
+ super().__init__()
15
+ self.hidden_channels = hidden_channels
16
+ self.filter_channels = filter_channels
17
+ self.n_heads = n_heads
18
+ self.n_layers = n_layers
19
+ self.kernel_size = kernel_size
20
+ self.p_dropout = p_dropout
21
+ self.window_size = window_size
22
+
23
+ self.drop = nn.Dropout(p_dropout)
24
+ self.attn_layers = nn.ModuleList()
25
+ self.norm_layers_1 = nn.ModuleList()
26
+ self.ffn_layers = nn.ModuleList()
27
+ self.norm_layers_2 = nn.ModuleList()
28
+ for i in range(self.n_layers):
29
+ self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout,
30
+ window_size=window_size))
31
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
32
+ self.ffn_layers.append(
33
+ FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout))
34
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
35
+
36
+ def forward(self, x, x_mask):
37
+ attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
38
+ x = x * x_mask
39
+ for i in range(self.n_layers):
40
+ y = self.attn_layers[i](x, x, attn_mask)
41
+ y = self.drop(y)
42
+ x = self.norm_layers_1[i](x + y)
43
+
44
+ y = self.ffn_layers[i](x, x_mask)
45
+ y = self.drop(y)
46
+ x = self.norm_layers_2[i](x + y)
47
+ x = x * x_mask
48
+ return x
49
+
50
+
51
+ class Decoder(nn.Module):
52
+ def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0.,
53
+ proximal_bias=False, proximal_init=True, **kwargs):
54
+ super().__init__()
55
+ self.hidden_channels = hidden_channels
56
+ self.filter_channels = filter_channels
57
+ self.n_heads = n_heads
58
+ self.n_layers = n_layers
59
+ self.kernel_size = kernel_size
60
+ self.p_dropout = p_dropout
61
+ self.proximal_bias = proximal_bias
62
+ self.proximal_init = proximal_init
63
+
64
+ self.drop = nn.Dropout(p_dropout)
65
+ self.self_attn_layers = nn.ModuleList()
66
+ self.norm_layers_0 = nn.ModuleList()
67
+ self.encdec_attn_layers = nn.ModuleList()
68
+ self.norm_layers_1 = nn.ModuleList()
69
+ self.ffn_layers = nn.ModuleList()
70
+ self.norm_layers_2 = nn.ModuleList()
71
+ for i in range(self.n_layers):
72
+ self.self_attn_layers.append(
73
+ MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout,
74
+ proximal_bias=proximal_bias, proximal_init=proximal_init))
75
+ self.norm_layers_0.append(LayerNorm(hidden_channels))
76
+ self.encdec_attn_layers.append(
77
+ MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout))
78
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
79
+ self.ffn_layers.append(
80
+ FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True))
81
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
82
+
83
+ def forward(self, x, x_mask, h, h_mask):
84
+ """
85
+ x: decoder input
86
+ h: encoder output
87
+ """
88
+ self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype)
89
+ encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
90
+ x = x * x_mask
91
+ for i in range(self.n_layers):
92
+ y = self.self_attn_layers[i](x, x, self_attn_mask)
93
+ y = self.drop(y)
94
+ x = self.norm_layers_0[i](x + y)
95
+
96
+ y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
97
+ y = self.drop(y)
98
+ x = self.norm_layers_1[i](x + y)
99
+
100
+ y = self.ffn_layers[i](x, x_mask)
101
+ y = self.drop(y)
102
+ x = self.norm_layers_2[i](x + y)
103
+ x = x * x_mask
104
+ return x
105
+
106
+
107
+ class MultiHeadAttention(nn.Module):
108
+ def __init__(self, channels, out_channels, n_heads, p_dropout=0., window_size=None, heads_share=True,
109
+ block_length=None, proximal_bias=False, proximal_init=False):
110
+ super().__init__()
111
+ assert channels % n_heads == 0
112
+
113
+ self.channels = channels
114
+ self.out_channels = out_channels
115
+ self.n_heads = n_heads
116
+ self.p_dropout = p_dropout
117
+ self.window_size = window_size
118
+ self.heads_share = heads_share
119
+ self.block_length = block_length
120
+ self.proximal_bias = proximal_bias
121
+ self.proximal_init = proximal_init
122
+ self.attn = None
123
+
124
+ self.k_channels = channels // n_heads
125
+ self.conv_q = nn.Conv1d(channels, channels, 1)
126
+ self.conv_k = nn.Conv1d(channels, channels, 1)
127
+ self.conv_v = nn.Conv1d(channels, channels, 1)
128
+ self.conv_o = nn.Conv1d(channels, out_channels, 1)
129
+ self.drop = nn.Dropout(p_dropout)
130
+
131
+ if window_size is not None:
132
+ n_heads_rel = 1 if heads_share else n_heads
133
+ rel_stddev = self.k_channels ** -0.5
134
+ self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
135
+ self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
136
+
137
+ nn.init.xavier_uniform_(self.conv_q.weight)
138
+ nn.init.xavier_uniform_(self.conv_k.weight)
139
+ nn.init.xavier_uniform_(self.conv_v.weight)
140
+ if proximal_init:
141
+ with torch.no_grad():
142
+ self.conv_k.weight.copy_(self.conv_q.weight)
143
+ self.conv_k.bias.copy_(self.conv_q.bias)
144
+
145
+ def forward(self, x, c, attn_mask=None):
146
+ q = self.conv_q(x)
147
+ k = self.conv_k(c)
148
+ v = self.conv_v(c)
149
+
150
+ x, self.attn = self.attention(q, k, v, mask=attn_mask)
151
+
152
+ x = self.conv_o(x)
153
+ return x
154
+
155
+ def attention(self, query, key, value, mask=None):
156
+ # reshape [b, d, t] -> [b, n_h, t, d_k]
157
+ b, d, t_s, t_t = (*key.size(), query.size(2))
158
+ query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
159
+ key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
160
+ value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
161
+
162
+ scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
163
+ if self.window_size is not None:
164
+ assert t_s == t_t, "Relative attention is only available for self-attention."
165
+ key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
166
+ rel_logits = self._matmul_with_relative_keys(query / math.sqrt(self.k_channels), key_relative_embeddings)
167
+ scores_local = self._relative_position_to_absolute_position(rel_logits)
168
+ scores = scores + scores_local
169
+ if self.proximal_bias:
170
+ assert t_s == t_t, "Proximal bias is only available for self-attention."
171
+ scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype)
172
+ if mask is not None:
173
+ scores = scores.masked_fill(mask == 0, -1e4)
174
+ if self.block_length is not None:
175
+ assert t_s == t_t, "Local attention is only available for self-attention."
176
+ block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length)
177
+ scores = scores.masked_fill(block_mask == 0, -1e4)
178
+ p_attn = t_func.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
179
+ p_attn = self.drop(p_attn)
180
+ output = torch.matmul(p_attn, value)
181
+ if self.window_size is not None:
182
+ relative_weights = self._absolute_position_to_relative_position(p_attn)
183
+ value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
184
+ output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
185
+ output = output.transpose(2, 3).contiguous().view(b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t]
186
+ return output, p_attn
187
+
188
+ def _matmul_with_relative_values(self, x, y):
189
+ """
190
+ x: [b, h, l, m]
191
+ y: [h or 1, m, d]
192
+ ret: [b, h, l, d]
193
+ """
194
+ ret = torch.matmul(x, y.unsqueeze(0))
195
+ return ret
196
+
197
+ def _matmul_with_relative_keys(self, x, y):
198
+ """
199
+ x: [b, h, l, d]
200
+ y: [h or 1, m, d]
201
+ ret: [b, h, l, m]
202
+ """
203
+ ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
204
+ return ret
205
+
206
+ def _get_relative_embeddings(self, relative_embeddings, length):
207
+ max_relative_position = 2 * self.window_size + 1
208
+ # Pad first before slice to avoid using cond ops.
209
+ pad_length = max(length - (self.window_size + 1), 0)
210
+ slice_start_position = max((self.window_size + 1) - length, 0)
211
+ slice_end_position = slice_start_position + 2 * length - 1
212
+ if pad_length > 0:
213
+ padded_relative_embeddings = t_func.pad(
214
+ relative_embeddings,
215
+ commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]))
216
+ else:
217
+ padded_relative_embeddings = relative_embeddings
218
+ used_relative_embeddings = padded_relative_embeddings[:, slice_start_position:slice_end_position]
219
+ return used_relative_embeddings
220
+
221
+ def _relative_position_to_absolute_position(self, x):
222
+ """
223
+ x: [b, h, l, 2*l-1]
224
+ ret: [b, h, l, l]
225
+ """
226
+ batch, heads, length, _ = x.size()
227
+ # Concat columns of pad to shift from relative to absolute indexing.
228
+ x = t_func.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]))
229
+
230
+ # Concat extra elements so to add up to shape (len+1, 2*len-1).
231
+ x_flat = x.view([batch, heads, length * 2 * length])
232
+ x_flat = t_func.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]]))
233
+
234
+ # Reshape and slice out the padded elements.
235
+ x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[:, :, :length, length - 1:]
236
+ return x_final
237
+
238
+ def _absolute_position_to_relative_position(self, x):
239
+ """
240
+ x: [b, h, l, l]
241
+ ret: [b, h, l, 2*l-1]
242
+ """
243
+ batch, heads, length, _ = x.size()
244
+ # padd along column
245
+ x = t_func.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]]))
246
+ x_flat = x.view([batch, heads, length ** 2 + length * (length - 1)])
247
+ # add 0's in the beginning that will skew the elements after reshape
248
+ x_flat = t_func.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
249
+ x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
250
+ return x_final
251
+
252
+ def _attention_bias_proximal(self, length):
253
+ """Bias for self-attention to encourage attention to close positions.
254
+ Args:
255
+ length: an integer scalar.
256
+ Returns:
257
+ a Tensor with shape [1, 1, length, length]
258
+ """
259
+ r = torch.arange(length, dtype=torch.float32)
260
+ diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
261
+ return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
262
+
263
+
264
+ class FFN(nn.Module):
265
+ def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None,
266
+ causal=False):
267
+ super().__init__()
268
+ self.in_channels = in_channels
269
+ self.out_channels = out_channels
270
+ self.filter_channels = filter_channels
271
+ self.kernel_size = kernel_size
272
+ self.p_dropout = p_dropout
273
+ self.activation = activation
274
+ self.causal = causal
275
+
276
+ if causal:
277
+ self.padding = self._causal_padding
278
+ else:
279
+ self.padding = self._same_padding
280
+
281
+ self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
282
+ self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
283
+ self.drop = nn.Dropout(p_dropout)
284
+
285
+ def forward(self, x, x_mask):
286
+ x = self.conv_1(self.padding(x * x_mask))
287
+ if self.activation == "gelu":
288
+ x = x * torch.sigmoid(1.702 * x)
289
+ else:
290
+ x = torch.relu(x)
291
+ x = self.drop(x)
292
+ x = self.conv_2(self.padding(x * x_mask))
293
+ return x * x_mask
294
+
295
+ def _causal_padding(self, x):
296
+ if self.kernel_size == 1:
297
+ return x
298
+ pad_l = self.kernel_size - 1
299
+ pad_r = 0
300
+ padding = [[0, 0], [0, 0], [pad_l, pad_r]]
301
+ x = t_func.pad(x, commons.convert_pad_shape(padding))
302
+ return x
303
+
304
+ def _same_padding(self, x):
305
+ if self.kernel_size == 1:
306
+ return x
307
+ pad_l = (self.kernel_size - 1) // 2
308
+ pad_r = self.kernel_size // 2
309
+ padding = [[0, 0], [0, 0], [pad_l, pad_r]]
310
+ x = t_func.pad(x, commons.convert_pad_shape(padding))
311
+ return x
sovits/commons.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+
3
+ import torch
4
+ from torch.nn import functional as F
5
+
6
+
7
+ def init_weights(m, mean=0.0, std=0.01):
8
+ classname = m.__class__.__name__
9
+ if classname.find("Conv") != -1:
10
+ m.weight.data.normal_(mean, std)
11
+
12
+
13
+ def get_padding(kernel_size, dilation=1):
14
+ return int((kernel_size * dilation - dilation) / 2)
15
+
16
+
17
+ def convert_pad_shape(pad_shape):
18
+ l = pad_shape[::-1]
19
+ pad_shape = [item for sublist in l for item in sublist]
20
+ return pad_shape
21
+
22
+
23
+ def intersperse(lst, item):
24
+ result = [item] * (len(lst) * 2 + 1)
25
+ result[1::2] = lst
26
+ return result
27
+
28
+
29
+ def kl_divergence(m_p, logs_p, m_q, logs_q):
30
+ """KL(P||Q)"""
31
+ kl = (logs_q - logs_p) - 0.5
32
+ kl += 0.5 * (torch.exp(2. * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2. * logs_q)
33
+ return kl
34
+
35
+
36
+ def rand_gumbel(shape):
37
+ """Sample from the Gumbel distribution, protect from overflows."""
38
+ uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
39
+ return -torch.log(-torch.log(uniform_samples))
40
+
41
+
42
+ def rand_gumbel_like(x):
43
+ g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
44
+ return g
45
+
46
+
47
+ def slice_segments(x, ids_str, segment_size=4):
48
+ ret = torch.zeros_like(x[:, :, :segment_size])
49
+ for i in range(x.size(0)):
50
+ idx_str = ids_str[i]
51
+ idx_end = idx_str + segment_size
52
+ ret[i] = x[i, :, idx_str:idx_end]
53
+ return ret
54
+
55
+
56
+ def slice_pitch_segments(x, ids_str, segment_size=4):
57
+ ret = torch.zeros_like(x[:, :segment_size])
58
+ for i in range(x.size(0)):
59
+ idx_str = ids_str[i]
60
+ idx_end = idx_str + segment_size
61
+ ret[i] = x[i, idx_str:idx_end]
62
+ return ret
63
+
64
+
65
+ def rand_slice_segments(x, x_lengths=None, segment_size=4):
66
+ b, d, t = x.size()
67
+ if x_lengths is None:
68
+ x_lengths = t
69
+ ids_str_max = x_lengths - segment_size + 1
70
+ ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
71
+ ret = slice_segments(x, ids_str, segment_size)
72
+ return ret, ids_str
73
+
74
+
75
+ def rand_slice_segments_with_pitch(x, pitch, x_lengths=None, segment_size=4):
76
+ b, d, t = x.size()
77
+ if x_lengths is None:
78
+ x_lengths = t
79
+ ids_str_max = x_lengths - segment_size + 1
80
+ ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
81
+ ret = slice_segments(x, ids_str, segment_size)
82
+ ret_pitch = slice_pitch_segments(pitch, ids_str, segment_size)
83
+ return ret, ret_pitch, ids_str
84
+
85
+
86
+ def get_timing_signal_1d(
87
+ length, channels, min_timescale=1.0, max_timescale=1.0e4):
88
+ position = torch.arange(length, dtype=torch.float)
89
+ num_timescales = channels // 2
90
+ log_timescale_increment = (
91
+ math.log(float(max_timescale) / float(min_timescale)) /
92
+ (num_timescales - 1))
93
+ inv_timescales = min_timescale * torch.exp(
94
+ torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment)
95
+ scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
96
+ signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
97
+ signal = F.pad(signal, [0, 0, 0, channels % 2])
98
+ signal = signal.view(1, channels, length)
99
+ return signal
100
+
101
+
102
+ def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
103
+ b, channels, length = x.size()
104
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
105
+ return x + signal.to(dtype=x.dtype, device=x.device)
106
+
107
+
108
+ def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
109
+ b, channels, length = x.size()
110
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
111
+ return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
112
+
113
+
114
+ def subsequent_mask(length):
115
+ mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
116
+ return mask
117
+
118
+
119
+ @torch.jit.script
120
+ def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
121
+ n_channels_int = n_channels[0]
122
+ in_act = input_a + input_b
123
+ t_act = torch.tanh(in_act[:, :n_channels_int, :])
124
+ s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
125
+ acts = t_act * s_act
126
+ return acts
127
+
128
+
129
+ def convert_pad_shape(pad_shape):
130
+ l = pad_shape[::-1]
131
+ pad_shape = [item for sublist in l for item in sublist]
132
+ return pad_shape
133
+
134
+
135
+ def shift_1d(x):
136
+ x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
137
+ return x
138
+
139
+
140
+ def sequence_mask(length, max_length=None):
141
+ if max_length is None:
142
+ max_length = length.max()
143
+ x = torch.arange(max_length, dtype=length.dtype, device=length.device)
144
+ return x.unsqueeze(0) < length.unsqueeze(1)
145
+
146
+
147
+ def generate_path(duration, mask):
148
+ """
149
+ duration: [b, 1, t_x]
150
+ mask: [b, 1, t_y, t_x]
151
+ """
152
+ device = duration.device
153
+
154
+ b, _, t_y, t_x = mask.shape
155
+ cum_duration = torch.cumsum(duration, -1)
156
+
157
+ cum_duration_flat = cum_duration.view(b * t_x)
158
+ path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
159
+ path = path.view(b, t_x, t_y)
160
+ path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
161
+ path = path.unsqueeze(1).transpose(2, 3) * mask
162
+ return path
163
+
164
+
165
+ def clip_grad_value_(parameters, clip_value, norm_type=2):
166
+ if isinstance(parameters, torch.Tensor):
167
+ parameters = [parameters]
168
+ parameters = list(filter(lambda p: p.grad is not None, parameters))
169
+ norm_type = float(norm_type)
170
+ if clip_value is not None:
171
+ clip_value = float(clip_value)
172
+
173
+ total_norm = 0
174
+ for p in parameters:
175
+ param_norm = p.grad.data.norm(norm_type)
176
+ total_norm += param_norm.item() ** norm_type
177
+ if clip_value is not None:
178
+ p.grad.data.clamp_(min=-clip_value, max=clip_value)
179
+ total_norm = total_norm ** (1. / norm_type)
180
+ return total_norm
sovits/configs/hoshimi_base.json ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 2000,
5
+ "seed": 1234,
6
+ "epochs": 10000,
7
+ "learning_rate": 2e-4,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-9,
13
+ "batch_size": 16,
14
+ "fp16_run": true,
15
+ "lr_decay": 0.999875,
16
+ "segment_size": 7680,
17
+ "init_lr_ratio": 1,
18
+ "warmup_epochs": 0,
19
+ "c_mel": 45,
20
+ "c_kl": 1.0
21
+ },
22
+ "data": {
23
+ "training_files": "filelists/hoshimi_train_filelist.txt",
24
+ "validation_files": "filelists/hoshimi_val_filelist.txt",
25
+ "text_cleaners": [
26
+ "english_cleaners2"
27
+ ],
28
+ "max_wav_value": 32768.0,
29
+ "sampling_rate": 32000,
30
+ "filter_length": 1024,
31
+ "hop_length": 320,
32
+ "win_length": 1024,
33
+ "n_mel_channels": 80,
34
+ "mel_fmin": 0.0,
35
+ "mel_fmax": null,
36
+ "add_blank": true,
37
+ "n_speakers": 8,
38
+ "cleaned_text": true
39
+ },
40
+ "model": {
41
+ "sampling_rate": 32000,
42
+ "inter_channels": 192,
43
+ "hidden_channels": 256,
44
+ "filter_channels": 768,
45
+ "n_heads": 2,
46
+ "n_layers": 6,
47
+ "kernel_size": 3,
48
+ "p_dropout": 0.1,
49
+ "resblock": "1",
50
+ "resblock_kernel_sizes": [
51
+ 3,
52
+ 7,
53
+ 11
54
+ ],
55
+ "resblock_dilation_sizes": [
56
+ [
57
+ 1,
58
+ 3,
59
+ 5
60
+ ],
61
+ [
62
+ 1,
63
+ 3,
64
+ 5
65
+ ],
66
+ [
67
+ 1,
68
+ 3,
69
+ 5
70
+ ]
71
+ ],
72
+ "upsample_rates": [
73
+ 10,
74
+ 8,
75
+ 2,
76
+ 2
77
+ ],
78
+ "upsample_initial_channel": 512,
79
+ "upsample_kernel_sizes": [
80
+ 16,
81
+ 16,
82
+ 4,
83
+ 4
84
+ ],
85
+ "n_layers_q": 3,
86
+ "use_spectral_norm": false,
87
+ "gin_channels": 256
88
+ },
89
+ "speakers": [
90
+ "hoshimi",
91
+ "yilanqiu",
92
+ "yunhao",
93
+ "jishuang",
94
+ "xing",
95
+ "opencpop",
96
+ "atri",
97
+ "tianyi"
98
+ ]
99
+ }
sovits/hubert_model.py ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import random
3
+ from typing import Optional, Tuple
4
+
5
+ import torch
6
+ import torch.nn as nn
7
+ import torch.nn.functional as t_func
8
+ from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
9
+
10
+
11
+ class Hubert(nn.Module):
12
+ def __init__(self, num_label_embeddings: int = 100, mask: bool = True):
13
+ super().__init__()
14
+ self._mask = mask
15
+ self.feature_extractor = FeatureExtractor()
16
+ self.feature_projection = FeatureProjection()
17
+ self.positional_embedding = PositionalConvEmbedding()
18
+ self.norm = nn.LayerNorm(768)
19
+ self.dropout = nn.Dropout(0.1)
20
+ self.encoder = TransformerEncoder(
21
+ nn.TransformerEncoderLayer(
22
+ 768, 12, 3072, activation="gelu", batch_first=True
23
+ ),
24
+ 12,
25
+ )
26
+ self.proj = nn.Linear(768, 256)
27
+
28
+ self.masked_spec_embed = nn.Parameter(torch.FloatTensor(768).uniform_())
29
+ self.label_embedding = nn.Embedding(num_label_embeddings, 256)
30
+
31
+ def mask(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
32
+ mask = None
33
+ if self.training and self._mask:
34
+ mask = _compute_mask((x.size(0), x.size(1)), 0.8, 10, x.device, 2)
35
+ x[mask] = self.masked_spec_embed.to(x.dtype)
36
+ return x, mask
37
+
38
+ def encode(
39
+ self, x: torch.Tensor, layer: Optional[int] = None
40
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
41
+ x = self.feature_extractor(x)
42
+ x = self.feature_projection(x.transpose(1, 2))
43
+ x, mask = self.mask(x)
44
+ x = x + self.positional_embedding(x)
45
+ x = self.dropout(self.norm(x))
46
+ x = self.encoder(x, output_layer=layer)
47
+ return x, mask
48
+
49
+ def logits(self, x: torch.Tensor) -> torch.Tensor:
50
+ logits = torch.cosine_similarity(
51
+ x.unsqueeze(2),
52
+ self.label_embedding.weight.unsqueeze(0).unsqueeze(0),
53
+ dim=-1,
54
+ )
55
+ return logits / 0.1
56
+
57
+ def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
58
+ x, mask = self.encode(x)
59
+ x = self.proj(x)
60
+ logits = self.logits(x)
61
+ return logits, mask
62
+
63
+
64
+ class HubertSoft(Hubert):
65
+ def __init__(self):
66
+ super().__init__()
67
+
68
+ @torch.inference_mode()
69
+ def units(self, wav: torch.Tensor) -> torch.Tensor:
70
+ wav = t_func.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
71
+ x, _ = self.encode(wav)
72
+ return self.proj(x)
73
+
74
+
75
+ class FeatureExtractor(nn.Module):
76
+ def __init__(self):
77
+ super().__init__()
78
+ self.conv0 = nn.Conv1d(1, 512, 10, 5, bias=False)
79
+ self.norm0 = nn.GroupNorm(512, 512)
80
+ self.conv1 = nn.Conv1d(512, 512, 3, 2, bias=False)
81
+ self.conv2 = nn.Conv1d(512, 512, 3, 2, bias=False)
82
+ self.conv3 = nn.Conv1d(512, 512, 3, 2, bias=False)
83
+ self.conv4 = nn.Conv1d(512, 512, 3, 2, bias=False)
84
+ self.conv5 = nn.Conv1d(512, 512, 2, 2, bias=False)
85
+ self.conv6 = nn.Conv1d(512, 512, 2, 2, bias=False)
86
+
87
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
88
+ x = t_func.gelu(self.norm0(self.conv0(x)))
89
+ x = t_func.gelu(self.conv1(x))
90
+ x = t_func.gelu(self.conv2(x))
91
+ x = t_func.gelu(self.conv3(x))
92
+ x = t_func.gelu(self.conv4(x))
93
+ x = t_func.gelu(self.conv5(x))
94
+ x = t_func.gelu(self.conv6(x))
95
+ return x
96
+
97
+
98
+ class FeatureProjection(nn.Module):
99
+ def __init__(self):
100
+ super().__init__()
101
+ self.norm = nn.LayerNorm(512)
102
+ self.projection = nn.Linear(512, 768)
103
+ self.dropout = nn.Dropout(0.1)
104
+
105
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
106
+ x = self.norm(x)
107
+ x = self.projection(x)
108
+ x = self.dropout(x)
109
+ return x
110
+
111
+
112
+ class PositionalConvEmbedding(nn.Module):
113
+ def __init__(self):
114
+ super().__init__()
115
+ self.conv = nn.Conv1d(
116
+ 768,
117
+ 768,
118
+ kernel_size=128,
119
+ padding=128 // 2,
120
+ groups=16,
121
+ )
122
+ self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
123
+
124
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
125
+ x = self.conv(x.transpose(1, 2))
126
+ x = t_func.gelu(x[:, :, :-1])
127
+ return x.transpose(1, 2)
128
+
129
+
130
+ class TransformerEncoder(nn.Module):
131
+ def __init__(
132
+ self, encoder_layer: nn.TransformerEncoderLayer, num_layers: int
133
+ ) -> None:
134
+ super(TransformerEncoder, self).__init__()
135
+ self.layers = nn.ModuleList(
136
+ [copy.deepcopy(encoder_layer) for _ in range(num_layers)]
137
+ )
138
+ self.num_layers = num_layers
139
+
140
+ def forward(
141
+ self,
142
+ src: torch.Tensor,
143
+ mask: torch.Tensor = None,
144
+ src_key_padding_mask: torch.Tensor = None,
145
+ output_layer: Optional[int] = None,
146
+ ) -> torch.Tensor:
147
+ output = src
148
+ for layer in self.layers[:output_layer]:
149
+ output = layer(
150
+ output, src_mask=mask, src_key_padding_mask=src_key_padding_mask
151
+ )
152
+ return output
153
+
154
+
155
+ def _compute_mask(
156
+ shape: Tuple[int, int],
157
+ mask_prob: float,
158
+ mask_length: int,
159
+ device: torch.device,
160
+ min_masks: int = 0,
161
+ ) -> torch.Tensor:
162
+ batch_size, sequence_length = shape
163
+
164
+ if mask_length < 1:
165
+ raise ValueError("`mask_length` has to be bigger than 0.")
166
+
167
+ if mask_length > sequence_length:
168
+ raise ValueError(
169
+ f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`"
170
+ )
171
+
172
+ # compute number of masked spans in batch
173
+ num_masked_spans = int(mask_prob * sequence_length / mask_length + random.random())
174
+ num_masked_spans = max(num_masked_spans, min_masks)
175
+
176
+ # make sure num masked indices <= sequence_length
177
+ if num_masked_spans * mask_length > sequence_length:
178
+ num_masked_spans = sequence_length // mask_length
179
+
180
+ # SpecAugment mask to fill
181
+ mask = torch.zeros((batch_size, sequence_length), device=device, dtype=torch.bool)
182
+
183
+ # uniform distribution to sample from, make sure that offset samples are < sequence_length
184
+ uniform_dist = torch.ones(
185
+ (batch_size, sequence_length - (mask_length - 1)), device=device
186
+ )
187
+
188
+ # get random indices to mask
189
+ mask_indices = torch.multinomial(uniform_dist, num_masked_spans)
190
+
191
+ # expand masked indices to masked spans
192
+ mask_indices = (
193
+ mask_indices.unsqueeze(dim=-1)
194
+ .expand((batch_size, num_masked_spans, mask_length))
195
+ .reshape(batch_size, num_masked_spans * mask_length)
196
+ )
197
+ offsets = (
198
+ torch.arange(mask_length, device=device)[None, None, :]
199
+ .expand((batch_size, num_masked_spans, mask_length))
200
+ .reshape(batch_size, num_masked_spans * mask_length)
201
+ )
202
+ mask_idxs = mask_indices + offsets
203
+
204
+ # scatter indices to mask
205
+ mask = mask.scatter(1, mask_idxs, True)
206
+
207
+ return mask
208
+
209
+
210
+ def hubert_soft(
211
+ path: str
212
+ ) -> HubertSoft:
213
+ r"""HuBERT-Soft from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.
214
+ Args:
215
+ path (str): path of a pretrained model
216
+ """
217
+ # dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
218
+ dev = torch.device("cpu")
219
+ hubert = HubertSoft()
220
+ checkpoint = torch.load(path)
221
+ consume_prefix_in_state_dict_if_present(checkpoint, "module.")
222
+ hubert.load_state_dict(checkpoint)
223
+ hubert.eval().to(dev)
224
+ return hubert
sovits/infer_tool.py ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import shutil
4
+ import subprocess
5
+ import time
6
+
7
+ import librosa
8
+ import maad
9
+ import numpy as np
10
+ import torch
11
+ import torchaudio
12
+
13
+ from sovits import hubert_model
14
+ from sovits import utils
15
+ from sovits.mel_processing import spectrogram_torch
16
+ from sovits.models import SynthesizerTrn
17
+ from sovits.preprocess_wave import FeatureInput
18
+
19
+ logging.getLogger('matplotlib').setLevel(logging.WARNING)
20
+
21
+
22
+ def timeit(func):
23
+ def run(*args, **kwargs):
24
+ t = time.time()
25
+ res = func(*args, **kwargs)
26
+ print('executing \'%s\' costed %.3fs' % (func.__name__, time.time() - t))
27
+ return res
28
+
29
+ return run
30
+
31
+
32
+ def cut_wav(raw_audio_path, out_audio_name, input_wav_path, cut_time):
33
+ raw_audio, raw_sr = torchaudio.load(raw_audio_path)
34
+ if raw_audio.shape[-1] / raw_sr > cut_time:
35
+ subprocess.Popen(
36
+ f"python ./sovits/slicer.py {raw_audio_path} --out_name {out_audio_name} --out {input_wav_path} --db_thresh -30",
37
+ shell=True).wait()
38
+ else:
39
+ shutil.copy(raw_audio_path, f"{input_wav_path}/{out_audio_name}-00.wav")
40
+
41
+
42
+ def get_end_file(dir_path, end):
43
+ file_lists = []
44
+ for root, dirs, files in os.walk(dir_path):
45
+ files = [f for f in files if f[0] != '.']
46
+ dirs[:] = [d for d in dirs if d[0] != '.']
47
+ for f_file in files:
48
+ if f_file.endswith(end):
49
+ file_lists.append(os.path.join(root, f_file).replace("\\", "/"))
50
+ return file_lists
51
+
52
+
53
+ def resize2d_f0(x, target_len):
54
+ source = np.array(x)
55
+ source[source < 0.001] = np.nan
56
+ target = np.interp(np.arange(0, len(source) * target_len, len(source)) / target_len, np.arange(0, len(source)),
57
+ source)
58
+ res = np.nan_to_num(target)
59
+ return res
60
+
61
+
62
+ def clean_pitch(input_pitch):
63
+ num_nan = np.sum(input_pitch == 1)
64
+ if num_nan / len(input_pitch) > 0.9:
65
+ input_pitch[input_pitch != 1] = 1
66
+ return input_pitch
67
+
68
+
69
+ def plt_pitch(input_pitch):
70
+ input_pitch = input_pitch.astype(float)
71
+ input_pitch[input_pitch == 1] = np.nan
72
+ return input_pitch
73
+
74
+
75
+ def f0_to_pitch(ff):
76
+ f0_pitch = 69 + 12 * np.log2(ff / 440)
77
+ return f0_pitch
78
+
79
+
80
+ def del_temp_wav(path_data):
81
+ for i in get_end_file(path_data, "wav"): # os.listdir(path_data)#返回一个列表,里面是当前目录下面的所有东西的相对路径
82
+ os.remove(i)
83
+
84
+
85
+ def fill_a_to_b(a, b):
86
+ if len(a) < len(b):
87
+ for _ in range(0, len(b) - len(a)):
88
+ a.append(a[0])
89
+
90
+
91
+ def mkdir(paths: list):
92
+ for path in paths:
93
+ if not os.path.exists(path):
94
+ os.mkdir(path)
95
+
96
+
97
+ class Svc(object):
98
+ def __init__(self, model_path, config_path, device="cpu"):
99
+ self.model_path = model_path
100
+ self.dev = torch.device(device)
101
+ self.net_g_ms = None
102
+ self.hps_ms = utils.get_hparams_from_file(config_path)
103
+ self.target_sample = self.hps_ms.data.sampling_rate
104
+ self.speakers = self.hps_ms.speakers
105
+ # 加载hubert
106
+ self.hubert_soft = hubert_model.hubert_soft(get_end_file("./pth", "pt")[0])
107
+ self.feature_input = FeatureInput(self.hps_ms.data.sampling_rate, self.hps_ms.data.hop_length)
108
+
109
+ self.load_model()
110
+
111
+ def load_model(self):
112
+ # 获取模型配置
113
+ self.net_g_ms = SynthesizerTrn(
114
+ 178,
115
+ self.hps_ms.data.filter_length // 2 + 1,
116
+ self.hps_ms.train.segment_size // self.hps_ms.data.hop_length,
117
+ n_speakers=self.hps_ms.data.n_speakers,
118
+ **self.hps_ms.model)
119
+ _ = utils.load_checkpoint(self.model_path, self.net_g_ms, None)
120
+ if "half" in self.model_path and torch.cuda.is_available():
121
+ _ = self.net_g_ms.half().eval().to(self.dev)
122
+ else:
123
+ _ = self.net_g_ms.eval().to(self.dev)
124
+
125
+ def calc_error(self, in_path, out_path, tran):
126
+ a, s = torchaudio.load(in_path)
127
+ input_pitch = self.feature_input.compute_f0(a.cpu().numpy()[0], s)
128
+ a, s = torchaudio.load(out_path)
129
+ output_pitch = self.feature_input.compute_f0(a.cpu().numpy()[0], s)
130
+ sum_y = []
131
+ if np.sum(input_pitch == 0) / len(input_pitch) > 0.9:
132
+ mistake, var_take = 0, 0
133
+ else:
134
+ for i in range(min(len(input_pitch), len(output_pitch))):
135
+ if input_pitch[i] > 0 and output_pitch[i] > 0:
136
+ sum_y.append(abs(f0_to_pitch(output_pitch[i]) - (f0_to_pitch(input_pitch[i]) + tran)))
137
+ num_y = 0
138
+ for x in sum_y:
139
+ num_y += x
140
+ len_y = len(sum_y) if len(sum_y) else 1
141
+ mistake = round(float(num_y / len_y), 2)
142
+ var_take = round(float(np.std(sum_y, ddof=1)), 2)
143
+ return mistake, var_take
144
+
145
+ def get_units(self, source, sr):
146
+ source = torchaudio.functional.resample(source, sr, 16000)
147
+ if len(source.shape) == 2 and source.shape[1] >= 2:
148
+ source = torch.mean(source, dim=0).unsqueeze(0)
149
+ source = source.unsqueeze(0).to(self.dev)
150
+ with torch.inference_mode():
151
+ units = self.hubert_soft.units(source)
152
+ return units
153
+
154
+ def transcribe(self, source, sr, length, transform):
155
+ feature_pit = self.feature_input.compute_f0(source, sr)
156
+ feature_pit = feature_pit * 2 ** (transform / 12)
157
+ feature_pit = resize2d_f0(feature_pit, length)
158
+ coarse_pit = self.feature_input.coarse_f0(feature_pit)
159
+ return coarse_pit
160
+
161
+ def get_unit_pitch(self, in_path, tran):
162
+ source, sr = torchaudio.load(in_path)
163
+ soft = self.get_units(source, sr).squeeze(0).cpu().numpy()
164
+ input_pitch = self.transcribe(source.cpu().numpy()[0], sr, soft.shape[0], tran)
165
+ return soft, input_pitch
166
+
167
+ def infer(self, speaker_id, tran, raw_path):
168
+ sid = torch.LongTensor([int(speaker_id)]).to(self.dev)
169
+ soft, pitch = self.get_unit_pitch(raw_path, tran)
170
+ pitch = torch.LongTensor(clean_pitch(pitch)).unsqueeze(0).to(self.dev)
171
+ if "half" in self.model_path and torch.cuda.is_available():
172
+ stn_tst = torch.HalfTensor(soft)
173
+ else:
174
+ stn_tst = torch.FloatTensor(soft)
175
+ with torch.no_grad():
176
+ x_tst = stn_tst.unsqueeze(0).to(self.dev)
177
+ x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(self.dev)
178
+ audio = self.net_g_ms.infer(x_tst, x_tst_lengths, pitch, sid=sid, noise_scale=0.3, noise_scale_w=0.5,
179
+ length_scale=1)[0][0, 0].data.float()
180
+ return audio, audio.shape[-1]
181
+
182
+ def load_audio_to_torch(self, full_path):
183
+ audio, sampling_rate = librosa.load(full_path, sr=self.target_sample, mono=True)
184
+ return torch.FloatTensor(audio.astype(np.float32))
185
+
186
+ def vc(self, origin_id, target_id, raw_path):
187
+ audio = self.load_audio_to_torch(raw_path)
188
+ y = audio.unsqueeze(0).to(self.dev)
189
+
190
+ spec = spectrogram_torch(y, self.hps_ms.data.filter_length,
191
+ self.hps_ms.data.sampling_rate, self.hps_ms.data.hop_length,
192
+ self.hps_ms.data.win_length, center=False)
193
+ spec_lengths = torch.LongTensor([spec.size(-1)]).to(self.dev)
194
+ sid_src = torch.LongTensor([origin_id]).to(self.dev)
195
+
196
+ with torch.no_grad():
197
+ sid_tgt = torch.LongTensor([target_id]).to(self.dev)
198
+ audio = self.net_g_ms.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[0][
199
+ 0, 0].data.float()
200
+ return audio, audio.shape[-1]
201
+
202
+ def format_wav(self, audio_path):
203
+ raw_audio, raw_sample_rate = torchaudio.load(audio_path)
204
+ if len(raw_audio.shape) == 2 and raw_audio.shape[1] >= 2:
205
+ raw_audio = torch.mean(raw_audio, dim=0).unsqueeze(0)
206
+ tar_audio = torchaudio.functional.resample(raw_audio, raw_sample_rate, self.target_sample)
207
+ torchaudio.save(audio_path[:-4] + ".wav", tar_audio, self.target_sample)
208
+ return tar_audio, self.target_sample
209
+
210
+ def flask_format_wav(self, input_wav_path, daw_sample):
211
+ raw_audio, raw_sample_rate = torchaudio.load(input_wav_path)
212
+ tar_audio = torchaudio.functional.resample(raw_audio, daw_sample, self.target_sample)
213
+ if len(tar_audio.shape) == 2 and tar_audio.shape[1] >= 2:
214
+ tar_audio = torch.mean(tar_audio, dim=0).unsqueeze(0)
215
+ return tar_audio.cpu().numpy(), self.target_sample
216
+
217
+
218
+ class RealTimeVC:
219
+ def __init__(self):
220
+ self.last_chunk = None
221
+ self.last_o = None
222
+ self.chunk_len = 16000 # 区块长度
223
+ self.pre_len = 3840 # 交叉淡化长度,640的倍数
224
+
225
+ """输入输出都是1维numpy 音频波形数组"""
226
+
227
+ def process(self, svc_model, speaker_id, f_pitch_change, input_wav_path):
228
+ audio, sr = torchaudio.load(input_wav_path)
229
+ audio = audio.cpu().numpy()[0]
230
+ temp_wav = io.BytesIO()
231
+ if self.last_chunk is None:
232
+ input_wav_path.seek(0)
233
+ audio, sr = svc_model.infer(speaker_id, f_pitch_change, input_wav_path)
234
+ audio = audio.cpu().numpy()
235
+ self.last_chunk = audio[-self.pre_len:]
236
+ self.last_o = audio
237
+ return audio[-self.chunk_len:]
238
+ else:
239
+ audio = np.concatenate([self.last_chunk, audio])
240
+ soundfile.write(temp_wav, audio, sr, format="wav")
241
+ temp_wav.seek(0)
242
+ audio, sr = svc_model.infer(speaker_id, f_pitch_change, temp_wav)
243
+ audio = audio.cpu().numpy()
244
+ ret = maad.util.crossfade(self.last_o, audio, self.pre_len)
245
+ self.last_chunk = audio[-self.pre_len:]
246
+ self.last_o = audio
247
+ return ret[self.chunk_len:2 * self.chunk_len]
sovits/mel_processing.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import os
3
+ import random
4
+ import torch
5
+ from torch import nn
6
+ import torch.nn.functional as F
7
+ import torch.utils.data
8
+ import numpy as np
9
+ import librosa
10
+ import librosa.util as librosa_util
11
+ from librosa.util import normalize, pad_center, tiny
12
+ from scipy.signal import get_window
13
+ from scipy.io.wavfile import read
14
+ from librosa.filters import mel as librosa_mel_fn
15
+
16
+ MAX_WAV_VALUE = 32768.0
17
+
18
+
19
+ def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
20
+ """
21
+ PARAMS
22
+ ------
23
+ C: compression factor
24
+ """
25
+ return torch.log(torch.clamp(x, min=clip_val) * C)
26
+
27
+
28
+ def dynamic_range_decompression_torch(x, C=1):
29
+ """
30
+ PARAMS
31
+ ------
32
+ C: compression factor used to compress
33
+ """
34
+ return torch.exp(x) / C
35
+
36
+
37
+ def spectral_normalize_torch(magnitudes):
38
+ output = dynamic_range_compression_torch(magnitudes)
39
+ return output
40
+
41
+
42
+ def spectral_de_normalize_torch(magnitudes):
43
+ output = dynamic_range_decompression_torch(magnitudes)
44
+ return output
45
+
46
+
47
+ mel_basis = {}
48
+ hann_window = {}
49
+
50
+
51
+ def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
52
+ if torch.min(y) < -1.:
53
+ print('min value is ', torch.min(y))
54
+ if torch.max(y) > 1.:
55
+ print('max value is ', torch.max(y))
56
+
57
+ global hann_window
58
+ dtype_device = str(y.dtype) + '_' + str(y.device)
59
+ wnsize_dtype_device = str(win_size) + '_' + dtype_device
60
+ if wnsize_dtype_device not in hann_window:
61
+ hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
62
+
63
+ y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
64
+ y = y.squeeze(1)
65
+
66
+ spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
67
+ center=center, pad_mode='reflect', normalized=False, onesided=True)
68
+
69
+ spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
70
+ return spec
71
+
72
+
73
+ def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
74
+ global mel_basis
75
+ dtype_device = str(spec.dtype) + '_' + str(spec.device)
76
+ fmax_dtype_device = str(fmax) + '_' + dtype_device
77
+ if fmax_dtype_device not in mel_basis:
78
+ mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
79
+ mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device)
80
+ spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
81
+ spec = spectral_normalize_torch(spec)
82
+ return spec
83
+
84
+
85
+ def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
86
+ if torch.min(y) < -1.:
87
+ print('min value is ', torch.min(y))
88
+ if torch.max(y) > 1.:
89
+ print('max value is ', torch.max(y))
90
+
91
+ global mel_basis, hann_window
92
+ dtype_device = str(y.dtype) + '_' + str(y.device)
93
+ fmax_dtype_device = str(fmax) + '_' + dtype_device
94
+ wnsize_dtype_device = str(win_size) + '_' + dtype_device
95
+ if fmax_dtype_device not in mel_basis:
96
+ mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
97
+ mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device)
98
+ if wnsize_dtype_device not in hann_window:
99
+ hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
100
+
101
+ y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
102
+ y = y.squeeze(1)
103
+
104
+ spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
105
+ center=center, pad_mode='reflect', normalized=False, onesided=True)
106
+
107
+ spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
108
+
109
+ spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
110
+ spec = spectral_normalize_torch(spec)
111
+
112
+ return spec
sovits/models.py ADDED
@@ -0,0 +1,418 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+ from torch.nn import Conv1d, ConvTranspose1d, Conv2d
4
+ from torch.nn import functional as F
5
+ from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
6
+
7
+ from sovits import attentions
8
+ from sovits import commons
9
+ from sovits import modules
10
+ from sovits.commons import init_weights, get_padding
11
+ from sovits.vdecoder.hifigan.hifigan import HifiGanGenerator
12
+
13
+
14
+ # import monotonic_align
15
+
16
+
17
+ class TextEncoder(nn.Module):
18
+ def __init__(self,
19
+ n_vocab,
20
+ out_channels,
21
+ hidden_channels,
22
+ filter_channels,
23
+ n_heads,
24
+ n_layers,
25
+ kernel_size,
26
+ p_dropout):
27
+ super().__init__()
28
+ self.n_vocab = n_vocab
29
+ self.out_channels = out_channels
30
+ self.hidden_channels = hidden_channels
31
+ self.filter_channels = filter_channels
32
+ self.n_heads = n_heads
33
+ self.n_layers = n_layers
34
+ self.kernel_size = kernel_size
35
+ self.p_dropout = p_dropout
36
+
37
+ # self.emb = nn.Embedding(n_vocab, hidden_channels)
38
+ # nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5)
39
+ self.emb_pitch = nn.Embedding(256, hidden_channels)
40
+ nn.init.normal_(self.emb_pitch.weight, 0.0, hidden_channels ** -0.5)
41
+
42
+ self.encoder = attentions.Encoder(
43
+ hidden_channels,
44
+ filter_channels,
45
+ n_heads,
46
+ n_layers,
47
+ kernel_size,
48
+ p_dropout)
49
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
50
+
51
+ def forward(self, x, x_lengths, pitch):
52
+ # x = x.transpose(1,2)
53
+ # x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h]
54
+ # print(x.shape)
55
+ x = x + self.emb_pitch(pitch)
56
+ x = torch.transpose(x, 1, -1) # [b, h, t]
57
+ x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
58
+
59
+ x = self.encoder(x * x_mask, x_mask)
60
+ stats = self.proj(x) * x_mask
61
+
62
+ m, logs = torch.split(stats, self.out_channels, dim=1)
63
+ return x, m, logs, x_mask
64
+
65
+
66
+ class ResidualCouplingBlock(nn.Module):
67
+ def __init__(self,
68
+ channels,
69
+ hidden_channels,
70
+ kernel_size,
71
+ dilation_rate,
72
+ n_layers,
73
+ n_flows=4,
74
+ gin_channels=0):
75
+ super().__init__()
76
+ self.channels = channels
77
+ self.hidden_channels = hidden_channels
78
+ self.kernel_size = kernel_size
79
+ self.dilation_rate = dilation_rate
80
+ self.n_layers = n_layers
81
+ self.n_flows = n_flows
82
+ self.gin_channels = gin_channels
83
+
84
+ self.flows = nn.ModuleList()
85
+ for i in range(n_flows):
86
+ self.flows.append(
87
+ modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers,
88
+ gin_channels=gin_channels, mean_only=True))
89
+ self.flows.append(modules.Flip())
90
+
91
+ def forward(self, x, x_mask, g=None, reverse=False):
92
+ if not reverse:
93
+ for flow in self.flows:
94
+ x, _ = flow(x, x_mask, g=g, reverse=reverse)
95
+ else:
96
+ for flow in reversed(self.flows):
97
+ x = flow(x, x_mask, g=g, reverse=reverse)
98
+ return x
99
+
100
+
101
+ class PosteriorEncoder(nn.Module):
102
+ def __init__(self,
103
+ in_channels,
104
+ out_channels,
105
+ hidden_channels,
106
+ kernel_size,
107
+ dilation_rate,
108
+ n_layers,
109
+ gin_channels=0):
110
+ super().__init__()
111
+ self.in_channels = in_channels
112
+ self.out_channels = out_channels
113
+ self.hidden_channels = hidden_channels
114
+ self.kernel_size = kernel_size
115
+ self.dilation_rate = dilation_rate
116
+ self.n_layers = n_layers
117
+ self.gin_channels = gin_channels
118
+
119
+ self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
120
+ self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
121
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
122
+
123
+ def forward(self, x, x_lengths, g=None):
124
+ x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
125
+ x = self.pre(x) * x_mask
126
+ x = self.enc(x, x_mask, g=g)
127
+ stats = self.proj(x) * x_mask
128
+ m, logs = torch.split(stats, self.out_channels, dim=1)
129
+ z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
130
+ return z, m, logs, x_mask
131
+
132
+
133
+ class Generator(torch.nn.Module):
134
+ def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates,
135
+ upsample_initial_channel, upsample_kernel_sizes, gin_channels=0):
136
+ super(Generator, self).__init__()
137
+ self.num_kernels = len(resblock_kernel_sizes)
138
+ self.num_upsamples = len(upsample_rates)
139
+ self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
140
+ resblock = modules.ResBlock1 if resblock == '1' else modules.ResBlock2
141
+
142
+ self.ups = nn.ModuleList()
143
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
144
+ self.ups.append(weight_norm(
145
+ ConvTranspose1d(upsample_initial_channel // (2 ** i), upsample_initial_channel // (2 ** (i + 1)),
146
+ k, u, padding=(k - u) // 2)))
147
+
148
+ self.resblocks = nn.ModuleList()
149
+ for i in range(len(self.ups)):
150
+ ch = upsample_initial_channel // (2 ** (i + 1))
151
+ for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
152
+ self.resblocks.append(resblock(ch, k, d))
153
+
154
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
155
+ self.ups.apply(init_weights)
156
+
157
+ if gin_channels != 0:
158
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
159
+
160
+ def forward(self, x, g=None):
161
+ x = self.conv_pre(x)
162
+ if g is not None:
163
+ x = x + self.cond(g)
164
+
165
+ for i in range(self.num_upsamples):
166
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
167
+ x = self.ups[i](x)
168
+ xs = None
169
+ for j in range(self.num_kernels):
170
+ if xs is None:
171
+ xs = self.resblocks[i * self.num_kernels + j](x)
172
+ else:
173
+ xs += self.resblocks[i * self.num_kernels + j](x)
174
+ x = xs / self.num_kernels
175
+ x = F.leaky_relu(x)
176
+ x = self.conv_post(x)
177
+ x = torch.tanh(x)
178
+
179
+ return x
180
+
181
+ def remove_weight_norm(self):
182
+ print('Removing weight norm...')
183
+ for l in self.ups:
184
+ remove_weight_norm(l)
185
+ for l in self.resblocks:
186
+ l.remove_weight_norm()
187
+
188
+
189
+ class DiscriminatorP(torch.nn.Module):
190
+ def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
191
+ super(DiscriminatorP, self).__init__()
192
+ self.period = period
193
+ self.use_spectral_norm = use_spectral_norm
194
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
195
+ self.convs = nn.ModuleList([
196
+ norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
197
+ norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
198
+ norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
199
+ norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
200
+ norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))),
201
+ ])
202
+ self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
203
+
204
+ def forward(self, x):
205
+ fmap = []
206
+
207
+ # 1d to 2d
208
+ b, c, t = x.shape
209
+ if t % self.period != 0: # pad first
210
+ n_pad = self.period - (t % self.period)
211
+ x = F.pad(x, (0, n_pad), "reflect")
212
+ t = t + n_pad
213
+ x = x.view(b, c, t // self.period, self.period)
214
+
215
+ for l in self.convs:
216
+ x = l(x)
217
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
218
+ fmap.append(x)
219
+ x = self.conv_post(x)
220
+ fmap.append(x)
221
+ x = torch.flatten(x, 1, -1)
222
+
223
+ return x, fmap
224
+
225
+
226
+ class DiscriminatorS(torch.nn.Module):
227
+ def __init__(self, use_spectral_norm=False):
228
+ super(DiscriminatorS, self).__init__()
229
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
230
+ self.convs = nn.ModuleList([
231
+ norm_f(Conv1d(1, 16, 15, 1, padding=7)),
232
+ norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
233
+ norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
234
+ norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
235
+ norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
236
+ norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
237
+ ])
238
+ self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
239
+
240
+ def forward(self, x):
241
+ fmap = []
242
+
243
+ for l in self.convs:
244
+ x = l(x)
245
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
246
+ fmap.append(x)
247
+ x = self.conv_post(x)
248
+ fmap.append(x)
249
+ x = torch.flatten(x, 1, -1)
250
+
251
+ return x, fmap
252
+
253
+
254
+ class MultiPeriodDiscriminator(torch.nn.Module):
255
+ def __init__(self, use_spectral_norm=False):
256
+ super(MultiPeriodDiscriminator, self).__init__()
257
+ periods = [2, 3, 5, 7, 11]
258
+
259
+ discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
260
+ discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
261
+ self.discriminators = nn.ModuleList(discs)
262
+
263
+ def forward(self, y, y_hat):
264
+ y_d_rs = []
265
+ y_d_gs = []
266
+ fmap_rs = []
267
+ fmap_gs = []
268
+ for i, d in enumerate(self.discriminators):
269
+ y_d_r, fmap_r = d(y)
270
+ y_d_g, fmap_g = d(y_hat)
271
+ y_d_rs.append(y_d_r)
272
+ y_d_gs.append(y_d_g)
273
+ fmap_rs.append(fmap_r)
274
+ fmap_gs.append(fmap_g)
275
+
276
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
277
+
278
+
279
+ class SynthesizerTrn(nn.Module):
280
+ """
281
+ Synthesizer for Training
282
+ """
283
+
284
+ def __init__(self,
285
+ n_vocab,
286
+ spec_channels,
287
+ segment_size,
288
+ inter_channels,
289
+ hidden_channels,
290
+ filter_channels,
291
+ n_heads,
292
+ n_layers,
293
+ kernel_size,
294
+ p_dropout,
295
+ resblock,
296
+ resblock_kernel_sizes,
297
+ resblock_dilation_sizes,
298
+ upsample_rates,
299
+ upsample_initial_channel,
300
+ upsample_kernel_sizes,
301
+ n_speakers=0,
302
+ gin_channels=0,
303
+ use_sdp=True,
304
+ **kwargs):
305
+
306
+ super().__init__()
307
+ self.n_vocab = n_vocab
308
+ self.spec_channels = spec_channels
309
+ self.inter_channels = inter_channels
310
+ self.hidden_channels = hidden_channels
311
+ self.filter_channels = filter_channels
312
+ self.n_heads = n_heads
313
+ self.n_layers = n_layers
314
+ self.kernel_size = kernel_size
315
+ self.p_dropout = p_dropout
316
+ self.resblock = resblock
317
+ self.resblock_kernel_sizes = resblock_kernel_sizes
318
+ self.resblock_dilation_sizes = resblock_dilation_sizes
319
+ self.upsample_rates = upsample_rates
320
+ self.upsample_initial_channel = upsample_initial_channel
321
+ self.upsample_kernel_sizes = upsample_kernel_sizes
322
+ self.segment_size = segment_size
323
+ self.n_speakers = n_speakers
324
+ self.gin_channels = gin_channels
325
+
326
+ self.use_sdp = use_sdp
327
+
328
+ self.enc_p = TextEncoder(n_vocab,
329
+ inter_channels,
330
+ hidden_channels,
331
+ filter_channels,
332
+ n_heads,
333
+ n_layers,
334
+ kernel_size,
335
+ p_dropout)
336
+ # self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates,
337
+ # upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
338
+ #
339
+ # from DiffSinger: modules.hifigan.hifigan.HifiGanGenerator
340
+ # hps = {
341
+ # "resblock_kernel_sizes": [3, 7, 11],
342
+ # "upsample_rates": [8, 8, 2, 2],
343
+ # "upsample_initial_channel": 128,
344
+ # "use_pitch_embed": True,
345
+ # "audio_sample_rate": kwargs["sampling_rate"],
346
+ # "resblock": "1",
347
+ # "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
348
+ # }
349
+ # from sovits json config hps
350
+ hps = {
351
+ "resblock_kernel_sizes": resblock_kernel_sizes,
352
+ "inter_channels": inter_channels,
353
+ "upsample_rates": upsample_rates,
354
+ "upsample_kernel_sizes": upsample_kernel_sizes,
355
+ "upsample_initial_channel": upsample_initial_channel,
356
+ "use_pitch_embed": True,
357
+ "audio_sample_rate": kwargs["sampling_rate"],
358
+ "resblock": "1",
359
+ "resblock_dilation_sizes": resblock_dilation_sizes
360
+ }
361
+ self.dec = HifiGanGenerator(h=hps)
362
+ self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16,
363
+ gin_channels=gin_channels)
364
+ self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
365
+
366
+ if n_speakers > 1:
367
+ self.emb_g = nn.Embedding(n_speakers, gin_channels)
368
+
369
+ def forward(self, x, x_lengths, y, y_lengths, pitch, sid=None):
370
+ assert 0 <= y.shape[2] - x.shape[1] * 2 <= 1, (y.shape[2], x.shape[1] * 2, sid)
371
+ if y.shape[2] != x.shape[1] * 2:
372
+ y_lengths[y_lengths == y.shape[2]] -= 1
373
+ y = y[:, :, :x.shape[1] * 2]
374
+
375
+ x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths, pitch)
376
+ if self.n_speakers > 0:
377
+ g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
378
+ else:
379
+ g = None
380
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
381
+ z_p = self.flow(z, y_mask, g=g)
382
+
383
+ m_p = torch.repeat_interleave(m_p, repeats=2, dim=2)
384
+ logs_p = torch.repeat_interleave(logs_p, repeats=2, dim=2)
385
+ # print(x.shape, y.shape, z.shape, pitch.shape)
386
+ z_slice, pitch_slice, ids_slice = commons.rand_slice_segments_with_pitch(z, torch.repeat_interleave(pitch,
387
+ repeats=2,
388
+ dim=1),
389
+ y_lengths, self.segment_size)
390
+ o = self.dec(z_slice, f0=pitch_slice)
391
+ return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
392
+
393
+ def infer(self, x, x_lengths, pitch, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None):
394
+ x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths, pitch)
395
+ if self.n_speakers > 0:
396
+ g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
397
+ else:
398
+ g = None
399
+ m_p = torch.repeat_interleave(m_p, repeats=2, dim=2)
400
+ logs_p = torch.repeat_interleave(logs_p, repeats=2, dim=2)
401
+ x_mask = torch.repeat_interleave(x_mask, repeats=2, dim=2)
402
+ z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
403
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
404
+ # o = self.dec((z * x_mask)[:, :, :max_len], g=g)
405
+ # print(x.shape, pitch.shape, sid)
406
+ # print()
407
+ o = self.dec((z * x_mask)[:, :, :max_len], f0=torch.repeat_interleave(pitch, repeats=2, dim=1))
408
+ return o, x_mask, (z, z_p, m_p, logs_p)
409
+
410
+ def voice_conversion(self, y, y_lengths, sid_src, sid_tgt):
411
+ assert self.n_speakers > 0, "n_speakers have to be larger than 0."
412
+ g_src = self.emb_g(sid_src).unsqueeze(-1)
413
+ g_tgt = self.emb_g(sid_tgt).unsqueeze(-1)
414
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src)
415
+ z_p = self.flow(z, y_mask, g=g_src)
416
+ z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
417
+ o_hat = self.dec(z_hat * y_mask, g=g_tgt)
418
+ return o_hat, y_mask, (z, z_p, z_hat)
sovits/models/G_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2cdefa60177b3963d335f36954f40e1bf77dfe4c5d0726325bbf206f49297ffa
3
+ size 633845309
sovits/models/G_16000.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e85d9491d0d362e7e2fb4f86ff6e94e86f78e7b0e63d3b19064a8231c777d364
3
+ size 633845309
sovits/modules.py ADDED
@@ -0,0 +1,353 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+
3
+ import torch
4
+ from torch import nn
5
+ from torch.nn import Conv1d
6
+ from torch.nn import functional as t_func
7
+ from torch.nn.utils import weight_norm, remove_weight_norm
8
+
9
+ from sovits import commons
10
+ from sovits.commons import init_weights, get_padding
11
+ from sovits.transforms import piecewise_rational_quadratic_transform
12
+
13
+ LRELU_SLOPE = 0.1
14
+
15
+
16
+ class LayerNorm(nn.Module):
17
+ def __init__(self, channels, eps=1e-5):
18
+ super().__init__()
19
+ self.channels = channels
20
+ self.eps = eps
21
+
22
+ self.gamma = nn.Parameter(torch.ones(channels))
23
+ self.beta = nn.Parameter(torch.zeros(channels))
24
+
25
+ def forward(self, x):
26
+ x = x.transpose(1, -1)
27
+ x = t_func.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
28
+ return x.transpose(1, -1)
29
+
30
+
31
+ class DDSConv(nn.Module):
32
+ """
33
+ Dialted and Depth-Separable Convolution
34
+ """
35
+
36
+ def __init__(self, channels, kernel_size, n_layers, p_dropout=0.):
37
+ super().__init__()
38
+ self.channels = channels
39
+ self.kernel_size = kernel_size
40
+ self.n_layers = n_layers
41
+ self.p_dropout = p_dropout
42
+
43
+ self.drop = nn.Dropout(p_dropout)
44
+ self.convs_sep = nn.ModuleList()
45
+ self.convs_1x1 = nn.ModuleList()
46
+ self.norms_1 = nn.ModuleList()
47
+ self.norms_2 = nn.ModuleList()
48
+ for i in range(n_layers):
49
+ dilation = kernel_size ** i
50
+ padding = (kernel_size * dilation - dilation) // 2
51
+ self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size,
52
+ groups=channels, dilation=dilation, padding=padding
53
+ ))
54
+ self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
55
+ self.norms_1.append(LayerNorm(channels))
56
+ self.norms_2.append(LayerNorm(channels))
57
+
58
+ def forward(self, x, x_mask, g=None):
59
+ if g is not None:
60
+ x = x + g
61
+ for i in range(self.n_layers):
62
+ y = self.convs_sep[i](x * x_mask)
63
+ y = self.norms_1[i](y)
64
+ y = t_func.gelu(y)
65
+ y = self.convs_1x1[i](y)
66
+ y = self.norms_2[i](y)
67
+ y = t_func.gelu(y)
68
+ y = self.drop(y)
69
+ x = x + y
70
+ return x * x_mask
71
+
72
+
73
+ class WN(torch.nn.Module):
74
+ def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
75
+ super(WN, self).__init__()
76
+ assert (kernel_size % 2 == 1)
77
+ self.hidden_channels = hidden_channels
78
+ self.kernel_size = kernel_size,
79
+ self.dilation_rate = dilation_rate
80
+ self.n_layers = n_layers
81
+ self.gin_channels = gin_channels
82
+ self.p_dropout = p_dropout
83
+
84
+ self.in_layers = torch.nn.ModuleList()
85
+ self.res_skip_layers = torch.nn.ModuleList()
86
+ self.drop = nn.Dropout(p_dropout)
87
+
88
+ if gin_channels != 0:
89
+ cond_layer = torch.nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1)
90
+ self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
91
+
92
+ for i in range(n_layers):
93
+ dilation = dilation_rate ** i
94
+ padding = int((kernel_size * dilation - dilation) / 2)
95
+ in_layer = torch.nn.Conv1d(hidden_channels, 2 * hidden_channels, kernel_size,
96
+ dilation=dilation, padding=padding)
97
+ in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
98
+ self.in_layers.append(in_layer)
99
+
100
+ # last one is not necessary
101
+ if i < n_layers - 1:
102
+ res_skip_channels = 2 * hidden_channels
103
+ else:
104
+ res_skip_channels = hidden_channels
105
+
106
+ res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
107
+ res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
108
+ self.res_skip_layers.append(res_skip_layer)
109
+
110
+ def forward(self, x, x_mask, g=None, **kwargs):
111
+ output = torch.zeros_like(x)
112
+ n_channels_tensor = torch.IntTensor([self.hidden_channels])
113
+
114
+ if g is not None:
115
+ g = self.cond_layer(g)
116
+
117
+ for i in range(self.n_layers):
118
+ x_in = self.in_layers[i](x)
119
+ if g is not None:
120
+ cond_offset = i * 2 * self.hidden_channels
121
+ g_l = g[:, cond_offset:cond_offset + 2 * self.hidden_channels, :]
122
+ else:
123
+ g_l = torch.zeros_like(x_in)
124
+
125
+ acts = commons.fused_add_tanh_sigmoid_multiply(
126
+ x_in,
127
+ g_l,
128
+ n_channels_tensor)
129
+ acts = self.drop(acts)
130
+
131
+ res_skip_acts = self.res_skip_layers[i](acts)
132
+ if i < self.n_layers - 1:
133
+ res_acts = res_skip_acts[:, :self.hidden_channels, :]
134
+ x = (x + res_acts) * x_mask
135
+ output = output + res_skip_acts[:, self.hidden_channels:, :]
136
+ else:
137
+ output = output + res_skip_acts
138
+ return output * x_mask
139
+
140
+ def remove_weight_norm(self):
141
+ if self.gin_channels != 0:
142
+ torch.nn.utils.remove_weight_norm(self.cond_layer)
143
+ for l in self.in_layers:
144
+ torch.nn.utils.remove_weight_norm(l)
145
+ for l in self.res_skip_layers:
146
+ torch.nn.utils.remove_weight_norm(l)
147
+
148
+
149
+ class ResBlock1(torch.nn.Module):
150
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
151
+ super(ResBlock1, self).__init__()
152
+ self.convs1 = nn.ModuleList([
153
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
154
+ padding=get_padding(kernel_size, dilation[0]))),
155
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
156
+ padding=get_padding(kernel_size, dilation[1]))),
157
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
158
+ padding=get_padding(kernel_size, dilation[2])))
159
+ ])
160
+ self.convs1.apply(init_weights)
161
+
162
+ self.convs2 = nn.ModuleList([
163
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
164
+ padding=get_padding(kernel_size, 1))),
165
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
166
+ padding=get_padding(kernel_size, 1))),
167
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
168
+ padding=get_padding(kernel_size, 1)))
169
+ ])
170
+ self.convs2.apply(init_weights)
171
+
172
+ def forward(self, x, x_mask=None):
173
+ for c1, c2 in zip(self.convs1, self.convs2):
174
+ xt = t_func.leaky_relu(x, LRELU_SLOPE)
175
+ if x_mask is not None:
176
+ xt = xt * x_mask
177
+ xt = c1(xt)
178
+ xt = t_func.leaky_relu(xt, LRELU_SLOPE)
179
+ if x_mask is not None:
180
+ xt = xt * x_mask
181
+ xt = c2(xt)
182
+ x = xt + x
183
+ if x_mask is not None:
184
+ x = x * x_mask
185
+ return x
186
+
187
+ def remove_weight_norm(self):
188
+ for l in self.convs1:
189
+ remove_weight_norm(l)
190
+ for l in self.convs2:
191
+ remove_weight_norm(l)
192
+
193
+
194
+ class ResBlock2(torch.nn.Module):
195
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
196
+ super(ResBlock2, self).__init__()
197
+ self.convs = nn.ModuleList([
198
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
199
+ padding=get_padding(kernel_size, dilation[0]))),
200
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
201
+ padding=get_padding(kernel_size, dilation[1])))
202
+ ])
203
+ self.convs.apply(init_weights)
204
+
205
+ def forward(self, x, x_mask=None):
206
+ for c in self.convs:
207
+ xt = t_func.leaky_relu(x, LRELU_SLOPE)
208
+ if x_mask is not None:
209
+ xt = xt * x_mask
210
+ xt = c(xt)
211
+ x = xt + x
212
+ if x_mask is not None:
213
+ x = x * x_mask
214
+ return x
215
+
216
+ def remove_weight_norm(self):
217
+ for l in self.convs:
218
+ remove_weight_norm(l)
219
+
220
+
221
+ class Log(nn.Module):
222
+ def forward(self, x, x_mask, reverse=False, **kwargs):
223
+ if not reverse:
224
+ y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
225
+ logdet = torch.sum(-y, [1, 2])
226
+ return y, logdet
227
+ else:
228
+ x = torch.exp(x) * x_mask
229
+ return x
230
+
231
+
232
+ class Flip(nn.Module):
233
+ def forward(self, x, *args, reverse=False, **kwargs):
234
+ x = torch.flip(x, [1])
235
+ if not reverse:
236
+ logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
237
+ return x, logdet
238
+ else:
239
+ return x
240
+
241
+
242
+ class ElementwiseAffine(nn.Module):
243
+ def __init__(self, channels):
244
+ super().__init__()
245
+ self.channels = channels
246
+ self.m = nn.Parameter(torch.zeros(channels, 1))
247
+ self.logs = nn.Parameter(torch.zeros(channels, 1))
248
+
249
+ def forward(self, x, x_mask, reverse=False, **kwargs):
250
+ if not reverse:
251
+ y = self.m + torch.exp(self.logs) * x
252
+ y = y * x_mask
253
+ logdet = torch.sum(self.logs * x_mask, [1, 2])
254
+ return y, logdet
255
+ else:
256
+ x = (x - self.m) * torch.exp(-self.logs) * x_mask
257
+ return x
258
+
259
+
260
+ class ResidualCouplingLayer(nn.Module):
261
+ def __init__(self,
262
+ channels,
263
+ hidden_channels,
264
+ kernel_size,
265
+ dilation_rate,
266
+ n_layers,
267
+ p_dropout=0,
268
+ gin_channels=0,
269
+ mean_only=False):
270
+ assert channels % 2 == 0, "channels should be divisible by 2"
271
+ super().__init__()
272
+ self.channels = channels
273
+ self.hidden_channels = hidden_channels
274
+ self.kernel_size = kernel_size
275
+ self.dilation_rate = dilation_rate
276
+ self.n_layers = n_layers
277
+ self.half_channels = channels // 2
278
+ self.mean_only = mean_only
279
+
280
+ self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
281
+ self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout,
282
+ gin_channels=gin_channels)
283
+ self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
284
+ self.post.weight.data.zero_()
285
+ self.post.bias.data.zero_()
286
+
287
+ def forward(self, x, x_mask, g=None, reverse=False):
288
+ x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
289
+ h = self.pre(x0) * x_mask
290
+ h = self.enc(h, x_mask, g=g)
291
+ stats = self.post(h) * x_mask
292
+ if not self.mean_only:
293
+ m, logs = torch.split(stats, [self.half_channels] * 2, 1)
294
+ else:
295
+ m = stats
296
+ logs = torch.zeros_like(m)
297
+
298
+ if not reverse:
299
+ x1 = m + x1 * torch.exp(logs) * x_mask
300
+ x = torch.cat([x0, x1], 1)
301
+ logdet = torch.sum(logs, [1, 2])
302
+ return x, logdet
303
+ else:
304
+ x1 = (x1 - m) * torch.exp(-logs) * x_mask
305
+ x = torch.cat([x0, x1], 1)
306
+ return x
307
+
308
+
309
+ class ConvFlow(nn.Module):
310
+ def __init__(self, in_channels, filter_channels, kernel_size, n_layers, num_bins=10, tail_bound=5.0):
311
+ super().__init__()
312
+ self.in_channels = in_channels
313
+ self.filter_channels = filter_channels
314
+ self.kernel_size = kernel_size
315
+ self.n_layers = n_layers
316
+ self.num_bins = num_bins
317
+ self.tail_bound = tail_bound
318
+ self.half_channels = in_channels // 2
319
+
320
+ self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
321
+ self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.)
322
+ self.proj = nn.Conv1d(filter_channels, self.half_channels * (num_bins * 3 - 1), 1)
323
+ self.proj.weight.data.zero_()
324
+ self.proj.bias.data.zero_()
325
+
326
+ def forward(self, x, x_mask, g=None, reverse=False):
327
+ x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
328
+ h = self.pre(x0)
329
+ h = self.convs(h, x_mask, g=g)
330
+ h = self.proj(h) * x_mask
331
+
332
+ b, c, t = x0.shape
333
+ h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?]
334
+
335
+ unnormalized_widths = h[..., :self.num_bins] / math.sqrt(self.filter_channels)
336
+ unnormalized_heights = h[..., self.num_bins:2 * self.num_bins] / math.sqrt(self.filter_channels)
337
+ unnormalized_derivatives = h[..., 2 * self.num_bins:]
338
+
339
+ x1, logabsdet = piecewise_rational_quadratic_transform(x1,
340
+ unnormalized_widths,
341
+ unnormalized_heights,
342
+ unnormalized_derivatives,
343
+ inverse=reverse,
344
+ tails='linear',
345
+ tail_bound=self.tail_bound
346
+ )
347
+
348
+ x = torch.cat([x0, x1], 1) * x_mask
349
+ logdet = torch.sum(logabsdet * x_mask, [1, 2])
350
+ if not reverse:
351
+ return x, logdet
352
+ else:
353
+ return x
sovits/preprocess_wave.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pyworld
3
+ from scipy.io import wavfile
4
+
5
+
6
+ class FeatureInput(object):
7
+ def __init__(self, samplerate=16000, hop_size=160):
8
+ self.fs = samplerate
9
+ self.hop = hop_size
10
+
11
+ self.f0_bin = 256
12
+ self.f0_max = 1100.0
13
+ self.f0_min = 50.0
14
+ self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
15
+ self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
16
+
17
+ def compute_f0(self, audio, sr):
18
+ x, sr = audio, self.fs
19
+ assert sr == self.fs
20
+ f0, t = pyworld.dio(
21
+ x.astype(np.double),
22
+ fs=sr,
23
+ f0_ceil=800,
24
+ frame_period=1000 * self.hop / sr,
25
+ )
26
+ f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.fs)
27
+ for index, pitch in enumerate(f0):
28
+ f0[index] = round(pitch, 1)
29
+ return f0
30
+
31
+ # for numpy # code from diffsinger
32
+ def coarse_f0(self, f0):
33
+ f0_mel = 1127 * np.log(1 + f0 / 700)
34
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * (
35
+ self.f0_bin - 2
36
+ ) / (self.f0_mel_max - self.f0_mel_min) + 1
37
+
38
+ # use 0 or 1
39
+ f0_mel[f0_mel <= 1] = 1
40
+ f0_mel[f0_mel > self.f0_bin - 1] = self.f0_bin - 1
41
+ f0_coarse = np.rint(f0_mel).astype(np.int)
42
+ assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (
43
+ f0_coarse.max(),
44
+ f0_coarse.min(),
45
+ )
46
+ return f0_coarse
47
+
48
+ # for tensor # code from diffsinger
49
+ def coarse_f0_ts(self, f0):
50
+ f0_mel = 1127 * (1 + f0 / 700).log()
51
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * (
52
+ self.f0_bin - 2
53
+ ) / (self.f0_mel_max - self.f0_mel_min) + 1
54
+
55
+ # use 0 or 1
56
+ f0_mel[f0_mel <= 1] = 1
57
+ f0_mel[f0_mel > self.f0_bin - 1] = self.f0_bin - 1
58
+ f0_coarse = (f0_mel + 0.5).long()
59
+ assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (
60
+ f0_coarse.max(),
61
+ f0_coarse.min(),
62
+ )
63
+ return f0_coarse
64
+
65
+ def save_wav(self, wav, path):
66
+ wav *= 32767 / max(0.01, np.max(np.abs(wav))) * 0.6
67
+ wavfile.write(path, self.fs, wav.astype(np.int16))
sovits/slicer.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os.path
2
+ import time
3
+ from argparse import ArgumentParser
4
+
5
+ import numpy as np
6
+ import soundfile
7
+ import torch
8
+ import torchaudio
9
+ from scipy.ndimage import maximum_filter1d, uniform_filter1d
10
+
11
+
12
+ def timeit(func):
13
+ def run(*args, **kwargs):
14
+ t = time.time()
15
+ res = func(*args, **kwargs)
16
+ print('executing \'%s\' costed %.3fs' % (func.__name__, time.time() - t))
17
+ return res
18
+
19
+ return run
20
+
21
+
22
+ # @timeit
23
+ def _window_maximum(arr, win_sz):
24
+ return maximum_filter1d(arr, size=win_sz)[win_sz // 2: win_sz // 2 + arr.shape[0] - win_sz + 1]
25
+
26
+
27
+ # @timeit
28
+ def _window_rms(arr, win_sz):
29
+ filtered = np.sqrt(uniform_filter1d(np.power(arr, 2), win_sz) - np.power(uniform_filter1d(arr, win_sz), 2))
30
+ return filtered[win_sz // 2: win_sz // 2 + arr.shape[0] - win_sz + 1]
31
+
32
+
33
+ def level2db(levels, eps=1e-12):
34
+ return 20 * np.log10(np.clip(levels, a_min=eps, a_max=1))
35
+
36
+
37
+ def _apply_slice(audio, begin, end):
38
+ if len(audio.shape) > 1:
39
+ return audio[:, begin: end]
40
+ else:
41
+ return audio[begin: end]
42
+
43
+
44
+ class Slicer:
45
+ def __init__(self,
46
+ sr: int,
47
+ db_threshold: float = -40,
48
+ min_length: int = 5000,
49
+ win_l: int = 300,
50
+ win_s: int = 20,
51
+ max_silence_kept: int = 500):
52
+ self.db_threshold = db_threshold
53
+ self.min_samples = round(sr * min_length / 1000)
54
+ self.win_ln = round(sr * win_l / 1000)
55
+ self.win_sn = round(sr * win_s / 1000)
56
+ self.max_silence = round(sr * max_silence_kept / 1000)
57
+ if not self.min_samples >= self.win_ln >= self.win_sn:
58
+ raise ValueError('The following condition must be satisfied: min_length >= win_l >= win_s')
59
+ if not self.max_silence >= self.win_sn:
60
+ raise ValueError('The following condition must be satisfied: max_silence_kept >= win_s')
61
+
62
+ @timeit
63
+ def slice(self, audio):
64
+ samples = audio
65
+ if samples.shape[0] <= self.min_samples:
66
+ return [audio]
67
+ # get absolute amplitudes
68
+ abs_amp = np.abs(samples - np.mean(samples))
69
+ # calculate local maximum with large window
70
+ win_max_db = level2db(_window_maximum(abs_amp, win_sz=self.win_ln))
71
+ sil_tags = []
72
+ left = right = 0
73
+ while right < win_max_db.shape[0]:
74
+ if win_max_db[right] < self.db_threshold:
75
+ right += 1
76
+ elif left == right:
77
+ left += 1
78
+ right += 1
79
+ else:
80
+ if left == 0:
81
+ split_loc_l = left
82
+ else:
83
+ sil_left_n = min(self.max_silence, (right + self.win_ln - left) // 2)
84
+ rms_db_left = level2db(_window_rms(samples[left: left + sil_left_n], win_sz=self.win_sn))
85
+ split_win_l = left + np.argmin(rms_db_left)
86
+ split_loc_l = split_win_l + np.argmin(abs_amp[split_win_l: split_win_l + self.win_sn])
87
+ if len(sil_tags) != 0 and split_loc_l - sil_tags[-1][1] < self.min_samples and right < win_max_db.shape[
88
+ 0] - 1:
89
+ right += 1
90
+ left = right
91
+ continue
92
+ if right == win_max_db.shape[0] - 1:
93
+ split_loc_r = right + self.win_ln
94
+ else:
95
+ sil_right_n = min(self.max_silence, (right + self.win_ln - left) // 2)
96
+ rms_db_right = level2db(_window_rms(samples[right + self.win_ln - sil_right_n: right + self.win_ln],
97
+ win_sz=self.win_sn))
98
+ split_win_r = right + self.win_ln - sil_right_n + np.argmin(rms_db_right)
99
+ split_loc_r = split_win_r + np.argmin(abs_amp[split_win_r: split_win_r + self.win_sn])
100
+ sil_tags.append((split_loc_l, split_loc_r))
101
+ right += 1
102
+ left = right
103
+ if left != right:
104
+ sil_left_n = min(self.max_silence, (right + self.win_ln - left) // 2)
105
+ rms_db_left = level2db(_window_rms(samples[left: left + sil_left_n], win_sz=self.win_sn))
106
+ split_win_l = left + np.argmin(rms_db_left)
107
+ split_loc_l = split_win_l + np.argmin(abs_amp[split_win_l: split_win_l + self.win_sn])
108
+ sil_tags.append((split_loc_l, samples.shape[0]))
109
+ if len(sil_tags) == 0:
110
+ return [len(audio)]
111
+ else:
112
+ chunks = []
113
+ for i in range(0, len(sil_tags)):
114
+ chunks.append(int((sil_tags[i][0] + sil_tags[i][1]) / 2))
115
+ return chunks
116
+
117
+
118
+ def main():
119
+ parser = ArgumentParser()
120
+ parser.add_argument('audio', type=str, help='The audio to be sliced')
121
+ parser.add_argument('--out_name', type=str, help='Output directory of the sliced audio clips')
122
+ parser.add_argument('--out', type=str, help='Output directory of the sliced audio clips')
123
+ parser.add_argument('--db_thresh', type=float, required=False, default=-40,
124
+ help='The dB threshold for silence detection')
125
+ parser.add_argument('--min_len', type=int, required=False, default=5000,
126
+ help='The minimum milliseconds required for each sliced audio clip')
127
+ parser.add_argument('--win_l', type=int, required=False, default=300,
128
+ help='Size of the large sliding window, presented in milliseconds')
129
+ parser.add_argument('--win_s', type=int, required=False, default=20,
130
+ help='Size of the small sliding window, presented in milliseconds')
131
+ parser.add_argument('--max_sil_kept', type=int, required=False, default=500,
132
+ help='The maximum silence length kept around the sliced audio, presented in milliseconds')
133
+ args = parser.parse_args()
134
+ out = args.out
135
+ if out is None:
136
+ out = os.path.dirname(os.path.abspath(args.audio))
137
+ audio, sr = torchaudio.load(args.audio)
138
+ if len(audio.shape) == 2 and audio.shape[1] >= 2:
139
+ audio = torch.mean(audio, dim=0).unsqueeze(0)
140
+ audio = audio.cpu().numpy()[0]
141
+
142
+ slicer = Slicer(
143
+ sr=sr,
144
+ db_threshold=args.db_thresh,
145
+ min_length=args.min_len,
146
+ win_l=args.win_l,
147
+ win_s=args.win_s,
148
+ max_silence_kept=args.max_sil_kept
149
+ )
150
+ chunks = slicer.slice(audio)
151
+ if not os.path.exists(args.out):
152
+ os.makedirs(args.out)
153
+ start = 0
154
+ end_id = 0
155
+ for i, chunk in enumerate(chunks):
156
+ end = chunk
157
+ soundfile.write(os.path.join(out, f'%s-%s.wav' % (args.out_name, str(i).zfill(2))), audio[start:end], sr)
158
+ start = end
159
+ end_id = i + 1
160
+ if start != len(audio):
161
+ soundfile.write(os.path.join(out, f'%s-%s.wav' % (args.out_name, str(end_id).zfill(2))),
162
+ audio[start:len(audio)], sr)
163
+
164
+
165
+ if __name__ == '__main__':
166
+ main()
sovits/sovits_inferencer.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import gradio as gr
4
+ import soundfile
5
+ import torch
6
+ import utils
7
+ import infer_tool
8
+ from sovits import ROOT_PATH
9
+
10
+ class SovitsInferencer:
11
+ def __init__(self, hps_path, device="cpu"):
12
+ print("init")
13
+ self.device = torch.device(device)
14
+ self.hps = utils.get_hparams_from_file(hps_path)
15
+ self.model_path = self.get_latest_model_path()
16
+ self.svc = infer_tool.Svc(self.model_path, hps_path)
17
+
18
+ def get_latest_model_path(self):
19
+ model_dir_path = os.path.join(ROOT_PATH, "models")
20
+ return utils.latest_checkpoint_path(model_dir_path, "G_*.pth")
21
+
22
+ def infer(self, audio_record, audio_upload, tran):
23
+ if audio_upload is not None:
24
+ audio_path = audio_upload
25
+ elif audio_record is not None:
26
+ audio_path = audio_record
27
+ else:
28
+ return "你需要上传wav文件或使用网页内置的录音!", None
29
+
30
+ audio, sampling_rate = self.svc.format_wav(audio_path)
31
+ duration = audio.shape[1] / sampling_rate
32
+ if duration > 60:
33
+ return "请上传小于60s的音频,需要转换长音频请使用colab", None
34
+
35
+ o_audio, out_sr = self.svc.infer(0, tran, audio_path)
36
+ out_path = f"./out_temp.wav"
37
+ soundfile.write(out_path, o_audio, self.svc.target_sample)
38
+ mistake, var = self.svc.calc_error(audio_path, out_path, tran)
39
+ return f"分段误差参考:0.3优秀,0.5左右合理,少量0.8-1可以接受\n若偏差过大,请调整升降半音数;多次调整均过大、说明超出歌手音域\n半音偏差:{mistake}\n半音方差:{var}", (self.hps.data.sampling_rate, o_audio.numpy())
40
+
41
+ def render(self):
42
+ record_input = gr.Audio(source="microphone", label="录制你的声音", type="filepath", elem_id="audio_inputs")
43
+ upload_input = gr.Audio(source="upload", label="上传音频(长度小于45秒)", type="filepath",
44
+ elem_id="audio_inputs")
45
+ # vc_speaker = gr.Number(label="Speaker", value=0)
46
+ vc_transform = gr.Number(label="升降半音(整数,可以正负,半音数量,升高八度就是12)", value=0)
47
+ vc_submit = gr.Button("转换", variant="primary")
48
+ out_message = gr.Textbox(label="Output Message")
49
+ out_audio = gr.Audio(label="Output Audio")
50
+ # vc_submit.click(self.infer, [vc_speaker, record_input, upload_input, vc_transform], [out_message, out_audio])
51
+ vc_submit.click(self.infer, [record_input, upload_input, vc_transform], [out_message, out_audio])
sovits/transforms.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ from torch.nn import functional as t_func
4
+
5
+ DEFAULT_MIN_BIN_WIDTH = 1e-3
6
+ DEFAULT_MIN_BIN_HEIGHT = 1e-3
7
+ DEFAULT_MIN_DERIVATIVE = 1e-3
8
+
9
+
10
+ def piecewise_rational_quadratic_transform(inputs,
11
+ unnormalized_widths,
12
+ unnormalized_heights,
13
+ unnormalized_derivatives,
14
+ inverse=False,
15
+ tails=None,
16
+ tail_bound=1.,
17
+ min_bin_width=DEFAULT_MIN_BIN_WIDTH,
18
+ min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
19
+ min_derivative=DEFAULT_MIN_DERIVATIVE):
20
+ if tails is None:
21
+ spline_fn = rational_quadratic_spline
22
+ spline_kwargs = {}
23
+ else:
24
+ spline_fn = unconstrained_rational_quadratic_spline
25
+ spline_kwargs = {
26
+ 'tails': tails,
27
+ 'tail_bound': tail_bound
28
+ }
29
+
30
+ outputs, logabsdet = spline_fn(
31
+ inputs=inputs,
32
+ unnormalized_widths=unnormalized_widths,
33
+ unnormalized_heights=unnormalized_heights,
34
+ unnormalized_derivatives=unnormalized_derivatives,
35
+ inverse=inverse,
36
+ min_bin_width=min_bin_width,
37
+ min_bin_height=min_bin_height,
38
+ min_derivative=min_derivative,
39
+ **spline_kwargs
40
+ )
41
+ return outputs, logabsdet
42
+
43
+
44
+ def searchsorted(bin_locations, inputs, eps=1e-6):
45
+ bin_locations[..., -1] += eps
46
+ return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1
47
+
48
+
49
+ def unconstrained_rational_quadratic_spline(inputs,
50
+ unnormalized_widths,
51
+ unnormalized_heights,
52
+ unnormalized_derivatives,
53
+ inverse=False,
54
+ tails='linear',
55
+ tail_bound=1.,
56
+ min_bin_width=DEFAULT_MIN_BIN_WIDTH,
57
+ min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
58
+ min_derivative=DEFAULT_MIN_DERIVATIVE):
59
+ inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
60
+ outside_interval_mask = ~inside_interval_mask
61
+
62
+ outputs = torch.zeros_like(inputs)
63
+ logabsdet = torch.zeros_like(inputs)
64
+
65
+ if tails == 'linear':
66
+ unnormalized_derivatives = t_func.pad(unnormalized_derivatives, pad=(1, 1))
67
+ constant = np.log(np.exp(1 - min_derivative) - 1)
68
+ unnormalized_derivatives[..., 0] = constant
69
+ unnormalized_derivatives[..., -1] = constant
70
+
71
+ outputs[outside_interval_mask] = inputs[outside_interval_mask]
72
+ logabsdet[outside_interval_mask] = 0
73
+ else:
74
+ raise RuntimeError('{} tails are not implemented.'.format(tails))
75
+
76
+ outputs[inside_interval_mask], logabsdet[inside_interval_mask] = rational_quadratic_spline(
77
+ inputs=inputs[inside_interval_mask],
78
+ unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
79
+ unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
80
+ unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
81
+ inverse=inverse,
82
+ left=-tail_bound, right=tail_bound, bottom=-tail_bound, top=tail_bound,
83
+ min_bin_width=min_bin_width,
84
+ min_bin_height=min_bin_height,
85
+ min_derivative=min_derivative
86
+ )
87
+
88
+ return outputs, logabsdet
89
+
90
+
91
+ def rational_quadratic_spline(inputs,
92
+ unnormalized_widths,
93
+ unnormalized_heights,
94
+ unnormalized_derivatives,
95
+ inverse=False,
96
+ left=0., right=1., bottom=0., top=1.,
97
+ min_bin_width=DEFAULT_MIN_BIN_WIDTH,
98
+ min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
99
+ min_derivative=DEFAULT_MIN_DERIVATIVE):
100
+ if torch.min(inputs) < left or torch.max(inputs) > right:
101
+ raise ValueError('Input to a transform is not within its domain')
102
+
103
+ num_bins = unnormalized_widths.shape[-1]
104
+
105
+ if min_bin_width * num_bins > 1.0:
106
+ raise ValueError('Minimal bin width too large for the number of bins')
107
+ if min_bin_height * num_bins > 1.0:
108
+ raise ValueError('Minimal bin height too large for the number of bins')
109
+
110
+ widths = t_func.softmax(unnormalized_widths, dim=-1)
111
+ widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
112
+ cumwidths = torch.cumsum(widths, dim=-1)
113
+ cumwidths = t_func.pad(cumwidths, pad=(1, 0), mode='constant', value=0.0)
114
+ cumwidths = (right - left) * cumwidths + left
115
+ cumwidths[..., 0] = left
116
+ cumwidths[..., -1] = right
117
+ widths = cumwidths[..., 1:] - cumwidths[..., :-1]
118
+
119
+ derivatives = min_derivative + t_func.softplus(unnormalized_derivatives)
120
+
121
+ heights = t_func.softmax(unnormalized_heights, dim=-1)
122
+ heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
123
+ cumheights = torch.cumsum(heights, dim=-1)
124
+ cumheights = t_func.pad(cumheights, pad=(1, 0), mode='constant', value=0.0)
125
+ cumheights = (top - bottom) * cumheights + bottom
126
+ cumheights[..., 0] = bottom
127
+ cumheights[..., -1] = top
128
+ heights = cumheights[..., 1:] - cumheights[..., :-1]
129
+
130
+ if inverse:
131
+ bin_idx = searchsorted(cumheights, inputs)[..., None]
132
+ else:
133
+ bin_idx = searchsorted(cumwidths, inputs)[..., None]
134
+
135
+ input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
136
+ input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
137
+
138
+ input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
139
+ delta = heights / widths
140
+ input_delta = delta.gather(-1, bin_idx)[..., 0]
141
+
142
+ input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
143
+ input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
144
+
145
+ input_heights = heights.gather(-1, bin_idx)[..., 0]
146
+
147
+ if inverse:
148
+ a = (inputs - input_cumheights) * (
149
+ input_derivatives + input_derivatives_plus_one - 2 * input_delta) + input_heights * (
150
+ input_delta - input_derivatives)
151
+ b = (input_heights * input_derivatives - (inputs - input_cumheights) * (
152
+ input_derivatives + input_derivatives_plus_one- 2 * input_delta))
153
+ c = - input_delta * (inputs - input_cumheights)
154
+
155
+ discriminant = b.pow(2) - 4 * a * c
156
+ assert (discriminant >= 0).all()
157
+
158
+ root = (2 * c) / (-b - torch.sqrt(discriminant))
159
+ outputs = root * input_bin_widths + input_cumwidths
160
+
161
+ theta_one_minus_theta = root * (1 - root)
162
+ denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta)
163
+ * theta_one_minus_theta)
164
+ derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * root.pow(2)
165
+ + 2 * input_delta * theta_one_minus_theta
166
+ + input_derivatives * (1 - root).pow(2))
167
+ logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
168
+
169
+ return outputs, -logabsdet
170
+ else:
171
+ theta = (inputs - input_cumwidths) / input_bin_widths
172
+ theta_one_minus_theta = theta * (1 - theta)
173
+
174
+ numerator = input_heights * (input_delta * theta.pow(2)
175
+ + input_derivatives * theta_one_minus_theta)
176
+ denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta)
177
+ * theta_one_minus_theta)
178
+ outputs = input_cumheights + numerator / denominator
179
+
180
+ derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * theta.pow(2)
181
+ + 2 * input_delta * theta_one_minus_theta
182
+ + input_derivatives * (1 - theta).pow(2))
183
+ logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
184
+
185
+ return outputs, logabsdet
sovits/utils.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ import os
4
+ import sys
5
+
6
+ import torch
7
+
8
+ MATPLOTLIB_FLAG = False
9
+
10
+ logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
11
+ logger = logging
12
+
13
+
14
+ def load_checkpoint(checkpoint_path, model, optimizer=None):
15
+ assert os.path.isfile(checkpoint_path)
16
+ checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
17
+ iteration = checkpoint_dict['iteration']
18
+ learning_rate = checkpoint_dict['learning_rate']
19
+ if optimizer is not None:
20
+ optimizer.load_state_dict(checkpoint_dict['optimizer'])
21
+ saved_state_dict = checkpoint_dict['model']
22
+
23
+ if hasattr(model, 'module'):
24
+ state_dict = model.module.state_dict()
25
+ else:
26
+ state_dict = model.state_dict()
27
+ new_state_dict = {}
28
+ for k, v in state_dict.items():
29
+ try:
30
+ new_state_dict[k] = saved_state_dict[k]
31
+ except Exception as e:
32
+ logger.info(e)
33
+ logger.info("%s is not in the checkpoint" % k)
34
+ new_state_dict[k] = v
35
+ if hasattr(model, 'module'):
36
+ model.module.load_state_dict(new_state_dict)
37
+ else:
38
+ model.load_state_dict(new_state_dict)
39
+ logger.info("Loaded checkpoint '{}' (iteration {})".format(
40
+ checkpoint_path, iteration))
41
+ return model, optimizer, learning_rate, iteration
42
+
43
+
44
+ def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path):
45
+ logger.info("Saving model and optimizer state at iteration {} to {}".format(
46
+ iteration, checkpoint_path))
47
+ if hasattr(model, 'module'):
48
+ state_dict = model.module.state_dict()
49
+ else:
50
+ state_dict = model.state_dict()
51
+ torch.save({'model': state_dict,
52
+ 'iteration': iteration,
53
+ 'optimizer': optimizer.state_dict(),
54
+ 'learning_rate': learning_rate}, checkpoint_path)
55
+
56
+
57
+ def get_hparams_from_file(config_path):
58
+ with open(config_path, "r", encoding="utf-8") as f:
59
+ data = f.read()
60
+ config = json.loads(data)
61
+
62
+ hparams = HParams(**config)
63
+ return hparams
64
+
65
+
66
+ class HParams:
67
+ def __init__(self, **kwargs):
68
+ for k, v in kwargs.items():
69
+ if type(v) == dict:
70
+ v = HParams(**v)
71
+ self[k] = v
72
+
73
+ def keys(self):
74
+ return self.__dict__.keys()
75
+
76
+ def items(self):
77
+ return self.__dict__.items()
78
+
79
+ def values(self):
80
+ return self.__dict__.values()
81
+
82
+ def __len__(self):
83
+ return len(self.__dict__)
84
+
85
+ def __getitem__(self, key):
86
+ return getattr(self, key)
87
+
88
+ def __setitem__(self, key, value):
89
+ return setattr(self, key, value)
90
+
91
+ def __contains__(self, key):
92
+ return key in self.__dict__
93
+
94
+ def __repr__(self):
95
+ return self.__dict__.__repr__()
sovits/vdecoder/__init__.py ADDED
File without changes
sovits/vdecoder/hifigan/hifigan.py ADDED
@@ -0,0 +1,366 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ import torch.nn as nn
4
+ import torch.nn.functional as F
5
+ from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
6
+ from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
7
+
8
+ from sovits.vdecoder.parallel_wavegan.models.source import SourceModuleHnNSF
9
+
10
+ LRELU_SLOPE = 0.1
11
+
12
+
13
+ def init_weights(m, mean=0.0, std=0.01):
14
+ classname = m.__class__.__name__
15
+ if classname.find("Conv") != -1:
16
+ m.weight.data.normal_(mean, std)
17
+
18
+
19
+ def apply_weight_norm(m):
20
+ classname = m.__class__.__name__
21
+ if classname.find("Conv") != -1:
22
+ weight_norm(m)
23
+
24
+
25
+ def get_padding(kernel_size, dilation=1):
26
+ return int((kernel_size * dilation - dilation) / 2)
27
+
28
+
29
+ class ResBlock1(torch.nn.Module):
30
+ def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
31
+ super(ResBlock1, self).__init__()
32
+ self.h = h
33
+ self.convs1 = nn.ModuleList([
34
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
35
+ padding=get_padding(kernel_size, dilation[0]))),
36
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
37
+ padding=get_padding(kernel_size, dilation[1]))),
38
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
39
+ padding=get_padding(kernel_size, dilation[2])))
40
+ ])
41
+ self.convs1.apply(init_weights)
42
+
43
+ self.convs2 = nn.ModuleList([
44
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
45
+ padding=get_padding(kernel_size, 1))),
46
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
47
+ padding=get_padding(kernel_size, 1))),
48
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
49
+ padding=get_padding(kernel_size, 1)))
50
+ ])
51
+ self.convs2.apply(init_weights)
52
+
53
+ def forward(self, x):
54
+ for c1, c2 in zip(self.convs1, self.convs2):
55
+ xt = F.leaky_relu(x, LRELU_SLOPE)
56
+ xt = c1(xt)
57
+ xt = F.leaky_relu(xt, LRELU_SLOPE)
58
+ xt = c2(xt)
59
+ x = xt + x
60
+ return x
61
+
62
+ def remove_weight_norm(self):
63
+ for l in self.convs1:
64
+ remove_weight_norm(l)
65
+ for l in self.convs2:
66
+ remove_weight_norm(l)
67
+
68
+
69
+ class ResBlock2(torch.nn.Module):
70
+ def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)):
71
+ super(ResBlock2, self).__init__()
72
+ self.h = h
73
+ self.convs = nn.ModuleList([
74
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
75
+ padding=get_padding(kernel_size, dilation[0]))),
76
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
77
+ padding=get_padding(kernel_size, dilation[1])))
78
+ ])
79
+ self.convs.apply(init_weights)
80
+
81
+ def forward(self, x):
82
+ for c in self.convs:
83
+ xt = F.leaky_relu(x, LRELU_SLOPE)
84
+ xt = c(xt)
85
+ x = xt + x
86
+ return x
87
+
88
+ def remove_weight_norm(self):
89
+ for l in self.convs:
90
+ remove_weight_norm(l)
91
+
92
+
93
+ class Conv1d1x1(Conv1d):
94
+ """1x1 Conv1d with customized initialization."""
95
+
96
+ def __init__(self, in_channels, out_channels, bias):
97
+ """Initialize 1x1 Conv1d module."""
98
+ super(Conv1d1x1, self).__init__(in_channels, out_channels,
99
+ kernel_size=1, padding=0,
100
+ dilation=1, bias=bias)
101
+
102
+
103
+ class HifiGanGenerator(torch.nn.Module):
104
+ def __init__(self, h, c_out=1):
105
+ super(HifiGanGenerator, self).__init__()
106
+ self.h = h
107
+ self.num_kernels = len(h['resblock_kernel_sizes'])
108
+ self.num_upsamples = len(h['upsample_rates'])
109
+
110
+ if h['use_pitch_embed']:
111
+ self.harmonic_num = 8
112
+ self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(h['upsample_rates']))
113
+ self.m_source = SourceModuleHnNSF(
114
+ sampling_rate=h['audio_sample_rate'],
115
+ harmonic_num=self.harmonic_num)
116
+ self.noise_convs = nn.ModuleList()
117
+ # self.conv_pre = weight_norm(Conv1d(80, h['upsample_initial_channel'], 7, 1, padding=3))
118
+ self.conv_pre = weight_norm(Conv1d(h["inter_channels"], h['upsample_initial_channel'], 7, 1, padding=3))
119
+ resblock = ResBlock1 if h['resblock'] == '1' else ResBlock2
120
+
121
+ self.ups = nn.ModuleList()
122
+ for i, (u, k) in enumerate(zip(h['upsample_rates'], h['upsample_kernel_sizes'])):
123
+ c_cur = h['upsample_initial_channel'] // (2 ** (i + 1))
124
+ self.ups.append(weight_norm(
125
+ ConvTranspose1d(c_cur * 2, c_cur, k, u, padding=(k - u) // 2)))
126
+ if h['use_pitch_embed']:
127
+ if i + 1 < len(h['upsample_rates']):
128
+ stride_f0 = np.prod(h['upsample_rates'][i + 1:])
129
+ self.noise_convs.append(Conv1d(
130
+ 1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=stride_f0 // 2))
131
+ else:
132
+ self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
133
+
134
+ self.resblocks = nn.ModuleList()
135
+ for i in range(len(self.ups)):
136
+ ch = h['upsample_initial_channel'] // (2 ** (i + 1))
137
+ for j, (k, d) in enumerate(zip(h['resblock_kernel_sizes'], h['resblock_dilation_sizes'])):
138
+ self.resblocks.append(resblock(h, ch, k, d))
139
+
140
+ self.conv_post = weight_norm(Conv1d(ch, c_out, 7, 1, padding=3))
141
+ self.ups.apply(init_weights)
142
+ self.conv_post.apply(init_weights)
143
+
144
+ def forward(self, x, f0=None):
145
+ if f0 is not None:
146
+ f0 = f0.float()
147
+ # harmonic-source signal, noise-source signal, uv flag
148
+ f0 = self.f0_upsamp(f0[:, None]).transpose(1, 2)
149
+ har_source, noi_source, uv = self.m_source(f0)
150
+ har_source = har_source.transpose(1, 2)
151
+
152
+ x = self.conv_pre(x)
153
+ for i in range(self.num_upsamples):
154
+ x = F.leaky_relu(x, LRELU_SLOPE)
155
+ x = self.ups[i](x)
156
+ if f0 is not None:
157
+ x_source = self.noise_convs[i](har_source)
158
+ x = x + x_source
159
+ xs = None
160
+ for j in range(self.num_kernels):
161
+ if xs is None:
162
+ xs = self.resblocks[i * self.num_kernels + j](x)
163
+ else:
164
+ xs += self.resblocks[i * self.num_kernels + j](x)
165
+ x = xs / self.num_kernels
166
+ x = F.leaky_relu(x)
167
+ x = self.conv_post(x)
168
+ x = torch.tanh(x)
169
+
170
+ return x
171
+
172
+ def remove_weight_norm(self):
173
+ print('Removing weight norm...')
174
+ for l in self.ups:
175
+ remove_weight_norm(l)
176
+ for l in self.resblocks:
177
+ l.remove_weight_norm()
178
+ remove_weight_norm(self.conv_pre)
179
+ remove_weight_norm(self.conv_post)
180
+
181
+
182
+ class DiscriminatorP(torch.nn.Module):
183
+ def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False, use_cond=False, c_in=1):
184
+ super(DiscriminatorP, self).__init__()
185
+ self.use_cond = use_cond
186
+ if use_cond:
187
+ from utils.hparams import hparams
188
+ t = hparams['hop_size']
189
+ self.cond_net = torch.nn.ConvTranspose1d(80, 1, t * 2, stride=t, padding=t // 2)
190
+ c_in = 2
191
+
192
+ self.period = period
193
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
194
+ self.convs = nn.ModuleList([
195
+ norm_f(Conv2d(c_in, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
196
+ norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
197
+ norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
198
+ norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
199
+ norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
200
+ ])
201
+ self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
202
+
203
+ def forward(self, x, mel):
204
+ fmap = []
205
+ if self.use_cond:
206
+ x_mel = self.cond_net(mel)
207
+ x = torch.cat([x_mel, x], 1)
208
+ # 1d to 2d
209
+ b, c, t = x.shape
210
+ if t % self.period != 0: # pad first
211
+ n_pad = self.period - (t % self.period)
212
+ x = F.pad(x, (0, n_pad), "reflect")
213
+ t = t + n_pad
214
+ x = x.view(b, c, t // self.period, self.period)
215
+
216
+ for l in self.convs:
217
+ x = l(x)
218
+ x = F.leaky_relu(x, LRELU_SLOPE)
219
+ fmap.append(x)
220
+ x = self.conv_post(x)
221
+ fmap.append(x)
222
+ x = torch.flatten(x, 1, -1)
223
+
224
+ return x, fmap
225
+
226
+
227
+ class MultiPeriodDiscriminator(torch.nn.Module):
228
+ def __init__(self, use_cond=False, c_in=1):
229
+ super(MultiPeriodDiscriminator, self).__init__()
230
+ self.discriminators = nn.ModuleList([
231
+ DiscriminatorP(2, use_cond=use_cond, c_in=c_in),
232
+ DiscriminatorP(3, use_cond=use_cond, c_in=c_in),
233
+ DiscriminatorP(5, use_cond=use_cond, c_in=c_in),
234
+ DiscriminatorP(7, use_cond=use_cond, c_in=c_in),
235
+ DiscriminatorP(11, use_cond=use_cond, c_in=c_in),
236
+ ])
237
+
238
+ def forward(self, y, y_hat, mel=None):
239
+ y_d_rs = []
240
+ y_d_gs = []
241
+ fmap_rs = []
242
+ fmap_gs = []
243
+ for i, d in enumerate(self.discriminators):
244
+ y_d_r, fmap_r = d(y, mel)
245
+ y_d_g, fmap_g = d(y_hat, mel)
246
+ y_d_rs.append(y_d_r)
247
+ fmap_rs.append(fmap_r)
248
+ y_d_gs.append(y_d_g)
249
+ fmap_gs.append(fmap_g)
250
+
251
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
252
+
253
+
254
+ class DiscriminatorS(torch.nn.Module):
255
+ def __init__(self, use_spectral_norm=False, use_cond=False, upsample_rates=None, c_in=1):
256
+ super(DiscriminatorS, self).__init__()
257
+ self.use_cond = use_cond
258
+ if use_cond:
259
+ t = np.prod(upsample_rates)
260
+ self.cond_net = torch.nn.ConvTranspose1d(80, 1, t * 2, stride=t, padding=t // 2)
261
+ c_in = 2
262
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
263
+ self.convs = nn.ModuleList([
264
+ norm_f(Conv1d(c_in, 128, 15, 1, padding=7)),
265
+ norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),
266
+ norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)),
267
+ norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)),
268
+ norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)),
269
+ norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)),
270
+ norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
271
+ ])
272
+ self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
273
+
274
+ def forward(self, x, mel):
275
+ if self.use_cond:
276
+ x_mel = self.cond_net(mel)
277
+ x = torch.cat([x_mel, x], 1)
278
+ fmap = []
279
+ for l in self.convs:
280
+ x = l(x)
281
+ x = F.leaky_relu(x, LRELU_SLOPE)
282
+ fmap.append(x)
283
+ x = self.conv_post(x)
284
+ fmap.append(x)
285
+ x = torch.flatten(x, 1, -1)
286
+
287
+ return x, fmap
288
+
289
+
290
+ class MultiScaleDiscriminator(torch.nn.Module):
291
+ def __init__(self, use_cond=False, c_in=1):
292
+ super(MultiScaleDiscriminator, self).__init__()
293
+ from utils.hparams import hparams
294
+ self.discriminators = nn.ModuleList([
295
+ DiscriminatorS(use_spectral_norm=True, use_cond=use_cond,
296
+ upsample_rates=[4, 4, hparams['hop_size'] // 16],
297
+ c_in=c_in),
298
+ DiscriminatorS(use_cond=use_cond,
299
+ upsample_rates=[4, 4, hparams['hop_size'] // 32],
300
+ c_in=c_in),
301
+ DiscriminatorS(use_cond=use_cond,
302
+ upsample_rates=[4, 4, hparams['hop_size'] // 64],
303
+ c_in=c_in),
304
+ ])
305
+ self.meanpools = nn.ModuleList([
306
+ AvgPool1d(4, 2, padding=1),
307
+ AvgPool1d(4, 2, padding=1)
308
+ ])
309
+
310
+ def forward(self, y, y_hat, mel=None):
311
+ y_d_rs = []
312
+ y_d_gs = []
313
+ fmap_rs = []
314
+ fmap_gs = []
315
+ for i, d in enumerate(self.discriminators):
316
+ if i != 0:
317
+ y = self.meanpools[i - 1](y)
318
+ y_hat = self.meanpools[i - 1](y_hat)
319
+ y_d_r, fmap_r = d(y, mel)
320
+ y_d_g, fmap_g = d(y_hat, mel)
321
+ y_d_rs.append(y_d_r)
322
+ fmap_rs.append(fmap_r)
323
+ y_d_gs.append(y_d_g)
324
+ fmap_gs.append(fmap_g)
325
+
326
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
327
+
328
+
329
+ def feature_loss(fmap_r, fmap_g):
330
+ loss = 0
331
+ for dr, dg in zip(fmap_r, fmap_g):
332
+ for rl, gl in zip(dr, dg):
333
+ loss += torch.mean(torch.abs(rl - gl))
334
+
335
+ return loss * 2
336
+
337
+
338
+ def discriminator_loss(disc_real_outputs, disc_generated_outputs):
339
+ r_losses = 0
340
+ g_losses = 0
341
+ for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
342
+ r_loss = torch.mean((1 - dr) ** 2)
343
+ g_loss = torch.mean(dg ** 2)
344
+ r_losses += r_loss
345
+ g_losses += g_loss
346
+ r_losses = r_losses / len(disc_real_outputs)
347
+ g_losses = g_losses / len(disc_real_outputs)
348
+ return r_losses, g_losses
349
+
350
+
351
+ def cond_discriminator_loss(outputs):
352
+ loss = 0
353
+ for dg in outputs:
354
+ g_loss = torch.mean(dg ** 2)
355
+ loss += g_loss
356
+ loss = loss / len(outputs)
357
+ return loss
358
+
359
+
360
+ def generator_loss(disc_outputs):
361
+ loss = 0
362
+ for dg in disc_outputs:
363
+ l = torch.mean((1 - dg) ** 2)
364
+ loss += l
365
+ loss = loss / len(disc_outputs)
366
+ return loss
sovits/vdecoder/hifigan/mel_utils.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ import torch.utils.data
4
+ from librosa.filters import mel as librosa_mel_fn
5
+ from scipy.io.wavfile import read
6
+
7
+ MAX_WAV_VALUE = 32768.0
8
+
9
+
10
+ def load_wav(full_path):
11
+ sampling_rate, data = read(full_path)
12
+ return data, sampling_rate
13
+
14
+
15
+ def dynamic_range_compression(x, C=1, clip_val=1e-5):
16
+ return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
17
+
18
+
19
+ def dynamic_range_decompression(x, C=1):
20
+ return np.exp(x) / C
21
+
22
+
23
+ def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
24
+ return torch.log(torch.clamp(x, min=clip_val) * C)
25
+
26
+
27
+ def dynamic_range_decompression_torch(x, C=1):
28
+ return torch.exp(x) / C
29
+
30
+
31
+ def spectral_normalize_torch(magnitudes):
32
+ output = dynamic_range_compression_torch(magnitudes)
33
+ return output
34
+
35
+
36
+ def spectral_de_normalize_torch(magnitudes):
37
+ output = dynamic_range_decompression_torch(magnitudes)
38
+ return output
39
+
40
+
41
+ mel_basis = {}
42
+ hann_window = {}
43
+
44
+
45
+ def mel_spectrogram(y, hparams, center=False, complex=False):
46
+ # hop_size: 512 # For 22050Hz, 275 ~= 12.5 ms (0.0125 * sample_rate)
47
+ # win_size: 2048 # For 22050Hz, 1100 ~= 50 ms (If None, win_size: fft_size) (0.05 * sample_rate)
48
+ # fmin: 55 # Set this to 55 if your speaker is male! if female, 95 should help taking off noise. (To test depending on dataset. Pitch info: male~[65, 260], female~[100, 525])
49
+ # fmax: 10000 # To be increased/reduced depending on data.
50
+ # fft_size: 2048 # Extra window size is filled with 0 paddings to match this parameter
51
+ # n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax,
52
+ n_fft = hparams['fft_size']
53
+ num_mels = hparams['audio_num_mel_bins']
54
+ sampling_rate = hparams['audio_sample_rate']
55
+ hop_size = hparams['hop_size']
56
+ win_size = hparams['win_size']
57
+ fmin = hparams['fmin']
58
+ fmax = hparams['fmax']
59
+ y = y.clamp(min=-1., max=1.)
60
+ global mel_basis, hann_window
61
+ if fmax not in mel_basis:
62
+ mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
63
+ mel_basis[str(fmax) + '_' + str(y.device)] = torch.from_numpy(mel).float().to(y.device)
64
+ hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)
65
+
66
+ y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
67
+ mode='reflect')
68
+ y = y.squeeze(1)
69
+
70
+ spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[str(y.device)],
71
+ center=center, pad_mode='reflect', normalized=False, onesided=True)
72
+
73
+ if not complex:
74
+ spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))
75
+ spec = torch.matmul(mel_basis[str(fmax) + '_' + str(y.device)], spec)
76
+ spec = spectral_normalize_torch(spec)
77
+ else:
78
+ B, C, T, _ = spec.shape
79
+ spec = spec.transpose(1, 2) # [B, T, n_fft, 2]
80
+ return spec
sovits/vdecoder/parallel_wavegan/__init__.py ADDED
File without changes
sovits/vdecoder/parallel_wavegan/layers/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from .causal_conv import * # NOQA
2
+ from .pqmf import * # NOQA
3
+ from .residual_block import * # NOQA
4
+ from sovits.vdecoder.parallel_wavegan.layers.residual_stack import * # NOQA
5
+ from .upsample import * # NOQA
sovits/vdecoder/parallel_wavegan/layers/causal_conv.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ # Copyright 2020 Tomoki Hayashi
4
+ # MIT License (https://opensource.org/licenses/MIT)
5
+
6
+ """Causal convolusion layer modules."""
7
+
8
+
9
+ import torch
10
+
11
+
12
+ class CausalConv1d(torch.nn.Module):
13
+ """CausalConv1d module with customized initialization."""
14
+
15
+ def __init__(self, in_channels, out_channels, kernel_size,
16
+ dilation=1, bias=True, pad="ConstantPad1d", pad_params={"value": 0.0}):
17
+ """Initialize CausalConv1d module."""
18
+ super(CausalConv1d, self).__init__()
19
+ self.pad = getattr(torch.nn, pad)((kernel_size - 1) * dilation, **pad_params)
20
+ self.conv = torch.nn.Conv1d(in_channels, out_channels, kernel_size,
21
+ dilation=dilation, bias=bias)
22
+
23
+ def forward(self, x):
24
+ """Calculate forward propagation.
25
+
26
+ Args:
27
+ x (Tensor): Input tensor (B, in_channels, T).
28
+
29
+ Returns:
30
+ Tensor: Output tensor (B, out_channels, T).
31
+
32
+ """
33
+ return self.conv(self.pad(x))[:, :, :x.size(2)]
34
+
35
+
36
+ class CausalConvTranspose1d(torch.nn.Module):
37
+ """CausalConvTranspose1d module with customized initialization."""
38
+
39
+ def __init__(self, in_channels, out_channels, kernel_size, stride, bias=True):
40
+ """Initialize CausalConvTranspose1d module."""
41
+ super(CausalConvTranspose1d, self).__init__()
42
+ self.deconv = torch.nn.ConvTranspose1d(
43
+ in_channels, out_channels, kernel_size, stride, bias=bias)
44
+ self.stride = stride
45
+
46
+ def forward(self, x):
47
+ """Calculate forward propagation.
48
+
49
+ Args:
50
+ x (Tensor): Input tensor (B, in_channels, T_in).
51
+
52
+ Returns:
53
+ Tensor: Output tensor (B, out_channels, T_out).
54
+
55
+ """
56
+ return self.deconv(x)[:, :, :-self.stride]
sovits/vdecoder/parallel_wavegan/layers/pqmf.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ # Copyright 2020 Tomoki Hayashi
4
+ # MIT License (https://opensource.org/licenses/MIT)
5
+
6
+ """Pseudo QMF modules."""
7
+
8
+ import numpy as np
9
+ import torch
10
+ import torch.nn.functional as F
11
+
12
+ from scipy.signal import kaiser
13
+
14
+
15
+ def design_prototype_filter(taps=62, cutoff_ratio=0.15, beta=9.0):
16
+ """Design prototype filter for PQMF.
17
+
18
+ This method is based on `A Kaiser window approach for the design of prototype
19
+ filters of cosine modulated filterbanks`_.
20
+
21
+ Args:
22
+ taps (int): The number of filter taps.
23
+ cutoff_ratio (float): Cut-off frequency ratio.
24
+ beta (float): Beta coefficient for kaiser window.
25
+
26
+ Returns:
27
+ ndarray: Impluse response of prototype filter (taps + 1,).
28
+
29
+ .. _`A Kaiser window approach for the design of prototype filters of cosine modulated filterbanks`:
30
+ https://ieeexplore.ieee.org/abstract/document/681427
31
+
32
+ """
33
+ # check the arguments are valid
34
+ assert taps % 2 == 0, "The number of taps mush be even number."
35
+ assert 0.0 < cutoff_ratio < 1.0, "Cutoff ratio must be > 0.0 and < 1.0."
36
+
37
+ # make initial filter
38
+ omega_c = np.pi * cutoff_ratio
39
+ with np.errstate(invalid='ignore'):
40
+ h_i = np.sin(omega_c * (np.arange(taps + 1) - 0.5 * taps)) \
41
+ / (np.pi * (np.arange(taps + 1) - 0.5 * taps))
42
+ h_i[taps // 2] = np.cos(0) * cutoff_ratio # fix nan due to indeterminate form
43
+
44
+ # apply kaiser window
45
+ w = kaiser(taps + 1, beta)
46
+ h = h_i * w
47
+
48
+ return h
49
+
50
+
51
+ class PQMF(torch.nn.Module):
52
+ """PQMF module.
53
+
54
+ This module is based on `Near-perfect-reconstruction pseudo-QMF banks`_.
55
+
56
+ .. _`Near-perfect-reconstruction pseudo-QMF banks`:
57
+ https://ieeexplore.ieee.org/document/258122
58
+
59
+ """
60
+
61
+ def __init__(self, subbands=4, taps=62, cutoff_ratio=0.15, beta=9.0):
62
+ """Initilize PQMF module.
63
+
64
+ Args:
65
+ subbands (int): The number of subbands.
66
+ taps (int): The number of filter taps.
67
+ cutoff_ratio (float): Cut-off frequency ratio.
68
+ beta (float): Beta coefficient for kaiser window.
69
+
70
+ """
71
+ super(PQMF, self).__init__()
72
+
73
+ # define filter coefficient
74
+ h_proto = design_prototype_filter(taps, cutoff_ratio, beta)
75
+ h_analysis = np.zeros((subbands, len(h_proto)))
76
+ h_synthesis = np.zeros((subbands, len(h_proto)))
77
+ for k in range(subbands):
78
+ h_analysis[k] = 2 * h_proto * np.cos(
79
+ (2 * k + 1) * (np.pi / (2 * subbands)) *
80
+ (np.arange(taps + 1) - ((taps - 1) / 2)) +
81
+ (-1) ** k * np.pi / 4)
82
+ h_synthesis[k] = 2 * h_proto * np.cos(
83
+ (2 * k + 1) * (np.pi / (2 * subbands)) *
84
+ (np.arange(taps + 1) - ((taps - 1) / 2)) -
85
+ (-1) ** k * np.pi / 4)
86
+
87
+ # convert to tensor
88
+ analysis_filter = torch.from_numpy(h_analysis).float().unsqueeze(1)
89
+ synthesis_filter = torch.from_numpy(h_synthesis).float().unsqueeze(0)
90
+
91
+ # register coefficients as beffer
92
+ self.register_buffer("analysis_filter", analysis_filter)
93
+ self.register_buffer("synthesis_filter", synthesis_filter)
94
+
95
+ # filter for downsampling & upsampling
96
+ updown_filter = torch.zeros((subbands, subbands, subbands)).float()
97
+ for k in range(subbands):
98
+ updown_filter[k, k, 0] = 1.0
99
+ self.register_buffer("updown_filter", updown_filter)
100
+ self.subbands = subbands
101
+
102
+ # keep padding info
103
+ self.pad_fn = torch.nn.ConstantPad1d(taps // 2, 0.0)
104
+
105
+ def analysis(self, x):
106
+ """Analysis with PQMF.
107
+
108
+ Args:
109
+ x (Tensor): Input tensor (B, 1, T).
110
+
111
+ Returns:
112
+ Tensor: Output tensor (B, subbands, T // subbands).
113
+
114
+ """
115
+ x = F.conv1d(self.pad_fn(x), self.analysis_filter)
116
+ return F.conv1d(x, self.updown_filter, stride=self.subbands)
117
+
118
+ def synthesis(self, x):
119
+ """Synthesis with PQMF.
120
+
121
+ Args:
122
+ x (Tensor): Input tensor (B, subbands, T // subbands).
123
+
124
+ Returns:
125
+ Tensor: Output tensor (B, 1, T).
126
+
127
+ """
128
+ x = F.conv_transpose1d(x, self.updown_filter * self.subbands, stride=self.subbands)
129
+ return F.conv1d(self.pad_fn(x), self.synthesis_filter)
sovits/vdecoder/parallel_wavegan/layers/residual_block.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ """Residual block module in WaveNet.
4
+
5
+ This code is modified from https://github.com/r9y9/wavenet_vocoder.
6
+
7
+ """
8
+
9
+ import math
10
+
11
+ import torch
12
+ import torch.nn.functional as F
13
+
14
+
15
+ class Conv1d(torch.nn.Conv1d):
16
+ """Conv1d module with customized initialization."""
17
+
18
+ def __init__(self, *args, **kwargs):
19
+ """Initialize Conv1d module."""
20
+ super(Conv1d, self).__init__(*args, **kwargs)
21
+
22
+ def reset_parameters(self):
23
+ """Reset parameters."""
24
+ torch.nn.init.kaiming_normal_(self.weight, nonlinearity="relu")
25
+ if self.bias is not None:
26
+ torch.nn.init.constant_(self.bias, 0.0)
27
+
28
+
29
+ class Conv1d1x1(Conv1d):
30
+ """1x1 Conv1d with customized initialization."""
31
+
32
+ def __init__(self, in_channels, out_channels, bias):
33
+ """Initialize 1x1 Conv1d module."""
34
+ super(Conv1d1x1, self).__init__(in_channels, out_channels,
35
+ kernel_size=1, padding=0,
36
+ dilation=1, bias=bias)
37
+
38
+
39
+ class ResidualBlock(torch.nn.Module):
40
+ """Residual block module in WaveNet."""
41
+
42
+ def __init__(self,
43
+ kernel_size=3,
44
+ residual_channels=64,
45
+ gate_channels=128,
46
+ skip_channels=64,
47
+ aux_channels=80,
48
+ dropout=0.0,
49
+ dilation=1,
50
+ bias=True,
51
+ use_causal_conv=False
52
+ ):
53
+ """Initialize ResidualBlock module.
54
+
55
+ Args:
56
+ kernel_size (int): Kernel size of dilation convolution layer.
57
+ residual_channels (int): Number of channels for residual connection.
58
+ skip_channels (int): Number of channels for skip connection.
59
+ aux_channels (int): Local conditioning channels i.e. auxiliary input dimension.
60
+ dropout (float): Dropout probability.
61
+ dilation (int): Dilation factor.
62
+ bias (bool): Whether to add bias parameter in convolution layers.
63
+ use_causal_conv (bool): Whether to use use_causal_conv or non-use_causal_conv convolution.
64
+
65
+ """
66
+ super(ResidualBlock, self).__init__()
67
+ self.dropout = dropout
68
+ # no future time stamps available
69
+ if use_causal_conv:
70
+ padding = (kernel_size - 1) * dilation
71
+ else:
72
+ assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size."
73
+ padding = (kernel_size - 1) // 2 * dilation
74
+ self.use_causal_conv = use_causal_conv
75
+
76
+ # dilation conv
77
+ self.conv = Conv1d(residual_channels, gate_channels, kernel_size,
78
+ padding=padding, dilation=dilation, bias=bias)
79
+
80
+ # local conditioning
81
+ if aux_channels > 0:
82
+ self.conv1x1_aux = Conv1d1x1(aux_channels, gate_channels, bias=False)
83
+ else:
84
+ self.conv1x1_aux = None
85
+
86
+ # conv output is split into two groups
87
+ gate_out_channels = gate_channels // 2
88
+ self.conv1x1_out = Conv1d1x1(gate_out_channels, residual_channels, bias=bias)
89
+ self.conv1x1_skip = Conv1d1x1(gate_out_channels, skip_channels, bias=bias)
90
+
91
+ def forward(self, x, c):
92
+ """Calculate forward propagation.
93
+
94
+ Args:
95
+ x (Tensor): Input tensor (B, residual_channels, T).
96
+ c (Tensor): Local conditioning auxiliary tensor (B, aux_channels, T).
97
+
98
+ Returns:
99
+ Tensor: Output tensor for residual connection (B, residual_channels, T).
100
+ Tensor: Output tensor for skip connection (B, skip_channels, T).
101
+
102
+ """
103
+ residual = x
104
+ x = F.dropout(x, p=self.dropout, training=self.training)
105
+ x = self.conv(x)
106
+
107
+ # remove future time steps if use_causal_conv conv
108
+ x = x[:, :, :residual.size(-1)] if self.use_causal_conv else x
109
+
110
+ # split into two part for gated activation
111
+ splitdim = 1
112
+ xa, xb = x.split(x.size(splitdim) // 2, dim=splitdim)
113
+
114
+ # local conditioning
115
+ if c is not None:
116
+ assert self.conv1x1_aux is not None
117
+ c = self.conv1x1_aux(c)
118
+ ca, cb = c.split(c.size(splitdim) // 2, dim=splitdim)
119
+ xa, xb = xa + ca, xb + cb
120
+
121
+ x = torch.tanh(xa) * torch.sigmoid(xb)
122
+
123
+ # for skip connection
124
+ s = self.conv1x1_skip(x)
125
+
126
+ # for residual connection
127
+ x = (self.conv1x1_out(x) + residual) * math.sqrt(0.5)
128
+
129
+ return x, s
sovits/vdecoder/parallel_wavegan/layers/residual_stack.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ # Copyright 2020 Tomoki Hayashi
4
+ # MIT License (https://opensource.org/licenses/MIT)
5
+
6
+ """Residual stack module in MelGAN."""
7
+
8
+ import torch
9
+
10
+ from . import CausalConv1d
11
+
12
+
13
+ class ResidualStack(torch.nn.Module):
14
+ """Residual stack module introduced in MelGAN."""
15
+
16
+ def __init__(self,
17
+ kernel_size=3,
18
+ channels=32,
19
+ dilation=1,
20
+ bias=True,
21
+ nonlinear_activation="LeakyReLU",
22
+ nonlinear_activation_params={"negative_slope": 0.2},
23
+ pad="ReflectionPad1d",
24
+ pad_params={},
25
+ use_causal_conv=False,
26
+ ):
27
+ """Initialize ResidualStack module.
28
+
29
+ Args:
30
+ kernel_size (int): Kernel size of dilation convolution layer.
31
+ channels (int): Number of channels of convolution layers.
32
+ dilation (int): Dilation factor.
33
+ bias (bool): Whether to add bias parameter in convolution layers.
34
+ nonlinear_activation (str): Activation function module name.
35
+ nonlinear_activation_params (dict): Hyperparameters for activation function.
36
+ pad (str): Padding function module name before dilated convolution layer.
37
+ pad_params (dict): Hyperparameters for padding function.
38
+ use_causal_conv (bool): Whether to use causal convolution.
39
+
40
+ """
41
+ super(ResidualStack, self).__init__()
42
+
43
+ # defile residual stack part
44
+ if not use_causal_conv:
45
+ assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size."
46
+ self.stack = torch.nn.Sequential(
47
+ getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
48
+ getattr(torch.nn, pad)((kernel_size - 1) // 2 * dilation, **pad_params),
49
+ torch.nn.Conv1d(channels, channels, kernel_size, dilation=dilation, bias=bias),
50
+ getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
51
+ torch.nn.Conv1d(channels, channels, 1, bias=bias),
52
+ )
53
+ else:
54
+ self.stack = torch.nn.Sequential(
55
+ getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
56
+ CausalConv1d(channels, channels, kernel_size, dilation=dilation,
57
+ bias=bias, pad=pad, pad_params=pad_params),
58
+ getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
59
+ torch.nn.Conv1d(channels, channels, 1, bias=bias),
60
+ )
61
+
62
+ # defile extra layer for skip connection
63
+ self.skip_layer = torch.nn.Conv1d(channels, channels, 1, bias=bias)
64
+
65
+ def forward(self, c):
66
+ """Calculate forward propagation.
67
+
68
+ Args:
69
+ c (Tensor): Input tensor (B, channels, T).
70
+
71
+ Returns:
72
+ Tensor: Output tensor (B, chennels, T).
73
+
74
+ """
75
+ return self.stack(c) + self.skip_layer(c)
sovits/vdecoder/parallel_wavegan/layers/tf_layers.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ # Copyright 2020 MINH ANH (@dathudeptrai)
4
+ # MIT License (https://opensource.org/licenses/MIT)
5
+
6
+ """Tensorflow Layer modules complatible with pytorch."""
7
+
8
+ import tensorflow as tf
9
+
10
+
11
+ class TFReflectionPad1d(tf.keras.layers.Layer):
12
+ """Tensorflow ReflectionPad1d module."""
13
+
14
+ def __init__(self, padding_size):
15
+ """Initialize TFReflectionPad1d module.
16
+
17
+ Args:
18
+ padding_size (int): Padding size.
19
+
20
+ """
21
+ super(TFReflectionPad1d, self).__init__()
22
+ self.padding_size = padding_size
23
+
24
+ @tf.function
25
+ def call(self, x):
26
+ """Calculate forward propagation.
27
+
28
+ Args:
29
+ x (Tensor): Input tensor (B, T, 1, C).
30
+
31
+ Returns:
32
+ Tensor: Padded tensor (B, T + 2 * padding_size, 1, C).
33
+
34
+ """
35
+ return tf.pad(x, [[0, 0], [self.padding_size, self.padding_size], [0, 0], [0, 0]], "REFLECT")
36
+
37
+
38
+ class TFConvTranspose1d(tf.keras.layers.Layer):
39
+ """Tensorflow ConvTranspose1d module."""
40
+
41
+ def __init__(self, channels, kernel_size, stride, padding):
42
+ """Initialize TFConvTranspose1d( module.
43
+
44
+ Args:
45
+ channels (int): Number of channels.
46
+ kernel_size (int): kernel size.
47
+ strides (int): Stride width.
48
+ padding (str): Padding type ("same" or "valid").
49
+
50
+ """
51
+ super(TFConvTranspose1d, self).__init__()
52
+ self.conv1d_transpose = tf.keras.layers.Conv2DTranspose(
53
+ filters=channels,
54
+ kernel_size=(kernel_size, 1),
55
+ strides=(stride, 1),
56
+ padding=padding,
57
+ )
58
+
59
+ @tf.function
60
+ def call(self, x):
61
+ """Calculate forward propagation.
62
+
63
+ Args:
64
+ x (Tensor): Input tensor (B, T, 1, C).
65
+
66
+ Returns:
67
+ Tensors: Output tensor (B, T', 1, C').
68
+
69
+ """
70
+ x = self.conv1d_transpose(x)
71
+ return x
72
+
73
+
74
+ class TFResidualStack(tf.keras.layers.Layer):
75
+ """Tensorflow ResidualStack module."""
76
+
77
+ def __init__(self,
78
+ kernel_size,
79
+ channels,
80
+ dilation,
81
+ bias,
82
+ nonlinear_activation,
83
+ nonlinear_activation_params,
84
+ padding,
85
+ ):
86
+ """Initialize TFResidualStack module.
87
+
88
+ Args:
89
+ kernel_size (int): Kernel size.
90
+ channles (int): Number of channels.
91
+ dilation (int): Dilation ine.
92
+ bias (bool): Whether to add bias parameter in convolution layers.
93
+ nonlinear_activation (str): Activation function module name.
94
+ nonlinear_activation_params (dict): Hyperparameters for activation function.
95
+ padding (str): Padding type ("same" or "valid").
96
+
97
+ """
98
+ super(TFResidualStack, self).__init__()
99
+ self.block = [
100
+ getattr(tf.keras.layers, nonlinear_activation)(**nonlinear_activation_params),
101
+ TFReflectionPad1d(dilation),
102
+ tf.keras.layers.Conv2D(
103
+ filters=channels,
104
+ kernel_size=(kernel_size, 1),
105
+ dilation_rate=(dilation, 1),
106
+ use_bias=bias,
107
+ padding="valid",
108
+ ),
109
+ getattr(tf.keras.layers, nonlinear_activation)(**nonlinear_activation_params),
110
+ tf.keras.layers.Conv2D(filters=channels, kernel_size=1, use_bias=bias)
111
+ ]
112
+ self.shortcut = tf.keras.layers.Conv2D(filters=channels, kernel_size=1, use_bias=bias)
113
+
114
+ @tf.function
115
+ def call(self, x):
116
+ """Calculate forward propagation.
117
+
118
+ Args:
119
+ x (Tensor): Input tensor (B, T, 1, C).
120
+
121
+ Returns:
122
+ Tensor: Output tensor (B, T, 1, C).
123
+
124
+ """
125
+ _x = tf.identity(x)
126
+ for i, layer in enumerate(self.block):
127
+ _x = layer(_x)
128
+ shortcut = self.shortcut(x)
129
+ return shortcut + _x
sovits/vdecoder/parallel_wavegan/layers/upsample.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ """Upsampling module.
4
+
5
+ This code is modified from https://github.com/r9y9/wavenet_vocoder.
6
+
7
+ """
8
+
9
+ import numpy as np
10
+ import torch
11
+ import torch.nn.functional as F
12
+
13
+ from . import Conv1d
14
+
15
+
16
+ class Stretch2d(torch.nn.Module):
17
+ """Stretch2d module."""
18
+
19
+ def __init__(self, x_scale, y_scale, mode="nearest"):
20
+ """Initialize Stretch2d module.
21
+
22
+ Args:
23
+ x_scale (int): X scaling factor (Time axis in spectrogram).
24
+ y_scale (int): Y scaling factor (Frequency axis in spectrogram).
25
+ mode (str): Interpolation mode.
26
+
27
+ """
28
+ super(Stretch2d, self).__init__()
29
+ self.x_scale = x_scale
30
+ self.y_scale = y_scale
31
+ self.mode = mode
32
+
33
+ def forward(self, x):
34
+ """Calculate forward propagation.
35
+
36
+ Args:
37
+ x (Tensor): Input tensor (B, C, F, T).
38
+
39
+ Returns:
40
+ Tensor: Interpolated tensor (B, C, F * y_scale, T * x_scale),
41
+
42
+ """
43
+ return F.interpolate(
44
+ x, scale_factor=(self.y_scale, self.x_scale), mode=self.mode)
45
+
46
+
47
+ class Conv2d(torch.nn.Conv2d):
48
+ """Conv2d module with customized initialization."""
49
+
50
+ def __init__(self, *args, **kwargs):
51
+ """Initialize Conv2d module."""
52
+ super(Conv2d, self).__init__(*args, **kwargs)
53
+
54
+ def reset_parameters(self):
55
+ """Reset parameters."""
56
+ self.weight.data.fill_(1. / np.prod(self.kernel_size))
57
+ if self.bias is not None:
58
+ torch.nn.init.constant_(self.bias, 0.0)
59
+
60
+
61
+ class UpsampleNetwork(torch.nn.Module):
62
+ """Upsampling network module."""
63
+
64
+ def __init__(self,
65
+ upsample_scales,
66
+ nonlinear_activation=None,
67
+ nonlinear_activation_params={},
68
+ interpolate_mode="nearest",
69
+ freq_axis_kernel_size=1,
70
+ use_causal_conv=False,
71
+ ):
72
+ """Initialize upsampling network module.
73
+
74
+ Args:
75
+ upsample_scales (list): List of upsampling scales.
76
+ nonlinear_activation (str): Activation function name.
77
+ nonlinear_activation_params (dict): Arguments for specified activation function.
78
+ interpolate_mode (str): Interpolation mode.
79
+ freq_axis_kernel_size (int): Kernel size in the direction of frequency axis.
80
+
81
+ """
82
+ super(UpsampleNetwork, self).__init__()
83
+ self.use_causal_conv = use_causal_conv
84
+ self.up_layers = torch.nn.ModuleList()
85
+ for scale in upsample_scales:
86
+ # interpolation layer
87
+ stretch = Stretch2d(scale, 1, interpolate_mode)
88
+ self.up_layers += [stretch]
89
+
90
+ # conv layer
91
+ assert (freq_axis_kernel_size - 1) % 2 == 0, "Not support even number freq axis kernel size."
92
+ freq_axis_padding = (freq_axis_kernel_size - 1) // 2
93
+ kernel_size = (freq_axis_kernel_size, scale * 2 + 1)
94
+ if use_causal_conv:
95
+ padding = (freq_axis_padding, scale * 2)
96
+ else:
97
+ padding = (freq_axis_padding, scale)
98
+ conv = Conv2d(1, 1, kernel_size=kernel_size, padding=padding, bias=False)
99
+ self.up_layers += [conv]
100
+
101
+ # nonlinear
102
+ if nonlinear_activation is not None:
103
+ nonlinear = getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params)
104
+ self.up_layers += [nonlinear]
105
+
106
+ def forward(self, c):
107
+ """Calculate forward propagation.
108
+
109
+ Args:
110
+ c : Input tensor (B, C, T).
111
+
112
+ Returns:
113
+ Tensor: Upsampled tensor (B, C, T'), where T' = T * prod(upsample_scales).
114
+
115
+ """
116
+ c = c.unsqueeze(1) # (B, 1, C, T)
117
+ for f in self.up_layers:
118
+ if self.use_causal_conv and isinstance(f, Conv2d):
119
+ c = f(c)[..., :c.size(-1)]
120
+ else:
121
+ c = f(c)
122
+ return c.squeeze(1) # (B, C, T')
123
+
124
+
125
+ class ConvInUpsampleNetwork(torch.nn.Module):
126
+ """Convolution + upsampling network module."""
127
+
128
+ def __init__(self,
129
+ upsample_scales,
130
+ nonlinear_activation=None,
131
+ nonlinear_activation_params={},
132
+ interpolate_mode="nearest",
133
+ freq_axis_kernel_size=1,
134
+ aux_channels=80,
135
+ aux_context_window=0,
136
+ use_causal_conv=False
137
+ ):
138
+ """Initialize convolution + upsampling network module.
139
+
140
+ Args:
141
+ upsample_scales (list): List of upsampling scales.
142
+ nonlinear_activation (str): Activation function name.
143
+ nonlinear_activation_params (dict): Arguments for specified activation function.
144
+ mode (str): Interpolation mode.
145
+ freq_axis_kernel_size (int): Kernel size in the direction of frequency axis.
146
+ aux_channels (int): Number of channels of pre-convolutional layer.
147
+ aux_context_window (int): Context window size of the pre-convolutional layer.
148
+ use_causal_conv (bool): Whether to use causal structure.
149
+
150
+ """
151
+ super(ConvInUpsampleNetwork, self).__init__()
152
+ self.aux_context_window = aux_context_window
153
+ self.use_causal_conv = use_causal_conv and aux_context_window > 0
154
+ # To capture wide-context information in conditional features
155
+ kernel_size = aux_context_window + 1 if use_causal_conv else 2 * aux_context_window + 1
156
+ # NOTE(kan-bayashi): Here do not use padding because the input is already padded
157
+ self.conv_in = Conv1d(aux_channels, aux_channels, kernel_size=kernel_size, bias=False)
158
+ self.upsample = UpsampleNetwork(
159
+ upsample_scales=upsample_scales,
160
+ nonlinear_activation=nonlinear_activation,
161
+ nonlinear_activation_params=nonlinear_activation_params,
162
+ interpolate_mode=interpolate_mode,
163
+ freq_axis_kernel_size=freq_axis_kernel_size,
164
+ use_causal_conv=use_causal_conv,
165
+ )
166
+
167
+ def forward(self, c):
168
+ """Calculate forward propagation.
169
+
170
+ Args:
171
+ c : Input tensor (B, C, T').
172
+
173
+ Returns:
174
+ Tensor: Upsampled tensor (B, C, T),
175
+ where T = (T' - aux_context_window * 2) * prod(upsample_scales).
176
+
177
+ Note:
178
+ The length of inputs considers the context window size.
179
+
180
+ """
181
+ c_ = self.conv_in(c)
182
+ c = c_[:, :, :-self.aux_context_window] if self.use_causal_conv else c_
183
+ return self.upsample(c)
sovits/vdecoder/parallel_wavegan/losses/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .stft_loss import * # NOQA
sovits/vdecoder/parallel_wavegan/losses/stft_loss.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ # Copyright 2019 Tomoki Hayashi
4
+ # MIT License (https://opensource.org/licenses/MIT)
5
+
6
+ """STFT-based Loss modules."""
7
+
8
+ import torch
9
+ import torch.nn.functional as F
10
+
11
+
12
+ def stft(x, fft_size, hop_size, win_length, window):
13
+ """Perform STFT and convert to magnitude spectrogram.
14
+
15
+ Args:
16
+ x (Tensor): Input signal tensor (B, T).
17
+ fft_size (int): FFT size.
18
+ hop_size (int): Hop size.
19
+ win_length (int): Window length.
20
+ window (str): Window function type.
21
+
22
+ Returns:
23
+ Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
24
+
25
+ """
26
+ x_stft = torch.stft(x, fft_size, hop_size, win_length, window)
27
+ real = x_stft[..., 0]
28
+ imag = x_stft[..., 1]
29
+
30
+ # NOTE(kan-bayashi): clamp is needed to avoid nan or inf
31
+ return torch.sqrt(torch.clamp(real ** 2 + imag ** 2, min=1e-7)).transpose(2, 1)
32
+
33
+
34
+ class SpectralConvergengeLoss(torch.nn.Module):
35
+ """Spectral convergence loss module."""
36
+
37
+ def __init__(self):
38
+ """Initilize spectral convergence loss module."""
39
+ super(SpectralConvergengeLoss, self).__init__()
40
+
41
+ def forward(self, x_mag, y_mag):
42
+ """Calculate forward propagation.
43
+
44
+ Args:
45
+ x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
46
+ y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
47
+
48
+ Returns:
49
+ Tensor: Spectral convergence loss value.
50
+
51
+ """
52
+ return torch.norm(y_mag - x_mag, p="fro") / torch.norm(y_mag, p="fro")
53
+
54
+
55
+ class LogSTFTMagnitudeLoss(torch.nn.Module):
56
+ """Log STFT magnitude loss module."""
57
+
58
+ def __init__(self):
59
+ """Initilize los STFT magnitude loss module."""
60
+ super(LogSTFTMagnitudeLoss, self).__init__()
61
+
62
+ def forward(self, x_mag, y_mag):
63
+ """Calculate forward propagation.
64
+
65
+ Args:
66
+ x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
67
+ y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
68
+
69
+ Returns:
70
+ Tensor: Log STFT magnitude loss value.
71
+
72
+ """
73
+ return F.l1_loss(torch.log(y_mag), torch.log(x_mag))
74
+
75
+
76
+ class STFTLoss(torch.nn.Module):
77
+ """STFT loss module."""
78
+
79
+ def __init__(self, fft_size=1024, shift_size=120, win_length=600, window="hann_window"):
80
+ """Initialize STFT loss module."""
81
+ super(STFTLoss, self).__init__()
82
+ self.fft_size = fft_size
83
+ self.shift_size = shift_size
84
+ self.win_length = win_length
85
+ self.window = getattr(torch, window)(win_length)
86
+ self.spectral_convergenge_loss = SpectralConvergengeLoss()
87
+ self.log_stft_magnitude_loss = LogSTFTMagnitudeLoss()
88
+
89
+ def forward(self, x, y):
90
+ """Calculate forward propagation.
91
+
92
+ Args:
93
+ x (Tensor): Predicted signal (B, T).
94
+ y (Tensor): Groundtruth signal (B, T).
95
+
96
+ Returns:
97
+ Tensor: Spectral convergence loss value.
98
+ Tensor: Log STFT magnitude loss value.
99
+
100
+ """
101
+ x_mag = stft(x, self.fft_size, self.shift_size, self.win_length, self.window)
102
+ y_mag = stft(y, self.fft_size, self.shift_size, self.win_length, self.window)
103
+ sc_loss = self.spectral_convergenge_loss(x_mag, y_mag)
104
+ mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag)
105
+
106
+ return sc_loss, mag_loss
107
+
108
+
109
+ class MultiResolutionSTFTLoss(torch.nn.Module):
110
+ """Multi resolution STFT loss module."""
111
+
112
+ def __init__(self,
113
+ fft_sizes=[1024, 2048, 512],
114
+ hop_sizes=[120, 240, 50],
115
+ win_lengths=[600, 1200, 240],
116
+ window="hann_window"):
117
+ """Initialize Multi resolution STFT loss module.
118
+
119
+ Args:
120
+ fft_sizes (list): List of FFT sizes.
121
+ hop_sizes (list): List of hop sizes.
122
+ win_lengths (list): List of window lengths.
123
+ window (str): Window function type.
124
+
125
+ """
126
+ super(MultiResolutionSTFTLoss, self).__init__()
127
+ assert len(fft_sizes) == len(hop_sizes) == len(win_lengths)
128
+ self.stft_losses = torch.nn.ModuleList()
129
+ for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths):
130
+ self.stft_losses += [STFTLoss(fs, ss, wl, window)]
131
+
132
+ def forward(self, x, y):
133
+ """Calculate forward propagation.
134
+
135
+ Args:
136
+ x (Tensor): Predicted signal (B, T).
137
+ y (Tensor): Groundtruth signal (B, T).
138
+
139
+ Returns:
140
+ Tensor: Multi resolution spectral convergence loss value.
141
+ Tensor: Multi resolution log STFT magnitude loss value.
142
+
143
+ """
144
+ sc_loss = 0.0
145
+ mag_loss = 0.0
146
+ for f in self.stft_losses:
147
+ sc_l, mag_l = f(x, y)
148
+ sc_loss += sc_l
149
+ mag_loss += mag_l
150
+ sc_loss /= len(self.stft_losses)
151
+ mag_loss /= len(self.stft_losses)
152
+
153
+ return sc_loss, mag_loss
sovits/vdecoder/parallel_wavegan/models/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from .melgan import * # NOQA
2
+ from .parallel_wavegan import * # NOQA
sovits/vdecoder/parallel_wavegan/models/melgan.py ADDED
@@ -0,0 +1,427 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ # Copyright 2020 Tomoki Hayashi
4
+ # MIT License (https://opensource.org/licenses/MIT)
5
+
6
+ """MelGAN Modules."""
7
+
8
+ import logging
9
+
10
+ import numpy as np
11
+ import torch
12
+
13
+ from sovits.vdecoder.parallel_wavegan.layers import CausalConv1d
14
+ from sovits.vdecoder.parallel_wavegan.layers import CausalConvTranspose1d
15
+ from sovits.vdecoder.parallel_wavegan.layers import ResidualStack
16
+
17
+
18
+ class MelGANGenerator(torch.nn.Module):
19
+ """MelGAN generator module."""
20
+
21
+ def __init__(self,
22
+ in_channels=80,
23
+ out_channels=1,
24
+ kernel_size=7,
25
+ channels=512,
26
+ bias=True,
27
+ upsample_scales=[8, 8, 2, 2],
28
+ stack_kernel_size=3,
29
+ stacks=3,
30
+ nonlinear_activation="LeakyReLU",
31
+ nonlinear_activation_params={"negative_slope": 0.2},
32
+ pad="ReflectionPad1d",
33
+ pad_params={},
34
+ use_final_nonlinear_activation=True,
35
+ use_weight_norm=True,
36
+ use_causal_conv=False,
37
+ ):
38
+ """Initialize MelGANGenerator module.
39
+
40
+ Args:
41
+ in_channels (int): Number of input channels.
42
+ out_channels (int): Number of output channels.
43
+ kernel_size (int): Kernel size of initial and final conv layer.
44
+ channels (int): Initial number of channels for conv layer.
45
+ bias (bool): Whether to add bias parameter in convolution layers.
46
+ upsample_scales (list): List of upsampling scales.
47
+ stack_kernel_size (int): Kernel size of dilated conv layers in residual stack.
48
+ stacks (int): Number of stacks in a single residual stack.
49
+ nonlinear_activation (str): Activation function module name.
50
+ nonlinear_activation_params (dict): Hyperparameters for activation function.
51
+ pad (str): Padding function module name before dilated convolution layer.
52
+ pad_params (dict): Hyperparameters for padding function.
53
+ use_final_nonlinear_activation (torch.nn.Module): Activation function for the final layer.
54
+ use_weight_norm (bool): Whether to use weight norm.
55
+ If set to true, it will be applied to all of the conv layers.
56
+ use_causal_conv (bool): Whether to use causal convolution.
57
+
58
+ """
59
+ super(MelGANGenerator, self).__init__()
60
+
61
+ # check hyper parameters is valid
62
+ assert channels >= np.prod(upsample_scales)
63
+ assert channels % (2 ** len(upsample_scales)) == 0
64
+ if not use_causal_conv:
65
+ assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size."
66
+
67
+ # add initial layer
68
+ layers = []
69
+ if not use_causal_conv:
70
+ layers += [
71
+ getattr(torch.nn, pad)((kernel_size - 1) // 2, **pad_params),
72
+ torch.nn.Conv1d(in_channels, channels, kernel_size, bias=bias),
73
+ ]
74
+ else:
75
+ layers += [
76
+ CausalConv1d(in_channels, channels, kernel_size,
77
+ bias=bias, pad=pad, pad_params=pad_params),
78
+ ]
79
+
80
+ for i, upsample_scale in enumerate(upsample_scales):
81
+ # add upsampling layer
82
+ layers += [getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params)]
83
+ if not use_causal_conv:
84
+ layers += [
85
+ torch.nn.ConvTranspose1d(
86
+ channels // (2 ** i),
87
+ channels // (2 ** (i + 1)),
88
+ upsample_scale * 2,
89
+ stride=upsample_scale,
90
+ padding=upsample_scale // 2 + upsample_scale % 2,
91
+ output_padding=upsample_scale % 2,
92
+ bias=bias,
93
+ )
94
+ ]
95
+ else:
96
+ layers += [
97
+ CausalConvTranspose1d(
98
+ channels // (2 ** i),
99
+ channels // (2 ** (i + 1)),
100
+ upsample_scale * 2,
101
+ stride=upsample_scale,
102
+ bias=bias,
103
+ )
104
+ ]
105
+
106
+ # add residual stack
107
+ for j in range(stacks):
108
+ layers += [
109
+ ResidualStack(
110
+ kernel_size=stack_kernel_size,
111
+ channels=channels // (2 ** (i + 1)),
112
+ dilation=stack_kernel_size ** j,
113
+ bias=bias,
114
+ nonlinear_activation=nonlinear_activation,
115
+ nonlinear_activation_params=nonlinear_activation_params,
116
+ pad=pad,
117
+ pad_params=pad_params,
118
+ use_causal_conv=use_causal_conv,
119
+ )
120
+ ]
121
+
122
+ # add final layer
123
+ layers += [getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params)]
124
+ if not use_causal_conv:
125
+ layers += [
126
+ getattr(torch.nn, pad)((kernel_size - 1) // 2, **pad_params),
127
+ torch.nn.Conv1d(channels // (2 ** (i + 1)), out_channels, kernel_size, bias=bias),
128
+ ]
129
+ else:
130
+ layers += [
131
+ CausalConv1d(channels // (2 ** (i + 1)), out_channels, kernel_size,
132
+ bias=bias, pad=pad, pad_params=pad_params),
133
+ ]
134
+ if use_final_nonlinear_activation:
135
+ layers += [torch.nn.Tanh()]
136
+
137
+ # define the model as a single function
138
+ self.melgan = torch.nn.Sequential(*layers)
139
+
140
+ # apply weight norm
141
+ if use_weight_norm:
142
+ self.apply_weight_norm()
143
+
144
+ # reset parameters
145
+ self.reset_parameters()
146
+
147
+ def forward(self, c):
148
+ """Calculate forward propagation.
149
+
150
+ Args:
151
+ c (Tensor): Input tensor (B, channels, T).
152
+
153
+ Returns:
154
+ Tensor: Output tensor (B, 1, T ** prod(upsample_scales)).
155
+
156
+ """
157
+ return self.melgan(c)
158
+
159
+ def remove_weight_norm(self):
160
+ """Remove weight normalization module from all of the layers."""
161
+ def _remove_weight_norm(m):
162
+ try:
163
+ logging.debug(f"Weight norm is removed from {m}.")
164
+ torch.nn.utils.remove_weight_norm(m)
165
+ except ValueError: # this module didn't have weight norm
166
+ return
167
+
168
+ self.apply(_remove_weight_norm)
169
+
170
+ def apply_weight_norm(self):
171
+ """Apply weight normalization module from all of the layers."""
172
+ def _apply_weight_norm(m):
173
+ if isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.ConvTranspose1d):
174
+ torch.nn.utils.weight_norm(m)
175
+ logging.debug(f"Weight norm is applied to {m}.")
176
+
177
+ self.apply(_apply_weight_norm)
178
+
179
+ def reset_parameters(self):
180
+ """Reset parameters.
181
+
182
+ This initialization follows official implementation manner.
183
+ https://github.com/descriptinc/melgan-neurips/blob/master/spec2wav/modules.py
184
+
185
+ """
186
+ def _reset_parameters(m):
187
+ if isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.ConvTranspose1d):
188
+ m.weight.data.normal_(0.0, 0.02)
189
+ logging.debug(f"Reset parameters in {m}.")
190
+
191
+ self.apply(_reset_parameters)
192
+
193
+
194
+ class MelGANDiscriminator(torch.nn.Module):
195
+ """MelGAN discriminator module."""
196
+
197
+ def __init__(self,
198
+ in_channels=1,
199
+ out_channels=1,
200
+ kernel_sizes=[5, 3],
201
+ channels=16,
202
+ max_downsample_channels=1024,
203
+ bias=True,
204
+ downsample_scales=[4, 4, 4, 4],
205
+ nonlinear_activation="LeakyReLU",
206
+ nonlinear_activation_params={"negative_slope": 0.2},
207
+ pad="ReflectionPad1d",
208
+ pad_params={},
209
+ ):
210
+ """Initilize MelGAN discriminator module.
211
+
212
+ Args:
213
+ in_channels (int): Number of input channels.
214
+ out_channels (int): Number of output channels.
215
+ kernel_sizes (list): List of two kernel sizes. The prod will be used for the first conv layer,
216
+ and the first and the second kernel sizes will be used for the last two layers.
217
+ For example if kernel_sizes = [5, 3], the first layer kernel size will be 5 * 3 = 15,
218
+ the last two layers' kernel size will be 5 and 3, respectively.
219
+ channels (int): Initial number of channels for conv layer.
220
+ max_downsample_channels (int): Maximum number of channels for downsampling layers.
221
+ bias (bool): Whether to add bias parameter in convolution layers.
222
+ downsample_scales (list): List of downsampling scales.
223
+ nonlinear_activation (str): Activation function module name.
224
+ nonlinear_activation_params (dict): Hyperparameters for activation function.
225
+ pad (str): Padding function module name before dilated convolution layer.
226
+ pad_params (dict): Hyperparameters for padding function.
227
+
228
+ """
229
+ super(MelGANDiscriminator, self).__init__()
230
+ self.layers = torch.nn.ModuleList()
231
+
232
+ # check kernel size is valid
233
+ assert len(kernel_sizes) == 2
234
+ assert kernel_sizes[0] % 2 == 1
235
+ assert kernel_sizes[1] % 2 == 1
236
+
237
+ # add first layer
238
+ self.layers += [
239
+ torch.nn.Sequential(
240
+ getattr(torch.nn, pad)((np.prod(kernel_sizes) - 1) // 2, **pad_params),
241
+ torch.nn.Conv1d(in_channels, channels, np.prod(kernel_sizes), bias=bias),
242
+ getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
243
+ )
244
+ ]
245
+
246
+ # add downsample layers
247
+ in_chs = channels
248
+ for downsample_scale in downsample_scales:
249
+ out_chs = min(in_chs * downsample_scale, max_downsample_channels)
250
+ self.layers += [
251
+ torch.nn.Sequential(
252
+ torch.nn.Conv1d(
253
+ in_chs, out_chs,
254
+ kernel_size=downsample_scale * 10 + 1,
255
+ stride=downsample_scale,
256
+ padding=downsample_scale * 5,
257
+ groups=in_chs // 4,
258
+ bias=bias,
259
+ ),
260
+ getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
261
+ )
262
+ ]
263
+ in_chs = out_chs
264
+
265
+ # add final layers
266
+ out_chs = min(in_chs * 2, max_downsample_channels)
267
+ self.layers += [
268
+ torch.nn.Sequential(
269
+ torch.nn.Conv1d(
270
+ in_chs, out_chs, kernel_sizes[0],
271
+ padding=(kernel_sizes[0] - 1) // 2,
272
+ bias=bias,
273
+ ),
274
+ getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params),
275
+ )
276
+ ]
277
+ self.layers += [
278
+ torch.nn.Conv1d(
279
+ out_chs, out_channels, kernel_sizes[1],
280
+ padding=(kernel_sizes[1] - 1) // 2,
281
+ bias=bias,
282
+ ),
283
+ ]
284
+
285
+ def forward(self, x):
286
+ """Calculate forward propagation.
287
+
288
+ Args:
289
+ x (Tensor): Input noise signal (B, 1, T).
290
+
291
+ Returns:
292
+ List: List of output tensors of each layer.
293
+
294
+ """
295
+ outs = []
296
+ for f in self.layers:
297
+ x = f(x)
298
+ outs += [x]
299
+
300
+ return outs
301
+
302
+
303
+ class MelGANMultiScaleDiscriminator(torch.nn.Module):
304
+ """MelGAN multi-scale discriminator module."""
305
+
306
+ def __init__(self,
307
+ in_channels=1,
308
+ out_channels=1,
309
+ scales=3,
310
+ downsample_pooling="AvgPool1d",
311
+ # follow the official implementation setting
312
+ downsample_pooling_params={
313
+ "kernel_size": 4,
314
+ "stride": 2,
315
+ "padding": 1,
316
+ "count_include_pad": False,
317
+ },
318
+ kernel_sizes=[5, 3],
319
+ channels=16,
320
+ max_downsample_channels=1024,
321
+ bias=True,
322
+ downsample_scales=[4, 4, 4, 4],
323
+ nonlinear_activation="LeakyReLU",
324
+ nonlinear_activation_params={"negative_slope": 0.2},
325
+ pad="ReflectionPad1d",
326
+ pad_params={},
327
+ use_weight_norm=True,
328
+ ):
329
+ """Initilize MelGAN multi-scale discriminator module.
330
+
331
+ Args:
332
+ in_channels (int): Number of input channels.
333
+ out_channels (int): Number of output channels.
334
+ downsample_pooling (str): Pooling module name for downsampling of the inputs.
335
+ downsample_pooling_params (dict): Parameters for the above pooling module.
336
+ kernel_sizes (list): List of two kernel sizes. The sum will be used for the first conv layer,
337
+ and the first and the second kernel sizes will be used for the last two layers.
338
+ channels (int): Initial number of channels for conv layer.
339
+ max_downsample_channels (int): Maximum number of channels for downsampling layers.
340
+ bias (bool): Whether to add bias parameter in convolution layers.
341
+ downsample_scales (list): List of downsampling scales.
342
+ nonlinear_activation (str): Activation function module name.
343
+ nonlinear_activation_params (dict): Hyperparameters for activation function.
344
+ pad (str): Padding function module name before dilated convolution layer.
345
+ pad_params (dict): Hyperparameters for padding function.
346
+ use_causal_conv (bool): Whether to use causal convolution.
347
+
348
+ """
349
+ super(MelGANMultiScaleDiscriminator, self).__init__()
350
+ self.discriminators = torch.nn.ModuleList()
351
+
352
+ # add discriminators
353
+ for _ in range(scales):
354
+ self.discriminators += [
355
+ MelGANDiscriminator(
356
+ in_channels=in_channels,
357
+ out_channels=out_channels,
358
+ kernel_sizes=kernel_sizes,
359
+ channels=channels,
360
+ max_downsample_channels=max_downsample_channels,
361
+ bias=bias,
362
+ downsample_scales=downsample_scales,
363
+ nonlinear_activation=nonlinear_activation,
364
+ nonlinear_activation_params=nonlinear_activation_params,
365
+ pad=pad,
366
+ pad_params=pad_params,
367
+ )
368
+ ]
369
+ self.pooling = getattr(torch.nn, downsample_pooling)(**downsample_pooling_params)
370
+
371
+ # apply weight norm
372
+ if use_weight_norm:
373
+ self.apply_weight_norm()
374
+
375
+ # reset parameters
376
+ self.reset_parameters()
377
+
378
+ def forward(self, x):
379
+ """Calculate forward propagation.
380
+
381
+ Args:
382
+ x (Tensor): Input noise signal (B, 1, T).
383
+
384
+ Returns:
385
+ List: List of list of each discriminator outputs, which consists of each layer output tensors.
386
+
387
+ """
388
+ outs = []
389
+ for f in self.discriminators:
390
+ outs += [f(x)]
391
+ x = self.pooling(x)
392
+
393
+ return outs
394
+
395
+ def remove_weight_norm(self):
396
+ """Remove weight normalization module from all of the layers."""
397
+ def _remove_weight_norm(m):
398
+ try:
399
+ logging.debug(f"Weight norm is removed from {m}.")
400
+ torch.nn.utils.remove_weight_norm(m)
401
+ except ValueError: # this module didn't have weight norm
402
+ return
403
+
404
+ self.apply(_remove_weight_norm)
405
+
406
+ def apply_weight_norm(self):
407
+ """Apply weight normalization module from all of the layers."""
408
+ def _apply_weight_norm(m):
409
+ if isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.ConvTranspose1d):
410
+ torch.nn.utils.weight_norm(m)
411
+ logging.debug(f"Weight norm is applied to {m}.")
412
+
413
+ self.apply(_apply_weight_norm)
414
+
415
+ def reset_parameters(self):
416
+ """Reset parameters.
417
+
418
+ This initialization follows official implementation manner.
419
+ https://github.com/descriptinc/melgan-neurips/blob/master/spec2wav/modules.py
420
+
421
+ """
422
+ def _reset_parameters(m):
423
+ if isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.ConvTranspose1d):
424
+ m.weight.data.normal_(0.0, 0.02)
425
+ logging.debug(f"Reset parameters in {m}.")
426
+
427
+ self.apply(_reset_parameters)
sovits/vdecoder/parallel_wavegan/models/parallel_wavegan.py ADDED
@@ -0,0 +1,434 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ # Copyright 2019 Tomoki Hayashi
4
+ # MIT License (https://opensource.org/licenses/MIT)
5
+
6
+ """Parallel WaveGAN Modules."""
7
+
8
+ import logging
9
+ import math
10
+
11
+ import torch
12
+ from torch import nn
13
+
14
+ from sovits.vdecoder.parallel_wavegan.layers import Conv1d
15
+ from sovits.vdecoder.parallel_wavegan.layers import Conv1d1x1
16
+ from sovits.vdecoder.parallel_wavegan.layers import ResidualBlock
17
+ from sovits.vdecoder.parallel_wavegan.layers import upsample
18
+ from sovits.vdecoder.parallel_wavegan import models
19
+
20
+
21
+ class ParallelWaveGANGenerator(torch.nn.Module):
22
+ """Parallel WaveGAN Generator module."""
23
+
24
+ def __init__(self,
25
+ in_channels=1,
26
+ out_channels=1,
27
+ kernel_size=3,
28
+ layers=30,
29
+ stacks=3,
30
+ residual_channels=64,
31
+ gate_channels=128,
32
+ skip_channels=64,
33
+ aux_channels=80,
34
+ aux_context_window=2,
35
+ dropout=0.0,
36
+ bias=True,
37
+ use_weight_norm=True,
38
+ use_causal_conv=False,
39
+ upsample_conditional_features=True,
40
+ upsample_net="ConvInUpsampleNetwork",
41
+ upsample_params={"upsample_scales": [4, 4, 4, 4]},
42
+ use_pitch_embed=False,
43
+ ):
44
+ """Initialize Parallel WaveGAN Generator module.
45
+
46
+ Args:
47
+ in_channels (int): Number of input channels.
48
+ out_channels (int): Number of output channels.
49
+ kernel_size (int): Kernel size of dilated convolution.
50
+ layers (int): Number of residual block layers.
51
+ stacks (int): Number of stacks i.e., dilation cycles.
52
+ residual_channels (int): Number of channels in residual conv.
53
+ gate_channels (int): Number of channels in gated conv.
54
+ skip_channels (int): Number of channels in skip conv.
55
+ aux_channels (int): Number of channels for auxiliary feature conv.
56
+ aux_context_window (int): Context window size for auxiliary feature.
57
+ dropout (float): Dropout rate. 0.0 means no dropout applied.
58
+ bias (bool): Whether to use bias parameter in conv layer.
59
+ use_weight_norm (bool): Whether to use weight norm.
60
+ If set to true, it will be applied to all of the conv layers.
61
+ use_causal_conv (bool): Whether to use causal structure.
62
+ upsample_conditional_features (bool): Whether to use upsampling network.
63
+ upsample_net (str): Upsampling network architecture.
64
+ upsample_params (dict): Upsampling network parameters.
65
+
66
+ """
67
+ super(ParallelWaveGANGenerator, self).__init__()
68
+ self.in_channels = in_channels
69
+ self.out_channels = out_channels
70
+ self.aux_channels = aux_channels
71
+ self.layers = layers
72
+ self.stacks = stacks
73
+ self.kernel_size = kernel_size
74
+
75
+ # check the number of layers and stacks
76
+ assert layers % stacks == 0
77
+ layers_per_stack = layers // stacks
78
+
79
+ # define first convolution
80
+ self.first_conv = Conv1d1x1(in_channels, residual_channels, bias=True)
81
+
82
+ # define conv + upsampling network
83
+ if upsample_conditional_features:
84
+ upsample_params.update({
85
+ "use_causal_conv": use_causal_conv,
86
+ })
87
+ if upsample_net == "MelGANGenerator":
88
+ assert aux_context_window == 0
89
+ upsample_params.update({
90
+ "use_weight_norm": False, # not to apply twice
91
+ "use_final_nonlinear_activation": False,
92
+ })
93
+ self.upsample_net = getattr(models, upsample_net)(**upsample_params)
94
+ else:
95
+ if upsample_net == "ConvInUpsampleNetwork":
96
+ upsample_params.update({
97
+ "aux_channels": aux_channels,
98
+ "aux_context_window": aux_context_window,
99
+ })
100
+ self.upsample_net = getattr(upsample, upsample_net)(**upsample_params)
101
+ else:
102
+ self.upsample_net = None
103
+
104
+ # define residual blocks
105
+ self.conv_layers = torch.nn.ModuleList()
106
+ for layer in range(layers):
107
+ dilation = 2 ** (layer % layers_per_stack)
108
+ conv = ResidualBlock(
109
+ kernel_size=kernel_size,
110
+ residual_channels=residual_channels,
111
+ gate_channels=gate_channels,
112
+ skip_channels=skip_channels,
113
+ aux_channels=aux_channels,
114
+ dilation=dilation,
115
+ dropout=dropout,
116
+ bias=bias,
117
+ use_causal_conv=use_causal_conv,
118
+ )
119
+ self.conv_layers += [conv]
120
+
121
+ # define output layers
122
+ self.last_conv_layers = torch.nn.ModuleList([
123
+ torch.nn.ReLU(inplace=True),
124
+ Conv1d1x1(skip_channels, skip_channels, bias=True),
125
+ torch.nn.ReLU(inplace=True),
126
+ Conv1d1x1(skip_channels, out_channels, bias=True),
127
+ ])
128
+
129
+ self.use_pitch_embed = use_pitch_embed
130
+ if use_pitch_embed:
131
+ self.pitch_embed = nn.Embedding(300, aux_channels, 0)
132
+ self.c_proj = nn.Linear(2 * aux_channels, aux_channels)
133
+
134
+ # apply weight norm
135
+ if use_weight_norm:
136
+ self.apply_weight_norm()
137
+
138
+ def forward(self, x, c=None, pitch=None, **kwargs):
139
+ """Calculate forward propagation.
140
+
141
+ Args:
142
+ x (Tensor): Input noise signal (B, C_in, T).
143
+ c (Tensor): Local conditioning auxiliary features (B, C ,T').
144
+ pitch (Tensor): Local conditioning pitch (B, T').
145
+
146
+ Returns:
147
+ Tensor: Output tensor (B, C_out, T)
148
+
149
+ """
150
+ # perform upsampling
151
+ if c is not None and self.upsample_net is not None:
152
+ if self.use_pitch_embed:
153
+ p = self.pitch_embed(pitch)
154
+ c = self.c_proj(torch.cat([c.transpose(1, 2), p], -1)).transpose(1, 2)
155
+ c = self.upsample_net(c)
156
+ assert c.size(-1) == x.size(-1), (c.size(-1), x.size(-1))
157
+
158
+ # encode to hidden representation
159
+ x = self.first_conv(x)
160
+ skips = 0
161
+ for f in self.conv_layers:
162
+ x, h = f(x, c)
163
+ skips += h
164
+ skips *= math.sqrt(1.0 / len(self.conv_layers))
165
+
166
+ # apply final layers
167
+ x = skips
168
+ for f in self.last_conv_layers:
169
+ x = f(x)
170
+
171
+ return x
172
+
173
+ def remove_weight_norm(self):
174
+ """Remove weight normalization module from all of the layers."""
175
+ def _remove_weight_norm(m):
176
+ try:
177
+ logging.debug(f"Weight norm is removed from {m}.")
178
+ torch.nn.utils.remove_weight_norm(m)
179
+ except ValueError: # this module didn't have weight norm
180
+ return
181
+
182
+ self.apply(_remove_weight_norm)
183
+
184
+ def apply_weight_norm(self):
185
+ """Apply weight normalization module from all of the layers."""
186
+ def _apply_weight_norm(m):
187
+ if isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.Conv2d):
188
+ torch.nn.utils.weight_norm(m)
189
+ logging.debug(f"Weight norm is applied to {m}.")
190
+
191
+ self.apply(_apply_weight_norm)
192
+
193
+ @staticmethod
194
+ def _get_receptive_field_size(layers, stacks, kernel_size,
195
+ dilation=lambda x: 2 ** x):
196
+ assert layers % stacks == 0
197
+ layers_per_cycle = layers // stacks
198
+ dilations = [dilation(i % layers_per_cycle) for i in range(layers)]
199
+ return (kernel_size - 1) * sum(dilations) + 1
200
+
201
+ @property
202
+ def receptive_field_size(self):
203
+ """Return receptive field size."""
204
+ return self._get_receptive_field_size(self.layers, self.stacks, self.kernel_size)
205
+
206
+
207
+ class ParallelWaveGANDiscriminator(torch.nn.Module):
208
+ """Parallel WaveGAN Discriminator module."""
209
+
210
+ def __init__(self,
211
+ in_channels=1,
212
+ out_channels=1,
213
+ kernel_size=3,
214
+ layers=10,
215
+ conv_channels=64,
216
+ dilation_factor=1,
217
+ nonlinear_activation="LeakyReLU",
218
+ nonlinear_activation_params={"negative_slope": 0.2},
219
+ bias=True,
220
+ use_weight_norm=True,
221
+ ):
222
+ """Initialize Parallel WaveGAN Discriminator module.
223
+
224
+ Args:
225
+ in_channels (int): Number of input channels.
226
+ out_channels (int): Number of output channels.
227
+ kernel_size (int): Number of output channels.
228
+ layers (int): Number of conv layers.
229
+ conv_channels (int): Number of chnn layers.
230
+ dilation_factor (int): Dilation factor. For example, if dilation_factor = 2,
231
+ the dilation will be 2, 4, 8, ..., and so on.
232
+ nonlinear_activation (str): Nonlinear function after each conv.
233
+ nonlinear_activation_params (dict): Nonlinear function parameters
234
+ bias (bool): Whether to use bias parameter in conv.
235
+ use_weight_norm (bool) Whether to use weight norm.
236
+ If set to true, it will be applied to all of the conv layers.
237
+
238
+ """
239
+ super(ParallelWaveGANDiscriminator, self).__init__()
240
+ assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size."
241
+ assert dilation_factor > 0, "Dilation factor must be > 0."
242
+ self.conv_layers = torch.nn.ModuleList()
243
+ conv_in_channels = in_channels
244
+ for i in range(layers - 1):
245
+ if i == 0:
246
+ dilation = 1
247
+ else:
248
+ dilation = i if dilation_factor == 1 else dilation_factor ** i
249
+ conv_in_channels = conv_channels
250
+ padding = (kernel_size - 1) // 2 * dilation
251
+ conv_layer = [
252
+ Conv1d(conv_in_channels, conv_channels,
253
+ kernel_size=kernel_size, padding=padding,
254
+ dilation=dilation, bias=bias),
255
+ getattr(torch.nn, nonlinear_activation)(inplace=True, **nonlinear_activation_params)
256
+ ]
257
+ self.conv_layers += conv_layer
258
+ padding = (kernel_size - 1) // 2
259
+ last_conv_layer = Conv1d(
260
+ conv_in_channels, out_channels,
261
+ kernel_size=kernel_size, padding=padding, bias=bias)
262
+ self.conv_layers += [last_conv_layer]
263
+
264
+ # apply weight norm
265
+ if use_weight_norm:
266
+ self.apply_weight_norm()
267
+
268
+ def forward(self, x):
269
+ """Calculate forward propagation.
270
+
271
+ Args:
272
+ x (Tensor): Input noise signal (B, 1, T).
273
+
274
+ Returns:
275
+ Tensor: Output tensor (B, 1, T)
276
+
277
+ """
278
+ for f in self.conv_layers:
279
+ x = f(x)
280
+ return x
281
+
282
+ def apply_weight_norm(self):
283
+ """Apply weight normalization module from all of the layers."""
284
+ def _apply_weight_norm(m):
285
+ if isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.Conv2d):
286
+ torch.nn.utils.weight_norm(m)
287
+ logging.debug(f"Weight norm is applied to {m}.")
288
+
289
+ self.apply(_apply_weight_norm)
290
+
291
+ def remove_weight_norm(self):
292
+ """Remove weight normalization module from all of the layers."""
293
+ def _remove_weight_norm(m):
294
+ try:
295
+ logging.debug(f"Weight norm is removed from {m}.")
296
+ torch.nn.utils.remove_weight_norm(m)
297
+ except ValueError: # this module didn't have weight norm
298
+ return
299
+
300
+ self.apply(_remove_weight_norm)
301
+
302
+
303
+ class ResidualParallelWaveGANDiscriminator(torch.nn.Module):
304
+ """Parallel WaveGAN Discriminator module."""
305
+
306
+ def __init__(self,
307
+ in_channels=1,
308
+ out_channels=1,
309
+ kernel_size=3,
310
+ layers=30,
311
+ stacks=3,
312
+ residual_channels=64,
313
+ gate_channels=128,
314
+ skip_channels=64,
315
+ dropout=0.0,
316
+ bias=True,
317
+ use_weight_norm=True,
318
+ use_causal_conv=False,
319
+ nonlinear_activation="LeakyReLU",
320
+ nonlinear_activation_params={"negative_slope": 0.2},
321
+ ):
322
+ """Initialize Parallel WaveGAN Discriminator module.
323
+
324
+ Args:
325
+ in_channels (int): Number of input channels.
326
+ out_channels (int): Number of output channels.
327
+ kernel_size (int): Kernel size of dilated convolution.
328
+ layers (int): Number of residual block layers.
329
+ stacks (int): Number of stacks i.e., dilation cycles.
330
+ residual_channels (int): Number of channels in residual conv.
331
+ gate_channels (int): Number of channels in gated conv.
332
+ skip_channels (int): Number of channels in skip conv.
333
+ dropout (float): Dropout rate. 0.0 means no dropout applied.
334
+ bias (bool): Whether to use bias parameter in conv.
335
+ use_weight_norm (bool): Whether to use weight norm.
336
+ If set to true, it will be applied to all of the conv layers.
337
+ use_causal_conv (bool): Whether to use causal structure.
338
+ nonlinear_activation_params (dict): Nonlinear function parameters
339
+
340
+ """
341
+ super(ResidualParallelWaveGANDiscriminator, self).__init__()
342
+ assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size."
343
+
344
+ self.in_channels = in_channels
345
+ self.out_channels = out_channels
346
+ self.layers = layers
347
+ self.stacks = stacks
348
+ self.kernel_size = kernel_size
349
+
350
+ # check the number of layers and stacks
351
+ assert layers % stacks == 0
352
+ layers_per_stack = layers // stacks
353
+
354
+ # define first convolution
355
+ self.first_conv = torch.nn.Sequential(
356
+ Conv1d1x1(in_channels, residual_channels, bias=True),
357
+ getattr(torch.nn, nonlinear_activation)(
358
+ inplace=True, **nonlinear_activation_params),
359
+ )
360
+
361
+ # define residual blocks
362
+ self.conv_layers = torch.nn.ModuleList()
363
+ for layer in range(layers):
364
+ dilation = 2 ** (layer % layers_per_stack)
365
+ conv = ResidualBlock(
366
+ kernel_size=kernel_size,
367
+ residual_channels=residual_channels,
368
+ gate_channels=gate_channels,
369
+ skip_channels=skip_channels,
370
+ aux_channels=-1,
371
+ dilation=dilation,
372
+ dropout=dropout,
373
+ bias=bias,
374
+ use_causal_conv=use_causal_conv,
375
+ )
376
+ self.conv_layers += [conv]
377
+
378
+ # define output layers
379
+ self.last_conv_layers = torch.nn.ModuleList([
380
+ getattr(torch.nn, nonlinear_activation)(
381
+ inplace=True, **nonlinear_activation_params),
382
+ Conv1d1x1(skip_channels, skip_channels, bias=True),
383
+ getattr(torch.nn, nonlinear_activation)(
384
+ inplace=True, **nonlinear_activation_params),
385
+ Conv1d1x1(skip_channels, out_channels, bias=True),
386
+ ])
387
+
388
+ # apply weight norm
389
+ if use_weight_norm:
390
+ self.apply_weight_norm()
391
+
392
+ def forward(self, x):
393
+ """Calculate forward propagation.
394
+
395
+ Args:
396
+ x (Tensor): Input noise signal (B, 1, T).
397
+
398
+ Returns:
399
+ Tensor: Output tensor (B, 1, T)
400
+
401
+ """
402
+ x = self.first_conv(x)
403
+
404
+ skips = 0
405
+ for f in self.conv_layers:
406
+ x, h = f(x, None)
407
+ skips += h
408
+ skips *= math.sqrt(1.0 / len(self.conv_layers))
409
+
410
+ # apply final layers
411
+ x = skips
412
+ for f in self.last_conv_layers:
413
+ x = f(x)
414
+ return x
415
+
416
+ def apply_weight_norm(self):
417
+ """Apply weight normalization module from all of the layers."""
418
+ def _apply_weight_norm(m):
419
+ if isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.Conv2d):
420
+ torch.nn.utils.weight_norm(m)
421
+ logging.debug(f"Weight norm is applied to {m}.")
422
+
423
+ self.apply(_apply_weight_norm)
424
+
425
+ def remove_weight_norm(self):
426
+ """Remove weight normalization module from all of the layers."""
427
+ def _remove_weight_norm(m):
428
+ try:
429
+ logging.debug(f"Weight norm is removed from {m}.")
430
+ torch.nn.utils.remove_weight_norm(m)
431
+ except ValueError: # this module didn't have weight norm
432
+ return
433
+
434
+ self.apply(_remove_weight_norm)
sovits/vdecoder/parallel_wavegan/models/source.py ADDED
@@ -0,0 +1,538 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+ import sys
4
+ import torch.nn.functional as torch_nn_func
5
+
6
+
7
+ class SineGen(torch.nn.Module):
8
+ """ Definition of sine generator
9
+ SineGen(samp_rate, harmonic_num = 0,
10
+ sine_amp = 0.1, noise_std = 0.003,
11
+ voiced_threshold = 0,
12
+ flag_for_pulse=False)
13
+
14
+ samp_rate: sampling rate in Hz
15
+ harmonic_num: number of harmonic overtones (default 0)
16
+ sine_amp: amplitude of sine-wavefrom (default 0.1)
17
+ noise_std: std of Gaussian noise (default 0.003)
18
+ voiced_thoreshold: F0 threshold for U/V classification (default 0)
19
+ flag_for_pulse: this SinGen is used inside PulseGen (default False)
20
+
21
+ Note: when flag_for_pulse is True, the first time step of a voiced
22
+ segment is always sin(np.pi) or cos(0)
23
+ """
24
+
25
+ def __init__(self, samp_rate, harmonic_num=0,
26
+ sine_amp=0.1, noise_std=0.003,
27
+ voiced_threshold=0,
28
+ flag_for_pulse=False):
29
+ super(SineGen, self).__init__()
30
+ self.sine_amp = sine_amp
31
+ self.noise_std = noise_std
32
+ self.harmonic_num = harmonic_num
33
+ self.dim = self.harmonic_num + 1
34
+ self.sampling_rate = samp_rate
35
+ self.voiced_threshold = voiced_threshold
36
+ self.flag_for_pulse = flag_for_pulse
37
+
38
+ def _f02uv(self, f0):
39
+ # generate uv signal
40
+ uv = torch.ones_like(f0)
41
+ uv = uv * (f0 > self.voiced_threshold)
42
+ return uv
43
+
44
+ def _f02sine(self, f0_values):
45
+ """ f0_values: (batchsize, length, dim)
46
+ where dim indicates fundamental tone and overtones
47
+ """
48
+ # convert to F0 in rad. The interger part n can be ignored
49
+ # because 2 * np.pi * n doesn't affect phase
50
+ rad_values = (f0_values / self.sampling_rate) % 1
51
+
52
+ # initial phase noise (no noise for fundamental component)
53
+ rand_ini = torch.rand(f0_values.shape[0], f0_values.shape[2], \
54
+ device=f0_values.device)
55
+ rand_ini[:, 0] = 0
56
+ rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
57
+
58
+ # instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad)
59
+ if not self.flag_for_pulse:
60
+ # for normal case
61
+
62
+ # To prevent torch.cumsum numerical overflow,
63
+ # it is necessary to add -1 whenever \sum_k=1^n rad_value_k > 1.
64
+ # Buffer tmp_over_one_idx indicates the time step to add -1.
65
+ # This will not change F0 of sine because (x-1) * 2*pi = x * 2*pi
66
+ tmp_over_one = torch.cumsum(rad_values, 1) % 1
67
+ tmp_over_one_idx = (tmp_over_one[:, 1:, :] -
68
+ tmp_over_one[:, :-1, :]) < 0
69
+ cumsum_shift = torch.zeros_like(rad_values)
70
+ cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
71
+
72
+ sines = torch.sin(torch.cumsum(rad_values + cumsum_shift, dim=1)
73
+ * 2 * np.pi)
74
+ else:
75
+ # If necessary, make sure that the first time step of every
76
+ # voiced segments is sin(pi) or cos(0)
77
+ # This is used for pulse-train generation
78
+
79
+ # identify the last time step in unvoiced segments
80
+ uv = self._f02uv(f0_values)
81
+ uv_1 = torch.roll(uv, shifts=-1, dims=1)
82
+ uv_1[:, -1, :] = 1
83
+ u_loc = (uv < 1) * (uv_1 > 0)
84
+
85
+ # get the instantanouse phase
86
+ tmp_cumsum = torch.cumsum(rad_values, dim=1)
87
+ # different batch needs to be processed differently
88
+ for idx in range(f0_values.shape[0]):
89
+ temp_sum = tmp_cumsum[idx, u_loc[idx, :, 0], :]
90
+ temp_sum[1:, :] = temp_sum[1:, :] - temp_sum[0:-1, :]
91
+ # stores the accumulation of i.phase within
92
+ # each voiced segments
93
+ tmp_cumsum[idx, :, :] = 0
94
+ tmp_cumsum[idx, u_loc[idx, :, 0], :] = temp_sum
95
+
96
+ # rad_values - tmp_cumsum: remove the accumulation of i.phase
97
+ # within the previous voiced segment.
98
+ i_phase = torch.cumsum(rad_values - tmp_cumsum, dim=1)
99
+
100
+ # get the sines
101
+ sines = torch.cos(i_phase * 2 * np.pi)
102
+ return sines
103
+
104
+ def forward(self, f0):
105
+ """ sine_tensor, uv = forward(f0)
106
+ input F0: tensor(batchsize=1, length, dim=1)
107
+ f0 for unvoiced steps should be 0
108
+ output sine_tensor: tensor(batchsize=1, length, dim)
109
+ output uv: tensor(batchsize=1, length, 1)
110
+ """
111
+ with torch.no_grad():
112
+ f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim,
113
+ device=f0.device)
114
+ # fundamental component
115
+ f0_buf[:, :, 0] = f0[:, :, 0]
116
+ for idx in np.arange(self.harmonic_num):
117
+ # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
118
+ f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (idx + 2)
119
+
120
+ # generate sine waveforms
121
+ sine_waves = self._f02sine(f0_buf) * self.sine_amp
122
+
123
+ # generate uv signal
124
+ # uv = torch.ones(f0.shape)
125
+ # uv = uv * (f0 > self.voiced_threshold)
126
+ uv = self._f02uv(f0)
127
+
128
+ # noise: for unvoiced should be similar to sine_amp
129
+ # std = self.sine_amp/3 -> max value ~ self.sine_amp
130
+ # . for voiced regions is self.noise_std
131
+ noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
132
+ noise = noise_amp * torch.randn_like(sine_waves)
133
+
134
+ # first: set the unvoiced part to 0 by uv
135
+ # then: additive noise
136
+ sine_waves = sine_waves * uv + noise
137
+ return sine_waves, uv, noise
138
+
139
+
140
+ class PulseGen(torch.nn.Module):
141
+ """ Definition of Pulse train generator
142
+
143
+ There are many ways to implement pulse generator.
144
+ Here, PulseGen is based on SinGen. For a perfect
145
+ """
146
+ def __init__(self, samp_rate, pulse_amp = 0.1,
147
+ noise_std = 0.003, voiced_threshold = 0):
148
+ super(PulseGen, self).__init__()
149
+ self.pulse_amp = pulse_amp
150
+ self.sampling_rate = samp_rate
151
+ self.voiced_threshold = voiced_threshold
152
+ self.noise_std = noise_std
153
+ self.l_sinegen = SineGen(self.sampling_rate, harmonic_num=0, \
154
+ sine_amp=self.pulse_amp, noise_std=0, \
155
+ voiced_threshold=self.voiced_threshold, \
156
+ flag_for_pulse=True)
157
+
158
+ def forward(self, f0):
159
+ """ Pulse train generator
160
+ pulse_train, uv = forward(f0)
161
+ input F0: tensor(batchsize=1, length, dim=1)
162
+ f0 for unvoiced steps should be 0
163
+ output pulse_train: tensor(batchsize=1, length, dim)
164
+ output uv: tensor(batchsize=1, length, 1)
165
+
166
+ Note: self.l_sine doesn't make sure that the initial phase of
167
+ a voiced segment is np.pi, the first pulse in a voiced segment
168
+ may not be at the first time step within a voiced segment
169
+ """
170
+ with torch.no_grad():
171
+ sine_wav, uv, noise = self.l_sinegen(f0)
172
+
173
+ # sine without additive noise
174
+ pure_sine = sine_wav - noise
175
+
176
+ # step t corresponds to a pulse if
177
+ # sine[t] > sine[t+1] & sine[t] > sine[t-1]
178
+ # & sine[t-1], sine[t+1], and sine[t] are voiced
179
+ # or
180
+ # sine[t] is voiced, sine[t-1] is unvoiced
181
+ # we use torch.roll to simulate sine[t+1] and sine[t-1]
182
+ sine_1 = torch.roll(pure_sine, shifts=1, dims=1)
183
+ uv_1 = torch.roll(uv, shifts=1, dims=1)
184
+ uv_1[:, 0, :] = 0
185
+ sine_2 = torch.roll(pure_sine, shifts=-1, dims=1)
186
+ uv_2 = torch.roll(uv, shifts=-1, dims=1)
187
+ uv_2[:, -1, :] = 0
188
+
189
+ loc = (pure_sine > sine_1) * (pure_sine > sine_2) \
190
+ * (uv_1 > 0) * (uv_2 > 0) * (uv > 0) \
191
+ + (uv_1 < 1) * (uv > 0)
192
+
193
+ # pulse train without noise
194
+ pulse_train = pure_sine * loc
195
+
196
+ # additive noise to pulse train
197
+ # note that noise from sinegen is zero in voiced regions
198
+ pulse_noise = torch.randn_like(pure_sine) * self.noise_std
199
+
200
+ # with additive noise on pulse, and unvoiced regions
201
+ pulse_train += pulse_noise * loc + pulse_noise * (1 - uv)
202
+ return pulse_train, sine_wav, uv, pulse_noise
203
+
204
+
205
+ class SignalsConv1d(torch.nn.Module):
206
+ """ Filtering input signal with time invariant filter
207
+ Note: FIRFilter conducted filtering given fixed FIR weight
208
+ SignalsConv1d convolves two signals
209
+ Note: this is based on torch.nn.functional.conv1d
210
+
211
+ """
212
+
213
+ def __init__(self):
214
+ super(SignalsConv1d, self).__init__()
215
+
216
+ def forward(self, signal, system_ir):
217
+ """ output = forward(signal, system_ir)
218
+
219
+ signal: (batchsize, length1, dim)
220
+ system_ir: (length2, dim)
221
+
222
+ output: (batchsize, length1, dim)
223
+ """
224
+ if signal.shape[-1] != system_ir.shape[-1]:
225
+ print("Error: SignalsConv1d expects shape:")
226
+ print("signal (batchsize, length1, dim)")
227
+ print("system_id (batchsize, length2, dim)")
228
+ print("But received signal: {:s}".format(str(signal.shape)))
229
+ print(" system_ir: {:s}".format(str(system_ir.shape)))
230
+ sys.exit(1)
231
+ padding_length = system_ir.shape[0] - 1
232
+ groups = signal.shape[-1]
233
+
234
+ # pad signal on the left
235
+ signal_pad = torch_nn_func.pad(signal.permute(0, 2, 1), \
236
+ (padding_length, 0))
237
+ # prepare system impulse response as (dim, 1, length2)
238
+ # also flip the impulse response
239
+ ir = torch.flip(system_ir.unsqueeze(1).permute(2, 1, 0), \
240
+ dims=[2])
241
+ # convolute
242
+ output = torch_nn_func.conv1d(signal_pad, ir, groups=groups)
243
+ return output.permute(0, 2, 1)
244
+
245
+
246
+ class CyclicNoiseGen_v1(torch.nn.Module):
247
+ """ CyclicnoiseGen_v1
248
+ Cyclic noise with a single parameter of beta.
249
+ Pytorch v1 implementation assumes f_t is also fixed
250
+ """
251
+
252
+ def __init__(self, samp_rate,
253
+ noise_std=0.003, voiced_threshold=0):
254
+ super(CyclicNoiseGen_v1, self).__init__()
255
+ self.samp_rate = samp_rate
256
+ self.noise_std = noise_std
257
+ self.voiced_threshold = voiced_threshold
258
+
259
+ self.l_pulse = PulseGen(samp_rate, pulse_amp=1.0,
260
+ noise_std=noise_std,
261
+ voiced_threshold=voiced_threshold)
262
+ self.l_conv = SignalsConv1d()
263
+
264
+ def noise_decay(self, beta, f0mean):
265
+ """ decayed_noise = noise_decay(beta, f0mean)
266
+ decayed_noise = n[t]exp(-t * f_mean / beta / samp_rate)
267
+
268
+ beta: (dim=1) or (batchsize=1, 1, dim=1)
269
+ f0mean (batchsize=1, 1, dim=1)
270
+
271
+ decayed_noise (batchsize=1, length, dim=1)
272
+ """
273
+ with torch.no_grad():
274
+ # exp(-1.0 n / T) < 0.01 => n > -log(0.01)*T = 4.60*T
275
+ # truncate the noise when decayed by -40 dB
276
+ length = 4.6 * self.samp_rate / f0mean
277
+ length = length.int()
278
+ time_idx = torch.arange(0, length, device=beta.device)
279
+ time_idx = time_idx.unsqueeze(0).unsqueeze(2)
280
+ time_idx = time_idx.repeat(beta.shape[0], 1, beta.shape[2])
281
+
282
+ noise = torch.randn(time_idx.shape, device=beta.device)
283
+
284
+ # due to Pytorch implementation, use f0_mean as the f0 factor
285
+ decay = torch.exp(-time_idx * f0mean / beta / self.samp_rate)
286
+ return noise * self.noise_std * decay
287
+
288
+ def forward(self, f0s, beta):
289
+ """ Producde cyclic-noise
290
+ """
291
+ # pulse train
292
+ pulse_train, sine_wav, uv, noise = self.l_pulse(f0s)
293
+ pure_pulse = pulse_train - noise
294
+
295
+ # decayed_noise (length, dim=1)
296
+ if (uv < 1).all():
297
+ # all unvoiced
298
+ cyc_noise = torch.zeros_like(sine_wav)
299
+ else:
300
+ f0mean = f0s[uv > 0].mean()
301
+
302
+ decayed_noise = self.noise_decay(beta, f0mean)[0, :, :]
303
+ # convolute
304
+ cyc_noise = self.l_conv(pure_pulse, decayed_noise)
305
+
306
+ # add noise in invoiced segments
307
+ cyc_noise = cyc_noise + noise * (1.0 - uv)
308
+ return cyc_noise, pulse_train, sine_wav, uv, noise
309
+
310
+
311
+ class SineGen(torch.nn.Module):
312
+ """ Definition of sine generator
313
+ SineGen(samp_rate, harmonic_num = 0,
314
+ sine_amp = 0.1, noise_std = 0.003,
315
+ voiced_threshold = 0,
316
+ flag_for_pulse=False)
317
+
318
+ samp_rate: sampling rate in Hz
319
+ harmonic_num: number of harmonic overtones (default 0)
320
+ sine_amp: amplitude of sine-wavefrom (default 0.1)
321
+ noise_std: std of Gaussian noise (default 0.003)
322
+ voiced_thoreshold: F0 threshold for U/V classification (default 0)
323
+ flag_for_pulse: this SinGen is used inside PulseGen (default False)
324
+
325
+ Note: when flag_for_pulse is True, the first time step of a voiced
326
+ segment is always sin(np.pi) or cos(0)
327
+ """
328
+
329
+ def __init__(self, samp_rate, harmonic_num=0,
330
+ sine_amp=0.1, noise_std=0.003,
331
+ voiced_threshold=0,
332
+ flag_for_pulse=False):
333
+ super(SineGen, self).__init__()
334
+ self.sine_amp = sine_amp
335
+ self.noise_std = noise_std
336
+ self.harmonic_num = harmonic_num
337
+ self.dim = self.harmonic_num + 1
338
+ self.sampling_rate = samp_rate
339
+ self.voiced_threshold = voiced_threshold
340
+ self.flag_for_pulse = flag_for_pulse
341
+
342
+ def _f02uv(self, f0):
343
+ # generate uv signal
344
+ uv = torch.ones_like(f0)
345
+ uv = uv * (f0 > self.voiced_threshold)
346
+ return uv
347
+
348
+ def _f02sine(self, f0_values):
349
+ """ f0_values: (batchsize, length, dim)
350
+ where dim indicates fundamental tone and overtones
351
+ """
352
+ # convert to F0 in rad. The interger part n can be ignored
353
+ # because 2 * np.pi * n doesn't affect phase
354
+ rad_values = (f0_values / self.sampling_rate) % 1
355
+
356
+ # initial phase noise (no noise for fundamental component)
357
+ rand_ini = torch.rand(f0_values.shape[0], f0_values.shape[2], \
358
+ device=f0_values.device)
359
+ rand_ini[:, 0] = 0
360
+ rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
361
+
362
+ # instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad)
363
+ if not self.flag_for_pulse:
364
+ # for normal case
365
+
366
+ # To prevent torch.cumsum numerical overflow,
367
+ # it is necessary to add -1 whenever \sum_k=1^n rad_value_k > 1.
368
+ # Buffer tmp_over_one_idx indicates the time step to add -1.
369
+ # This will not change F0 of sine because (x-1) * 2*pi = x * 2*pi
370
+ tmp_over_one = torch.cumsum(rad_values, 1) % 1
371
+ tmp_over_one_idx = (tmp_over_one[:, 1:, :] -
372
+ tmp_over_one[:, :-1, :]) < 0
373
+ cumsum_shift = torch.zeros_like(rad_values)
374
+ cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
375
+
376
+ sines = torch.sin(torch.cumsum(rad_values + cumsum_shift, dim=1)
377
+ * 2 * np.pi)
378
+ else:
379
+ # If necessary, make sure that the first time step of every
380
+ # voiced segments is sin(pi) or cos(0)
381
+ # This is used for pulse-train generation
382
+
383
+ # identify the last time step in unvoiced segments
384
+ uv = self._f02uv(f0_values)
385
+ uv_1 = torch.roll(uv, shifts=-1, dims=1)
386
+ uv_1[:, -1, :] = 1
387
+ u_loc = (uv < 1) * (uv_1 > 0)
388
+
389
+ # get the instantanouse phase
390
+ tmp_cumsum = torch.cumsum(rad_values, dim=1)
391
+ # different batch needs to be processed differently
392
+ for idx in range(f0_values.shape[0]):
393
+ temp_sum = tmp_cumsum[idx, u_loc[idx, :, 0], :]
394
+ temp_sum[1:, :] = temp_sum[1:, :] - temp_sum[0:-1, :]
395
+ # stores the accumulation of i.phase within
396
+ # each voiced segments
397
+ tmp_cumsum[idx, :, :] = 0
398
+ tmp_cumsum[idx, u_loc[idx, :, 0], :] = temp_sum
399
+
400
+ # rad_values - tmp_cumsum: remove the accumulation of i.phase
401
+ # within the previous voiced segment.
402
+ i_phase = torch.cumsum(rad_values - tmp_cumsum, dim=1)
403
+
404
+ # get the sines
405
+ sines = torch.cos(i_phase * 2 * np.pi)
406
+ return sines
407
+
408
+ def forward(self, f0):
409
+ """ sine_tensor, uv = forward(f0)
410
+ input F0: tensor(batchsize=1, length, dim=1)
411
+ f0 for unvoiced steps should be 0
412
+ output sine_tensor: tensor(batchsize=1, length, dim)
413
+ output uv: tensor(batchsize=1, length, 1)
414
+ """
415
+ with torch.no_grad():
416
+ f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, \
417
+ device=f0.device)
418
+ # fundamental component
419
+ f0_buf[:, :, 0] = f0[:, :, 0]
420
+ for idx in np.arange(self.harmonic_num):
421
+ # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
422
+ f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (idx + 2)
423
+
424
+ # generate sine waveforms
425
+ sine_waves = self._f02sine(f0_buf) * self.sine_amp
426
+
427
+ # generate uv signal
428
+ # uv = torch.ones(f0.shape)
429
+ # uv = uv * (f0 > self.voiced_threshold)
430
+ uv = self._f02uv(f0)
431
+
432
+ # noise: for unvoiced should be similar to sine_amp
433
+ # std = self.sine_amp/3 -> max value ~ self.sine_amp
434
+ # . for voiced regions is self.noise_std
435
+ noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
436
+ noise = noise_amp * torch.randn_like(sine_waves)
437
+
438
+ # first: set the unvoiced part to 0 by uv
439
+ # then: additive noise
440
+ sine_waves = sine_waves * uv + noise
441
+ return sine_waves, uv, noise
442
+
443
+
444
+ class SourceModuleCycNoise_v1(torch.nn.Module):
445
+ """ SourceModuleCycNoise_v1
446
+ SourceModule(sampling_rate, noise_std=0.003, voiced_threshod=0)
447
+ sampling_rate: sampling_rate in Hz
448
+
449
+ noise_std: std of Gaussian noise (default: 0.003)
450
+ voiced_threshold: threshold to set U/V given F0 (default: 0)
451
+
452
+ cyc, noise, uv = SourceModuleCycNoise_v1(F0_upsampled, beta)
453
+ F0_upsampled (batchsize, length, 1)
454
+ beta (1)
455
+ cyc (batchsize, length, 1)
456
+ noise (batchsize, length, 1)
457
+ uv (batchsize, length, 1)
458
+ """
459
+
460
+ def __init__(self, sampling_rate, noise_std=0.003, voiced_threshod=0):
461
+ super(SourceModuleCycNoise_v1, self).__init__()
462
+ self.sampling_rate = sampling_rate
463
+ self.noise_std = noise_std
464
+ self.l_cyc_gen = CyclicNoiseGen_v1(sampling_rate, noise_std,
465
+ voiced_threshod)
466
+
467
+ def forward(self, f0_upsamped, beta):
468
+ """
469
+ cyc, noise, uv = SourceModuleCycNoise_v1(F0, beta)
470
+ F0_upsampled (batchsize, length, 1)
471
+ beta (1)
472
+ cyc (batchsize, length, 1)
473
+ noise (batchsize, length, 1)
474
+ uv (batchsize, length, 1)
475
+ """
476
+ # source for harmonic branch
477
+ cyc, pulse, sine, uv, add_noi = self.l_cyc_gen(f0_upsamped, beta)
478
+
479
+ # source for noise branch, in the same shape as uv
480
+ noise = torch.randn_like(uv) * self.noise_std / 3
481
+ return cyc, noise, uv
482
+
483
+
484
+ class SourceModuleHnNSF(torch.nn.Module):
485
+ """ SourceModule for hn-nsf
486
+ SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
487
+ add_noise_std=0.003, voiced_threshod=0)
488
+ sampling_rate: sampling_rate in Hz
489
+ harmonic_num: number of harmonic above F0 (default: 0)
490
+ sine_amp: amplitude of sine source signal (default: 0.1)
491
+ add_noise_std: std of additive Gaussian noise (default: 0.003)
492
+ note that amplitude of noise in unvoiced is decided
493
+ by sine_amp
494
+ voiced_threshold: threhold to set U/V given F0 (default: 0)
495
+
496
+ Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
497
+ F0_sampled (batchsize, length, 1)
498
+ Sine_source (batchsize, length, 1)
499
+ noise_source (batchsize, length 1)
500
+ uv (batchsize, length, 1)
501
+ """
502
+
503
+ def __init__(self, sampling_rate, harmonic_num=0, sine_amp=0.1,
504
+ add_noise_std=0.003, voiced_threshod=0):
505
+ super(SourceModuleHnNSF, self).__init__()
506
+
507
+ self.sine_amp = sine_amp
508
+ self.noise_std = add_noise_std
509
+
510
+ # to produce sine waveforms
511
+ self.l_sin_gen = SineGen(sampling_rate, harmonic_num,
512
+ sine_amp, add_noise_std, voiced_threshod)
513
+
514
+ # to merge source harmonics into a single excitation
515
+ self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
516
+ self.l_tanh = torch.nn.Tanh()
517
+
518
+ def forward(self, x):
519
+ """
520
+ Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
521
+ F0_sampled (batchsize, length, 1)
522
+ Sine_source (batchsize, length, 1)
523
+ noise_source (batchsize, length 1)
524
+ """
525
+ # source for harmonic branch
526
+ sine_wavs, uv, _ = self.l_sin_gen(x)
527
+ sine_merge = self.l_tanh(self.l_linear(sine_wavs))
528
+
529
+ # source for noise branch, in the same shape as uv
530
+ noise = torch.randn_like(uv) * self.sine_amp / 3
531
+ return sine_merge, noise, uv
532
+
533
+
534
+ if __name__ == '__main__':
535
+ source = SourceModuleCycNoise_v1(24000)
536
+ x = torch.randn(16, 25600, 1)
537
+
538
+
sovits/vdecoder/parallel_wavegan/optimizers/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from torch.optim import * # NOQA
2
+ from .radam import * # NOQA
sovits/vdecoder/parallel_wavegan/optimizers/radam.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ """RAdam optimizer.
4
+
5
+ This code is drived from https://github.com/LiyuanLucasLiu/RAdam.
6
+ """
7
+
8
+ import math
9
+ import torch
10
+
11
+ from torch.optim.optimizer import Optimizer
12
+
13
+
14
+ class RAdam(Optimizer):
15
+ """Rectified Adam optimizer."""
16
+
17
+ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0):
18
+ """Initilize RAdam optimizer."""
19
+ defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
20
+ self.buffer = [[None, None, None] for ind in range(10)]
21
+ super(RAdam, self).__init__(params, defaults)
22
+
23
+ def __setstate__(self, state):
24
+ """Set state."""
25
+ super(RAdam, self).__setstate__(state)
26
+
27
+ def step(self, closure=None):
28
+ """Run one step."""
29
+ loss = None
30
+ if closure is not None:
31
+ loss = closure()
32
+
33
+ for group in self.param_groups:
34
+
35
+ for p in group['params']:
36
+ if p.grad is None:
37
+ continue
38
+ grad = p.grad.data.float()
39
+ if grad.is_sparse:
40
+ raise RuntimeError('RAdam does not support sparse gradients')
41
+
42
+ p_data_fp32 = p.data.float()
43
+
44
+ state = self.state[p]
45
+
46
+ if len(state) == 0:
47
+ state['step'] = 0
48
+ state['exp_avg'] = torch.zeros_like(p_data_fp32)
49
+ state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
50
+ else:
51
+ state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
52
+ state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)
53
+
54
+ exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
55
+ beta1, beta2 = group['betas']
56
+
57
+ exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
58
+ exp_avg.mul_(beta1).add_(1 - beta1, grad)
59
+
60
+ state['step'] += 1
61
+ buffered = self.buffer[int(state['step'] % 10)]
62
+ if state['step'] == buffered[0]:
63
+ N_sma, step_size = buffered[1], buffered[2]
64
+ else:
65
+ buffered[0] = state['step']
66
+ beta2_t = beta2 ** state['step']
67
+ N_sma_max = 2 / (1 - beta2) - 1
68
+ N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
69
+ buffered[1] = N_sma
70
+
71
+ # more conservative since it's an approximated value
72
+ if N_sma >= 5:
73
+ step_size = math.sqrt(
74
+ (1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step']) # NOQA
75
+ else:
76
+ step_size = 1.0 / (1 - beta1 ** state['step'])
77
+ buffered[2] = step_size
78
+
79
+ if group['weight_decay'] != 0:
80
+ p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
81
+
82
+ # more conservative since it's an approximated value
83
+ if N_sma >= 5:
84
+ denom = exp_avg_sq.sqrt().add_(group['eps'])
85
+ p_data_fp32.addcdiv_(-step_size * group['lr'], exp_avg, denom)
86
+ else:
87
+ p_data_fp32.add_(-step_size * group['lr'], exp_avg)
88
+
89
+ p.data.copy_(p_data_fp32)
90
+
91
+ return loss
sovits/vdecoder/parallel_wavegan/stft_loss.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ # Copyright 2019 Tomoki Hayashi
4
+ # MIT License (https://opensource.org/licenses/MIT)
5
+
6
+ """STFT-based Loss modules."""
7
+ import librosa
8
+ import torch
9
+
10
+ from modules.parallel_wavegan.losses import LogSTFTMagnitudeLoss, SpectralConvergengeLoss, stft
11
+
12
+
13
+ class STFTLoss(torch.nn.Module):
14
+ """STFT loss module."""
15
+
16
+ def __init__(self, fft_size=1024, shift_size=120, win_length=600, window="hann_window",
17
+ use_mel_loss=False):
18
+ """Initialize STFT loss module."""
19
+ super(STFTLoss, self).__init__()
20
+ self.fft_size = fft_size
21
+ self.shift_size = shift_size
22
+ self.win_length = win_length
23
+ self.window = getattr(torch, window)(win_length)
24
+ self.spectral_convergenge_loss = SpectralConvergengeLoss()
25
+ self.log_stft_magnitude_loss = LogSTFTMagnitudeLoss()
26
+ self.use_mel_loss = use_mel_loss
27
+ self.mel_basis = None
28
+
29
+ def forward(self, x, y):
30
+ """Calculate forward propagation.
31
+
32
+ Args:
33
+ x (Tensor): Predicted signal (B, T).
34
+ y (Tensor): Groundtruth signal (B, T).
35
+
36
+ Returns:
37
+ Tensor: Spectral convergence loss value.
38
+ Tensor: Log STFT magnitude loss value.
39
+
40
+ """
41
+ x_mag = stft(x, self.fft_size, self.shift_size, self.win_length, self.window)
42
+ y_mag = stft(y, self.fft_size, self.shift_size, self.win_length, self.window)
43
+ if self.use_mel_loss:
44
+ if self.mel_basis is None:
45
+ self.mel_basis = torch.from_numpy(librosa.filters.mel(22050, self.fft_size, 80)).cuda().T
46
+ x_mag = x_mag @ self.mel_basis
47
+ y_mag = y_mag @ self.mel_basis
48
+
49
+ sc_loss = self.spectral_convergenge_loss(x_mag, y_mag)
50
+ mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag)
51
+
52
+ return sc_loss, mag_loss
53
+
54
+
55
+ class MultiResolutionSTFTLoss(torch.nn.Module):
56
+ """Multi resolution STFT loss module."""
57
+
58
+ def __init__(self,
59
+ fft_sizes=[1024, 2048, 512],
60
+ hop_sizes=[120, 240, 50],
61
+ win_lengths=[600, 1200, 240],
62
+ window="hann_window",
63
+ use_mel_loss=False):
64
+ """Initialize Multi resolution STFT loss module.
65
+
66
+ Args:
67
+ fft_sizes (list): List of FFT sizes.
68
+ hop_sizes (list): List of hop sizes.
69
+ win_lengths (list): List of window lengths.
70
+ window (str): Window function type.
71
+
72
+ """
73
+ super(MultiResolutionSTFTLoss, self).__init__()
74
+ assert len(fft_sizes) == len(hop_sizes) == len(win_lengths)
75
+ self.stft_losses = torch.nn.ModuleList()
76
+ for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths):
77
+ self.stft_losses += [STFTLoss(fs, ss, wl, window, use_mel_loss)]
78
+
79
+ def forward(self, x, y):
80
+ """Calculate forward propagation.
81
+
82
+ Args:
83
+ x (Tensor): Predicted signal (B, T).
84
+ y (Tensor): Groundtruth signal (B, T).
85
+
86
+ Returns:
87
+ Tensor: Multi resolution spectral convergence loss value.
88
+ Tensor: Multi resolution log STFT magnitude loss value.
89
+
90
+ """
91
+ sc_loss = 0.0
92
+ mag_loss = 0.0
93
+ for f in self.stft_losses:
94
+ sc_l, mag_l = f(x, y)
95
+ sc_loss += sc_l
96
+ mag_loss += mag_l
97
+ sc_loss /= len(self.stft_losses)
98
+ mag_loss /= len(self.stft_losses)
99
+
100
+ return sc_loss, mag_loss
sovits/vdecoder/parallel_wavegan/utils/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .utils import * # NOQA
sovits/vdecoder/parallel_wavegan/utils/utils.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ # Copyright 2019 Tomoki Hayashi
4
+ # MIT License (https://opensource.org/licenses/MIT)
5
+
6
+ """Utility functions."""
7
+
8
+ import fnmatch
9
+ import logging
10
+ import os
11
+ import sys
12
+
13
+ import h5py
14
+ import numpy as np
15
+
16
+
17
+ def find_files(root_dir, query="*.wav", include_root_dir=True):
18
+ """Find files recursively.
19
+
20
+ Args:
21
+ root_dir (str): Root root_dir to find.
22
+ query (str): Query to find.
23
+ include_root_dir (bool): If False, root_dir name is not included.
24
+
25
+ Returns:
26
+ list: List of found filenames.
27
+
28
+ """
29
+ files = []
30
+ for root, dirnames, filenames in os.walk(root_dir, followlinks=True):
31
+ for filename in fnmatch.filter(filenames, query):
32
+ files.append(os.path.join(root, filename))
33
+ if not include_root_dir:
34
+ files = [file_.replace(root_dir + "/", "") for file_ in files]
35
+
36
+ return files
37
+
38
+
39
+ def read_hdf5(hdf5_name, hdf5_path):
40
+ """Read hdf5 dataset.
41
+
42
+ Args:
43
+ hdf5_name (str): Filename of hdf5 file.
44
+ hdf5_path (str): Dataset name in hdf5 file.
45
+
46
+ Return:
47
+ any: Dataset values.
48
+
49
+ """
50
+ if not os.path.exists(hdf5_name):
51
+ logging.error(f"There is no such a hdf5 file ({hdf5_name}).")
52
+ sys.exit(1)
53
+
54
+ hdf5_file = h5py.File(hdf5_name, "r")
55
+
56
+ if hdf5_path not in hdf5_file:
57
+ logging.error(f"There is no such a data in hdf5 file. ({hdf5_path})")
58
+ sys.exit(1)
59
+
60
+ hdf5_data = hdf5_file[hdf5_path][()]
61
+ hdf5_file.close()
62
+
63
+ return hdf5_data
64
+
65
+
66
+ def write_hdf5(hdf5_name, hdf5_path, write_data, is_overwrite=True):
67
+ """Write dataset to hdf5.
68
+
69
+ Args:
70
+ hdf5_name (str): Hdf5 dataset filename.
71
+ hdf5_path (str): Dataset path in hdf5.
72
+ write_data (ndarray): Data to write.
73
+ is_overwrite (bool): Whether to overwrite dataset.
74
+
75
+ """
76
+ # convert to numpy array
77
+ write_data = np.array(write_data)
78
+
79
+ # check folder existence
80
+ folder_name, _ = os.path.split(hdf5_name)
81
+ if not os.path.exists(folder_name) and len(folder_name) != 0:
82
+ os.makedirs(folder_name)
83
+
84
+ # check hdf5 existence
85
+ if os.path.exists(hdf5_name):
86
+ # if already exists, open with r+ mode
87
+ hdf5_file = h5py.File(hdf5_name, "r+")
88
+ # check dataset existence
89
+ if hdf5_path in hdf5_file:
90
+ if is_overwrite:
91
+ logging.warning("Dataset in hdf5 file already exists. "
92
+ "recreate dataset in hdf5.")
93
+ hdf5_file.__delitem__(hdf5_path)
94
+ else:
95
+ logging.error("Dataset in hdf5 file already exists. "
96
+ "if you want to overwrite, please set is_overwrite = True.")
97
+ hdf5_file.close()
98
+ sys.exit(1)
99
+ else:
100
+ # if not exists, open with w mode
101
+ hdf5_file = h5py.File(hdf5_name, "w")
102
+
103
+ # write data to hdf5
104
+ hdf5_file.create_dataset(hdf5_path, data=write_data)
105
+ hdf5_file.flush()
106
+ hdf5_file.close()
107
+
108
+
109
+ class HDF5ScpLoader(object):
110
+ """Loader class for a fests.scp file of hdf5 file.
111
+
112
+ Examples:
113
+ key1 /some/path/a.h5:feats
114
+ key2 /some/path/b.h5:feats
115
+ key3 /some/path/c.h5:feats
116
+ key4 /some/path/d.h5:feats
117
+ ...
118
+ >>> loader = HDF5ScpLoader("hdf5.scp")
119
+ >>> array = loader["key1"]
120
+
121
+ key1 /some/path/a.h5
122
+ key2 /some/path/b.h5
123
+ key3 /some/path/c.h5
124
+ key4 /some/path/d.h5
125
+ ...
126
+ >>> loader = HDF5ScpLoader("hdf5.scp", "feats")
127
+ >>> array = loader["key1"]
128
+
129
+ """
130
+
131
+ def __init__(self, feats_scp, default_hdf5_path="feats"):
132
+ """Initialize HDF5 scp loader.
133
+
134
+ Args:
135
+ feats_scp (str): Kaldi-style feats.scp file with hdf5 format.
136
+ default_hdf5_path (str): Path in hdf5 file. If the scp contain the info, not used.
137
+
138
+ """
139
+ self.default_hdf5_path = default_hdf5_path
140
+ with open(feats_scp, encoding='utf-8') as f:
141
+ lines = [line.replace("\n", "") for line in f.readlines()]
142
+ self.data = {}
143
+ for line in lines:
144
+ key, value = line.split()
145
+ self.data[key] = value
146
+
147
+ def get_path(self, key):
148
+ """Get hdf5 file path for a given key."""
149
+ return self.data[key]
150
+
151
+ def __getitem__(self, key):
152
+ """Get ndarray for a given key."""
153
+ p = self.data[key]
154
+ if ":" in p:
155
+ return read_hdf5(*p.split(":"))
156
+ else:
157
+ return read_hdf5(p, self.default_hdf5_path)
158
+
159
+ def __len__(self):
160
+ """Return the length of the scp file."""
161
+ return len(self.data)
162
+
163
+ def __iter__(self):
164
+ """Return the iterator of the scp file."""
165
+ return iter(self.data)
166
+
167
+ def keys(self):
168
+ """Return the keys of the scp file."""
169
+ return self.data.keys()
vits/{tts_inferencer.py → vits_inferencer.py} RENAMED
@@ -31,7 +31,7 @@ def get_text(text, hps):
31
  text_norm = torch.LongTensor(text_norm)
32
  return text_norm
33
 
34
- class TTSInferencer:
35
  def __init__(self, hps_path, device="cpu"):
36
  print("init")
37
  self.device = torch.device(device)
 
31
  text_norm = torch.LongTensor(text_norm)
32
  return text_norm
33
 
34
+ class VitsInferencer:
35
  def __init__(self, hps_path, device="cpu"):
36
  print("init")
37
  self.device = torch.device(device)