maxmax20160403 commited on
Commit
c24b656
1 Parent(s): c2e61ce
app.py CHANGED
@@ -60,7 +60,7 @@ def compute_f0_nn(filename, device):
60
  periodicity = np.repeat(periodicity, 2, -1) # 320 -> 160 * 2
61
  # CREPE was not trained on silent audio. some error on silent need filter.
62
  periodicity = torchcrepe.filter.median(periodicity, 9)
63
- pitch = torchcrepe.filter.mean(pitch, 9)
64
  pitch[periodicity < 0.1] = 0
65
  pitch = pitch.squeeze(0)
66
  return pitch
@@ -72,7 +72,7 @@ model = SynthesizerInfer(
72
  hp.data.filter_length // 2 + 1,
73
  hp.data.segment_size // hp.data.hop_length,
74
  hp)
75
- load_svc_model("vits_pretrain/sovits5.0-48k-debug.pth", model)
76
  model.eval()
77
  model.to(device)
78
 
@@ -116,17 +116,17 @@ def svc_change(argswave, argsspk):
116
  has_audio = True
117
  if (out_index == 0): # start frame
118
  cut_s = out_index
119
- cut_s_48k = 0
120
  else:
121
  cut_s = out_index - hop_frame
122
- cut_s_48k = hop_frame * hop_size
123
 
124
  if (out_index + out_chunk + hop_frame > all_frame): # end frame
125
  cut_e = out_index + out_chunk
126
- cut_e_48k = 0
127
  else:
128
  cut_e = out_index + out_chunk + hop_frame
129
- cut_e_48k = -1 * hop_frame * hop_size
130
 
131
  sub_ppg = ppg[cut_s:cut_e, :].unsqueeze(0).to(device)
132
  sub_pit = pit[cut_s:cut_e].unsqueeze(0).to(device)
@@ -136,17 +136,17 @@ def svc_change(argswave, argsspk):
136
  sub_out = model.inference(sub_ppg, sub_pit, spk, sub_len, sub_har)
137
  sub_out = sub_out[0, 0].data.cpu().detach().numpy()
138
 
139
- sub_out = sub_out[cut_s_48k:cut_e_48k]
140
  out_audio.extend(sub_out)
141
  out_index = out_index + out_chunk
142
 
143
  if (out_index < all_frame):
144
  if (has_audio):
145
  cut_s = out_index - hop_frame
146
- cut_s_48k = hop_frame * hop_size
147
  else:
148
  cut_s = 0
149
- cut_s_48k = 0
150
  sub_ppg = ppg[cut_s:, :].unsqueeze(0).to(device)
151
  sub_pit = pit[cut_s:].unsqueeze(0).to(device)
152
  sub_len = torch.LongTensor([all_frame - cut_s]).to(device)
@@ -154,7 +154,7 @@ def svc_change(argswave, argsspk):
154
  sub_out = model.inference(sub_ppg, sub_pit, spk, sub_len, sub_har)
155
  sub_out = sub_out[0, 0].data.cpu().detach().numpy()
156
 
157
- sub_out = sub_out[cut_s_48k:]
158
  out_audio.extend(sub_out)
159
  out_audio = np.asarray(out_audio)
160
 
@@ -175,7 +175,7 @@ def svc_main(sid, input_audio):
175
  wav_path = "temp.wav"
176
  soundfile.write(wav_path, audio, 16000, format="wav")
177
  out_audio = svc_change(wav_path, f"configs/singers/singer00{sid}.npy")
178
- return "Success", (48000, out_audio)
179
 
180
 
181
  app = gr.Blocks()
@@ -183,7 +183,7 @@ with app:
183
  with gr.Tabs():
184
  with gr.TabItem("sovits 5.0"):
185
  gr.Markdown(value="""
186
- 基于开源数据:Multi-Singer
187
 
188
  https://github.com/Multi-Singer/Multi-Singer.github.io
189
 
 
60
  periodicity = np.repeat(periodicity, 2, -1) # 320 -> 160 * 2
61
  # CREPE was not trained on silent audio. some error on silent need filter.
62
  periodicity = torchcrepe.filter.median(periodicity, 9)
63
+ pitch = torchcrepe.filter.mean(pitch, 3)
64
  pitch[periodicity < 0.1] = 0
65
  pitch = pitch.squeeze(0)
66
  return pitch
 
72
  hp.data.filter_length // 2 + 1,
73
  hp.data.segment_size // hp.data.hop_length,
74
  hp)
75
+ load_svc_model("vits_pretrain/sovits5.0_bigvgan.pth", model)
76
  model.eval()
77
  model.to(device)
78
 
 
116
  has_audio = True
117
  if (out_index == 0): # start frame
118
  cut_s = out_index
119
+ cut_s_out = 0
120
  else:
121
  cut_s = out_index - hop_frame
122
+ cut_s_out = hop_frame * hop_size
123
 
124
  if (out_index + out_chunk + hop_frame > all_frame): # end frame
125
  cut_e = out_index + out_chunk
126
+ cut_e_out = 0
127
  else:
128
  cut_e = out_index + out_chunk + hop_frame
129
+ cut_e_out = -1 * hop_frame * hop_size
130
 
131
  sub_ppg = ppg[cut_s:cut_e, :].unsqueeze(0).to(device)
132
  sub_pit = pit[cut_s:cut_e].unsqueeze(0).to(device)
 
136
  sub_out = model.inference(sub_ppg, sub_pit, spk, sub_len, sub_har)
137
  sub_out = sub_out[0, 0].data.cpu().detach().numpy()
138
 
139
+ sub_out = sub_out[cut_s_out:cut_e_out]
140
  out_audio.extend(sub_out)
141
  out_index = out_index + out_chunk
142
 
143
  if (out_index < all_frame):
144
  if (has_audio):
145
  cut_s = out_index - hop_frame
146
+ cut_s_out = hop_frame * hop_size
147
  else:
148
  cut_s = 0
149
+ cut_s_out = 0
150
  sub_ppg = ppg[cut_s:, :].unsqueeze(0).to(device)
151
  sub_pit = pit[cut_s:].unsqueeze(0).to(device)
152
  sub_len = torch.LongTensor([all_frame - cut_s]).to(device)
 
154
  sub_out = model.inference(sub_ppg, sub_pit, spk, sub_len, sub_har)
155
  sub_out = sub_out[0, 0].data.cpu().detach().numpy()
156
 
157
+ sub_out = sub_out[cut_s_out:]
158
  out_audio.extend(sub_out)
159
  out_audio = np.asarray(out_audio)
160
 
 
175
  wav_path = "temp.wav"
176
  soundfile.write(wav_path, audio, 16000, format="wav")
177
  out_audio = svc_change(wav_path, f"configs/singers/singer00{sid}.npy")
178
+ return "Success", (32000, out_audio)
179
 
180
 
181
  app = gr.Blocks()
 
183
  with gr.Tabs():
184
  with gr.TabItem("sovits 5.0"):
185
  gr.Markdown(value="""
186
+ 最终版本,基于开源数据:Multi-Singer
187
 
188
  https://github.com/Multi-Singer/Multi-Singer.github.io
189
 
configs/base.yaml CHANGED
@@ -7,24 +7,24 @@ train:
7
  lr_decay: 0.999875
8
  eps: 1e-9
9
  batch_size: 8
10
- c_stft: 5
11
- c_mel: 2.5
12
- c_kl: 1.0
13
  port: 8001
14
  pretrain: ""
15
  #############################
16
  data:
17
  training_files: "files/train.txt"
18
  validation_files: "files/valid.txt"
19
- segment_size: 12000 # WARNING: base on hop_length
20
  max_wav_value: 32768.0
21
- sampling_rate: 48000
22
- filter_length: 2048
23
- hop_length: 480
24
- win_length: 2048
25
- mel_channels: 80
26
- mel_fmin: 0.0
27
- mel_fmax: 24000.0
28
  #############################
29
  vits:
30
  ppg_dim: 1024
@@ -36,9 +36,9 @@ vits:
36
  #############################
37
  gen:
38
  upsample_input: 192
39
- upsample_rates: [6,5,4,2,2]
40
- upsample_kernel_sizes: [20,15,8,4,4]
41
- upsample_initial_channel: 256
42
  resblock_kernel_sizes: [3,7,11]
43
  resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
44
  #############################
@@ -50,13 +50,13 @@ mpd:
50
  lReLU_slope: 0.2
51
  #############################
52
  mrd:
53
- resolutions: "[(1024, 120, 600), (2048, 240, 1200), (512, 50, 240)]" # (filter_length, hop_length, win_length)
54
  use_spectral_norm: False
55
  lReLU_slope: 0.2
56
  #############################
57
  log:
58
  info_interval: 100
59
- eval_interval: 5
60
  save_interval: 5
61
  num_audio: 6
62
  pth_dir: 'chkpt'
 
7
  lr_decay: 0.999875
8
  eps: 1e-9
9
  batch_size: 8
10
+ c_stft: 9
11
+ c_mel: 1.
12
+ c_kl: 0.2
13
  port: 8001
14
  pretrain: ""
15
  #############################
16
  data:
17
  training_files: "files/train.txt"
18
  validation_files: "files/valid.txt"
19
+ segment_size: 8000 # WARNING: base on hop_length
20
  max_wav_value: 32768.0
21
+ sampling_rate: 32000
22
+ filter_length: 1024
23
+ hop_length: 320
24
+ win_length: 1024
25
+ mel_channels: 100
26
+ mel_fmin: 50.0
27
+ mel_fmax: 16000.0
28
  #############################
29
  vits:
30
  ppg_dim: 1024
 
36
  #############################
37
  gen:
38
  upsample_input: 192
39
+ upsample_rates: [5,4,4,2,2]
40
+ upsample_kernel_sizes: [15,8,8,4,4]
41
+ upsample_initial_channel: 320
42
  resblock_kernel_sizes: [3,7,11]
43
  resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
44
  #############################
 
50
  lReLU_slope: 0.2
51
  #############################
52
  mrd:
53
+ resolutions: "[(1024, 120, 600), (2048, 240, 1200), (4096, 480, 2400), (512, 50, 240)]" # (filter_length, hop_length, win_length)
54
  use_spectral_norm: False
55
  lReLU_slope: 0.2
56
  #############################
57
  log:
58
  info_interval: 100
59
+ eval_interval: 1
60
  save_interval: 5
61
  num_audio: 6
62
  pth_dir: 'chkpt'
vits/data_utils.py CHANGED
@@ -27,8 +27,8 @@ class TextAudioSpeakerSet(torch.utils.data.Dataset):
27
  def _filter(self):
28
  lengths = []
29
  items_new = []
30
- items_min = int(self.segment_size / self.hop_length * 2) # 1 S
31
- items_max = int(self.segment_size / self.hop_length * 9) # 4.5 S
32
  for wavpath, spec, pitch, ppg, spk in self.items:
33
  if not os.path.isfile(wavpath):
34
  continue
@@ -87,7 +87,7 @@ class TextAudioSpeakerSet(torch.utils.data.Dataset):
87
  spk = torch.FloatTensor(spk)
88
 
89
  len_pit = pit.size()[0]
90
- len_ppg = ppg.size()[0]
91
  len_min = min(len_pit, len_ppg)
92
  len_wav = len_min * self.hop_length
93
 
@@ -255,6 +255,8 @@ class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
255
  for i in range(len(self.buckets)):
256
  bucket = self.buckets[i]
257
  len_bucket = len(bucket)
 
 
258
  ids_bucket = indices[i]
259
  num_samples_bucket = self.num_samples_per_bucket[i]
260
 
 
27
  def _filter(self):
28
  lengths = []
29
  items_new = []
30
+ items_min = int(self.segment_size / self.hop_length * 4) # 1 S
31
+ items_max = int(self.segment_size / self.hop_length * 16) # 4 S
32
  for wavpath, spec, pitch, ppg, spk in self.items:
33
  if not os.path.isfile(wavpath):
34
  continue
 
87
  spk = torch.FloatTensor(spk)
88
 
89
  len_pit = pit.size()[0]
90
+ len_ppg = ppg.size()[0] - 2 # for safe
91
  len_min = min(len_pit, len_ppg)
92
  len_wav = len_min * self.hop_length
93
 
 
255
  for i in range(len(self.buckets)):
256
  bucket = self.buckets[i]
257
  len_bucket = len(bucket)
258
+ if (len_bucket == 0):
259
+ continue
260
  ids_bucket = indices[i]
261
  num_samples_bucket = self.num_samples_per_bucket[i]
262
 
vits/models.py CHANGED
@@ -8,6 +8,7 @@ from vits import commons
8
  from vits import modules
9
  from vits.utils import f0_to_coarse
10
  from vits_decoder.generator import Generator
 
11
 
12
 
13
  class TextEncoder(nn.Module):
@@ -44,7 +45,7 @@ class TextEncoder(nn.Module):
44
  stats = self.proj(x) * x_mask
45
  m, logs = torch.split(stats, self.out_channels, dim=1)
46
  z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
47
- return z, m, logs, x_mask
48
 
49
 
50
  class ResidualCouplingBlock(nn.Module):
@@ -151,6 +152,10 @@ class SynthesizerTrn(nn.Module):
151
  3,
152
  0.1,
153
  )
 
 
 
 
154
  self.enc_q = PosteriorEncoder(
155
  spec_channels,
156
  hp.vits.inter_channels,
@@ -171,8 +176,9 @@ class SynthesizerTrn(nn.Module):
171
  self.dec = Generator(hp=hp)
172
 
173
  def forward(self, ppg, pit, spec, spk, ppg_l, spec_l):
 
174
  g = self.emb_g(F.normalize(spk)).unsqueeze(-1)
175
- z_p, m_p, logs_p, ppg_mask = self.enc_p(
176
  ppg, ppg_l, f0=f0_to_coarse(pit))
177
  z_q, m_q, logs_q, spec_mask = self.enc_q(spec, spec_l, g=g)
178
 
@@ -183,10 +189,13 @@ class SynthesizerTrn(nn.Module):
183
  # SNAC to flow
184
  z_f, logdet_f = self.flow(z_q, spec_mask, g=spk)
185
  z_r, logdet_r = self.flow(z_p, spec_mask, g=spk, reverse=True)
186
- return audio, ids_slice, spec_mask, (z_f, z_r, z_p, m_p, logs_p, z_q, m_q, logs_q, logdet_f, logdet_r)
 
 
187
 
188
  def infer(self, ppg, pit, spk, ppg_l):
189
- z_p, m_p, logs_p, ppg_mask = self.enc_p(
 
190
  ppg, ppg_l, f0=f0_to_coarse(pit))
191
  z, _ = self.flow(z_p, ppg_mask, g=spk, reverse=True)
192
  o = self.dec(spk, z * ppg_mask, f0=pit)
@@ -233,7 +242,7 @@ class SynthesizerInfer(nn.Module):
233
  return self.dec.source2wav(source)
234
 
235
  def inference(self, ppg, pit, spk, ppg_l, source):
236
- z_p, m_p, logs_p, ppg_mask = self.enc_p(
237
  ppg, ppg_l, f0=f0_to_coarse(pit))
238
  z, _ = self.flow(z_p, ppg_mask, g=spk, reverse=True)
239
  o = self.dec.inference(spk, z * ppg_mask, source)
 
8
  from vits import modules
9
  from vits.utils import f0_to_coarse
10
  from vits_decoder.generator import Generator
11
+ from vits.modules_grl import SpeakerClassifier
12
 
13
 
14
  class TextEncoder(nn.Module):
 
45
  stats = self.proj(x) * x_mask
46
  m, logs = torch.split(stats, self.out_channels, dim=1)
47
  z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
48
+ return z, m, logs, x_mask, x
49
 
50
 
51
  class ResidualCouplingBlock(nn.Module):
 
152
  3,
153
  0.1,
154
  )
155
+ self.speaker_classifier = SpeakerClassifier(
156
+ hp.vits.hidden_channels,
157
+ hp.vits.spk_dim,
158
+ )
159
  self.enc_q = PosteriorEncoder(
160
  spec_channels,
161
  hp.vits.inter_channels,
 
176
  self.dec = Generator(hp=hp)
177
 
178
  def forward(self, ppg, pit, spec, spk, ppg_l, spec_l):
179
+ ppg = ppg + torch.randn_like(ppg) # Perturbation
180
  g = self.emb_g(F.normalize(spk)).unsqueeze(-1)
181
+ z_p, m_p, logs_p, ppg_mask, x = self.enc_p(
182
  ppg, ppg_l, f0=f0_to_coarse(pit))
183
  z_q, m_q, logs_q, spec_mask = self.enc_q(spec, spec_l, g=g)
184
 
 
189
  # SNAC to flow
190
  z_f, logdet_f = self.flow(z_q, spec_mask, g=spk)
191
  z_r, logdet_r = self.flow(z_p, spec_mask, g=spk, reverse=True)
192
+ # speaker
193
+ spk_preds = self.speaker_classifier(x)
194
+ return audio, ids_slice, spec_mask, (z_f, z_r, z_p, m_p, logs_p, z_q, m_q, logs_q, logdet_f, logdet_r), spk_preds
195
 
196
  def infer(self, ppg, pit, spk, ppg_l):
197
+ ppg = ppg + torch.randn_like(ppg) * 0.0001 # Perturbation
198
+ z_p, m_p, logs_p, ppg_mask, x = self.enc_p(
199
  ppg, ppg_l, f0=f0_to_coarse(pit))
200
  z, _ = self.flow(z_p, ppg_mask, g=spk, reverse=True)
201
  o = self.dec(spk, z * ppg_mask, f0=pit)
 
242
  return self.dec.source2wav(source)
243
 
244
  def inference(self, ppg, pit, spk, ppg_l, source):
245
+ z_p, m_p, logs_p, ppg_mask, x = self.enc_p(
246
  ppg, ppg_l, f0=f0_to_coarse(pit))
247
  z, _ = self.flow(z_p, ppg_mask, g=spk, reverse=True)
248
  o = self.dec.inference(spk, z * ppg_mask, source)
vits/modules.py CHANGED
@@ -1,16 +1,7 @@
1
- import copy
2
- import math
3
- import numpy as np
4
- import scipy
5
  import torch
6
  from torch import nn
7
  from torch.nn import functional as F
8
-
9
- from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
10
- from torch.nn.utils import weight_norm, remove_weight_norm
11
-
12
  from vits import commons
13
- from vits.commons import init_weights, get_padding
14
 
15
 
16
  LRELU_SLOPE = 0.1
@@ -220,148 +211,6 @@ class WN(torch.nn.Module):
220
  torch.nn.utils.remove_weight_norm(l)
221
 
222
 
223
- class ResBlock1(torch.nn.Module):
224
- def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
225
- super(ResBlock1, self).__init__()
226
- self.convs1 = nn.ModuleList(
227
- [
228
- weight_norm(
229
- Conv1d(
230
- channels,
231
- channels,
232
- kernel_size,
233
- 1,
234
- dilation=dilation[0],
235
- padding=get_padding(kernel_size, dilation[0]),
236
- )
237
- ),
238
- weight_norm(
239
- Conv1d(
240
- channels,
241
- channels,
242
- kernel_size,
243
- 1,
244
- dilation=dilation[1],
245
- padding=get_padding(kernel_size, dilation[1]),
246
- )
247
- ),
248
- weight_norm(
249
- Conv1d(
250
- channels,
251
- channels,
252
- kernel_size,
253
- 1,
254
- dilation=dilation[2],
255
- padding=get_padding(kernel_size, dilation[2]),
256
- )
257
- ),
258
- ]
259
- )
260
- self.convs1.apply(init_weights)
261
-
262
- self.convs2 = nn.ModuleList(
263
- [
264
- weight_norm(
265
- Conv1d(
266
- channels,
267
- channels,
268
- kernel_size,
269
- 1,
270
- dilation=1,
271
- padding=get_padding(kernel_size, 1),
272
- )
273
- ),
274
- weight_norm(
275
- Conv1d(
276
- channels,
277
- channels,
278
- kernel_size,
279
- 1,
280
- dilation=1,
281
- padding=get_padding(kernel_size, 1),
282
- )
283
- ),
284
- weight_norm(
285
- Conv1d(
286
- channels,
287
- channels,
288
- kernel_size,
289
- 1,
290
- dilation=1,
291
- padding=get_padding(kernel_size, 1),
292
- )
293
- ),
294
- ]
295
- )
296
- self.convs2.apply(init_weights)
297
-
298
- def forward(self, x, x_mask=None):
299
- for c1, c2 in zip(self.convs1, self.convs2):
300
- xt = F.leaky_relu(x, LRELU_SLOPE)
301
- if x_mask is not None:
302
- xt = xt * x_mask
303
- xt = c1(xt)
304
- xt = F.leaky_relu(xt, LRELU_SLOPE)
305
- if x_mask is not None:
306
- xt = xt * x_mask
307
- xt = c2(xt)
308
- x = xt + x
309
- if x_mask is not None:
310
- x = x * x_mask
311
- return x
312
-
313
- def remove_weight_norm(self):
314
- for l in self.convs1:
315
- remove_weight_norm(l)
316
- for l in self.convs2:
317
- remove_weight_norm(l)
318
-
319
-
320
- class ResBlock2(torch.nn.Module):
321
- def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
322
- super(ResBlock2, self).__init__()
323
- self.convs = nn.ModuleList(
324
- [
325
- weight_norm(
326
- Conv1d(
327
- channels,
328
- channels,
329
- kernel_size,
330
- 1,
331
- dilation=dilation[0],
332
- padding=get_padding(kernel_size, dilation[0]),
333
- )
334
- ),
335
- weight_norm(
336
- Conv1d(
337
- channels,
338
- channels,
339
- kernel_size,
340
- 1,
341
- dilation=dilation[1],
342
- padding=get_padding(kernel_size, dilation[1]),
343
- )
344
- ),
345
- ]
346
- )
347
- self.convs.apply(init_weights)
348
-
349
- def forward(self, x, x_mask=None):
350
- for c in self.convs:
351
- xt = F.leaky_relu(x, LRELU_SLOPE)
352
- if x_mask is not None:
353
- xt = xt * x_mask
354
- xt = c(xt)
355
- x = xt + x
356
- if x_mask is not None:
357
- x = x * x_mask
358
- return x
359
-
360
- def remove_weight_norm(self):
361
- for l in self.convs:
362
- remove_weight_norm(l)
363
-
364
-
365
  class Log(nn.Module):
366
  def forward(self, x, x_mask, reverse=False, **kwargs):
367
  if not reverse:
 
 
 
 
 
1
  import torch
2
  from torch import nn
3
  from torch.nn import functional as F
 
 
 
 
4
  from vits import commons
 
5
 
6
 
7
  LRELU_SLOPE = 0.1
 
211
  torch.nn.utils.remove_weight_norm(l)
212
 
213
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
  class Log(nn.Module):
215
  def forward(self, x, x_mask, reverse=False, **kwargs):
216
  if not reverse:
vits/modules_grl.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adapted from https://github.com/ubisoft/ubisoft-laforge-daft-exprt Apache License Version 2.0
2
+ # Unsupervised Domain Adaptation by Backpropagation
3
+
4
+ import torch
5
+ import torch.nn as nn
6
+
7
+ from torch.autograd import Function
8
+ from torch.nn.utils import weight_norm
9
+
10
+
11
+ class GradientReversalFunction(Function):
12
+ @staticmethod
13
+ def forward(ctx, x, lambda_):
14
+ ctx.lambda_ = lambda_
15
+ return x.clone()
16
+
17
+ @staticmethod
18
+ def backward(ctx, grads):
19
+ lambda_ = ctx.lambda_
20
+ lambda_ = grads.new_tensor(lambda_)
21
+ dx = -lambda_ * grads
22
+ return dx, None
23
+
24
+
25
+ class GradientReversal(torch.nn.Module):
26
+ ''' Gradient Reversal Layer
27
+ Y. Ganin, V. Lempitsky,
28
+ "Unsupervised Domain Adaptation by Backpropagation",
29
+ in ICML, 2015.
30
+ Forward pass is the identity function
31
+ In the backward pass, upstream gradients are multiplied by -lambda (i.e. gradient are reversed)
32
+ '''
33
+
34
+ def __init__(self, lambda_reversal=1):
35
+ super(GradientReversal, self).__init__()
36
+ self.lambda_ = lambda_reversal
37
+
38
+ def forward(self, x):
39
+ return GradientReversalFunction.apply(x, self.lambda_)
40
+
41
+
42
+ class SpeakerClassifier(nn.Module):
43
+
44
+ def __init__(self, embed_dim, spk_dim):
45
+ super(SpeakerClassifier, self).__init__()
46
+ self.classifier = nn.Sequential(
47
+ GradientReversal(lambda_reversal=1),
48
+ weight_norm(nn.Conv1d(embed_dim, embed_dim, kernel_size=5, padding=2)),
49
+ nn.ReLU(),
50
+ weight_norm(nn.Conv1d(embed_dim, embed_dim, kernel_size=5, padding=2)),
51
+ nn.ReLU(),
52
+ weight_norm(nn.Conv1d(embed_dim, spk_dim, kernel_size=5, padding=2))
53
+ )
54
+
55
+ def forward(self, x):
56
+ ''' Forward function of Speaker Classifier:
57
+ x = (B, embed_dim, len)
58
+ '''
59
+ # pass through classifier
60
+ outputs = self.classifier(x) # (B, nb_speakers)
61
+ outputs = torch.mean(outputs, dim=-1)
62
+ return outputs
vits/utils.py CHANGED
@@ -1,10 +1,6 @@
1
- import os
2
- import argparse
3
- import numpy as np
4
  import torch
5
-
6
  from scipy.io.wavfile import read
7
- from omegaconf import OmegaConf
8
 
9
  MATPLOTLIB_FLAG = False
10
 
@@ -35,18 +31,3 @@ def f0_to_coarse(f0):
35
  assert f0_coarse.max() <= 255 and f0_coarse.min(
36
  ) >= 1, (f0_coarse.max(), f0_coarse.min())
37
  return f0_coarse
38
-
39
-
40
- def get_hparams(init=True):
41
- parser = argparse.ArgumentParser()
42
- parser.add_argument('-c', '--config', type=str, default="./configs/base.yaml",
43
- help='YAML file for configuration')
44
- args = parser.parse_args()
45
- hparams = OmegaConf.load(args.config)
46
- model_dir = os.path.join("./logs", hparams.train.model)
47
- if not os.path.exists(model_dir):
48
- os.makedirs(model_dir)
49
- config_save_path = os.path.join(model_dir, "config.json")
50
- os.system(f"cp {args.config} {config_save_path}")
51
- hparams.model_dir = model_dir
52
- return hparams
 
 
 
 
1
  import torch
2
+ import numpy as np
3
  from scipy.io.wavfile import read
 
4
 
5
  MATPLOTLIB_FLAG = False
6
 
 
31
  assert f0_coarse.max() <= 255 and f0_coarse.min(
32
  ) >= 1, (f0_coarse.max(), f0_coarse.min())
33
  return f0_coarse
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
vits_decoder/__init__.py CHANGED
@@ -0,0 +1 @@
 
 
1
+ from .alias.act import SnakeAlias
vits_decoder/alias/act.py CHANGED
@@ -1,7 +1,12 @@
1
  # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
2
  # LICENSE is in incl_licenses directory.
3
 
 
4
  import torch.nn as nn
 
 
 
 
5
  from .resample import UpSample1d, DownSample1d
6
 
7
 
@@ -19,6 +24,102 @@ class Activation1d(nn.Module):
19
  self.upsample = UpSample1d(up_ratio, up_kernel_size)
20
  self.downsample = DownSample1d(down_ratio, down_kernel_size)
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  # x: [B,C,T]
23
  def forward(self, x):
24
  x = self.upsample(x)
 
1
  # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
2
  # LICENSE is in incl_licenses directory.
3
 
4
+ import torch
5
  import torch.nn as nn
6
+ import torch.nn.functional as F
7
+
8
+ from torch import sin, pow
9
+ from torch.nn import Parameter
10
  from .resample import UpSample1d, DownSample1d
11
 
12
 
 
24
  self.upsample = UpSample1d(up_ratio, up_kernel_size)
25
  self.downsample = DownSample1d(down_ratio, down_kernel_size)
26
 
27
+ # x: [B,C,T]
28
+ def forward(self, x):
29
+ x = self.upsample(x)
30
+ x = self.act(x)
31
+ x = self.downsample(x)
32
+
33
+ return x
34
+
35
+
36
+ class SnakeBeta(nn.Module):
37
+ '''
38
+ A modified Snake function which uses separate parameters for the magnitude of the periodic components
39
+ Shape:
40
+ - Input: (B, C, T)
41
+ - Output: (B, C, T), same shape as the input
42
+ Parameters:
43
+ - alpha - trainable parameter that controls frequency
44
+ - beta - trainable parameter that controls magnitude
45
+ References:
46
+ - This activation function is a modified version based on this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
47
+ https://arxiv.org/abs/2006.08195
48
+ Examples:
49
+ >>> a1 = snakebeta(256)
50
+ >>> x = torch.randn(256)
51
+ >>> x = a1(x)
52
+ '''
53
+
54
+ def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
55
+ '''
56
+ Initialization.
57
+ INPUT:
58
+ - in_features: shape of the input
59
+ - alpha - trainable parameter that controls frequency
60
+ - beta - trainable parameter that controls magnitude
61
+ alpha is initialized to 1 by default, higher values = higher-frequency.
62
+ beta is initialized to 1 by default, higher values = higher-magnitude.
63
+ alpha will be trained along with the rest of your model.
64
+ '''
65
+ super(SnakeBeta, self).__init__()
66
+ self.in_features = in_features
67
+ # initialize alpha
68
+ self.alpha_logscale = alpha_logscale
69
+ if self.alpha_logscale: # log scale alphas initialized to zeros
70
+ self.alpha = Parameter(torch.zeros(in_features) * alpha)
71
+ self.beta = Parameter(torch.zeros(in_features) * alpha)
72
+ else: # linear scale alphas initialized to ones
73
+ self.alpha = Parameter(torch.ones(in_features) * alpha)
74
+ self.beta = Parameter(torch.ones(in_features) * alpha)
75
+ self.alpha.requires_grad = alpha_trainable
76
+ self.beta.requires_grad = alpha_trainable
77
+ self.no_div_by_zero = 0.000000001
78
+
79
+ def forward(self, x):
80
+ '''
81
+ Forward pass of the function.
82
+ Applies the function to the input elementwise.
83
+ SnakeBeta = x + 1/b * sin^2 (xa)
84
+ '''
85
+ alpha = self.alpha.unsqueeze(
86
+ 0).unsqueeze(-1) # line up with x to [B, C, T]
87
+ beta = self.beta.unsqueeze(0).unsqueeze(-1)
88
+ if self.alpha_logscale:
89
+ alpha = torch.exp(alpha)
90
+ beta = torch.exp(beta)
91
+ x = x + (1.0 / (beta + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
92
+ return x
93
+
94
+
95
+ class Mish(nn.Module):
96
+ """
97
+ Mish activation function is proposed in "Mish: A Self
98
+ Regularized Non-Monotonic Neural Activation Function"
99
+ paper, https://arxiv.org/abs/1908.08681.
100
+ """
101
+
102
+ def __init__(self):
103
+ super().__init__()
104
+
105
+ def forward(self, x):
106
+ return x * torch.tanh(F.softplus(x))
107
+
108
+
109
+ class SnakeAlias(nn.Module):
110
+ def __init__(self,
111
+ channels,
112
+ up_ratio: int = 2,
113
+ down_ratio: int = 2,
114
+ up_kernel_size: int = 12,
115
+ down_kernel_size: int = 12):
116
+ super().__init__()
117
+ self.up_ratio = up_ratio
118
+ self.down_ratio = down_ratio
119
+ self.act = SnakeBeta(channels, alpha_logscale=True)
120
+ self.upsample = UpSample1d(up_ratio, up_kernel_size)
121
+ self.downsample = DownSample1d(down_ratio, down_kernel_size)
122
+
123
  # x: [B,C,T]
124
  def forward(self, x):
125
  x = self.upsample(x)
vits_decoder/alias/activations.py DELETED
File without changes
vits_decoder/bigv.py CHANGED
@@ -1,14 +1,9 @@
1
  import torch
2
- import torch.nn.functional as F
3
  import torch.nn as nn
4
 
5
- from torch import nn, sin, pow
6
- from torch.nn import Parameter
7
  from torch.nn import Conv1d
8
  from torch.nn.utils import weight_norm, remove_weight_norm
9
-
10
-
11
- from .alias import *
12
 
13
 
14
  def init_weights(m, mean=0.0, std=0.01):
@@ -21,69 +16,9 @@ def get_padding(kernel_size, dilation=1):
21
  return int((kernel_size*dilation - dilation)/2)
22
 
23
 
24
- class SnakeBeta(nn.Module):
25
- '''
26
- A modified Snake function which uses separate parameters for the magnitude of the periodic components
27
- Shape:
28
- - Input: (B, C, T)
29
- - Output: (B, C, T), same shape as the input
30
- Parameters:
31
- - alpha - trainable parameter that controls frequency
32
- - beta - trainable parameter that controls magnitude
33
- References:
34
- - This activation function is a modified version based on this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
35
- https://arxiv.org/abs/2006.08195
36
- Examples:
37
- >>> a1 = snakebeta(256)
38
- >>> x = torch.randn(256)
39
- >>> x = a1(x)
40
- '''
41
-
42
- def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
43
- '''
44
- Initialization.
45
- INPUT:
46
- - in_features: shape of the input
47
- - alpha - trainable parameter that controls frequency
48
- - beta - trainable parameter that controls magnitude
49
- alpha is initialized to 1 by default, higher values = higher-frequency.
50
- beta is initialized to 1 by default, higher values = higher-magnitude.
51
- alpha will be trained along with the rest of your model.
52
- '''
53
- super(SnakeBeta, self).__init__()
54
- self.in_features = in_features
55
- # initialize alpha
56
- self.alpha_logscale = alpha_logscale
57
- if self.alpha_logscale: # log scale alphas initialized to zeros
58
- self.alpha = Parameter(torch.zeros(in_features) * alpha)
59
- self.beta = Parameter(torch.zeros(in_features) * alpha)
60
- else: # linear scale alphas initialized to ones
61
- self.alpha = Parameter(torch.ones(in_features) * alpha)
62
- self.beta = Parameter(torch.ones(in_features) * alpha)
63
- self.alpha.requires_grad = alpha_trainable
64
- self.beta.requires_grad = alpha_trainable
65
- self.no_div_by_zero = 0.000000001
66
-
67
- def forward(self, x):
68
- '''
69
- Forward pass of the function.
70
- Applies the function to the input elementwise.
71
- SnakeBeta ∶= x + 1/b * sin^2 (xa)
72
- '''
73
- alpha = self.alpha.unsqueeze(
74
- 0).unsqueeze(-1) # line up with x to [B, C, T]
75
- beta = self.beta.unsqueeze(0).unsqueeze(-1)
76
- if self.alpha_logscale:
77
- alpha = torch.exp(alpha)
78
- beta = torch.exp(beta)
79
- x = x + (1.0 / (beta + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
80
- return x
81
-
82
-
83
  class AMPBlock(torch.nn.Module):
84
- def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
85
  super(AMPBlock, self).__init__()
86
- self.h = h
87
  self.convs1 = nn.ModuleList([
88
  weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
89
  padding=get_padding(kernel_size, dilation[0]))),
@@ -109,9 +44,7 @@ class AMPBlock(torch.nn.Module):
109
 
110
  # periodic nonlinearity with snakebeta function and anti-aliasing
111
  self.activations = nn.ModuleList([
112
- Activation1d(
113
- activation=SnakeBeta(channels, alpha_logscale=True))
114
- for _ in range(self.num_layers)
115
  ])
116
 
117
  def forward(self, x):
 
1
  import torch
 
2
  import torch.nn as nn
3
 
 
 
4
  from torch.nn import Conv1d
5
  from torch.nn.utils import weight_norm, remove_weight_norm
6
+ from .alias.act import SnakeAlias
 
 
7
 
8
 
9
  def init_weights(m, mean=0.0, std=0.01):
 
16
  return int((kernel_size*dilation - dilation)/2)
17
 
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  class AMPBlock(torch.nn.Module):
20
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
21
  super(AMPBlock, self).__init__()
 
22
  self.convs1 = nn.ModuleList([
23
  weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
24
  padding=get_padding(kernel_size, dilation[0]))),
 
44
 
45
  # periodic nonlinearity with snakebeta function and anti-aliasing
46
  self.activations = nn.ModuleList([
47
+ SnakeAlias(channels) for _ in range(self.num_layers)
 
 
48
  ])
49
 
50
  def forward(self, x):
vits_decoder/discriminator.py CHANGED
@@ -1,32 +1,39 @@
1
  import torch
2
  import torch.nn as nn
3
 
 
 
4
  from .mpd import MultiPeriodDiscriminator
5
  from .mrd import MultiResolutionDiscriminator
6
- from omegaconf import OmegaConf
7
 
8
  class Discriminator(nn.Module):
9
  def __init__(self, hp):
10
  super(Discriminator, self).__init__()
11
  self.MRD = MultiResolutionDiscriminator(hp)
12
  self.MPD = MultiPeriodDiscriminator(hp)
 
13
 
14
  def forward(self, x):
15
- return self.MRD(x), self.MPD(x)
 
 
 
 
16
 
17
  if __name__ == '__main__':
18
- hp = OmegaConf.load('../config/default.yaml')
19
  model = Discriminator(hp)
20
 
21
  x = torch.randn(3, 1, 16384)
22
  print(x.shape)
23
 
24
- mrd_output, mpd_output = model(x)
25
- for features, score in mpd_output:
26
  for feat in features:
27
  print(feat.shape)
28
  print(score.shape)
29
 
30
- pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
 
31
  print(pytorch_total_params)
32
-
 
1
  import torch
2
  import torch.nn as nn
3
 
4
+ from omegaconf import OmegaConf
5
+ from .msd import ScaleDiscriminator
6
  from .mpd import MultiPeriodDiscriminator
7
  from .mrd import MultiResolutionDiscriminator
8
+
9
 
10
  class Discriminator(nn.Module):
11
  def __init__(self, hp):
12
  super(Discriminator, self).__init__()
13
  self.MRD = MultiResolutionDiscriminator(hp)
14
  self.MPD = MultiPeriodDiscriminator(hp)
15
+ self.MSD = ScaleDiscriminator()
16
 
17
  def forward(self, x):
18
+ r = self.MRD(x)
19
+ p = self.MPD(x)
20
+ s = self.MSD(x)
21
+ return r + p + s
22
+
23
 
24
  if __name__ == '__main__':
25
+ hp = OmegaConf.load('../config/base.yaml')
26
  model = Discriminator(hp)
27
 
28
  x = torch.randn(3, 1, 16384)
29
  print(x.shape)
30
 
31
+ output = model(x)
32
+ for features, score in output:
33
  for feat in features:
34
  print(feat.shape)
35
  print(score.shape)
36
 
37
+ pytorch_total_params = sum(p.numel()
38
+ for p in model.parameters() if p.requires_grad)
39
  print(pytorch_total_params)
 
vits_decoder/generator.py CHANGED
@@ -1,5 +1,6 @@
1
  import torch
2
  import torch.nn as nn
 
3
  import numpy as np
4
 
5
  from torch.nn import Conv1d
@@ -8,8 +9,7 @@ from torch.nn.utils import weight_norm
8
  from torch.nn.utils import remove_weight_norm
9
 
10
  from .nsf import SourceModuleHnNSF
11
- from .bigv import init_weights, SnakeBeta, AMPBlock
12
- from .alias import Activation1d
13
 
14
 
15
  class SpeakerAdapter(nn.Module):
@@ -57,24 +57,28 @@ class Generator(torch.nn.Module):
57
  # speaker adaper, 256 should change by what speaker encoder you use
58
  self.adapter = SpeakerAdapter(hp.vits.spk_dim, hp.gen.upsample_input)
59
  # pre conv
60
- self.conv_pre = nn.utils.weight_norm(
61
- Conv1d(hp.gen.upsample_input, hp.gen.upsample_initial_channel, 7, 1, padding=3))
62
  # nsf
63
  self.f0_upsamp = torch.nn.Upsample(
64
  scale_factor=np.prod(hp.gen.upsample_rates))
65
- self.m_source = SourceModuleHnNSF()
66
  self.noise_convs = nn.ModuleList()
67
  # transposed conv-based upsamplers. does not apply anti-aliasing
68
  self.ups = nn.ModuleList()
69
  for i, (u, k) in enumerate(zip(hp.gen.upsample_rates, hp.gen.upsample_kernel_sizes)):
70
  # print(f'ups: {i} {k}, {u}, {(k - u) // 2}')
71
  # base
72
- self.ups.append(nn.ModuleList([
73
- weight_norm(ConvTranspose1d(hp.gen.upsample_initial_channel // (2 ** i),
74
- hp.gen.upsample_initial_channel // (
75
- 2 ** (i + 1)),
76
- k, u, padding=(k - u) // 2))
77
- ]))
 
 
 
 
78
  # nsf
79
  if i + 1 < len(hp.gen.upsample_rates):
80
  stride_f0 = np.prod(hp.gen.upsample_rates[i + 1:])
@@ -99,32 +103,30 @@ class Generator(torch.nn.Module):
99
  for i in range(len(self.ups)):
100
  ch = hp.gen.upsample_initial_channel // (2 ** (i + 1))
101
  for k, d in zip(hp.gen.resblock_kernel_sizes, hp.gen.resblock_dilation_sizes):
102
- self.resblocks.append(AMPBlock(hp, ch, k, d))
103
 
104
  # post conv
105
- activation_post = SnakeBeta(ch, alpha_logscale=True)
106
- self.activation_post = Activation1d(activation=activation_post)
107
- self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
108
-
109
  # weight initialization
110
- for i in range(len(self.ups)):
111
- self.ups[i].apply(init_weights)
112
- self.conv_post.apply(init_weights)
113
 
114
  def forward(self, spk, x, f0):
 
 
115
  # adapter
116
  x = self.adapter(x, spk)
 
 
117
  # nsf
118
  f0 = f0[:, None]
119
  f0 = self.f0_upsamp(f0).transpose(1, 2)
120
  har_source = self.m_source(f0)
121
  har_source = har_source.transpose(1, 2)
122
- x = self.conv_pre(x)
123
 
124
  for i in range(self.num_upsamples):
125
  # upsampling
126
- for i_up in range(len(self.ups[i])):
127
- x = self.ups[i][i_up](x)
128
  # nsf
129
  x_source = self.noise_convs[i](har_source)
130
  x = x + x_source
@@ -145,12 +147,9 @@ class Generator(torch.nn.Module):
145
 
146
  def remove_weight_norm(self):
147
  for l in self.ups:
148
- for l_i in l:
149
- remove_weight_norm(l_i)
150
  for l in self.resblocks:
151
  l.remove_weight_norm()
152
- remove_weight_norm(self.conv_pre)
153
- remove_weight_norm(self.conv_post)
154
 
155
  def eval(self, inference=False):
156
  super(Generator, self).eval()
@@ -177,11 +176,11 @@ class Generator(torch.nn.Module):
177
  # adapter
178
  x = self.adapter(x, spk)
179
  x = self.conv_pre(x)
 
180
 
181
  for i in range(self.num_upsamples):
182
  # upsampling
183
- for i_up in range(len(self.ups[i])):
184
- x = self.ups[i][i_up](x)
185
  # nsf
186
  x_source = self.noise_convs[i](har_source)
187
  x = x + x_source
 
1
  import torch
2
  import torch.nn as nn
3
+ import torch.nn.functional as F
4
  import numpy as np
5
 
6
  from torch.nn import Conv1d
 
9
  from torch.nn.utils import remove_weight_norm
10
 
11
  from .nsf import SourceModuleHnNSF
12
+ from .bigv import init_weights, AMPBlock, SnakeAlias
 
13
 
14
 
15
  class SpeakerAdapter(nn.Module):
 
57
  # speaker adaper, 256 should change by what speaker encoder you use
58
  self.adapter = SpeakerAdapter(hp.vits.spk_dim, hp.gen.upsample_input)
59
  # pre conv
60
+ self.conv_pre = Conv1d(hp.gen.upsample_input,
61
+ hp.gen.upsample_initial_channel, 7, 1, padding=3)
62
  # nsf
63
  self.f0_upsamp = torch.nn.Upsample(
64
  scale_factor=np.prod(hp.gen.upsample_rates))
65
+ self.m_source = SourceModuleHnNSF(sampling_rate=hp.data.sampling_rate)
66
  self.noise_convs = nn.ModuleList()
67
  # transposed conv-based upsamplers. does not apply anti-aliasing
68
  self.ups = nn.ModuleList()
69
  for i, (u, k) in enumerate(zip(hp.gen.upsample_rates, hp.gen.upsample_kernel_sizes)):
70
  # print(f'ups: {i} {k}, {u}, {(k - u) // 2}')
71
  # base
72
+ self.ups.append(
73
+ weight_norm(
74
+ ConvTranspose1d(
75
+ hp.gen.upsample_initial_channel // (2 ** i),
76
+ hp.gen.upsample_initial_channel // (2 ** (i + 1)),
77
+ k,
78
+ u,
79
+ padding=(k - u) // 2)
80
+ )
81
+ )
82
  # nsf
83
  if i + 1 < len(hp.gen.upsample_rates):
84
  stride_f0 = np.prod(hp.gen.upsample_rates[i + 1:])
 
103
  for i in range(len(self.ups)):
104
  ch = hp.gen.upsample_initial_channel // (2 ** (i + 1))
105
  for k, d in zip(hp.gen.resblock_kernel_sizes, hp.gen.resblock_dilation_sizes):
106
+ self.resblocks.append(AMPBlock(ch, k, d))
107
 
108
  # post conv
109
+ self.activation_post = SnakeAlias(ch)
110
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
 
 
111
  # weight initialization
112
+ self.ups.apply(init_weights)
 
 
113
 
114
  def forward(self, spk, x, f0):
115
+ # Perturbation
116
+ x = x + torch.randn_like(x)
117
  # adapter
118
  x = self.adapter(x, spk)
119
+ x = self.conv_pre(x)
120
+ x = x * torch.tanh(F.softplus(x))
121
  # nsf
122
  f0 = f0[:, None]
123
  f0 = self.f0_upsamp(f0).transpose(1, 2)
124
  har_source = self.m_source(f0)
125
  har_source = har_source.transpose(1, 2)
 
126
 
127
  for i in range(self.num_upsamples):
128
  # upsampling
129
+ x = self.ups[i](x)
 
130
  # nsf
131
  x_source = self.noise_convs[i](har_source)
132
  x = x + x_source
 
147
 
148
  def remove_weight_norm(self):
149
  for l in self.ups:
150
+ remove_weight_norm(l)
 
151
  for l in self.resblocks:
152
  l.remove_weight_norm()
 
 
153
 
154
  def eval(self, inference=False):
155
  super(Generator, self).eval()
 
176
  # adapter
177
  x = self.adapter(x, spk)
178
  x = self.conv_pre(x)
179
+ x = x * torch.tanh(F.softplus(x))
180
 
181
  for i in range(self.num_upsamples):
182
  # upsampling
183
+ x = self.ups[i](x)
 
184
  # nsf
185
  x_source = self.noise_convs[i](har_source)
186
  x = x + x_source
vits_decoder/med.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torchaudio
3
+ import typing as T
4
+
5
+
6
+ class MelspecDiscriminator(torch.nn.Module):
7
+ """mel spectrogram (frequency domain) discriminator"""
8
+
9
+ def __init__(self) -> None:
10
+ super().__init__()
11
+ self.SAMPLE_RATE = 48000
12
+ # mel filterbank transform
13
+ self._melspec = torchaudio.transforms.MelSpectrogram(
14
+ sample_rate=self.SAMPLE_RATE,
15
+ n_fft=2048,
16
+ win_length=int(0.025 * self.SAMPLE_RATE),
17
+ hop_length=int(0.010 * self.SAMPLE_RATE),
18
+ n_mels=128,
19
+ power=1,
20
+ )
21
+
22
+ # time-frequency 2D convolutions
23
+ kernel_sizes = [(7, 7), (4, 4), (4, 4), (4, 4)]
24
+ strides = [(1, 2), (1, 2), (1, 2), (1, 2)]
25
+ self._convs = torch.nn.ModuleList(
26
+ [
27
+ torch.nn.Sequential(
28
+ torch.nn.Conv2d(
29
+ in_channels=1 if i == 0 else 32,
30
+ out_channels=64,
31
+ kernel_size=k,
32
+ stride=s,
33
+ padding=(1, 2),
34
+ bias=False,
35
+ ),
36
+ torch.nn.BatchNorm2d(num_features=64),
37
+ torch.nn.GLU(dim=1),
38
+ )
39
+ for i, (k, s) in enumerate(zip(kernel_sizes, strides))
40
+ ]
41
+ )
42
+
43
+ # output adversarial projection
44
+ self._postnet = torch.nn.Conv2d(
45
+ in_channels=32,
46
+ out_channels=1,
47
+ kernel_size=(15, 3),
48
+ stride=(1, 2),
49
+ )
50
+
51
+ def forward(self, x: torch.Tensor) -> T.Tuple[torch.Tensor, T.List[torch.Tensor]]:
52
+ # apply the log-scale mel spectrogram transform
53
+ x = torch.log(self._melspec(x) + 1e-5)
54
+
55
+ # compute hidden layers and feature maps
56
+ f = []
57
+ for c in self._convs:
58
+ x = c(x)
59
+ f.append(x)
60
+
61
+ # apply the output projection and global average pooling
62
+ x = self._postnet(x)
63
+ x = x.mean(dim=[-2, -1])
64
+
65
+ return [(f, x)]
vits_decoder/msd.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ from torch.nn.utils import weight_norm
5
+
6
+
7
+ class ScaleDiscriminator(torch.nn.Module):
8
+ def __init__(self):
9
+ super(ScaleDiscriminator, self).__init__()
10
+ self.convs = nn.ModuleList([
11
+ weight_norm(nn.Conv1d(1, 16, 15, 1, padding=7)),
12
+ weight_norm(nn.Conv1d(16, 64, 41, 4, groups=4, padding=20)),
13
+ weight_norm(nn.Conv1d(64, 256, 41, 4, groups=16, padding=20)),
14
+ weight_norm(nn.Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
15
+ weight_norm(nn.Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
16
+ weight_norm(nn.Conv1d(1024, 1024, 5, 1, padding=2)),
17
+ ])
18
+ self.conv_post = weight_norm(nn.Conv1d(1024, 1, 3, 1, padding=1))
19
+
20
+ def forward(self, x):
21
+ fmap = []
22
+ for l in self.convs:
23
+ x = l(x)
24
+ x = F.leaky_relu(x, 0.1)
25
+ fmap.append(x)
26
+ x = self.conv_post(x)
27
+ fmap.append(x)
28
+ x = torch.flatten(x, 1, -1)
29
+ return [(fmap, x)]
vits_decoder/nsf.py CHANGED
@@ -356,34 +356,15 @@ class SourceModuleCycNoise_v1(torch.nn.Module):
356
 
357
 
358
  class SourceModuleHnNSF(torch.nn.Module):
359
- """SourceModule for hn-nsf
360
- SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
361
- add_noise_std=0.003, voiced_threshod=0)
362
- sampling_rate: sampling_rate in Hz
363
- harmonic_num: number of harmonic above F0 (default: 0)
364
- sine_amp: amplitude of sine source signal (default: 0.1)
365
- add_noise_std: std of additive Gaussian noise (default: 0.003)
366
- note that amplitude of noise in unvoiced is decided
367
- by sine_amp
368
- voiced_threshold: threhold to set U/V given F0 (default: 0)
369
-
370
- Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
371
- F0_sampled (batchsize, length, 1)
372
- Sine_source (batchsize, length, 1)
373
- noise_source (batchsize, length 1)
374
- uv (batchsize, length, 1)
375
- """
376
-
377
  def __init__(
378
  self,
379
- sampling_rate=48000,
380
- harmonic_num=10,
381
  sine_amp=0.1,
382
  add_noise_std=0.003,
383
  voiced_threshod=0,
384
  ):
385
  super(SourceModuleHnNSF, self).__init__()
386
-
387
  self.sine_amp = sine_amp
388
  self.noise_std = add_noise_std
389
 
@@ -393,17 +374,21 @@ class SourceModuleHnNSF(torch.nn.Module):
393
  )
394
 
395
  # to merge source harmonics into a single excitation
396
- self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
397
  self.l_tanh = torch.nn.Tanh()
 
 
 
 
398
 
399
  def forward(self, x):
400
  """
401
- Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
402
  F0_sampled (batchsize, length, 1)
403
  Sine_source (batchsize, length, 1)
404
- noise_source (batchsize, length 1)
405
  """
406
  # source for harmonic branch
407
  sine_wavs = self.l_sin_gen(x)
408
- sine_merge = self.l_tanh(self.l_linear(sine_wavs))
 
 
409
  return sine_merge
 
356
 
357
 
358
  class SourceModuleHnNSF(torch.nn.Module):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
359
  def __init__(
360
  self,
361
+ sampling_rate=32000,
 
362
  sine_amp=0.1,
363
  add_noise_std=0.003,
364
  voiced_threshod=0,
365
  ):
366
  super(SourceModuleHnNSF, self).__init__()
367
+ harmonic_num = 10
368
  self.sine_amp = sine_amp
369
  self.noise_std = add_noise_std
370
 
 
374
  )
375
 
376
  # to merge source harmonics into a single excitation
 
377
  self.l_tanh = torch.nn.Tanh()
378
+ self.register_buffer('merge_w', torch.FloatTensor([[
379
+ 0.2942, -0.2243, 0.0033, -0.0056, -0.0020, -0.0046,
380
+ 0.0221, -0.0083, -0.0241, -0.0036, -0.0581]]))
381
+ self.register_buffer('merge_b', torch.FloatTensor([0.0008]))
382
 
383
  def forward(self, x):
384
  """
385
+ Sine_source = SourceModuleHnNSF(F0_sampled)
386
  F0_sampled (batchsize, length, 1)
387
  Sine_source (batchsize, length, 1)
 
388
  """
389
  # source for harmonic branch
390
  sine_wavs = self.l_sin_gen(x)
391
+ sine_wavs = torch_nn_func.linear(
392
+ sine_wavs, self.merge_w) + self.merge_b
393
+ sine_merge = self.l_tanh(sine_wavs)
394
  return sine_merge
vits_pretrain/{sovits5.0-48k-debug.pth → sovits5.0_bigvgan.pth} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b2c3e49cebb2968266659507c80a007375f49d616ee216cf084cb3f87e93083d
3
- size 67866609
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ffed3845044b8bef076d72272da19791e1344ad3b750a02d6e4980acf6cb0a0b
3
+ size 74825605