XiaoHei Studio commited on
Commit
cc44d2d
1 Parent(s): c008384

Upload 86 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +4 -0
  2. results/audio_Shengshuyan_12key_sovits_pm.wav +3 -0
  3. results/audio_Shengshuyan_12key_sovits_pm_1.wav +3 -0
  4. results/tts_Shengshuyan_0key_sovits_pm.wav +0 -0
  5. results/tts_Shengshuyan_0key_sovits_pm_1.wav +0 -0
  6. results/tts_Shengshuyan_0key_sovits_pm_2.wav +0 -0
  7. results/tts_Shengshuyan_0key_sovits_pm_3.wav +0 -0
  8. results/tts_Shengshuyan_12key_sovits_pm.wav +0 -0
  9. results/tts_Shengshuyan_auto_sovits_pm.wav +0 -0
  10. results/tts_Shengshuyan_auto_sovits_pm_1.wav +0 -0
  11. results/tts_Shengshuyan_auto_sovits_pm_2.wav +3 -0
  12. results/tts_Shengshuyan_auto_sovits_pm_3.wav +0 -0
  13. results/tts_Shengshuyan_auto_sovits_pm_4.wav +0 -0
  14. results/vocals_Shengshuyan_0key_sovits_pm.wav +3 -0
  15. trained/put_trained_checkpoints_here +0 -0
  16. vdecoder/__init__.py +0 -0
  17. vdecoder/__pycache__/__init__.cpython-38.pyc +0 -0
  18. vdecoder/hifigan/__pycache__/env.cpython-38.pyc +0 -0
  19. vdecoder/hifigan/__pycache__/models.cpython-38.pyc +0 -0
  20. vdecoder/hifigan/__pycache__/utils.cpython-38.pyc +0 -0
  21. vdecoder/hifigan/env.py +15 -0
  22. vdecoder/hifigan/models.py +557 -0
  23. vdecoder/hifigan/nvSTFT.py +109 -0
  24. vdecoder/hifigan/utils.py +68 -0
  25. vdecoder/hifiganwithsnake/alias/__init__.py +6 -0
  26. vdecoder/hifiganwithsnake/alias/act.py +130 -0
  27. vdecoder/hifiganwithsnake/alias/filter.py +110 -0
  28. vdecoder/hifiganwithsnake/alias/resample.py +72 -0
  29. vdecoder/hifiganwithsnake/env.py +15 -0
  30. vdecoder/hifiganwithsnake/models.py +576 -0
  31. vdecoder/hifiganwithsnake/nvSTFT.py +109 -0
  32. vdecoder/hifiganwithsnake/utils.py +68 -0
  33. vdecoder/nsf_hifigan/__pycache__/env.cpython-38.pyc +0 -0
  34. vdecoder/nsf_hifigan/__pycache__/models.cpython-38.pyc +0 -0
  35. vdecoder/nsf_hifigan/__pycache__/nvSTFT.cpython-38.pyc +0 -0
  36. vdecoder/nsf_hifigan/__pycache__/utils.cpython-38.pyc +0 -0
  37. vdecoder/nsf_hifigan/env.py +15 -0
  38. vdecoder/nsf_hifigan/models.py +441 -0
  39. vdecoder/nsf_hifigan/nvSTFT.py +132 -0
  40. vdecoder/nsf_hifigan/utils.py +70 -0
  41. vencoder/CNHubertLarge.py +36 -0
  42. vencoder/ContentVec256L12_Onnx.py +33 -0
  43. vencoder/ContentVec256L9.py +38 -0
  44. vencoder/ContentVec256L9_Onnx.py +32 -0
  45. vencoder/ContentVec768L12.py +37 -0
  46. vencoder/ContentVec768L12_Onnx.py +33 -0
  47. vencoder/ContentVec768L9_Onnx.py +33 -0
  48. vencoder/DPHubert.py +29 -0
  49. vencoder/HubertSoft.py +28 -0
  50. vencoder/HubertSoft_Onnx.py +33 -0
.gitattributes CHANGED
@@ -21,3 +21,7 @@ pretrain/hubert-soft-0d54a1f4.pt filter=lfs diff=lfs merge=lfs -text
21
  pretrain/medium.pt filter=lfs diff=lfs merge=lfs -text
22
  pretrain/nsf_hifigan/model filter=lfs diff=lfs merge=lfs -text
23
  pretrain/rmvpe.pt filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
21
  pretrain/medium.pt filter=lfs diff=lfs merge=lfs -text
22
  pretrain/nsf_hifigan/model filter=lfs diff=lfs merge=lfs -text
23
  pretrain/rmvpe.pt filter=lfs diff=lfs merge=lfs -text
24
+ results/audio_Shengshuyan_12key_sovits_pm_1.wav filter=lfs diff=lfs merge=lfs -text
25
+ results/audio_Shengshuyan_12key_sovits_pm.wav filter=lfs diff=lfs merge=lfs -text
26
+ results/tts_Shengshuyan_auto_sovits_pm_2.wav filter=lfs diff=lfs merge=lfs -text
27
+ results/vocals_Shengshuyan_0key_sovits_pm.wav filter=lfs diff=lfs merge=lfs -text
results/audio_Shengshuyan_12key_sovits_pm.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:802153e84b5b9415e88fead81f26dfa39276411458d1d7b46bfa158ae2268084
3
+ size 1603520
results/audio_Shengshuyan_12key_sovits_pm_1.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8b0907cc03021cdd9b396a5f25158e02ba36186733f03e1d7d9e59048dc1450
3
+ size 1534724
results/tts_Shengshuyan_0key_sovits_pm.wav ADDED
Binary file (709 kB). View file
 
results/tts_Shengshuyan_0key_sovits_pm_1.wav ADDED
Binary file (301 kB). View file
 
results/tts_Shengshuyan_0key_sovits_pm_2.wav ADDED
Binary file (301 kB). View file
 
results/tts_Shengshuyan_0key_sovits_pm_3.wav ADDED
Binary file (423 kB). View file
 
results/tts_Shengshuyan_12key_sovits_pm.wav ADDED
Binary file (709 kB). View file
 
results/tts_Shengshuyan_auto_sovits_pm.wav ADDED
Binary file (553 kB). View file
 
results/tts_Shengshuyan_auto_sovits_pm_1.wav ADDED
Binary file (553 kB). View file
 
results/tts_Shengshuyan_auto_sovits_pm_2.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:008ab4c60f5e1b31466e4b60d97ababbf8ec476c01d0e2c1c330972d72fda1fd
3
+ size 1172752
results/tts_Shengshuyan_auto_sovits_pm_3.wav ADDED
Binary file (599 kB). View file
 
results/tts_Shengshuyan_auto_sovits_pm_4.wav ADDED
Binary file (286 kB). View file
 
results/vocals_Shengshuyan_0key_sovits_pm.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4057ce12f656c6325fa616e97b1fe82e392e3e0bafb518384c5f548dd549ca8
3
+ size 18330192
trained/put_trained_checkpoints_here ADDED
File without changes
vdecoder/__init__.py ADDED
File without changes
vdecoder/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (158 Bytes). View file
 
vdecoder/hifigan/__pycache__/env.cpython-38.pyc ADDED
Binary file (819 Bytes). View file
 
vdecoder/hifigan/__pycache__/models.cpython-38.pyc ADDED
Binary file (16.4 kB). View file
 
vdecoder/hifigan/__pycache__/utils.cpython-38.pyc ADDED
Binary file (2.32 kB). View file
 
vdecoder/hifigan/env.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+
4
+
5
+ class AttrDict(dict):
6
+ def __init__(self, *args, **kwargs):
7
+ super(AttrDict, self).__init__(*args, **kwargs)
8
+ self.__dict__ = self
9
+
10
+
11
+ def build_env(config, config_name, path):
12
+ t_path = os.path.join(path, config_name)
13
+ if config != t_path:
14
+ os.makedirs(path, exist_ok=True)
15
+ shutil.copyfile(config, os.path.join(path, config_name))
vdecoder/hifigan/models.py ADDED
@@ -0,0 +1,557 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+
4
+ import numpy as np
5
+ import torch
6
+ import torch.nn as nn
7
+ import torch.nn.functional as F
8
+ from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d
9
+ from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm
10
+
11
+ from .env import AttrDict
12
+ from .utils import get_padding, init_weights
13
+
14
+ LRELU_SLOPE = 0.1
15
+
16
+
17
+ def load_model(model_path, device='cuda'):
18
+ config_file = os.path.join(os.path.split(model_path)[0], 'config.json')
19
+ with open(config_file) as f:
20
+ data = f.read()
21
+
22
+ global h
23
+ json_config = json.loads(data)
24
+ h = AttrDict(json_config)
25
+
26
+ generator = Generator(h).to(device)
27
+
28
+ cp_dict = torch.load(model_path)
29
+ generator.load_state_dict(cp_dict['generator'])
30
+ generator.eval()
31
+ generator.remove_weight_norm()
32
+ del cp_dict
33
+ return generator, h
34
+
35
+
36
+ class ResBlock1(torch.nn.Module):
37
+ def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
38
+ super(ResBlock1, self).__init__()
39
+ self.h = h
40
+ self.convs1 = nn.ModuleList([
41
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
42
+ padding=get_padding(kernel_size, dilation[0]))),
43
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
44
+ padding=get_padding(kernel_size, dilation[1]))),
45
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
46
+ padding=get_padding(kernel_size, dilation[2])))
47
+ ])
48
+ self.convs1.apply(init_weights)
49
+
50
+ self.convs2 = nn.ModuleList([
51
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
52
+ padding=get_padding(kernel_size, 1))),
53
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
54
+ padding=get_padding(kernel_size, 1))),
55
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
56
+ padding=get_padding(kernel_size, 1)))
57
+ ])
58
+ self.convs2.apply(init_weights)
59
+
60
+ def forward(self, x):
61
+ for c1, c2 in zip(self.convs1, self.convs2):
62
+ xt = F.leaky_relu(x, LRELU_SLOPE)
63
+ xt = c1(xt)
64
+ xt = F.leaky_relu(xt, LRELU_SLOPE)
65
+ xt = c2(xt)
66
+ x = xt + x
67
+ return x
68
+
69
+ def remove_weight_norm(self):
70
+ for l in self.convs1:
71
+ remove_weight_norm(l)
72
+ for l in self.convs2:
73
+ remove_weight_norm(l)
74
+
75
+
76
+ class ResBlock2(torch.nn.Module):
77
+ def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)):
78
+ super(ResBlock2, self).__init__()
79
+ self.h = h
80
+ self.convs = nn.ModuleList([
81
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
82
+ padding=get_padding(kernel_size, dilation[0]))),
83
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
84
+ padding=get_padding(kernel_size, dilation[1])))
85
+ ])
86
+ self.convs.apply(init_weights)
87
+
88
+ def forward(self, x):
89
+ for c in self.convs:
90
+ xt = F.leaky_relu(x, LRELU_SLOPE)
91
+ xt = c(xt)
92
+ x = xt + x
93
+ return x
94
+
95
+ def remove_weight_norm(self):
96
+ for l in self.convs:
97
+ remove_weight_norm(l)
98
+
99
+
100
+ def padDiff(x):
101
+ return F.pad(F.pad(x, (0,0,-1,1), 'constant', 0) - x, (0,0,0,-1), 'constant', 0)
102
+
103
+ class SineGen(torch.nn.Module):
104
+ """ Definition of sine generator
105
+ SineGen(samp_rate, harmonic_num = 0,
106
+ sine_amp = 0.1, noise_std = 0.003,
107
+ voiced_threshold = 0,
108
+ flag_for_pulse=False)
109
+ samp_rate: sampling rate in Hz
110
+ harmonic_num: number of harmonic overtones (default 0)
111
+ sine_amp: amplitude of sine-wavefrom (default 0.1)
112
+ noise_std: std of Gaussian noise (default 0.003)
113
+ voiced_thoreshold: F0 threshold for U/V classification (default 0)
114
+ flag_for_pulse: this SinGen is used inside PulseGen (default False)
115
+ Note: when flag_for_pulse is True, the first time step of a voiced
116
+ segment is always sin(np.pi) or cos(0)
117
+ """
118
+
119
+ def __init__(self, samp_rate, harmonic_num=0,
120
+ sine_amp=0.1, noise_std=0.003,
121
+ voiced_threshold=0,
122
+ flag_for_pulse=False):
123
+ super(SineGen, self).__init__()
124
+ self.sine_amp = sine_amp
125
+ self.noise_std = noise_std
126
+ self.harmonic_num = harmonic_num
127
+ self.dim = self.harmonic_num + 1
128
+ self.sampling_rate = samp_rate
129
+ self.voiced_threshold = voiced_threshold
130
+ self.flag_for_pulse = flag_for_pulse
131
+ self.onnx = False
132
+
133
+ def _f02uv(self, f0):
134
+ # generate uv signal
135
+ uv = (f0 > self.voiced_threshold).type(torch.float32)
136
+ return uv
137
+
138
+ def _f02sine(self, f0_values):
139
+ """ f0_values: (batchsize, length, dim)
140
+ where dim indicates fundamental tone and overtones
141
+ """
142
+ # convert to F0 in rad. The interger part n can be ignored
143
+ # because 2 * np.pi * n doesn't affect phase
144
+ rad_values = (f0_values / self.sampling_rate) % 1
145
+
146
+ # initial phase noise (no noise for fundamental component)
147
+ rand_ini = torch.rand(f0_values.shape[0], f0_values.shape[2], \
148
+ device=f0_values.device)
149
+ rand_ini[:, 0] = 0
150
+ rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
151
+
152
+ # instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad)
153
+ if not self.flag_for_pulse:
154
+ # for normal case
155
+
156
+ # To prevent torch.cumsum numerical overflow,
157
+ # it is necessary to add -1 whenever \sum_k=1^n rad_value_k > 1.
158
+ # Buffer tmp_over_one_idx indicates the time step to add -1.
159
+ # This will not change F0 of sine because (x-1) * 2*pi = x * 2*pi
160
+ tmp_over_one = torch.cumsum(rad_values, 1) % 1
161
+ tmp_over_one_idx = (padDiff(tmp_over_one)) < 0
162
+ cumsum_shift = torch.zeros_like(rad_values)
163
+ cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
164
+
165
+ sines = torch.sin(torch.cumsum(rad_values + cumsum_shift, dim=1)
166
+ * 2 * np.pi)
167
+ else:
168
+ # If necessary, make sure that the first time step of every
169
+ # voiced segments is sin(pi) or cos(0)
170
+ # This is used for pulse-train generation
171
+
172
+ # identify the last time step in unvoiced segments
173
+ uv = self._f02uv(f0_values)
174
+ uv_1 = torch.roll(uv, shifts=-1, dims=1)
175
+ uv_1[:, -1, :] = 1
176
+ u_loc = (uv < 1) * (uv_1 > 0)
177
+
178
+ # get the instantanouse phase
179
+ tmp_cumsum = torch.cumsum(rad_values, dim=1)
180
+ # different batch needs to be processed differently
181
+ for idx in range(f0_values.shape[0]):
182
+ temp_sum = tmp_cumsum[idx, u_loc[idx, :, 0], :]
183
+ temp_sum[1:, :] = temp_sum[1:, :] - temp_sum[0:-1, :]
184
+ # stores the accumulation of i.phase within
185
+ # each voiced segments
186
+ tmp_cumsum[idx, :, :] = 0
187
+ tmp_cumsum[idx, u_loc[idx, :, 0], :] = temp_sum
188
+
189
+ # rad_values - tmp_cumsum: remove the accumulation of i.phase
190
+ # within the previous voiced segment.
191
+ i_phase = torch.cumsum(rad_values - tmp_cumsum, dim=1)
192
+
193
+ # get the sines
194
+ sines = torch.cos(i_phase * 2 * np.pi)
195
+ return sines
196
+
197
+ def forward(self, f0, upp=None):
198
+ """ sine_tensor, uv = forward(f0)
199
+ input F0: tensor(batchsize=1, length, dim=1)
200
+ f0 for unvoiced steps should be 0
201
+ output sine_tensor: tensor(batchsize=1, length, dim)
202
+ output uv: tensor(batchsize=1, length, 1)
203
+ """
204
+ if self.onnx:
205
+ with torch.no_grad():
206
+ f0 = f0[:, None].transpose(1, 2)
207
+ f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
208
+ # fundamental component
209
+ f0_buf[:, :, 0] = f0[:, :, 0]
210
+ for idx in np.arange(self.harmonic_num):
211
+ f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
212
+ idx + 2
213
+ ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
214
+ rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化
215
+ rand_ini = torch.rand(
216
+ f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
217
+ )
218
+ rand_ini[:, 0] = 0
219
+ rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
220
+ tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化
221
+ tmp_over_one *= upp
222
+ tmp_over_one = F.interpolate(
223
+ tmp_over_one.transpose(2, 1),
224
+ scale_factor=upp,
225
+ mode="linear",
226
+ align_corners=True,
227
+ ).transpose(2, 1)
228
+ rad_values = F.interpolate(
229
+ rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
230
+ ).transpose(
231
+ 2, 1
232
+ ) #######
233
+ tmp_over_one %= 1
234
+ tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
235
+ cumsum_shift = torch.zeros_like(rad_values)
236
+ cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
237
+ sine_waves = torch.sin(
238
+ torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
239
+ )
240
+ sine_waves = sine_waves * self.sine_amp
241
+ uv = self._f02uv(f0)
242
+ uv = F.interpolate(
243
+ uv.transpose(2, 1), scale_factor=upp, mode="nearest"
244
+ ).transpose(2, 1)
245
+ noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
246
+ noise = noise_amp * torch.randn_like(sine_waves)
247
+ sine_waves = sine_waves * uv + noise
248
+ return sine_waves, uv, noise
249
+ else:
250
+ with torch.no_grad():
251
+ # fundamental component
252
+ fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device))
253
+
254
+ # generate sine waveforms
255
+ sine_waves = self._f02sine(fn) * self.sine_amp
256
+
257
+ # generate uv signal
258
+ # uv = torch.ones(f0.shape)
259
+ # uv = uv * (f0 > self.voiced_threshold)
260
+ uv = self._f02uv(f0)
261
+
262
+ # noise: for unvoiced should be similar to sine_amp
263
+ # std = self.sine_amp/3 -> max value ~ self.sine_amp
264
+ # . for voiced regions is self.noise_std
265
+ noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
266
+ noise = noise_amp * torch.randn_like(sine_waves)
267
+
268
+ # first: set the unvoiced part to 0 by uv
269
+ # then: additive noise
270
+ sine_waves = sine_waves * uv + noise
271
+ return sine_waves, uv, noise
272
+
273
+
274
+ class SourceModuleHnNSF(torch.nn.Module):
275
+ """ SourceModule for hn-nsf
276
+ SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
277
+ add_noise_std=0.003, voiced_threshod=0)
278
+ sampling_rate: sampling_rate in Hz
279
+ harmonic_num: number of harmonic above F0 (default: 0)
280
+ sine_amp: amplitude of sine source signal (default: 0.1)
281
+ add_noise_std: std of additive Gaussian noise (default: 0.003)
282
+ note that amplitude of noise in unvoiced is decided
283
+ by sine_amp
284
+ voiced_threshold: threhold to set U/V given F0 (default: 0)
285
+ Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
286
+ F0_sampled (batchsize, length, 1)
287
+ Sine_source (batchsize, length, 1)
288
+ noise_source (batchsize, length 1)
289
+ uv (batchsize, length, 1)
290
+ """
291
+
292
+ def __init__(self, sampling_rate, harmonic_num=0, sine_amp=0.1,
293
+ add_noise_std=0.003, voiced_threshod=0):
294
+ super(SourceModuleHnNSF, self).__init__()
295
+
296
+ self.sine_amp = sine_amp
297
+ self.noise_std = add_noise_std
298
+
299
+ # to produce sine waveforms
300
+ self.l_sin_gen = SineGen(sampling_rate, harmonic_num,
301
+ sine_amp, add_noise_std, voiced_threshod)
302
+
303
+ # to merge source harmonics into a single excitation
304
+ self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
305
+ self.l_tanh = torch.nn.Tanh()
306
+
307
+ def forward(self, x, upp=None):
308
+ """
309
+ Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
310
+ F0_sampled (batchsize, length, 1)
311
+ Sine_source (batchsize, length, 1)
312
+ noise_source (batchsize, length 1)
313
+ """
314
+ # source for harmonic branch
315
+ sine_wavs, uv, _ = self.l_sin_gen(x, upp)
316
+ sine_merge = self.l_tanh(self.l_linear(sine_wavs.to(self.l_linear.weight.dtype)))
317
+
318
+ # source for noise branch, in the same shape as uv
319
+ noise = torch.randn_like(uv) * self.sine_amp / 3
320
+ return sine_merge, noise, uv
321
+
322
+
323
+ class Generator(torch.nn.Module):
324
+ def __init__(self, h):
325
+ super(Generator, self).__init__()
326
+ self.h = h
327
+
328
+ self.num_kernels = len(h["resblock_kernel_sizes"])
329
+ self.num_upsamples = len(h["upsample_rates"])
330
+ self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(h["upsample_rates"]))
331
+ self.m_source = SourceModuleHnNSF(
332
+ sampling_rate=h["sampling_rate"],
333
+ harmonic_num=8)
334
+ self.noise_convs = nn.ModuleList()
335
+ self.conv_pre = weight_norm(Conv1d(h["inter_channels"], h["upsample_initial_channel"], 7, 1, padding=3))
336
+ resblock = ResBlock1 if h["resblock"] == '1' else ResBlock2
337
+ self.ups = nn.ModuleList()
338
+ for i, (u, k) in enumerate(zip(h["upsample_rates"], h["upsample_kernel_sizes"])):
339
+ c_cur = h["upsample_initial_channel"] // (2 ** (i + 1))
340
+ self.ups.append(weight_norm(
341
+ ConvTranspose1d(h["upsample_initial_channel"] // (2 ** i), h["upsample_initial_channel"] // (2 ** (i + 1)),
342
+ k, u, padding=(k - u +1 ) // 2)))
343
+ if i + 1 < len(h["upsample_rates"]): #
344
+ stride_f0 = np.prod(h["upsample_rates"][i + 1:])
345
+ self.noise_convs.append(Conv1d(
346
+ 1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=(stride_f0+1) // 2))
347
+ else:
348
+ self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
349
+ self.resblocks = nn.ModuleList()
350
+ for i in range(len(self.ups)):
351
+ ch = h["upsample_initial_channel"] // (2 ** (i + 1))
352
+ for j, (k, d) in enumerate(zip(h["resblock_kernel_sizes"], h["resblock_dilation_sizes"])):
353
+ self.resblocks.append(resblock(h, ch, k, d))
354
+
355
+ self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
356
+ self.ups.apply(init_weights)
357
+ self.conv_post.apply(init_weights)
358
+ self.cond = nn.Conv1d(h['gin_channels'], h['upsample_initial_channel'], 1)
359
+ self.upp = np.prod(h["upsample_rates"])
360
+ self.onnx = False
361
+
362
+ def OnnxExport(self):
363
+ self.onnx = True
364
+ self.m_source.l_sin_gen.onnx = True
365
+
366
+ def forward(self, x, f0, g=None):
367
+ # print(1,x.shape,f0.shape,f0[:, None].shape)
368
+ if not self.onnx:
369
+ f0 = self.f0_upsamp(f0[:, None]).transpose(1, 2) # bs,n,t
370
+ # print(2,f0.shape)
371
+ har_source, noi_source, uv = self.m_source(f0, self.upp)
372
+ har_source = har_source.transpose(1, 2)
373
+ x = self.conv_pre(x)
374
+ x = x + self.cond(g)
375
+ # print(124,x.shape,har_source.shape)
376
+ for i in range(self.num_upsamples):
377
+ x = F.leaky_relu(x, LRELU_SLOPE)
378
+ # print(3,x.shape)
379
+ x = self.ups[i](x)
380
+ x_source = self.noise_convs[i](har_source)
381
+ # print(4,x_source.shape,har_source.shape,x.shape)
382
+ x = x + x_source
383
+ xs = None
384
+ for j in range(self.num_kernels):
385
+ if xs is None:
386
+ xs = self.resblocks[i * self.num_kernels + j](x)
387
+ else:
388
+ xs += self.resblocks[i * self.num_kernels + j](x)
389
+ x = xs / self.num_kernels
390
+ x = F.leaky_relu(x)
391
+ x = self.conv_post(x)
392
+ x = torch.tanh(x)
393
+
394
+ return x
395
+
396
+ def remove_weight_norm(self):
397
+ print('Removing weight norm...')
398
+ for l in self.ups:
399
+ remove_weight_norm(l)
400
+ for l in self.resblocks:
401
+ l.remove_weight_norm()
402
+ remove_weight_norm(self.conv_pre)
403
+ remove_weight_norm(self.conv_post)
404
+
405
+
406
+ class DiscriminatorP(torch.nn.Module):
407
+ def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
408
+ super(DiscriminatorP, self).__init__()
409
+ self.period = period
410
+ norm_f = weight_norm if use_spectral_norm is False else spectral_norm
411
+ self.convs = nn.ModuleList([
412
+ norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
413
+ norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
414
+ norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
415
+ norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
416
+ norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
417
+ ])
418
+ self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
419
+
420
+ def forward(self, x):
421
+ fmap = []
422
+
423
+ # 1d to 2d
424
+ b, c, t = x.shape
425
+ if t % self.period != 0: # pad first
426
+ n_pad = self.period - (t % self.period)
427
+ x = F.pad(x, (0, n_pad), "reflect")
428
+ t = t + n_pad
429
+ x = x.view(b, c, t // self.period, self.period)
430
+
431
+ for l in self.convs:
432
+ x = l(x)
433
+ x = F.leaky_relu(x, LRELU_SLOPE)
434
+ fmap.append(x)
435
+ x = self.conv_post(x)
436
+ fmap.append(x)
437
+ x = torch.flatten(x, 1, -1)
438
+
439
+ return x, fmap
440
+
441
+
442
+ class MultiPeriodDiscriminator(torch.nn.Module):
443
+ def __init__(self, periods=None):
444
+ super(MultiPeriodDiscriminator, self).__init__()
445
+ self.periods = periods if periods is not None else [2, 3, 5, 7, 11]
446
+ self.discriminators = nn.ModuleList()
447
+ for period in self.periods:
448
+ self.discriminators.append(DiscriminatorP(period))
449
+
450
+ def forward(self, y, y_hat):
451
+ y_d_rs = []
452
+ y_d_gs = []
453
+ fmap_rs = []
454
+ fmap_gs = []
455
+ for i, d in enumerate(self.discriminators):
456
+ y_d_r, fmap_r = d(y)
457
+ y_d_g, fmap_g = d(y_hat)
458
+ y_d_rs.append(y_d_r)
459
+ fmap_rs.append(fmap_r)
460
+ y_d_gs.append(y_d_g)
461
+ fmap_gs.append(fmap_g)
462
+
463
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
464
+
465
+
466
+ class DiscriminatorS(torch.nn.Module):
467
+ def __init__(self, use_spectral_norm=False):
468
+ super(DiscriminatorS, self).__init__()
469
+ norm_f = weight_norm if use_spectral_norm is False else spectral_norm
470
+ self.convs = nn.ModuleList([
471
+ norm_f(Conv1d(1, 128, 15, 1, padding=7)),
472
+ norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),
473
+ norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)),
474
+ norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)),
475
+ norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)),
476
+ norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)),
477
+ norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
478
+ ])
479
+ self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
480
+
481
+ def forward(self, x):
482
+ fmap = []
483
+ for l in self.convs:
484
+ x = l(x)
485
+ x = F.leaky_relu(x, LRELU_SLOPE)
486
+ fmap.append(x)
487
+ x = self.conv_post(x)
488
+ fmap.append(x)
489
+ x = torch.flatten(x, 1, -1)
490
+
491
+ return x, fmap
492
+
493
+
494
+ class MultiScaleDiscriminator(torch.nn.Module):
495
+ def __init__(self):
496
+ super(MultiScaleDiscriminator, self).__init__()
497
+ self.discriminators = nn.ModuleList([
498
+ DiscriminatorS(use_spectral_norm=True),
499
+ DiscriminatorS(),
500
+ DiscriminatorS(),
501
+ ])
502
+ self.meanpools = nn.ModuleList([
503
+ AvgPool1d(4, 2, padding=2),
504
+ AvgPool1d(4, 2, padding=2)
505
+ ])
506
+
507
+ def forward(self, y, y_hat):
508
+ y_d_rs = []
509
+ y_d_gs = []
510
+ fmap_rs = []
511
+ fmap_gs = []
512
+ for i, d in enumerate(self.discriminators):
513
+ if i != 0:
514
+ y = self.meanpools[i - 1](y)
515
+ y_hat = self.meanpools[i - 1](y_hat)
516
+ y_d_r, fmap_r = d(y)
517
+ y_d_g, fmap_g = d(y_hat)
518
+ y_d_rs.append(y_d_r)
519
+ fmap_rs.append(fmap_r)
520
+ y_d_gs.append(y_d_g)
521
+ fmap_gs.append(fmap_g)
522
+
523
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
524
+
525
+
526
+ def feature_loss(fmap_r, fmap_g):
527
+ loss = 0
528
+ for dr, dg in zip(fmap_r, fmap_g):
529
+ for rl, gl in zip(dr, dg):
530
+ loss += torch.mean(torch.abs(rl - gl))
531
+
532
+ return loss * 2
533
+
534
+
535
+ def discriminator_loss(disc_real_outputs, disc_generated_outputs):
536
+ loss = 0
537
+ r_losses = []
538
+ g_losses = []
539
+ for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
540
+ r_loss = torch.mean((1 - dr) ** 2)
541
+ g_loss = torch.mean(dg ** 2)
542
+ loss += (r_loss + g_loss)
543
+ r_losses.append(r_loss.item())
544
+ g_losses.append(g_loss.item())
545
+
546
+ return loss, r_losses, g_losses
547
+
548
+
549
+ def generator_loss(disc_outputs):
550
+ loss = 0
551
+ gen_losses = []
552
+ for dg in disc_outputs:
553
+ l = torch.mean((1 - dg) ** 2)
554
+ gen_losses.append(l)
555
+ loss += l
556
+
557
+ return loss, gen_losses
vdecoder/hifigan/nvSTFT.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import librosa
4
+ import numpy as np
5
+ import soundfile as sf
6
+ import torch
7
+ import torch.utils.data
8
+ from librosa.filters import mel as librosa_mel_fn
9
+
10
+ os.environ["LRU_CACHE_CAPACITY"] = "3"
11
+
12
+ def load_wav_to_torch(full_path, target_sr=None, return_empty_on_exception=False):
13
+ sampling_rate = None
14
+ try:
15
+ data, sampling_rate = sf.read(full_path, always_2d=True)# than soundfile.
16
+ except Exception as ex:
17
+ print(f"'{full_path}' failed to load.\nException:")
18
+ print(ex)
19
+ if return_empty_on_exception:
20
+ return [], sampling_rate or target_sr or 32000
21
+ else:
22
+ raise Exception(ex)
23
+
24
+ if len(data.shape) > 1:
25
+ data = data[:, 0]
26
+ assert len(data) > 2# check duration of audio file is > 2 samples (because otherwise the slice operation was on the wrong dimension)
27
+
28
+ if np.issubdtype(data.dtype, np.integer): # if audio data is type int
29
+ max_mag = -np.iinfo(data.dtype).min # maximum magnitude = min possible value of intXX
30
+ else: # if audio data is type fp32
31
+ max_mag = max(np.amax(data), -np.amin(data))
32
+ max_mag = (2**31)+1 if max_mag > (2**15) else ((2**15)+1 if max_mag > 1.01 else 1.0) # data should be either 16-bit INT, 32-bit INT or [-1 to 1] float32
33
+
34
+ data = torch.FloatTensor(data.astype(np.float32))/max_mag
35
+
36
+ if (torch.isinf(data) | torch.isnan(data)).any() and return_empty_on_exception:# resample will crash with inf/NaN inputs. return_empty_on_exception will return empty arr instead of except
37
+ return [], sampling_rate or target_sr or 32000
38
+ if target_sr is not None and sampling_rate != target_sr:
39
+ data = torch.from_numpy(librosa.core.resample(data.numpy(), orig_sr=sampling_rate, target_sr=target_sr))
40
+ sampling_rate = target_sr
41
+
42
+ return data, sampling_rate
43
+
44
+ def dynamic_range_compression(x, C=1, clip_val=1e-5):
45
+ return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
46
+
47
+ def dynamic_range_decompression(x, C=1):
48
+ return np.exp(x) / C
49
+
50
+ def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
51
+ return torch.log(torch.clamp(x, min=clip_val) * C)
52
+
53
+ def dynamic_range_decompression_torch(x, C=1):
54
+ return torch.exp(x) / C
55
+
56
+ class STFT():
57
+ def __init__(self, sr=22050, n_mels=80, n_fft=1024, win_size=1024, hop_length=256, fmin=20, fmax=11025, clip_val=1e-5):
58
+ self.target_sr = sr
59
+
60
+ self.n_mels = n_mels
61
+ self.n_fft = n_fft
62
+ self.win_size = win_size
63
+ self.hop_length = hop_length
64
+ self.fmin = fmin
65
+ self.fmax = fmax
66
+ self.clip_val = clip_val
67
+ self.mel_basis = {}
68
+ self.hann_window = {}
69
+
70
+ def get_mel(self, y, center=False):
71
+ sampling_rate = self.target_sr
72
+ n_mels = self.n_mels
73
+ n_fft = self.n_fft
74
+ win_size = self.win_size
75
+ hop_length = self.hop_length
76
+ fmin = self.fmin
77
+ fmax = self.fmax
78
+ clip_val = self.clip_val
79
+
80
+ if torch.min(y) < -1.:
81
+ print('min value is ', torch.min(y))
82
+ if torch.max(y) > 1.:
83
+ print('max value is ', torch.max(y))
84
+
85
+ if fmax not in self.mel_basis:
86
+ mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax)
87
+ self.mel_basis[str(fmax)+'_'+str(y.device)] = torch.from_numpy(mel).float().to(y.device)
88
+ self.hann_window[str(y.device)] = torch.hann_window(self.win_size).to(y.device)
89
+
90
+ y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_length)/2), int((n_fft-hop_length)/2)), mode='reflect')
91
+ y = y.squeeze(1)
92
+
93
+ spec = torch.stft(y, n_fft, hop_length=hop_length, win_length=win_size, window=self.hann_window[str(y.device)],
94
+ center=center, pad_mode='reflect', normalized=False, onesided=True)
95
+ # print(111,spec)
96
+ spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9))
97
+ # print(222,spec)
98
+ spec = torch.matmul(self.mel_basis[str(fmax)+'_'+str(y.device)], spec)
99
+ # print(333,spec)
100
+ spec = dynamic_range_compression_torch(spec, clip_val=clip_val)
101
+ # print(444,spec)
102
+ return spec
103
+
104
+ def __call__(self, audiopath):
105
+ audio, sr = load_wav_to_torch(audiopath, target_sr=self.target_sr)
106
+ spect = self.get_mel(audio.unsqueeze(0)).squeeze(0)
107
+ return spect
108
+
109
+ stft = STFT()
vdecoder/hifigan/utils.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import glob
2
+ import os
3
+
4
+ # matplotlib.use("Agg")
5
+ import matplotlib.pylab as plt
6
+ import torch
7
+ from torch.nn.utils import weight_norm
8
+
9
+
10
+ def plot_spectrogram(spectrogram):
11
+ fig, ax = plt.subplots(figsize=(10, 2))
12
+ im = ax.imshow(spectrogram, aspect="auto", origin="lower",
13
+ interpolation='none')
14
+ plt.colorbar(im, ax=ax)
15
+
16
+ fig.canvas.draw()
17
+ plt.close()
18
+
19
+ return fig
20
+
21
+
22
+ def init_weights(m, mean=0.0, std=0.01):
23
+ classname = m.__class__.__name__
24
+ if classname.find("Conv") != -1:
25
+ m.weight.data.normal_(mean, std)
26
+
27
+
28
+ def apply_weight_norm(m):
29
+ classname = m.__class__.__name__
30
+ if classname.find("Conv") != -1:
31
+ weight_norm(m)
32
+
33
+
34
+ def get_padding(kernel_size, dilation=1):
35
+ return int((kernel_size*dilation - dilation)/2)
36
+
37
+
38
+ def load_checkpoint(filepath, device):
39
+ assert os.path.isfile(filepath)
40
+ print("Loading '{}'".format(filepath))
41
+ checkpoint_dict = torch.load(filepath, map_location=device)
42
+ print("Complete.")
43
+ return checkpoint_dict
44
+
45
+
46
+ def save_checkpoint(filepath, obj):
47
+ print("Saving checkpoint to {}".format(filepath))
48
+ torch.save(obj, filepath)
49
+ print("Complete.")
50
+
51
+
52
+ def del_old_checkpoints(cp_dir, prefix, n_models=2):
53
+ pattern = os.path.join(cp_dir, prefix + '????????')
54
+ cp_list = glob.glob(pattern) # get checkpoint paths
55
+ cp_list = sorted(cp_list)# sort by iter
56
+ if len(cp_list) > n_models: # if more than n_models models are found
57
+ for cp in cp_list[:-n_models]:# delete the oldest models other than lastest n_models
58
+ open(cp, 'w').close()# empty file contents
59
+ os.unlink(cp)# delete file (move to trash when using Colab)
60
+
61
+
62
+ def scan_checkpoint(cp_dir, prefix):
63
+ pattern = os.path.join(cp_dir, prefix + '????????')
64
+ cp_list = glob.glob(pattern)
65
+ if len(cp_list) == 0:
66
+ return None
67
+ return sorted(cp_list)[-1]
68
+
vdecoder/hifiganwithsnake/alias/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
2
+ # LICENSE is in incl_licenses directory.
3
+
4
+ from .act import * # noqa: F403
5
+ from .filter import * # noqa: F403
6
+ from .resample import * # noqa: F403
vdecoder/hifiganwithsnake/alias/act.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
2
+ # LICENSE is in incl_licenses directory.
3
+
4
+ import torch
5
+ import torch.nn as nn
6
+ import torch.nn.functional as F
7
+ from torch import pow, sin
8
+ from torch.nn import Parameter
9
+
10
+ from .resample import DownSample1d, UpSample1d
11
+
12
+
13
+ class Activation1d(nn.Module):
14
+ def __init__(self,
15
+ activation,
16
+ up_ratio: int = 2,
17
+ down_ratio: int = 2,
18
+ up_kernel_size: int = 12,
19
+ down_kernel_size: int = 12):
20
+ super().__init__()
21
+ self.up_ratio = up_ratio
22
+ self.down_ratio = down_ratio
23
+ self.act = activation
24
+ self.upsample = UpSample1d(up_ratio, up_kernel_size)
25
+ self.downsample = DownSample1d(down_ratio, down_kernel_size)
26
+
27
+ # x: [B,C,T]
28
+ def forward(self, x):
29
+ x = self.upsample(x)
30
+ x = self.act(x)
31
+ x = self.downsample(x)
32
+
33
+ return x
34
+
35
+
36
+ class SnakeBeta(nn.Module):
37
+ '''
38
+ A modified Snake function which uses separate parameters for the magnitude of the periodic components
39
+ Shape:
40
+ - Input: (B, C, T)
41
+ - Output: (B, C, T), same shape as the input
42
+ Parameters:
43
+ - alpha - trainable parameter that controls frequency
44
+ - beta - trainable parameter that controls magnitude
45
+ References:
46
+ - This activation function is a modified version based on this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
47
+ https://arxiv.org/abs/2006.08195
48
+ Examples:
49
+ >>> a1 = snakebeta(256)
50
+ >>> x = torch.randn(256)
51
+ >>> x = a1(x)
52
+ '''
53
+
54
+ def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
55
+ '''
56
+ Initialization.
57
+ INPUT:
58
+ - in_features: shape of the input
59
+ - alpha - trainable parameter that controls frequency
60
+ - beta - trainable parameter that controls magnitude
61
+ alpha is initialized to 1 by default, higher values = higher-frequency.
62
+ beta is initialized to 1 by default, higher values = higher-magnitude.
63
+ alpha will be trained along with the rest of your model.
64
+ '''
65
+ super(SnakeBeta, self).__init__()
66
+ self.in_features = in_features
67
+ # initialize alpha
68
+ self.alpha_logscale = alpha_logscale
69
+ if self.alpha_logscale: # log scale alphas initialized to zeros
70
+ self.alpha = Parameter(torch.zeros(in_features) * alpha)
71
+ self.beta = Parameter(torch.zeros(in_features) * alpha)
72
+ else: # linear scale alphas initialized to ones
73
+ self.alpha = Parameter(torch.ones(in_features) * alpha)
74
+ self.beta = Parameter(torch.ones(in_features) * alpha)
75
+ self.alpha.requires_grad = alpha_trainable
76
+ self.beta.requires_grad = alpha_trainable
77
+ self.no_div_by_zero = 0.000000001
78
+
79
+ def forward(self, x):
80
+ '''
81
+ Forward pass of the function.
82
+ Applies the function to the input elementwise.
83
+ SnakeBeta = x + 1/b * sin^2 (xa)
84
+ '''
85
+ alpha = self.alpha.unsqueeze(
86
+ 0).unsqueeze(-1) # line up with x to [B, C, T]
87
+ beta = self.beta.unsqueeze(0).unsqueeze(-1)
88
+ if self.alpha_logscale:
89
+ alpha = torch.exp(alpha)
90
+ beta = torch.exp(beta)
91
+ x = x + (1.0 / (beta + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
92
+ return x
93
+
94
+
95
+ class Mish(nn.Module):
96
+ """
97
+ Mish activation function is proposed in "Mish: A Self
98
+ Regularized Non-Monotonic Neural Activation Function"
99
+ paper, https://arxiv.org/abs/1908.08681.
100
+ """
101
+
102
+ def __init__(self):
103
+ super().__init__()
104
+
105
+ def forward(self, x):
106
+ return x * torch.tanh(F.softplus(x))
107
+
108
+
109
+ class SnakeAlias(nn.Module):
110
+ def __init__(self,
111
+ channels,
112
+ up_ratio: int = 2,
113
+ down_ratio: int = 2,
114
+ up_kernel_size: int = 12,
115
+ down_kernel_size: int = 12,
116
+ C = None):
117
+ super().__init__()
118
+ self.up_ratio = up_ratio
119
+ self.down_ratio = down_ratio
120
+ self.act = SnakeBeta(channels, alpha_logscale=True)
121
+ self.upsample = UpSample1d(up_ratio, up_kernel_size, C)
122
+ self.downsample = DownSample1d(down_ratio, down_kernel_size, C)
123
+
124
+ # x: [B,C,T]
125
+ def forward(self, x, C=None):
126
+ x = self.upsample(x, C)
127
+ x = self.act(x)
128
+ x = self.downsample(x)
129
+
130
+ return x
vdecoder/hifiganwithsnake/alias/filter.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
2
+ # LICENSE is in incl_licenses directory.
3
+
4
+ import math
5
+
6
+ import torch
7
+ import torch.nn as nn
8
+ import torch.nn.functional as F
9
+
10
+ if 'sinc' in dir(torch):
11
+ sinc = torch.sinc
12
+ else:
13
+ # This code is adopted from adefossez's julius.core.sinc under the MIT License
14
+ # https://adefossez.github.io/julius/julius/core.html
15
+ # LICENSE is in incl_licenses directory.
16
+ def sinc(x: torch.Tensor):
17
+ """
18
+ Implementation of sinc, i.e. sin(pi * x) / (pi * x)
19
+ __Warning__: Different to julius.sinc, the input is multiplied by `pi`!
20
+ """
21
+ return torch.where(x == 0,
22
+ torch.tensor(1., device=x.device, dtype=x.dtype),
23
+ torch.sin(math.pi * x) / math.pi / x)
24
+
25
+
26
+ # This code is adopted from adefossez's julius.lowpass.LowPassFilters under the MIT License
27
+ # https://adefossez.github.io/julius/julius/lowpass.html
28
+ # LICENSE is in incl_licenses directory.
29
+ def kaiser_sinc_filter1d(cutoff, half_width, kernel_size): # return filter [1,1,kernel_size]
30
+ even = (kernel_size % 2 == 0)
31
+ half_size = kernel_size // 2
32
+
33
+ #For kaiser window
34
+ delta_f = 4 * half_width
35
+ A = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95
36
+ if A > 50.:
37
+ beta = 0.1102 * (A - 8.7)
38
+ elif A >= 21.:
39
+ beta = 0.5842 * (A - 21)**0.4 + 0.07886 * (A - 21.)
40
+ else:
41
+ beta = 0.
42
+ window = torch.kaiser_window(kernel_size, beta=beta, periodic=False)
43
+
44
+ # ratio = 0.5/cutoff -> 2 * cutoff = 1 / ratio
45
+ if even:
46
+ time = (torch.arange(-half_size, half_size) + 0.5)
47
+ else:
48
+ time = torch.arange(kernel_size) - half_size
49
+ if cutoff == 0:
50
+ filter_ = torch.zeros_like(time)
51
+ else:
52
+ filter_ = 2 * cutoff * window * sinc(2 * cutoff * time)
53
+ # Normalize filter to have sum = 1, otherwise we will have a small leakage
54
+ # of the constant component in the input signal.
55
+ filter_ /= filter_.sum()
56
+ filter = filter_.view(1, 1, kernel_size)
57
+
58
+ return filter
59
+
60
+
61
+ class LowPassFilter1d(nn.Module):
62
+ def __init__(self,
63
+ cutoff=0.5,
64
+ half_width=0.6,
65
+ stride: int = 1,
66
+ padding: bool = True,
67
+ padding_mode: str = 'replicate',
68
+ kernel_size: int = 12,
69
+ C=None):
70
+ # kernel_size should be even number for stylegan3 setup,
71
+ # in this implementation, odd number is also possible.
72
+ super().__init__()
73
+ if cutoff < -0.:
74
+ raise ValueError("Minimum cutoff must be larger than zero.")
75
+ if cutoff > 0.5:
76
+ raise ValueError("A cutoff above 0.5 does not make sense.")
77
+ self.kernel_size = kernel_size
78
+ self.even = (kernel_size % 2 == 0)
79
+ self.pad_left = kernel_size // 2 - int(self.even)
80
+ self.pad_right = kernel_size // 2
81
+ self.stride = stride
82
+ self.padding = padding
83
+ self.padding_mode = padding_mode
84
+ filter = kaiser_sinc_filter1d(cutoff, half_width, kernel_size)
85
+ self.register_buffer("filter", filter)
86
+ self.conv1d_block = None
87
+ if C is not None:
88
+ self.conv1d_block = [nn.Conv1d(C,C,kernel_size,stride=self.stride, groups=C, bias=False),]
89
+ self.conv1d_block[0].weight = nn.Parameter(self.filter.expand(C, -1, -1))
90
+ self.conv1d_block[0].requires_grad_(False)
91
+
92
+ #input [B, C, T]
93
+ def forward(self, x):
94
+ if self.conv1d_block[0].weight.device != x.device:
95
+ self.conv1d_block[0] = self.conv1d_block[0].to(x.device)
96
+ if self.conv1d_block is None:
97
+ _, C, _ = x.shape
98
+
99
+ if self.padding:
100
+ x = F.pad(x, (self.pad_left, self.pad_right),
101
+ mode=self.padding_mode)
102
+ out = F.conv1d(x, self.filter.expand(C, -1, -1),
103
+ stride=self.stride, groups=C)
104
+ else:
105
+ if self.padding:
106
+ x = F.pad(x, (self.pad_left, self.pad_right),
107
+ mode=self.padding_mode)
108
+ out = self.conv1d_block[0](x)
109
+
110
+ return out
vdecoder/hifiganwithsnake/alias/resample.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
2
+ # LICENSE is in incl_licenses directory.
3
+
4
+ import torch.nn as nn
5
+ from torch.nn import functional as F
6
+
7
+ from .filter import LowPassFilter1d, kaiser_sinc_filter1d
8
+
9
+
10
+ class UpSample1d(nn.Module):
11
+ def __init__(self, ratio=2, kernel_size=None, C=None):
12
+ super().__init__()
13
+ self.ratio = ratio
14
+ self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
15
+ self.stride = ratio
16
+ self.pad = self.kernel_size // ratio - 1
17
+ self.pad_left = self.pad * self.stride + (self.kernel_size - self.stride) // 2
18
+ self.pad_right = self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2
19
+ filter = kaiser_sinc_filter1d(cutoff=0.5 / ratio,
20
+ half_width=0.6 / ratio,
21
+ kernel_size=self.kernel_size)
22
+ self.register_buffer("filter", filter)
23
+ self.conv_transpose1d_block = None
24
+ if C is not None:
25
+ self.conv_transpose1d_block = [nn.ConvTranspose1d(C,
26
+ C,
27
+ kernel_size=self.kernel_size,
28
+ stride=self.stride,
29
+ groups=C,
30
+ bias=False
31
+ ),]
32
+ self.conv_transpose1d_block[0].weight = nn.Parameter(self.filter.expand(C, -1, -1).clone())
33
+ self.conv_transpose1d_block[0].requires_grad_(False)
34
+
35
+
36
+
37
+ # x: [B, C, T]
38
+ def forward(self, x, C=None):
39
+ if self.conv_transpose1d_block[0].weight.device != x.device:
40
+ self.conv_transpose1d_block[0] = self.conv_transpose1d_block[0].to(x.device)
41
+ if self.conv_transpose1d_block is None:
42
+ if C is None:
43
+ _, C, _ = x.shape
44
+ # print("snake.conv_t.in:",x.shape)
45
+ x = F.pad(x, (self.pad, self.pad), mode='replicate')
46
+ x = self.ratio * F.conv_transpose1d(
47
+ x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C)
48
+ # print("snake.conv_t.out:",x.shape)
49
+ x = x[..., self.pad_left:-self.pad_right]
50
+ else:
51
+ x = F.pad(x, (self.pad, self.pad), mode='replicate')
52
+ x = self.ratio * self.conv_transpose1d_block[0](x)
53
+ x = x[..., self.pad_left:-self.pad_right]
54
+ return x
55
+
56
+
57
+ class DownSample1d(nn.Module):
58
+ def __init__(self, ratio=2, kernel_size=None, C=None):
59
+ super().__init__()
60
+ self.ratio = ratio
61
+ self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
62
+ self.lowpass = LowPassFilter1d(cutoff=0.5 / ratio,
63
+ half_width=0.6 / ratio,
64
+ stride=ratio,
65
+ kernel_size=self.kernel_size,
66
+ C=C)
67
+
68
+
69
+ def forward(self, x):
70
+ xx = self.lowpass(x)
71
+
72
+ return xx
vdecoder/hifiganwithsnake/env.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+
4
+
5
+ class AttrDict(dict):
6
+ def __init__(self, *args, **kwargs):
7
+ super(AttrDict, self).__init__(*args, **kwargs)
8
+ self.__dict__ = self
9
+
10
+
11
+ def build_env(config, config_name, path):
12
+ t_path = os.path.join(path, config_name)
13
+ if config != t_path:
14
+ os.makedirs(path, exist_ok=True)
15
+ shutil.copyfile(config, os.path.join(path, config_name))
vdecoder/hifiganwithsnake/models.py ADDED
@@ -0,0 +1,576 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+
4
+ import numpy as np
5
+ import torch
6
+ import torch.nn as nn
7
+ import torch.nn.functional as F
8
+ from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d
9
+ from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm
10
+
11
+ from vdecoder.hifiganwithsnake.alias.act import SnakeAlias
12
+
13
+ from .env import AttrDict
14
+ from .utils import get_padding, init_weights
15
+
16
+ LRELU_SLOPE = 0.1
17
+
18
+
19
+ def load_model(model_path, device='cuda'):
20
+ config_file = os.path.join(os.path.split(model_path)[0], 'config.json')
21
+ with open(config_file) as f:
22
+ data = f.read()
23
+
24
+ global h
25
+ json_config = json.loads(data)
26
+ h = AttrDict(json_config)
27
+
28
+ generator = Generator(h).to(device)
29
+
30
+ cp_dict = torch.load(model_path)
31
+ generator.load_state_dict(cp_dict['generator'])
32
+ generator.eval()
33
+ generator.remove_weight_norm()
34
+ del cp_dict
35
+ return generator, h
36
+
37
+
38
+ class ResBlock1(torch.nn.Module):
39
+ def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5), C=None):
40
+ super(ResBlock1, self).__init__()
41
+ self.h = h
42
+ self.convs1 = nn.ModuleList([
43
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
44
+ padding=get_padding(kernel_size, dilation[0]))),
45
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
46
+ padding=get_padding(kernel_size, dilation[1]))),
47
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
48
+ padding=get_padding(kernel_size, dilation[2])))
49
+ ])
50
+ self.convs1.apply(init_weights)
51
+
52
+ self.convs2 = nn.ModuleList([
53
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
54
+ padding=get_padding(kernel_size, 1))),
55
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
56
+ padding=get_padding(kernel_size, 1))),
57
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
58
+ padding=get_padding(kernel_size, 1)))
59
+ ])
60
+ self.convs2.apply(init_weights)
61
+
62
+ self.num_layers = len(self.convs1) + len(self.convs2)
63
+ self.activations = nn.ModuleList([
64
+ SnakeAlias(channels, C=C) for _ in range(self.num_layers)
65
+ ])
66
+
67
+ def forward(self, x, DIM=None):
68
+ acts1, acts2 = self.activations[::2], self.activations[1::2]
69
+ for c1, c2, a1, a2 in zip(self.convs1, self.convs2, acts1, acts2):
70
+ xt = a1(x, DIM)
71
+ xt = c1(xt)
72
+ xt = a2(xt, DIM)
73
+ xt = c2(xt)
74
+ x = xt + x
75
+ return x
76
+
77
+ def remove_weight_norm(self):
78
+ for l in self.convs1:
79
+ remove_weight_norm(l)
80
+ for l in self.convs2:
81
+ remove_weight_norm(l)
82
+
83
+
84
+ class ResBlock2(torch.nn.Module):
85
+ def __init__(self, h, channels, kernel_size=3, dilation=(1, 3), C=None):
86
+ super(ResBlock2, self).__init__()
87
+ self.h = h
88
+ self.convs = nn.ModuleList([
89
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
90
+ padding=get_padding(kernel_size, dilation[0]))),
91
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
92
+ padding=get_padding(kernel_size, dilation[1])))
93
+ ])
94
+ self.convs.apply(init_weights)
95
+
96
+ self.num_layers = len(self.convs)
97
+ self.activations = nn.ModuleList([
98
+ SnakeAlias(channels, C=C) for _ in range(self.num_layers)
99
+ ])
100
+
101
+ def forward(self, x, DIM=None):
102
+ for c,a in zip(self.convs, self.activations):
103
+ xt = a(x, DIM)
104
+ xt = c(xt)
105
+ x = xt + x
106
+ return x
107
+
108
+ def remove_weight_norm(self):
109
+ for l in self.convs:
110
+ remove_weight_norm(l)
111
+
112
+
113
+ def padDiff(x):
114
+ return F.pad(F.pad(x, (0,0,-1,1), 'constant', 0) - x, (0,0,0,-1), 'constant', 0)
115
+
116
+ class SineGen(torch.nn.Module):
117
+ """ Definition of sine generator
118
+ SineGen(samp_rate, harmonic_num = 0,
119
+ sine_amp = 0.1, noise_std = 0.003,
120
+ voiced_threshold = 0,
121
+ flag_for_pulse=False)
122
+ samp_rate: sampling rate in Hz
123
+ harmonic_num: number of harmonic overtones (default 0)
124
+ sine_amp: amplitude of sine-wavefrom (default 0.1)
125
+ noise_std: std of Gaussian noise (default 0.003)
126
+ voiced_thoreshold: F0 threshold for U/V classification (default 0)
127
+ flag_for_pulse: this SinGen is used inside PulseGen (default False)
128
+ Note: when flag_for_pulse is True, the first time step of a voiced
129
+ segment is always sin(np.pi) or cos(0)
130
+ """
131
+
132
+ def __init__(self, samp_rate, harmonic_num=0,
133
+ sine_amp=0.1, noise_std=0.003,
134
+ voiced_threshold=0,
135
+ flag_for_pulse=False):
136
+ super(SineGen, self).__init__()
137
+ self.sine_amp = sine_amp
138
+ self.noise_std = noise_std
139
+ self.harmonic_num = harmonic_num
140
+ self.dim = self.harmonic_num + 1
141
+ self.sampling_rate = samp_rate
142
+ self.voiced_threshold = voiced_threshold
143
+ self.flag_for_pulse = flag_for_pulse
144
+ self.onnx = False
145
+
146
+ def _f02uv(self, f0):
147
+ # generate uv signal
148
+ uv = (f0 > self.voiced_threshold).type(torch.float32)
149
+ return uv
150
+
151
+ def _f02sine(self, f0_values):
152
+ """ f0_values: (batchsize, length, dim)
153
+ where dim indicates fundamental tone and overtones
154
+ """
155
+ # convert to F0 in rad. The interger part n can be ignored
156
+ # because 2 * np.pi * n doesn't affect phase
157
+ rad_values = (f0_values / self.sampling_rate) % 1
158
+
159
+ # initial phase noise (no noise for fundamental component)
160
+ rand_ini = torch.rand(f0_values.shape[0], f0_values.shape[2], \
161
+ device=f0_values.device)
162
+ rand_ini[:, 0] = 0
163
+ rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
164
+
165
+ # instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad)
166
+ if not self.flag_for_pulse:
167
+ # for normal case
168
+
169
+ # To prevent torch.cumsum numerical overflow,
170
+ # it is necessary to add -1 whenever \sum_k=1^n rad_value_k > 1.
171
+ # Buffer tmp_over_one_idx indicates the time step to add -1.
172
+ # This will not change F0 of sine because (x-1) * 2*pi = x * 2*pi
173
+ tmp_over_one = torch.cumsum(rad_values, 1) % 1
174
+ tmp_over_one_idx = (padDiff(tmp_over_one)) < 0
175
+ cumsum_shift = torch.zeros_like(rad_values)
176
+ cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
177
+
178
+ sines = torch.sin(torch.cumsum(rad_values + cumsum_shift, dim=1)
179
+ * 2 * np.pi)
180
+ else:
181
+ # If necessary, make sure that the first time step of every
182
+ # voiced segments is sin(pi) or cos(0)
183
+ # This is used for pulse-train generation
184
+
185
+ # identify the last time step in unvoiced segments
186
+ uv = self._f02uv(f0_values)
187
+ uv_1 = torch.roll(uv, shifts=-1, dims=1)
188
+ uv_1[:, -1, :] = 1
189
+ u_loc = (uv < 1) * (uv_1 > 0)
190
+
191
+ # get the instantanouse phase
192
+ tmp_cumsum = torch.cumsum(rad_values, dim=1)
193
+ # different batch needs to be processed differently
194
+ for idx in range(f0_values.shape[0]):
195
+ temp_sum = tmp_cumsum[idx, u_loc[idx, :, 0], :]
196
+ temp_sum[1:, :] = temp_sum[1:, :] - temp_sum[0:-1, :]
197
+ # stores the accumulation of i.phase within
198
+ # each voiced segments
199
+ tmp_cumsum[idx, :, :] = 0
200
+ tmp_cumsum[idx, u_loc[idx, :, 0], :] = temp_sum
201
+
202
+ # rad_values - tmp_cumsum: remove the accumulation of i.phase
203
+ # within the previous voiced segment.
204
+ i_phase = torch.cumsum(rad_values - tmp_cumsum, dim=1)
205
+
206
+ # get the sines
207
+ sines = torch.cos(i_phase * 2 * np.pi)
208
+ return sines
209
+
210
+ def forward(self, f0, upp=None):
211
+ """ sine_tensor, uv = forward(f0)
212
+ input F0: tensor(batchsize=1, length, dim=1)
213
+ f0 for unvoiced steps should be 0
214
+ output sine_tensor: tensor(batchsize=1, length, dim)
215
+ output uv: tensor(batchsize=1, length, 1)
216
+ """
217
+
218
+ if self.onnx:
219
+ with torch.no_grad():
220
+ f0 = f0[:, None].transpose(1, 2)
221
+ f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
222
+ # fundamental component
223
+ f0_buf[:, :, 0] = f0[:, :, 0]
224
+ for idx in np.arange(self.harmonic_num):
225
+ f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
226
+ idx + 2
227
+ ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
228
+ rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化
229
+ rand_ini = torch.rand(
230
+ f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
231
+ )
232
+ rand_ini[:, 0] = 0
233
+ rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
234
+ tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化
235
+ tmp_over_one *= upp
236
+ tmp_over_one = F.interpolate(
237
+ tmp_over_one.transpose(2, 1),
238
+ scale_factor=upp,
239
+ mode="linear",
240
+ align_corners=True,
241
+ ).transpose(2, 1)
242
+ rad_values = F.interpolate(
243
+ rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
244
+ ).transpose(
245
+ 2, 1
246
+ ) #######
247
+ tmp_over_one %= 1
248
+ tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
249
+ cumsum_shift = torch.zeros_like(rad_values)
250
+ cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
251
+ sine_waves = torch.sin(
252
+ torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
253
+ )
254
+ sine_waves = sine_waves * self.sine_amp
255
+ uv = self._f02uv(f0)
256
+ uv = F.interpolate(
257
+ uv.transpose(2, 1), scale_factor=upp, mode="nearest"
258
+ ).transpose(2, 1)
259
+ noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
260
+ noise = noise_amp * torch.randn_like(sine_waves)
261
+ sine_waves = sine_waves * uv + noise
262
+ return sine_waves, uv, noise
263
+ else:
264
+ with torch.no_grad():
265
+ # fundamental component
266
+ fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device))
267
+
268
+ # generate sine waveforms
269
+ sine_waves = self._f02sine(fn) * self.sine_amp
270
+
271
+ # generate uv signal
272
+ # uv = torch.ones(f0.shape)
273
+ # uv = uv * (f0 > self.voiced_threshold)
274
+ uv = self._f02uv(f0)
275
+
276
+ # noise: for unvoiced should be similar to sine_amp
277
+ # std = self.sine_amp/3 -> max value ~ self.sine_amp
278
+ # . for voiced regions is self.noise_std
279
+ noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
280
+ noise = noise_amp * torch.randn_like(sine_waves)
281
+
282
+ # first: set the unvoiced part to 0 by uv
283
+ # then: additive noise
284
+ sine_waves = sine_waves * uv + noise
285
+ return sine_waves, uv, noise
286
+
287
+
288
+ class SourceModuleHnNSF(torch.nn.Module):
289
+ """ SourceModule for hn-nsf
290
+ SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
291
+ add_noise_std=0.003, voiced_threshod=0)
292
+ sampling_rate: sampling_rate in Hz
293
+ harmonic_num: number of harmonic above F0 (default: 0)
294
+ sine_amp: amplitude of sine source signal (default: 0.1)
295
+ add_noise_std: std of additive Gaussian noise (default: 0.003)
296
+ note that amplitude of noise in unvoiced is decided
297
+ by sine_amp
298
+ voiced_threshold: threhold to set U/V given F0 (default: 0)
299
+ Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
300
+ F0_sampled (batchsize, length, 1)
301
+ Sine_source (batchsize, length, 1)
302
+ noise_source (batchsize, length 1)
303
+ uv (batchsize, length, 1)
304
+ """
305
+
306
+ def __init__(self, sampling_rate, harmonic_num=0, sine_amp=0.1,
307
+ add_noise_std=0.003, voiced_threshod=0):
308
+ super(SourceModuleHnNSF, self).__init__()
309
+
310
+ self.sine_amp = sine_amp
311
+ self.noise_std = add_noise_std
312
+
313
+ # to produce sine waveforms
314
+ self.l_sin_gen = SineGen(sampling_rate, harmonic_num,
315
+ sine_amp, add_noise_std, voiced_threshod)
316
+
317
+ # to merge source harmonics into a single excitation
318
+ self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
319
+ self.l_tanh = torch.nn.Tanh()
320
+
321
+ def forward(self, x, upp=None):
322
+ """
323
+ Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
324
+ F0_sampled (batchsize, length, 1)
325
+ Sine_source (batchsize, length, 1)
326
+ noise_source (batchsize, length 1)
327
+ """
328
+ # source for harmonic branch
329
+ sine_wavs, uv, _ = self.l_sin_gen(x, upp)
330
+ sine_merge = self.l_tanh(self.l_linear(sine_wavs.to(self.l_linear.weight.dtype)))
331
+
332
+ # source for noise branch, in the same shape as uv
333
+ noise = torch.randn_like(uv) * self.sine_amp / 3
334
+ return sine_merge, noise, uv
335
+
336
+
337
+ class Generator(torch.nn.Module):
338
+ def __init__(self, h):
339
+ super(Generator, self).__init__()
340
+ self.h = h
341
+
342
+ self.num_kernels = len(h["resblock_kernel_sizes"])
343
+ self.num_upsamples = len(h["upsample_rates"])
344
+ self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(h["upsample_rates"]))
345
+ self.m_source = SourceModuleHnNSF(
346
+ sampling_rate=h["sampling_rate"],
347
+ harmonic_num=8)
348
+ self.noise_convs = nn.ModuleList()
349
+ self.conv_pre = weight_norm(Conv1d(h["inter_channels"], h["upsample_initial_channel"], 7, 1, padding=3))
350
+ resblock = ResBlock1 if h["resblock"] == '1' else ResBlock2
351
+ self.ups = nn.ModuleList()
352
+ for i, (u, k) in enumerate(zip(h["upsample_rates"], h["upsample_kernel_sizes"])):
353
+ c_cur = h["upsample_initial_channel"] // (2 ** (i + 1))
354
+ self.ups.append(weight_norm(
355
+ ConvTranspose1d(h["upsample_initial_channel"] // (2 ** i), h["upsample_initial_channel"] // (2 ** (i + 1)),
356
+ k, u, padding=(k - u + 1) // 2)))
357
+ if i + 1 < len(h["upsample_rates"]): #
358
+ stride_f0 = np.prod(h["upsample_rates"][i + 1:])
359
+ self.noise_convs.append(Conv1d(
360
+ 1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=(stride_f0+ 1) // 2))
361
+ else:
362
+ self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
363
+ self.resblocks = nn.ModuleList()
364
+ self.snakes = nn.ModuleList()
365
+ for i in range(len(self.ups)):
366
+ ch = h["upsample_initial_channel"] // (2 ** (i + 1))
367
+ self.snakes.append(SnakeAlias(h["upsample_initial_channel"] // (2 ** (i)), C = h["upsample_initial_channel"] >> i))
368
+ for j, (k, d) in enumerate(zip(h["resblock_kernel_sizes"], h["resblock_dilation_sizes"])):
369
+ self.resblocks.append(resblock(h, ch, k, d, C = h["upsample_initial_channel"] >> (i + 1)))
370
+
371
+ self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
372
+ self.ups.apply(init_weights)
373
+ self.conv_post.apply(init_weights)
374
+ self.snake_post = SnakeAlias(ch, C = h["upsample_initial_channel"] >> len(self.ups))
375
+ self.cond = nn.Conv1d(h['gin_channels'], h['upsample_initial_channel'], 1)
376
+ self.upp = np.prod(h["upsample_rates"])
377
+ self.onnx = False
378
+
379
+ def OnnxExport(self):
380
+ self.onnx = True
381
+ self.m_source.l_sin_gen.onnx = True
382
+
383
+ def forward(self, x, f0, g=None):
384
+ # print(1,x.shape,f0.shape,f0[:, None].shape)
385
+ if not self.onnx:
386
+ f0 = self.f0_upsamp(f0[:, None]).transpose(1, 2) # bs,n,t
387
+ # print(2,f0.shape)
388
+ har_source, noi_source, uv = self.m_source(f0, self.upp)
389
+ har_source = har_source.transpose(1, 2)
390
+ x = self.conv_pre(x)
391
+ x = x + self.cond(g)
392
+ # print(124,x.shape,har_source.shape)
393
+ for i in range(self.num_upsamples):
394
+ # print(f"self.snakes.{i}.pre:", x.shape)
395
+ x = self.snakes[i](x)
396
+ # print(f"self.snakes.{i}.after:", x.shape)
397
+ x = self.ups[i](x)
398
+ x_source = self.noise_convs[i](har_source)
399
+ # print(4,x_source.shape,har_source.shape,x.shape)
400
+ x = x + x_source
401
+ xs = None
402
+ for j in range(self.num_kernels):
403
+ if xs is None:
404
+ xs = self.resblocks[i * self.num_kernels + j](x)
405
+ else:
406
+ xs += self.resblocks[i * self.num_kernels + j](x)
407
+ # print(f"self.resblocks.{i}.after:", xs.shape)
408
+ x = xs / self.num_kernels
409
+ x = self.snake_post(x)
410
+ x = self.conv_post(x)
411
+ x = torch.tanh(x)
412
+
413
+ return x
414
+
415
+ def remove_weight_norm(self):
416
+ print('Removing weight norm...')
417
+ for l in self.ups:
418
+ remove_weight_norm(l)
419
+ for l in self.resblocks:
420
+ l.remove_weight_norm()
421
+ remove_weight_norm(self.conv_pre)
422
+ remove_weight_norm(self.conv_post)
423
+
424
+
425
+ class DiscriminatorP(torch.nn.Module):
426
+ def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
427
+ super(DiscriminatorP, self).__init__()
428
+ self.period = period
429
+ norm_f = weight_norm if use_spectral_norm is False else spectral_norm
430
+ self.convs = nn.ModuleList([
431
+ norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
432
+ norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
433
+ norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
434
+ norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
435
+ norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
436
+ ])
437
+ self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
438
+
439
+ def forward(self, x):
440
+ fmap = []
441
+
442
+ # 1d to 2d
443
+ b, c, t = x.shape
444
+ if t % self.period != 0: # pad first
445
+ n_pad = self.period - (t % self.period)
446
+ x = F.pad(x, (0, n_pad), "reflect")
447
+ t = t + n_pad
448
+ x = x.view(b, c, t // self.period, self.period)
449
+
450
+ for l in self.convs:
451
+ x = l(x)
452
+ x = F.leaky_relu(x, LRELU_SLOPE)
453
+ fmap.append(x)
454
+ x = self.conv_post(x)
455
+ fmap.append(x)
456
+ x = torch.flatten(x, 1, -1)
457
+
458
+ return x, fmap
459
+
460
+
461
+ class MultiPeriodDiscriminator(torch.nn.Module):
462
+ def __init__(self, periods=None):
463
+ super(MultiPeriodDiscriminator, self).__init__()
464
+ self.periods = periods if periods is not None else [2, 3, 5, 7, 11]
465
+ self.discriminators = nn.ModuleList()
466
+ for period in self.periods:
467
+ self.discriminators.append(DiscriminatorP(period))
468
+
469
+ def forward(self, y, y_hat):
470
+ y_d_rs = []
471
+ y_d_gs = []
472
+ fmap_rs = []
473
+ fmap_gs = []
474
+ for i, d in enumerate(self.discriminators):
475
+ y_d_r, fmap_r = d(y)
476
+ y_d_g, fmap_g = d(y_hat)
477
+ y_d_rs.append(y_d_r)
478
+ fmap_rs.append(fmap_r)
479
+ y_d_gs.append(y_d_g)
480
+ fmap_gs.append(fmap_g)
481
+
482
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
483
+
484
+
485
+ class DiscriminatorS(torch.nn.Module):
486
+ def __init__(self, use_spectral_norm=False):
487
+ super(DiscriminatorS, self).__init__()
488
+ norm_f = weight_norm if use_spectral_norm is False else spectral_norm
489
+ self.convs = nn.ModuleList([
490
+ norm_f(Conv1d(1, 128, 15, 1, padding=7)),
491
+ norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),
492
+ norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)),
493
+ norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)),
494
+ norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)),
495
+ norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)),
496
+ norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
497
+ ])
498
+ self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
499
+
500
+ def forward(self, x):
501
+ fmap = []
502
+ for l in self.convs:
503
+ x = l(x)
504
+ x = F.leaky_relu(x, LRELU_SLOPE)
505
+ fmap.append(x)
506
+ x = self.conv_post(x)
507
+ fmap.append(x)
508
+ x = torch.flatten(x, 1, -1)
509
+
510
+ return x, fmap
511
+
512
+
513
+ class MultiScaleDiscriminator(torch.nn.Module):
514
+ def __init__(self):
515
+ super(MultiScaleDiscriminator, self).__init__()
516
+ self.discriminators = nn.ModuleList([
517
+ DiscriminatorS(use_spectral_norm=True),
518
+ DiscriminatorS(),
519
+ DiscriminatorS(),
520
+ ])
521
+ self.meanpools = nn.ModuleList([
522
+ AvgPool1d(4, 2, padding=2),
523
+ AvgPool1d(4, 2, padding=2)
524
+ ])
525
+
526
+ def forward(self, y, y_hat):
527
+ y_d_rs = []
528
+ y_d_gs = []
529
+ fmap_rs = []
530
+ fmap_gs = []
531
+ for i, d in enumerate(self.discriminators):
532
+ if i != 0:
533
+ y = self.meanpools[i - 1](y)
534
+ y_hat = self.meanpools[i - 1](y_hat)
535
+ y_d_r, fmap_r = d(y)
536
+ y_d_g, fmap_g = d(y_hat)
537
+ y_d_rs.append(y_d_r)
538
+ fmap_rs.append(fmap_r)
539
+ y_d_gs.append(y_d_g)
540
+ fmap_gs.append(fmap_g)
541
+
542
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
543
+
544
+
545
+ def feature_loss(fmap_r, fmap_g):
546
+ loss = 0
547
+ for dr, dg in zip(fmap_r, fmap_g):
548
+ for rl, gl in zip(dr, dg):
549
+ loss += torch.mean(torch.abs(rl - gl))
550
+
551
+ return loss * 2
552
+
553
+
554
+ def discriminator_loss(disc_real_outputs, disc_generated_outputs):
555
+ loss = 0
556
+ r_losses = []
557
+ g_losses = []
558
+ for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
559
+ r_loss = torch.mean((1 - dr) ** 2)
560
+ g_loss = torch.mean(dg ** 2)
561
+ loss += (r_loss + g_loss)
562
+ r_losses.append(r_loss.item())
563
+ g_losses.append(g_loss.item())
564
+
565
+ return loss, r_losses, g_losses
566
+
567
+
568
+ def generator_loss(disc_outputs):
569
+ loss = 0
570
+ gen_losses = []
571
+ for dg in disc_outputs:
572
+ l = torch.mean((1 - dg) ** 2)
573
+ gen_losses.append(l)
574
+ loss += l
575
+
576
+ return loss, gen_losses
vdecoder/hifiganwithsnake/nvSTFT.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import librosa
4
+ import numpy as np
5
+ import soundfile as sf
6
+ import torch
7
+ import torch.utils.data
8
+ from librosa.filters import mel as librosa_mel_fn
9
+
10
+ os.environ["LRU_CACHE_CAPACITY"] = "3"
11
+
12
+ def load_wav_to_torch(full_path, target_sr=None, return_empty_on_exception=False):
13
+ sampling_rate = None
14
+ try:
15
+ data, sampling_rate = sf.read(full_path, always_2d=True)# than soundfile.
16
+ except Exception as ex:
17
+ print(f"'{full_path}' failed to load.\nException:")
18
+ print(ex)
19
+ if return_empty_on_exception:
20
+ return [], sampling_rate or target_sr or 32000
21
+ else:
22
+ raise Exception(ex)
23
+
24
+ if len(data.shape) > 1:
25
+ data = data[:, 0]
26
+ assert len(data) > 2# check duration of audio file is > 2 samples (because otherwise the slice operation was on the wrong dimension)
27
+
28
+ if np.issubdtype(data.dtype, np.integer): # if audio data is type int
29
+ max_mag = -np.iinfo(data.dtype).min # maximum magnitude = min possible value of intXX
30
+ else: # if audio data is type fp32
31
+ max_mag = max(np.amax(data), -np.amin(data))
32
+ max_mag = (2**31)+1 if max_mag > (2**15) else ((2**15)+1 if max_mag > 1.01 else 1.0) # data should be either 16-bit INT, 32-bit INT or [-1 to 1] float32
33
+
34
+ data = torch.FloatTensor(data.astype(np.float32))/max_mag
35
+
36
+ if (torch.isinf(data) | torch.isnan(data)).any() and return_empty_on_exception:# resample will crash with inf/NaN inputs. return_empty_on_exception will return empty arr instead of except
37
+ return [], sampling_rate or target_sr or 32000
38
+ if target_sr is not None and sampling_rate != target_sr:
39
+ data = torch.from_numpy(librosa.core.resample(data.numpy(), orig_sr=sampling_rate, target_sr=target_sr))
40
+ sampling_rate = target_sr
41
+
42
+ return data, sampling_rate
43
+
44
+ def dynamic_range_compression(x, C=1, clip_val=1e-5):
45
+ return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
46
+
47
+ def dynamic_range_decompression(x, C=1):
48
+ return np.exp(x) / C
49
+
50
+ def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
51
+ return torch.log(torch.clamp(x, min=clip_val) * C)
52
+
53
+ def dynamic_range_decompression_torch(x, C=1):
54
+ return torch.exp(x) / C
55
+
56
+ class STFT():
57
+ def __init__(self, sr=22050, n_mels=80, n_fft=1024, win_size=1024, hop_length=256, fmin=20, fmax=11025, clip_val=1e-5):
58
+ self.target_sr = sr
59
+
60
+ self.n_mels = n_mels
61
+ self.n_fft = n_fft
62
+ self.win_size = win_size
63
+ self.hop_length = hop_length
64
+ self.fmin = fmin
65
+ self.fmax = fmax
66
+ self.clip_val = clip_val
67
+ self.mel_basis = {}
68
+ self.hann_window = {}
69
+
70
+ def get_mel(self, y, center=False):
71
+ sampling_rate = self.target_sr
72
+ n_mels = self.n_mels
73
+ n_fft = self.n_fft
74
+ win_size = self.win_size
75
+ hop_length = self.hop_length
76
+ fmin = self.fmin
77
+ fmax = self.fmax
78
+ clip_val = self.clip_val
79
+
80
+ if torch.min(y) < -1.:
81
+ print('min value is ', torch.min(y))
82
+ if torch.max(y) > 1.:
83
+ print('max value is ', torch.max(y))
84
+
85
+ if fmax not in self.mel_basis:
86
+ mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax)
87
+ self.mel_basis[str(fmax)+'_'+str(y.device)] = torch.from_numpy(mel).float().to(y.device)
88
+ self.hann_window[str(y.device)] = torch.hann_window(self.win_size).to(y.device)
89
+
90
+ y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_length)/2), int((n_fft-hop_length)/2)), mode='reflect')
91
+ y = y.squeeze(1)
92
+
93
+ spec = torch.stft(y, n_fft, hop_length=hop_length, win_length=win_size, window=self.hann_window[str(y.device)],
94
+ center=center, pad_mode='reflect', normalized=False, onesided=True)
95
+ # print(111,spec)
96
+ spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9))
97
+ # print(222,spec)
98
+ spec = torch.matmul(self.mel_basis[str(fmax)+'_'+str(y.device)], spec)
99
+ # print(333,spec)
100
+ spec = dynamic_range_compression_torch(spec, clip_val=clip_val)
101
+ # print(444,spec)
102
+ return spec
103
+
104
+ def __call__(self, audiopath):
105
+ audio, sr = load_wav_to_torch(audiopath, target_sr=self.target_sr)
106
+ spect = self.get_mel(audio.unsqueeze(0)).squeeze(0)
107
+ return spect
108
+
109
+ stft = STFT()
vdecoder/hifiganwithsnake/utils.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import glob
2
+ import os
3
+
4
+ # matplotlib.use("Agg")
5
+ import matplotlib.pylab as plt
6
+ import torch
7
+ from torch.nn.utils import weight_norm
8
+
9
+
10
+ def plot_spectrogram(spectrogram):
11
+ fig, ax = plt.subplots(figsize=(10, 2))
12
+ im = ax.imshow(spectrogram, aspect="auto", origin="lower",
13
+ interpolation='none')
14
+ plt.colorbar(im, ax=ax)
15
+
16
+ fig.canvas.draw()
17
+ plt.close()
18
+
19
+ return fig
20
+
21
+
22
+ def init_weights(m, mean=0.0, std=0.01):
23
+ classname = m.__class__.__name__
24
+ if classname.find("Conv") != -1:
25
+ m.weight.data.normal_(mean, std)
26
+
27
+
28
+ def apply_weight_norm(m):
29
+ classname = m.__class__.__name__
30
+ if classname.find("Conv") != -1:
31
+ weight_norm(m)
32
+
33
+
34
+ def get_padding(kernel_size, dilation=1):
35
+ return int((kernel_size*dilation - dilation)/2)
36
+
37
+
38
+ def load_checkpoint(filepath, device):
39
+ assert os.path.isfile(filepath)
40
+ print("Loading '{}'".format(filepath))
41
+ checkpoint_dict = torch.load(filepath, map_location=device)
42
+ print("Complete.")
43
+ return checkpoint_dict
44
+
45
+
46
+ def save_checkpoint(filepath, obj):
47
+ print("Saving checkpoint to {}".format(filepath))
48
+ torch.save(obj, filepath)
49
+ print("Complete.")
50
+
51
+
52
+ def del_old_checkpoints(cp_dir, prefix, n_models=2):
53
+ pattern = os.path.join(cp_dir, prefix + '????????')
54
+ cp_list = glob.glob(pattern) # get checkpoint paths
55
+ cp_list = sorted(cp_list)# sort by iter
56
+ if len(cp_list) > n_models: # if more than n_models models are found
57
+ for cp in cp_list[:-n_models]:# delete the oldest models other than lastest n_models
58
+ open(cp, 'w').close()# empty file contents
59
+ os.unlink(cp)# delete file (move to trash when using Colab)
60
+
61
+
62
+ def scan_checkpoint(cp_dir, prefix):
63
+ pattern = os.path.join(cp_dir, prefix + '????????')
64
+ cp_list = glob.glob(pattern)
65
+ if len(cp_list) == 0:
66
+ return None
67
+ return sorted(cp_list)[-1]
68
+
vdecoder/nsf_hifigan/__pycache__/env.cpython-38.pyc ADDED
Binary file (823 Bytes). View file
 
vdecoder/nsf_hifigan/__pycache__/models.cpython-38.pyc ADDED
Binary file (14.3 kB). View file
 
vdecoder/nsf_hifigan/__pycache__/nvSTFT.cpython-38.pyc ADDED
Binary file (4.36 kB). View file
 
vdecoder/nsf_hifigan/__pycache__/utils.cpython-38.pyc ADDED
Binary file (2.37 kB). View file
 
vdecoder/nsf_hifigan/env.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+
4
+
5
+ class AttrDict(dict):
6
+ def __init__(self, *args, **kwargs):
7
+ super(AttrDict, self).__init__(*args, **kwargs)
8
+ self.__dict__ = self
9
+
10
+
11
+ def build_env(config, config_name, path):
12
+ t_path = os.path.join(path, config_name)
13
+ if config != t_path:
14
+ os.makedirs(path, exist_ok=True)
15
+ shutil.copyfile(config, os.path.join(path, config_name))
vdecoder/nsf_hifigan/models.py ADDED
@@ -0,0 +1,441 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+
4
+ import numpy as np
5
+ import torch
6
+ import torch.nn as nn
7
+ import torch.nn.functional as F
8
+ from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d
9
+ from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm
10
+
11
+ from .env import AttrDict
12
+ from .utils import get_padding, init_weights
13
+
14
+ LRELU_SLOPE = 0.1
15
+
16
+
17
+ def load_model(model_path, device='cuda'):
18
+ h = load_config(model_path)
19
+
20
+ generator = Generator(h).to(device)
21
+
22
+ cp_dict = torch.load(model_path, map_location=device)
23
+ generator.load_state_dict(cp_dict['generator'])
24
+ generator.eval()
25
+ generator.remove_weight_norm()
26
+ del cp_dict
27
+ return generator, h
28
+
29
+ def load_config(model_path):
30
+ config_file = os.path.join(os.path.split(model_path)[0], 'config.json')
31
+ with open(config_file) as f:
32
+ data = f.read()
33
+
34
+ json_config = json.loads(data)
35
+ h = AttrDict(json_config)
36
+ return h
37
+
38
+
39
+ class ResBlock1(torch.nn.Module):
40
+ def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
41
+ super(ResBlock1, self).__init__()
42
+ self.h = h
43
+ self.convs1 = nn.ModuleList([
44
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
45
+ padding=get_padding(kernel_size, dilation[0]))),
46
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
47
+ padding=get_padding(kernel_size, dilation[1]))),
48
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
49
+ padding=get_padding(kernel_size, dilation[2])))
50
+ ])
51
+ self.convs1.apply(init_weights)
52
+
53
+ self.convs2 = nn.ModuleList([
54
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
55
+ padding=get_padding(kernel_size, 1))),
56
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
57
+ padding=get_padding(kernel_size, 1))),
58
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
59
+ padding=get_padding(kernel_size, 1)))
60
+ ])
61
+ self.convs2.apply(init_weights)
62
+
63
+ def forward(self, x):
64
+ for c1, c2 in zip(self.convs1, self.convs2):
65
+ xt = F.leaky_relu(x, LRELU_SLOPE)
66
+ xt = c1(xt)
67
+ xt = F.leaky_relu(xt, LRELU_SLOPE)
68
+ xt = c2(xt)
69
+ x = xt + x
70
+ return x
71
+
72
+ def remove_weight_norm(self):
73
+ for l in self.convs1:
74
+ remove_weight_norm(l)
75
+ for l in self.convs2:
76
+ remove_weight_norm(l)
77
+
78
+
79
+ class ResBlock2(torch.nn.Module):
80
+ def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)):
81
+ super(ResBlock2, self).__init__()
82
+ self.h = h
83
+ self.convs = nn.ModuleList([
84
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
85
+ padding=get_padding(kernel_size, dilation[0]))),
86
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
87
+ padding=get_padding(kernel_size, dilation[1])))
88
+ ])
89
+ self.convs.apply(init_weights)
90
+
91
+ def forward(self, x):
92
+ for c in self.convs:
93
+ xt = F.leaky_relu(x, LRELU_SLOPE)
94
+ xt = c(xt)
95
+ x = xt + x
96
+ return x
97
+
98
+ def remove_weight_norm(self):
99
+ for l in self.convs:
100
+ remove_weight_norm(l)
101
+
102
+
103
+ class SineGen(torch.nn.Module):
104
+ """ Definition of sine generator
105
+ SineGen(samp_rate, harmonic_num = 0,
106
+ sine_amp = 0.1, noise_std = 0.003,
107
+ voiced_threshold = 0,
108
+ flag_for_pulse=False)
109
+ samp_rate: sampling rate in Hz
110
+ harmonic_num: number of harmonic overtones (default 0)
111
+ sine_amp: amplitude of sine-wavefrom (default 0.1)
112
+ noise_std: std of Gaussian noise (default 0.003)
113
+ voiced_thoreshold: F0 threshold for U/V classification (default 0)
114
+ flag_for_pulse: this SinGen is used inside PulseGen (default False)
115
+ Note: when flag_for_pulse is True, the first time step of a voiced
116
+ segment is always sin(np.pi) or cos(0)
117
+ """
118
+
119
+ def __init__(self, samp_rate, harmonic_num=0,
120
+ sine_amp=0.1, noise_std=0.003,
121
+ voiced_threshold=0):
122
+ super(SineGen, self).__init__()
123
+ self.sine_amp = sine_amp
124
+ self.noise_std = noise_std
125
+ self.harmonic_num = harmonic_num
126
+ self.dim = self.harmonic_num + 1
127
+ self.sampling_rate = samp_rate
128
+ self.voiced_threshold = voiced_threshold
129
+
130
+ def _f02uv(self, f0):
131
+ # generate uv signal
132
+ uv = torch.ones_like(f0)
133
+ uv = uv * (f0 > self.voiced_threshold)
134
+ return uv
135
+
136
+ @torch.no_grad()
137
+ def forward(self, f0, upp):
138
+ """ sine_tensor, uv = forward(f0)
139
+ input F0: tensor(batchsize=1, length, dim=1)
140
+ f0 for unvoiced steps should be 0
141
+ output sine_tensor: tensor(batchsize=1, length, dim)
142
+ output uv: tensor(batchsize=1, length, 1)
143
+ """
144
+ f0 = f0.unsqueeze(-1)
145
+ fn = torch.multiply(f0, torch.arange(1, self.dim + 1, device=f0.device).reshape((1, 1, -1)))
146
+ rad_values = (fn / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化
147
+ rand_ini = torch.rand(fn.shape[0], fn.shape[2], device=fn.device)
148
+ rand_ini[:, 0] = 0
149
+ rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
150
+ is_half = rad_values.dtype is not torch.float32
151
+ tmp_over_one = torch.cumsum(rad_values.double(), 1) # % 1 #####%1意味着后面的cumsum无法再优化
152
+ if is_half:
153
+ tmp_over_one = tmp_over_one.half()
154
+ else:
155
+ tmp_over_one = tmp_over_one.float()
156
+ tmp_over_one *= upp
157
+ tmp_over_one = F.interpolate(
158
+ tmp_over_one.transpose(2, 1), scale_factor=upp,
159
+ mode='linear', align_corners=True
160
+ ).transpose(2, 1)
161
+ rad_values = F.interpolate(rad_values.transpose(2, 1), scale_factor=upp, mode='nearest').transpose(2, 1)
162
+ tmp_over_one %= 1
163
+ tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
164
+ cumsum_shift = torch.zeros_like(rad_values)
165
+ cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
166
+ rad_values = rad_values.double()
167
+ cumsum_shift = cumsum_shift.double()
168
+ sine_waves = torch.sin(torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi)
169
+ if is_half:
170
+ sine_waves = sine_waves.half()
171
+ else:
172
+ sine_waves = sine_waves.float()
173
+ sine_waves = sine_waves * self.sine_amp
174
+ uv = self._f02uv(f0)
175
+ uv = F.interpolate(uv.transpose(2, 1), scale_factor=upp, mode='nearest').transpose(2, 1)
176
+ noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
177
+ noise = noise_amp * torch.randn_like(sine_waves)
178
+ sine_waves = sine_waves * uv + noise
179
+ return sine_waves, uv, noise
180
+
181
+
182
+ class SourceModuleHnNSF(torch.nn.Module):
183
+ """ SourceModule for hn-nsf
184
+ SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
185
+ add_noise_std=0.003, voiced_threshod=0)
186
+ sampling_rate: sampling_rate in Hz
187
+ harmonic_num: number of harmonic above F0 (default: 0)
188
+ sine_amp: amplitude of sine source signal (default: 0.1)
189
+ add_noise_std: std of additive Gaussian noise (default: 0.003)
190
+ note that amplitude of noise in unvoiced is decided
191
+ by sine_amp
192
+ voiced_threshold: threhold to set U/V given F0 (default: 0)
193
+ Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
194
+ F0_sampled (batchsize, length, 1)
195
+ Sine_source (batchsize, length, 1)
196
+ noise_source (batchsize, length 1)
197
+ uv (batchsize, length, 1)
198
+ """
199
+
200
+ def __init__(self, sampling_rate, harmonic_num=0, sine_amp=0.1,
201
+ add_noise_std=0.003, voiced_threshod=0):
202
+ super(SourceModuleHnNSF, self).__init__()
203
+
204
+ self.sine_amp = sine_amp
205
+ self.noise_std = add_noise_std
206
+
207
+ # to produce sine waveforms
208
+ self.l_sin_gen = SineGen(sampling_rate, harmonic_num,
209
+ sine_amp, add_noise_std, voiced_threshod)
210
+
211
+ # to merge source harmonics into a single excitation
212
+ self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
213
+ self.l_tanh = torch.nn.Tanh()
214
+
215
+ def forward(self, x, upp):
216
+ sine_wavs, uv, _ = self.l_sin_gen(x, upp)
217
+ sine_merge = self.l_tanh(self.l_linear(sine_wavs))
218
+ return sine_merge
219
+
220
+
221
+ class Generator(torch.nn.Module):
222
+ def __init__(self, h):
223
+ super(Generator, self).__init__()
224
+ self.h = h
225
+ self.num_kernels = len(h.resblock_kernel_sizes)
226
+ self.num_upsamples = len(h.upsample_rates)
227
+ self.m_source = SourceModuleHnNSF(
228
+ sampling_rate=h.sampling_rate,
229
+ harmonic_num=8
230
+ )
231
+ self.noise_convs = nn.ModuleList()
232
+ self.conv_pre = weight_norm(Conv1d(h.num_mels, h.upsample_initial_channel, 7, 1, padding=3))
233
+ resblock = ResBlock1 if h.resblock == '1' else ResBlock2
234
+
235
+ self.ups = nn.ModuleList()
236
+ for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
237
+ c_cur = h.upsample_initial_channel // (2 ** (i + 1))
238
+ self.ups.append(weight_norm(
239
+ ConvTranspose1d(h.upsample_initial_channel // (2 ** i), h.upsample_initial_channel // (2 ** (i + 1)),
240
+ k, u, padding=(k - u) // 2)))
241
+ if i + 1 < len(h.upsample_rates): #
242
+ stride_f0 = int(np.prod(h.upsample_rates[i + 1:]))
243
+ self.noise_convs.append(Conv1d(
244
+ 1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=stride_f0 // 2))
245
+ else:
246
+ self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
247
+ self.resblocks = nn.ModuleList()
248
+ ch = h.upsample_initial_channel
249
+ for i in range(len(self.ups)):
250
+ ch //= 2
251
+ for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
252
+ self.resblocks.append(resblock(h, ch, k, d))
253
+
254
+ self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
255
+ self.ups.apply(init_weights)
256
+ self.conv_post.apply(init_weights)
257
+ self.upp = int(np.prod(h.upsample_rates))
258
+
259
+ def forward(self, x, f0):
260
+ har_source = self.m_source(f0, self.upp).transpose(1, 2)
261
+ x = self.conv_pre(x)
262
+ for i in range(self.num_upsamples):
263
+ x = F.leaky_relu(x, LRELU_SLOPE)
264
+ x = self.ups[i](x)
265
+ x_source = self.noise_convs[i](har_source)
266
+ x = x + x_source
267
+ xs = None
268
+ for j in range(self.num_kernels):
269
+ if xs is None:
270
+ xs = self.resblocks[i * self.num_kernels + j](x)
271
+ else:
272
+ xs += self.resblocks[i * self.num_kernels + j](x)
273
+ x = xs / self.num_kernels
274
+ x = F.leaky_relu(x)
275
+ x = self.conv_post(x)
276
+ x = torch.tanh(x)
277
+
278
+ return x
279
+
280
+ def remove_weight_norm(self):
281
+ print('Removing weight norm...')
282
+ for l in self.ups:
283
+ remove_weight_norm(l)
284
+ for l in self.resblocks:
285
+ l.remove_weight_norm()
286
+ remove_weight_norm(self.conv_pre)
287
+ remove_weight_norm(self.conv_post)
288
+
289
+
290
+ class DiscriminatorP(torch.nn.Module):
291
+ def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
292
+ super(DiscriminatorP, self).__init__()
293
+ self.period = period
294
+ norm_f = weight_norm if use_spectral_norm is False else spectral_norm
295
+ self.convs = nn.ModuleList([
296
+ norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
297
+ norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
298
+ norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
299
+ norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
300
+ norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
301
+ ])
302
+ self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
303
+
304
+ def forward(self, x):
305
+ fmap = []
306
+
307
+ # 1d to 2d
308
+ b, c, t = x.shape
309
+ if t % self.period != 0: # pad first
310
+ n_pad = self.period - (t % self.period)
311
+ x = F.pad(x, (0, n_pad), "reflect")
312
+ t = t + n_pad
313
+ x = x.view(b, c, t // self.period, self.period)
314
+
315
+ for l in self.convs:
316
+ x = l(x)
317
+ x = F.leaky_relu(x, LRELU_SLOPE)
318
+ fmap.append(x)
319
+ x = self.conv_post(x)
320
+ fmap.append(x)
321
+ x = torch.flatten(x, 1, -1)
322
+
323
+ return x, fmap
324
+
325
+
326
+ class MultiPeriodDiscriminator(torch.nn.Module):
327
+ def __init__(self, periods=None):
328
+ super(MultiPeriodDiscriminator, self).__init__()
329
+ self.periods = periods if periods is not None else [2, 3, 5, 7, 11]
330
+ self.discriminators = nn.ModuleList()
331
+ for period in self.periods:
332
+ self.discriminators.append(DiscriminatorP(period))
333
+
334
+ def forward(self, y, y_hat):
335
+ y_d_rs = []
336
+ y_d_gs = []
337
+ fmap_rs = []
338
+ fmap_gs = []
339
+ for i, d in enumerate(self.discriminators):
340
+ y_d_r, fmap_r = d(y)
341
+ y_d_g, fmap_g = d(y_hat)
342
+ y_d_rs.append(y_d_r)
343
+ fmap_rs.append(fmap_r)
344
+ y_d_gs.append(y_d_g)
345
+ fmap_gs.append(fmap_g)
346
+
347
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
348
+
349
+
350
+ class DiscriminatorS(torch.nn.Module):
351
+ def __init__(self, use_spectral_norm=False):
352
+ super(DiscriminatorS, self).__init__()
353
+ norm_f = weight_norm if use_spectral_norm is False else spectral_norm
354
+ self.convs = nn.ModuleList([
355
+ norm_f(Conv1d(1, 128, 15, 1, padding=7)),
356
+ norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),
357
+ norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)),
358
+ norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)),
359
+ norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)),
360
+ norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)),
361
+ norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
362
+ ])
363
+ self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
364
+
365
+ def forward(self, x):
366
+ fmap = []
367
+ for l in self.convs:
368
+ x = l(x)
369
+ x = F.leaky_relu(x, LRELU_SLOPE)
370
+ fmap.append(x)
371
+ x = self.conv_post(x)
372
+ fmap.append(x)
373
+ x = torch.flatten(x, 1, -1)
374
+
375
+ return x, fmap
376
+
377
+
378
+ class MultiScaleDiscriminator(torch.nn.Module):
379
+ def __init__(self):
380
+ super(MultiScaleDiscriminator, self).__init__()
381
+ self.discriminators = nn.ModuleList([
382
+ DiscriminatorS(use_spectral_norm=True),
383
+ DiscriminatorS(),
384
+ DiscriminatorS(),
385
+ ])
386
+ self.meanpools = nn.ModuleList([
387
+ AvgPool1d(4, 2, padding=2),
388
+ AvgPool1d(4, 2, padding=2)
389
+ ])
390
+
391
+ def forward(self, y, y_hat):
392
+ y_d_rs = []
393
+ y_d_gs = []
394
+ fmap_rs = []
395
+ fmap_gs = []
396
+ for i, d in enumerate(self.discriminators):
397
+ if i != 0:
398
+ y = self.meanpools[i - 1](y)
399
+ y_hat = self.meanpools[i - 1](y_hat)
400
+ y_d_r, fmap_r = d(y)
401
+ y_d_g, fmap_g = d(y_hat)
402
+ y_d_rs.append(y_d_r)
403
+ fmap_rs.append(fmap_r)
404
+ y_d_gs.append(y_d_g)
405
+ fmap_gs.append(fmap_g)
406
+
407
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
408
+
409
+
410
+ def feature_loss(fmap_r, fmap_g):
411
+ loss = 0
412
+ for dr, dg in zip(fmap_r, fmap_g):
413
+ for rl, gl in zip(dr, dg):
414
+ loss += torch.mean(torch.abs(rl - gl))
415
+
416
+ return loss * 2
417
+
418
+
419
+ def discriminator_loss(disc_real_outputs, disc_generated_outputs):
420
+ loss = 0
421
+ r_losses = []
422
+ g_losses = []
423
+ for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
424
+ r_loss = torch.mean((1 - dr) ** 2)
425
+ g_loss = torch.mean(dg ** 2)
426
+ loss += (r_loss + g_loss)
427
+ r_losses.append(r_loss.item())
428
+ g_losses.append(g_loss.item())
429
+
430
+ return loss, r_losses, g_losses
431
+
432
+
433
+ def generator_loss(disc_outputs):
434
+ loss = 0
435
+ gen_losses = []
436
+ for dg in disc_outputs:
437
+ l = torch.mean((1 - dg) ** 2)
438
+ gen_losses.append(l)
439
+ loss += l
440
+
441
+ return loss, gen_losses
vdecoder/nsf_hifigan/nvSTFT.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import librosa
4
+ import numpy as np
5
+ import soundfile as sf
6
+ import torch
7
+ import torch.nn.functional as F
8
+ import torch.utils.data
9
+ from librosa.filters import mel as librosa_mel_fn
10
+
11
+ os.environ["LRU_CACHE_CAPACITY"] = "3"
12
+
13
+ def load_wav_to_torch(full_path, target_sr=None, return_empty_on_exception=False):
14
+ sampling_rate = None
15
+ try:
16
+ data, sampling_rate = sf.read(full_path, always_2d=True)# than soundfile.
17
+ except Exception as ex:
18
+ print(f"'{full_path}' failed to load.\nException:")
19
+ print(ex)
20
+ if return_empty_on_exception:
21
+ return [], sampling_rate or target_sr or 48000
22
+ else:
23
+ raise Exception(ex)
24
+
25
+ if len(data.shape) > 1:
26
+ data = data[:, 0]
27
+ assert len(data) > 2# check duration of audio file is > 2 samples (because otherwise the slice operation was on the wrong dimension)
28
+
29
+ if np.issubdtype(data.dtype, np.integer): # if audio data is type int
30
+ max_mag = -np.iinfo(data.dtype).min # maximum magnitude = min possible value of intXX
31
+ else: # if audio data is type fp32
32
+ max_mag = max(np.amax(data), -np.amin(data))
33
+ max_mag = (2**31)+1 if max_mag > (2**15) else ((2**15)+1 if max_mag > 1.01 else 1.0) # data should be either 16-bit INT, 32-bit INT or [-1 to 1] float32
34
+
35
+ data = torch.FloatTensor(data.astype(np.float32))/max_mag
36
+
37
+ if (torch.isinf(data) | torch.isnan(data)).any() and return_empty_on_exception:# resample will crash with inf/NaN inputs. return_empty_on_exception will return empty arr instead of except
38
+ return [], sampling_rate or target_sr or 48000
39
+ if target_sr is not None and sampling_rate != target_sr:
40
+ data = torch.from_numpy(librosa.core.resample(data.numpy(), orig_sr=sampling_rate, target_sr=target_sr))
41
+ sampling_rate = target_sr
42
+
43
+ return data, sampling_rate
44
+
45
+ def dynamic_range_compression(x, C=1, clip_val=1e-5):
46
+ return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
47
+
48
+ def dynamic_range_decompression(x, C=1):
49
+ return np.exp(x) / C
50
+
51
+ def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
52
+ return torch.log(torch.clamp(x, min=clip_val) * C)
53
+
54
+ def dynamic_range_decompression_torch(x, C=1):
55
+ return torch.exp(x) / C
56
+
57
+ class STFT():
58
+ def __init__(self, sr=22050, n_mels=80, n_fft=1024, win_size=1024, hop_length=256, fmin=20, fmax=11025, clip_val=1e-5):
59
+ self.target_sr = sr
60
+
61
+ self.n_mels = n_mels
62
+ self.n_fft = n_fft
63
+ self.win_size = win_size
64
+ self.hop_length = hop_length
65
+ self.fmin = fmin
66
+ self.fmax = fmax
67
+ self.clip_val = clip_val
68
+ self.mel_basis = {}
69
+ self.hann_window = {}
70
+
71
+ def get_mel(self, y, keyshift=0, speed=1, center=False):
72
+ sampling_rate = self.target_sr
73
+ n_mels = self.n_mels
74
+ n_fft = self.n_fft
75
+ win_size = self.win_size
76
+ hop_length = self.hop_length
77
+ fmin = self.fmin
78
+ fmax = self.fmax
79
+ clip_val = self.clip_val
80
+
81
+ factor = 2 ** (keyshift / 12)
82
+ n_fft_new = int(np.round(n_fft * factor))
83
+ win_size_new = int(np.round(win_size * factor))
84
+ hop_length_new = int(np.round(hop_length * speed))
85
+
86
+ if torch.min(y) < -1.:
87
+ print('min value is ', torch.min(y))
88
+ if torch.max(y) > 1.:
89
+ print('max value is ', torch.max(y))
90
+
91
+ mel_basis_key = str(fmax)+'_'+str(y.device)
92
+ if mel_basis_key not in self.mel_basis:
93
+ mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax)
94
+ self.mel_basis[mel_basis_key] = torch.from_numpy(mel).float().to(y.device)
95
+
96
+ keyshift_key = str(keyshift)+'_'+str(y.device)
97
+ if keyshift_key not in self.hann_window:
98
+ self.hann_window[keyshift_key] = torch.hann_window(win_size_new).to(y.device)
99
+
100
+ pad_left = (win_size_new - hop_length_new) //2
101
+ pad_right = max((win_size_new- hop_length_new + 1) //2, win_size_new - y.size(-1) - pad_left)
102
+ if pad_right < y.size(-1):
103
+ mode = 'reflect'
104
+ else:
105
+ mode = 'constant'
106
+ y = torch.nn.functional.pad(y.unsqueeze(1), (pad_left, pad_right), mode = mode)
107
+ y = y.squeeze(1)
108
+
109
+ spec = torch.stft(y, n_fft_new, hop_length=hop_length_new, win_length=win_size_new, window=self.hann_window[keyshift_key],
110
+ center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
111
+ # print(111,spec)
112
+ spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9))
113
+ if keyshift != 0:
114
+ size = n_fft // 2 + 1
115
+ resize = spec.size(1)
116
+ if resize < size:
117
+ spec = F.pad(spec, (0, 0, 0, size-resize))
118
+ spec = spec[:, :size, :] * win_size / win_size_new
119
+
120
+ # print(222,spec)
121
+ spec = torch.matmul(self.mel_basis[mel_basis_key], spec)
122
+ # print(333,spec)
123
+ spec = dynamic_range_compression_torch(spec, clip_val=clip_val)
124
+ # print(444,spec)
125
+ return spec
126
+
127
+ def __call__(self, audiopath):
128
+ audio, sr = load_wav_to_torch(audiopath, target_sr=self.target_sr)
129
+ spect = self.get_mel(audio.unsqueeze(0)).squeeze(0)
130
+ return spect
131
+
132
+ stft = STFT()
vdecoder/nsf_hifigan/utils.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import glob
2
+ import os
3
+
4
+ import matplotlib
5
+ import matplotlib.pylab as plt
6
+ import torch
7
+ from torch.nn.utils import weight_norm
8
+
9
+ matplotlib.use("Agg")
10
+
11
+
12
+ def plot_spectrogram(spectrogram):
13
+ fig, ax = plt.subplots(figsize=(10, 2))
14
+ im = ax.imshow(spectrogram, aspect="auto", origin="lower",
15
+ interpolation='none')
16
+ plt.colorbar(im, ax=ax)
17
+
18
+ fig.canvas.draw()
19
+ plt.close()
20
+
21
+ return fig
22
+
23
+
24
+ def init_weights(m, mean=0.0, std=0.01):
25
+ classname = m.__class__.__name__
26
+ if classname.find("Conv") != -1:
27
+ m.weight.data.normal_(mean, std)
28
+
29
+
30
+ def apply_weight_norm(m):
31
+ classname = m.__class__.__name__
32
+ if classname.find("Conv") != -1:
33
+ weight_norm(m)
34
+
35
+
36
+ def get_padding(kernel_size, dilation=1):
37
+ return int((kernel_size*dilation - dilation)/2)
38
+
39
+
40
+ def load_checkpoint(filepath, device):
41
+ assert os.path.isfile(filepath)
42
+ print("Loading '{}'".format(filepath))
43
+ checkpoint_dict = torch.load(filepath, map_location=device)
44
+ print("Complete.")
45
+ return checkpoint_dict
46
+
47
+
48
+ def save_checkpoint(filepath, obj):
49
+ print("Saving checkpoint to {}".format(filepath))
50
+ torch.save(obj, filepath)
51
+ print("Complete.")
52
+
53
+
54
+ def del_old_checkpoints(cp_dir, prefix, n_models=2):
55
+ pattern = os.path.join(cp_dir, prefix + '????????')
56
+ cp_list = glob.glob(pattern) # get checkpoint paths
57
+ cp_list = sorted(cp_list)# sort by iter
58
+ if len(cp_list) > n_models: # if more than n_models models are found
59
+ for cp in cp_list[:-n_models]:# delete the oldest models other than lastest n_models
60
+ open(cp, 'w').close()# empty file contents
61
+ os.unlink(cp)# delete file (move to trash when using Colab)
62
+
63
+
64
+ def scan_checkpoint(cp_dir, prefix):
65
+ pattern = os.path.join(cp_dir, prefix + '????????')
66
+ cp_list = glob.glob(pattern)
67
+ if len(cp_list) == 0:
68
+ return None
69
+ return sorted(cp_list)[-1]
70
+
vencoder/CNHubertLarge.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from fairseq import checkpoint_utils
3
+
4
+ from vencoder.encoder import SpeechEncoder
5
+
6
+
7
+ class CNHubertLarge(SpeechEncoder):
8
+ def __init__(self, vec_path="pretrain/chinese-hubert-large-fairseq-ckpt.pt", device=None):
9
+ super().__init__()
10
+ print("load model(s) from {}".format(vec_path))
11
+ self.hidden_dim = 1024
12
+ models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
13
+ [vec_path],
14
+ suffix="",
15
+ )
16
+ if device is None:
17
+ self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
18
+ else:
19
+ self.dev = torch.device(device)
20
+ self.model = models[0].to(self.dev)
21
+ self.model.eval()
22
+
23
+ def encoder(self, wav):
24
+ feats = wav
25
+ if feats.dim() == 2: # double channels
26
+ feats = feats.mean(-1)
27
+ assert feats.dim() == 1, feats.dim()
28
+ feats = feats.view(1, -1)
29
+ padding_mask = torch.BoolTensor(feats.shape).fill_(False)
30
+ inputs = {
31
+ "source": feats.to(wav.device),
32
+ "padding_mask": padding_mask.to(wav.device)
33
+ }
34
+ with torch.no_grad():
35
+ logits = self.model.extract_features(**inputs)
36
+ return logits[0].transpose(1, 2)
vencoder/ContentVec256L12_Onnx.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import onnxruntime
2
+ import torch
3
+
4
+ from vencoder.encoder import SpeechEncoder
5
+
6
+
7
+ class ContentVec256L12_Onnx(SpeechEncoder):
8
+ def __init__(self, vec_path="pretrain/vec-256-layer-12.onnx", device=None):
9
+ super().__init__()
10
+ print("load model(s) from {}".format(vec_path))
11
+ self.hidden_dim = 256
12
+ if device is None:
13
+ self.dev = torch.device("cpu")
14
+ else:
15
+ self.dev = torch.device(device)
16
+
17
+ if device == 'cuda' or device == torch.device("cuda"):
18
+ providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
19
+ else:
20
+ providers = ['CPUExecutionProvider']
21
+
22
+ self.model = onnxruntime.InferenceSession(vec_path, providers=providers)
23
+
24
+ def encoder(self, wav):
25
+ feats = wav
26
+ if feats.dim() == 2: # double channels
27
+ feats = feats.mean(-1)
28
+ assert feats.dim() == 1, feats.dim()
29
+ feats = feats.view(1, -1)
30
+ feats = feats.unsqueeze(0).cpu().detach().numpy()
31
+ onnx_input = {self.model.get_inputs()[0].name: feats}
32
+ logits = self.model.run(None, onnx_input)
33
+ return torch.tensor(logits[0]).transpose(1, 2).to(self.dev)
vencoder/ContentVec256L9.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from fairseq import checkpoint_utils
3
+
4
+ from vencoder.encoder import SpeechEncoder
5
+
6
+
7
+ class ContentVec256L9(SpeechEncoder):
8
+ def __init__(self, vec_path="pretrain/checkpoint_best_legacy_500.pt", device=None):
9
+ super().__init__()
10
+ print("load model(s) from {}".format(vec_path))
11
+ models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
12
+ [vec_path],
13
+ suffix="",
14
+ )
15
+ self.hidden_dim = 256
16
+ if device is None:
17
+ self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
18
+ else:
19
+ self.dev = torch.device(device)
20
+ self.model = models[0].to(self.dev)
21
+ self.model.eval()
22
+
23
+ def encoder(self, wav):
24
+ feats = wav
25
+ if feats.dim() == 2: # double channels
26
+ feats = feats.mean(-1)
27
+ assert feats.dim() == 1, feats.dim()
28
+ feats = feats.view(1, -1)
29
+ padding_mask = torch.BoolTensor(feats.shape).fill_(False)
30
+ inputs = {
31
+ "source": feats.to(wav.device),
32
+ "padding_mask": padding_mask.to(wav.device),
33
+ "output_layer": 9, # layer 9
34
+ }
35
+ with torch.no_grad():
36
+ logits = self.model.extract_features(**inputs)
37
+ feats = self.model.final_proj(logits[0])
38
+ return feats.transpose(1, 2)
vencoder/ContentVec256L9_Onnx.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import onnxruntime
2
+ import torch
3
+
4
+ from vencoder.encoder import SpeechEncoder
5
+
6
+
7
+ class ContentVec256L9_Onnx(SpeechEncoder):
8
+ def __init__(self, vec_path="pretrain/vec-256-layer-9.onnx", device=None):
9
+ super().__init__()
10
+ print("load model(s) from {}".format(vec_path))
11
+ self.hidden_dim = 256
12
+ if device is None:
13
+ self.dev = torch.device("cpu")
14
+ else:
15
+ self.dev = torch.device(device)
16
+ if device == 'cpu' or device == torch.device("cpu") or device is None:
17
+ providers = ['CPUExecutionProvider']
18
+ elif device == 'cuda' or device == torch.device("cuda"):
19
+ providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
20
+ self.model = onnxruntime.InferenceSession(vec_path, providers=providers)
21
+
22
+ def encoder(self, wav):
23
+ feats = wav
24
+ if feats.dim() == 2: # double channels
25
+ feats = feats.mean(-1)
26
+ assert feats.dim() == 1, feats.dim()
27
+ feats = feats.view(1, -1)
28
+ feats = feats.unsqueeze(0).cpu().detach().numpy()
29
+ onnx_input = {self.model.get_inputs()[0].name: feats}
30
+ logits = self.model.run(None, onnx_input)
31
+ return torch.tensor(logits[0]).transpose(1, 2).to(self.dev)
32
+
vencoder/ContentVec768L12.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from fairseq import checkpoint_utils
3
+
4
+ from vencoder.encoder import SpeechEncoder
5
+
6
+
7
+ class ContentVec768L12(SpeechEncoder):
8
+ def __init__(self, vec_path="pretrain/checkpoint_best_legacy_500.pt", device=None):
9
+ super().__init__()
10
+ print("load model(s) from {}".format(vec_path))
11
+ self.hidden_dim = 768
12
+ models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
13
+ [vec_path],
14
+ suffix="",
15
+ )
16
+ if device is None:
17
+ self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
18
+ else:
19
+ self.dev = torch.device(device)
20
+ self.model = models[0].to(self.dev)
21
+ self.model.eval()
22
+
23
+ def encoder(self, wav):
24
+ feats = wav
25
+ if feats.dim() == 2: # double channels
26
+ feats = feats.mean(-1)
27
+ assert feats.dim() == 1, feats.dim()
28
+ feats = feats.view(1, -1)
29
+ padding_mask = torch.BoolTensor(feats.shape).fill_(False)
30
+ inputs = {
31
+ "source": feats.to(wav.device),
32
+ "padding_mask": padding_mask.to(wav.device),
33
+ "output_layer": 12, # layer 12
34
+ }
35
+ with torch.no_grad():
36
+ logits = self.model.extract_features(**inputs)
37
+ return logits[0].transpose(1, 2)
vencoder/ContentVec768L12_Onnx.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import onnxruntime
2
+ import torch
3
+
4
+ from vencoder.encoder import SpeechEncoder
5
+
6
+
7
+ class ContentVec768L12_Onnx(SpeechEncoder):
8
+ def __init__(self, vec_path="pretrain/vec-768-layer-12.onnx", device=None):
9
+ super().__init__()
10
+ print("load model(s) from {}".format(vec_path))
11
+ self.hidden_dim = 768
12
+ if device is None:
13
+ self.dev = torch.device("cpu")
14
+ else:
15
+ self.dev = torch.device(device)
16
+
17
+ if device == 'cuda' or device == torch.device("cuda"):
18
+ providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
19
+ else:
20
+ providers = ['CPUExecutionProvider']
21
+
22
+ self.model = onnxruntime.InferenceSession(vec_path, providers=providers)
23
+
24
+ def encoder(self, wav):
25
+ feats = wav
26
+ if feats.dim() == 2: # double channels
27
+ feats = feats.mean(-1)
28
+ assert feats.dim() == 1, feats.dim()
29
+ feats = feats.view(1, -1)
30
+ feats = feats.unsqueeze(0).cpu().detach().numpy()
31
+ onnx_input = {self.model.get_inputs()[0].name: feats}
32
+ logits = self.model.run(None, onnx_input)
33
+ return torch.tensor(logits[0]).transpose(1, 2).to(self.dev)
vencoder/ContentVec768L9_Onnx.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import onnxruntime
2
+ import torch
3
+
4
+ from vencoder.encoder import SpeechEncoder
5
+
6
+
7
+ class ContentVec768L9_Onnx(SpeechEncoder):
8
+ def __init__(self,vec_path = "pretrain/vec-768-layer-9.onnx",device=None):
9
+ super().__init__()
10
+ print("load model(s) from {}".format(vec_path))
11
+ self.hidden_dim = 768
12
+ if device is None:
13
+ self.dev = torch.device("cpu")
14
+ else:
15
+ self.dev = torch.device(device)
16
+
17
+ if device == 'cuda' or device == torch.device("cuda"):
18
+ providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
19
+ else:
20
+ providers = ['CPUExecutionProvider']
21
+
22
+ self.model = onnxruntime.InferenceSession(vec_path, providers=providers)
23
+
24
+ def encoder(self, wav):
25
+ feats = wav
26
+ if feats.dim() == 2: # double channels
27
+ feats = feats.mean(-1)
28
+ assert feats.dim() == 1, feats.dim()
29
+ feats = feats.view(1, -1)
30
+ feats = feats.unsqueeze(0).cpu().detach().numpy()
31
+ onnx_input = {self.model.get_inputs()[0].name: feats}
32
+ logits = self.model.run(None, onnx_input)
33
+ return torch.tensor(logits[0]).transpose(1, 2).to(self.dev)
vencoder/DPHubert.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+ from vencoder.dphubert.model import wav2vec2_model
4
+ from vencoder.encoder import SpeechEncoder
5
+
6
+
7
+ class DPHubert(SpeechEncoder):
8
+ def __init__(self, vec_path="pretrain/DPHuBERT-sp0.75.pth", device=None):
9
+ super().__init__()
10
+ print("load model(s) from {}".format(vec_path))
11
+ if device is None:
12
+ self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
13
+ else:
14
+ self.dev = torch.device(device)
15
+ ckpt = torch.load(vec_path)
16
+ self.hidden_dim = 768
17
+ self.model = wav2vec2_model(**ckpt["config"]).to(self.dev)
18
+ self.model.load_state_dict(ckpt["state_dict"], strict=False)
19
+
20
+ def encoder(self, wav):
21
+ feats = wav
22
+ if feats.dim() == 2: # double channels
23
+ feats = feats.mean(-1)
24
+ assert feats.dim() == 1, feats.dim()
25
+ feats = feats[None, :]
26
+ with torch.no_grad():
27
+ with torch.inference_mode():
28
+ units = self.model(feats)[0]
29
+ return units.transpose(1,2)
vencoder/HubertSoft.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+ from vencoder.encoder import SpeechEncoder
4
+ from vencoder.hubert import hubert_model
5
+
6
+
7
+ class HubertSoft(SpeechEncoder):
8
+ def __init__(self, vec_path="pretrain/hubert-soft-0d54a1f4.pt", device=None):
9
+ super().__init__()
10
+ print("load model(s) from {}".format(vec_path))
11
+ hubert_soft = hubert_model.hubert_soft(vec_path)
12
+ if device is None:
13
+ self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
14
+ else:
15
+ self.dev = torch.device(device)
16
+ self.hidden_dim = 256
17
+ self.model = hubert_soft.to(self.dev)
18
+
19
+ def encoder(self, wav):
20
+ feats = wav
21
+ if feats.dim() == 2: # double channels
22
+ feats = feats.mean(-1)
23
+ assert feats.dim() == 1, feats.dim()
24
+ feats = feats[None,None,:]
25
+ with torch.no_grad():
26
+ with torch.inference_mode():
27
+ units = self.model.units(feats)
28
+ return units.transpose(1,2)
vencoder/HubertSoft_Onnx.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import onnxruntime
2
+ import torch
3
+
4
+ from vencoder.encoder import SpeechEncoder
5
+
6
+
7
+ class HubertSoft_Onnx(SpeechEncoder):
8
+ def __init__(self, vec_path="pretrain/hubert-soft.onnx", device=None):
9
+ super().__init__()
10
+ print("load model(s) from {}".format(vec_path))
11
+ self.hidden_dim = 256
12
+ if device is None:
13
+ self.dev = torch.device("cpu")
14
+ else:
15
+ self.dev = torch.device(device)
16
+
17
+ if device == 'cuda' or device == torch.device("cuda"):
18
+ providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
19
+ else:
20
+ providers = ['CPUExecutionProvider']
21
+
22
+ self.model = onnxruntime.InferenceSession(vec_path, providers=providers)
23
+
24
+ def encoder(self, wav):
25
+ feats = wav
26
+ if feats.dim() == 2: # double channels
27
+ feats = feats.mean(-1)
28
+ assert feats.dim() == 1, feats.dim()
29
+ feats = feats.view(1, -1)
30
+ feats = feats.unsqueeze(0).cpu().detach().numpy()
31
+ onnx_input = {self.model.get_inputs()[0].name: feats}
32
+ logits = self.model.run(None, onnx_input)
33
+ return torch.tensor(logits[0]).transpose(1, 2).to(self.dev)