TIMBOVILL commited on
Commit
0afdd88
1 Parent(s): 73c7286

Upload 2 files

Browse files
Files changed (2) hide show
  1. rvc/lib/rmvpe.py +388 -0
  2. rvc/lib/utils.py +16 -0
rvc/lib/rmvpe.py ADDED
@@ -0,0 +1,388 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch.nn as nn
2
+ import torch, numpy as np
3
+ import torch.nn.functional as F
4
+ from librosa.filters import mel
5
+
6
+
7
+ class BiGRU(nn.Module):
8
+ def __init__(self, input_features, hidden_features, num_layers):
9
+ super(BiGRU, self).__init__()
10
+ self.gru = nn.GRU(
11
+ input_features,
12
+ hidden_features,
13
+ num_layers=num_layers,
14
+ batch_first=True,
15
+ bidirectional=True,
16
+ )
17
+
18
+ def forward(self, x):
19
+ return self.gru(x)[0]
20
+
21
+
22
+ class ConvBlockRes(nn.Module):
23
+ def __init__(self, in_channels, out_channels, momentum=0.01):
24
+ super(ConvBlockRes, self).__init__()
25
+ self.conv = nn.Sequential(
26
+ nn.Conv2d(
27
+ in_channels=in_channels,
28
+ out_channels=out_channels,
29
+ kernel_size=(3, 3),
30
+ stride=(1, 1),
31
+ padding=(1, 1),
32
+ bias=False,
33
+ ),
34
+ nn.BatchNorm2d(out_channels, momentum=momentum),
35
+ nn.ReLU(),
36
+ nn.Conv2d(
37
+ in_channels=out_channels,
38
+ out_channels=out_channels,
39
+ kernel_size=(3, 3),
40
+ stride=(1, 1),
41
+ padding=(1, 1),
42
+ bias=False,
43
+ ),
44
+ nn.BatchNorm2d(out_channels, momentum=momentum),
45
+ nn.ReLU(),
46
+ )
47
+ if in_channels != out_channels:
48
+ self.shortcut = nn.Conv2d(in_channels, out_channels, (1, 1))
49
+ self.is_shortcut = True
50
+ else:
51
+ self.is_shortcut = False
52
+
53
+ def forward(self, x):
54
+ if self.is_shortcut:
55
+ return self.conv(x) + self.shortcut(x)
56
+ else:
57
+ return self.conv(x) + x
58
+
59
+
60
+ class Encoder(nn.Module):
61
+ def __init__(
62
+ self,
63
+ in_channels,
64
+ in_size,
65
+ n_encoders,
66
+ kernel_size,
67
+ n_blocks,
68
+ out_channels=16,
69
+ momentum=0.01,
70
+ ):
71
+ super(Encoder, self).__init__()
72
+ self.n_encoders = n_encoders
73
+ self.bn = nn.BatchNorm2d(in_channels, momentum=momentum)
74
+ self.layers = nn.ModuleList()
75
+ self.latent_channels = []
76
+ for i in range(self.n_encoders):
77
+ self.layers.append(
78
+ ResEncoderBlock(
79
+ in_channels, out_channels, kernel_size, n_blocks, momentum=momentum
80
+ )
81
+ )
82
+ self.latent_channels.append([out_channels, in_size])
83
+ in_channels = out_channels
84
+ out_channels *= 2
85
+ in_size //= 2
86
+ self.out_size = in_size
87
+ self.out_channel = out_channels
88
+
89
+ def forward(self, x):
90
+ concat_tensors = []
91
+ x = self.bn(x)
92
+ for i in range(self.n_encoders):
93
+ _, x = self.layers[i](x)
94
+ concat_tensors.append(_)
95
+ return x, concat_tensors
96
+
97
+
98
+ class ResEncoderBlock(nn.Module):
99
+ def __init__(
100
+ self, in_channels, out_channels, kernel_size, n_blocks=1, momentum=0.01
101
+ ):
102
+ super(ResEncoderBlock, self).__init__()
103
+ self.n_blocks = n_blocks
104
+ self.conv = nn.ModuleList()
105
+ self.conv.append(ConvBlockRes(in_channels, out_channels, momentum))
106
+ for i in range(n_blocks - 1):
107
+ self.conv.append(ConvBlockRes(out_channels, out_channels, momentum))
108
+ self.kernel_size = kernel_size
109
+ if self.kernel_size is not None:
110
+ self.pool = nn.AvgPool2d(kernel_size=kernel_size)
111
+
112
+ def forward(self, x):
113
+ for i in range(self.n_blocks):
114
+ x = self.conv[i](x)
115
+ if self.kernel_size is not None:
116
+ return x, self.pool(x)
117
+ else:
118
+ return x
119
+
120
+
121
+ class Intermediate(nn.Module): #
122
+ def __init__(self, in_channels, out_channels, n_inters, n_blocks, momentum=0.01):
123
+ super(Intermediate, self).__init__()
124
+ self.n_inters = n_inters
125
+ self.layers = nn.ModuleList()
126
+ self.layers.append(
127
+ ResEncoderBlock(in_channels, out_channels, None, n_blocks, momentum)
128
+ )
129
+ for i in range(self.n_inters - 1):
130
+ self.layers.append(
131
+ ResEncoderBlock(out_channels, out_channels, None, n_blocks, momentum)
132
+ )
133
+
134
+ def forward(self, x):
135
+ for i in range(self.n_inters):
136
+ x = self.layers[i](x)
137
+ return x
138
+
139
+
140
+ class ResDecoderBlock(nn.Module):
141
+ def __init__(self, in_channels, out_channels, stride, n_blocks=1, momentum=0.01):
142
+ super(ResDecoderBlock, self).__init__()
143
+ out_padding = (0, 1) if stride == (1, 2) else (1, 1)
144
+ self.n_blocks = n_blocks
145
+ self.conv1 = nn.Sequential(
146
+ nn.ConvTranspose2d(
147
+ in_channels=in_channels,
148
+ out_channels=out_channels,
149
+ kernel_size=(3, 3),
150
+ stride=stride,
151
+ padding=(1, 1),
152
+ output_padding=out_padding,
153
+ bias=False,
154
+ ),
155
+ nn.BatchNorm2d(out_channels, momentum=momentum),
156
+ nn.ReLU(),
157
+ )
158
+ self.conv2 = nn.ModuleList()
159
+ self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum))
160
+ for i in range(n_blocks - 1):
161
+ self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum))
162
+
163
+ def forward(self, x, concat_tensor):
164
+ x = self.conv1(x)
165
+ x = torch.cat((x, concat_tensor), dim=1)
166
+ for i in range(self.n_blocks):
167
+ x = self.conv2[i](x)
168
+ return x
169
+
170
+
171
+ class Decoder(nn.Module):
172
+ def __init__(self, in_channels, n_decoders, stride, n_blocks, momentum=0.01):
173
+ super(Decoder, self).__init__()
174
+ self.layers = nn.ModuleList()
175
+ self.n_decoders = n_decoders
176
+ for i in range(self.n_decoders):
177
+ out_channels = in_channels // 2
178
+ self.layers.append(
179
+ ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum)
180
+ )
181
+ in_channels = out_channels
182
+
183
+ def forward(self, x, concat_tensors):
184
+ for i in range(self.n_decoders):
185
+ x = self.layers[i](x, concat_tensors[-1 - i])
186
+ return x
187
+
188
+
189
+ class DeepUnet(nn.Module):
190
+ def __init__(
191
+ self,
192
+ kernel_size,
193
+ n_blocks,
194
+ en_de_layers=5,
195
+ inter_layers=4,
196
+ in_channels=1,
197
+ en_out_channels=16,
198
+ ):
199
+ super(DeepUnet, self).__init__()
200
+ self.encoder = Encoder(
201
+ in_channels, 128, en_de_layers, kernel_size, n_blocks, en_out_channels
202
+ )
203
+ self.intermediate = Intermediate(
204
+ self.encoder.out_channel // 2,
205
+ self.encoder.out_channel,
206
+ inter_layers,
207
+ n_blocks,
208
+ )
209
+ self.decoder = Decoder(
210
+ self.encoder.out_channel, en_de_layers, kernel_size, n_blocks
211
+ )
212
+
213
+ def forward(self, x):
214
+ x, concat_tensors = self.encoder(x)
215
+ x = self.intermediate(x)
216
+ x = self.decoder(x, concat_tensors)
217
+ return x
218
+
219
+
220
+ class E2E(nn.Module):
221
+ def __init__(
222
+ self,
223
+ n_blocks,
224
+ n_gru,
225
+ kernel_size,
226
+ en_de_layers=5,
227
+ inter_layers=4,
228
+ in_channels=1,
229
+ en_out_channels=16,
230
+ ):
231
+ super(E2E, self).__init__()
232
+ self.unet = DeepUnet(
233
+ kernel_size,
234
+ n_blocks,
235
+ en_de_layers,
236
+ inter_layers,
237
+ in_channels,
238
+ en_out_channels,
239
+ )
240
+ self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1))
241
+ if n_gru:
242
+ self.fc = nn.Sequential(
243
+ BiGRU(3 * 128, 256, n_gru),
244
+ nn.Linear(512, 360),
245
+ nn.Dropout(0.25),
246
+ nn.Sigmoid(),
247
+ )
248
+
249
+ def forward(self, mel):
250
+ mel = mel.transpose(-1, -2).unsqueeze(1)
251
+ x = self.cnn(self.unet(mel)).transpose(1, 2).flatten(-2)
252
+ x = self.fc(x)
253
+ return x
254
+
255
+
256
+ class MelSpectrogram(torch.nn.Module):
257
+ def __init__(
258
+ self,
259
+ is_half,
260
+ n_mel_channels,
261
+ sampling_rate,
262
+ win_length,
263
+ hop_length,
264
+ n_fft=None,
265
+ mel_fmin=0,
266
+ mel_fmax=None,
267
+ clamp=1e-5,
268
+ ):
269
+ super().__init__()
270
+ n_fft = win_length if n_fft is None else n_fft
271
+ self.hann_window = {}
272
+ mel_basis = mel(
273
+ sr=sampling_rate,
274
+ n_fft=n_fft,
275
+ n_mels=n_mel_channels,
276
+ fmin=mel_fmin,
277
+ fmax=mel_fmax,
278
+ htk=True,
279
+ )
280
+ mel_basis = torch.from_numpy(mel_basis).float()
281
+ self.register_buffer("mel_basis", mel_basis)
282
+ self.n_fft = win_length if n_fft is None else n_fft
283
+ self.hop_length = hop_length
284
+ self.win_length = win_length
285
+ self.sampling_rate = sampling_rate
286
+ self.n_mel_channels = n_mel_channels
287
+ self.clamp = clamp
288
+ self.is_half = is_half
289
+
290
+ def forward(self, audio, keyshift=0, speed=1, center=True):
291
+ factor = 2 ** (keyshift / 12)
292
+ n_fft_new = int(np.round(self.n_fft * factor))
293
+ win_length_new = int(np.round(self.win_length * factor))
294
+ hop_length_new = int(np.round(self.hop_length * speed))
295
+ keyshift_key = str(keyshift) + "_" + str(audio.device)
296
+ if keyshift_key not in self.hann_window:
297
+ self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to(
298
+ audio.device
299
+ )
300
+ fft = torch.stft(
301
+ audio,
302
+ n_fft=n_fft_new,
303
+ hop_length=hop_length_new,
304
+ win_length=win_length_new,
305
+ window=self.hann_window[keyshift_key],
306
+ center=center,
307
+ return_complex=True,
308
+ )
309
+ magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2))
310
+ if keyshift != 0:
311
+ size = self.n_fft // 2 + 1
312
+ resize = magnitude.size(1)
313
+ if resize < size:
314
+ magnitude = F.pad(magnitude, (0, 0, 0, size - resize))
315
+ magnitude = magnitude[:, :size, :] * self.win_length / win_length_new
316
+ mel_output = torch.matmul(self.mel_basis, magnitude)
317
+ if self.is_half == True:
318
+ mel_output = mel_output.half()
319
+ log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp))
320
+ return log_mel_spec
321
+
322
+
323
+ class RMVPE:
324
+ def __init__(self, model_path, is_half, device=None):
325
+ self.resample_kernel = {}
326
+ model = E2E(4, 1, (2, 2))
327
+ ckpt = torch.load(model_path, map_location="cpu")
328
+ model.load_state_dict(ckpt)
329
+ model.eval()
330
+ if is_half == True:
331
+ model = model.half()
332
+ self.model = model
333
+ self.resample_kernel = {}
334
+ self.is_half = is_half
335
+ if device is None:
336
+ device = "cuda" if torch.cuda.is_available() else "cpu"
337
+ self.device = device
338
+ self.mel_extractor = MelSpectrogram(
339
+ is_half, 128, 16000, 1024, 160, None, 30, 8000
340
+ ).to(device)
341
+ self.model = self.model.to(device)
342
+ cents_mapping = 20 * np.arange(360) + 1997.3794084376191
343
+ self.cents_mapping = np.pad(cents_mapping, (4, 4)) # 368
344
+
345
+ def mel2hidden(self, mel):
346
+ with torch.no_grad():
347
+ n_frames = mel.shape[-1]
348
+ mel = F.pad(
349
+ mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode="reflect"
350
+ )
351
+ hidden = self.model(mel)
352
+ return hidden[:, :n_frames]
353
+
354
+ def decode(self, hidden, thred=0.03):
355
+ cents_pred = self.to_local_average_cents(hidden, thred=thred)
356
+ f0 = 10 * (2 ** (cents_pred / 1200))
357
+ f0[f0 == 10] = 0
358
+ return f0
359
+
360
+ def infer_from_audio(self, audio, thred=0.03):
361
+ audio = torch.from_numpy(audio).float().to(self.device).unsqueeze(0)
362
+ mel = self.mel_extractor(audio, center=True)
363
+ hidden = self.mel2hidden(mel)
364
+ hidden = hidden.squeeze(0).cpu().numpy()
365
+ if self.is_half == True:
366
+ hidden = hidden.astype("float32")
367
+ f0 = self.decode(hidden, thred=thred)
368
+ return f0
369
+
370
+ def to_local_average_cents(self, salience, thred=0.05):
371
+ center = np.argmax(salience, axis=1)
372
+ salience = np.pad(salience, ((0, 0), (4, 4)))
373
+ center += 4
374
+ todo_salience = []
375
+ todo_cents_mapping = []
376
+ starts = center - 4
377
+ ends = center + 5
378
+ for idx in range(salience.shape[0]):
379
+ todo_salience.append(salience[:, starts[idx] : ends[idx]][idx])
380
+ todo_cents_mapping.append(self.cents_mapping[starts[idx] : ends[idx]])
381
+ todo_salience = np.array(todo_salience)
382
+ todo_cents_mapping = np.array(todo_cents_mapping)
383
+ product_sum = np.sum(todo_salience * todo_cents_mapping, 1)
384
+ weight_sum = np.sum(todo_salience, 1)
385
+ devided = product_sum / weight_sum
386
+ maxx = np.max(salience, axis=1)
387
+ devided[maxx <= thred] = 0
388
+ return devided
rvc/lib/utils.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ffmpeg
2
+ import numpy as np
3
+
4
+
5
+ def load_audio(file, sampling_rate):
6
+ try:
7
+ file = file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
8
+ out, _ = (
9
+ ffmpeg.input(file, threads=0)
10
+ .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sampling_rate)
11
+ .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
12
+ )
13
+ except Exception as error:
14
+ raise RuntimeError(f"Failed to load audio: {error}")
15
+
16
+ return np.frombuffer(out, np.float32).flatten()