NeoPy commited on
Commit
b74f750
·
verified ·
1 Parent(s): a2848e3
Files changed (1) hide show
  1. infer/lib/predictors/Generator.py +839 -0
infer/lib/predictors/Generator.py ADDED
@@ -0,0 +1,839 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import sys
4
+ import math
5
+ import torch
6
+ import parselmouth
7
+
8
+ import numba as nb
9
+ import numpy as np
10
+
11
+ from scipy.signal import medfilt
12
+ from librosa import yin, pyin, piptrack
13
+
14
+ sys.path.append(os.getcwd())
15
+
16
+ from main.library.predictors.CREPE.filter import mean, median
17
+ from main.library.predictors.WORLD.SWIPE import swipe, stonemask
18
+ from main.app.variables import config, configs, logger, translations
19
+ from main.library.utils import autotune_f0, proposal_f0_up_key, circular_write
20
+
21
+ @nb.jit(nopython=True)
22
+ def post_process(
23
+ tf0,
24
+ f0,
25
+ f0_up_key,
26
+ manual_x_pad,
27
+ f0_mel_min,
28
+ f0_mel_max,
29
+ manual_f0 = None
30
+ ):
31
+ f0 *= pow(2, f0_up_key / 12)
32
+
33
+ if manual_f0 is not None:
34
+ replace_f0 = np.interp(
35
+ list(
36
+ range(
37
+ np.round(
38
+ (manual_f0[:, 0].max() - manual_f0[:, 0].min()) * tf0 + 1
39
+ ).astype(np.int16)
40
+ )
41
+ ),
42
+ manual_f0[:, 0] * 100,
43
+ manual_f0[:, 1]
44
+ )
45
+
46
+ f0[
47
+ manual_x_pad * tf0 : manual_x_pad * tf0 + len(replace_f0)
48
+ ] = replace_f0[
49
+ :f0[
50
+ manual_x_pad * tf0 : manual_x_pad * tf0 + len(replace_f0)
51
+ ].shape[0]
52
+ ]
53
+
54
+ f0_mel = 1127 * np.log(1 + f0 / 700)
55
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (f0_mel_max - f0_mel_min) + 1
56
+ f0_mel[f0_mel <= 1] = 1
57
+ f0_mel[f0_mel > 255] = 255
58
+
59
+ return np.rint(f0_mel).astype(np.int32), f0
60
+
61
+ def realtime_post_process(
62
+ f0,
63
+ pitch,
64
+ pitchf,
65
+ f0_up_key = 0,
66
+ f0_mel_min = 50.0,
67
+ f0_mel_max = 1100.0
68
+ ):
69
+ f0 *= 2 ** (f0_up_key / 12)
70
+
71
+ f0_mel = 1127.0 * (1.0 + f0 / 700.0).log()
72
+ f0_mel = torch.clip((f0_mel - f0_mel_min) * 254 / (f0_mel_max - f0_mel_min) + 1, 1, 255, out=f0_mel)
73
+ f0_coarse = torch.round(f0_mel, out=f0_mel).long()
74
+
75
+ if pitch is not None and pitchf is not None:
76
+ circular_write(f0_coarse, pitch)
77
+ circular_write(f0, pitchf)
78
+ else:
79
+ pitch = f0_coarse
80
+ pitchf = f0
81
+
82
+ return pitch.unsqueeze(0), pitchf.unsqueeze(0)
83
+
84
+ class Generator:
85
+ def __init__(
86
+ self,
87
+ sample_rate = 16000,
88
+ hop_length = 160,
89
+ f0_min = 50,
90
+ f0_max = 1100,
91
+ alpha = 0.5,
92
+ is_half = False,
93
+ device = "cpu",
94
+ predictor_onnx = False,
95
+ delete_predictor_onnx = True
96
+ ):
97
+ self.sample_rate = sample_rate
98
+ self.hop_length = hop_length
99
+ self.f0_min = f0_min
100
+ self.f0_max = f0_max
101
+ self.is_half = is_half
102
+ self.device = device
103
+ self.providers = config.providers
104
+ self.predictor_onnx = predictor_onnx
105
+ self.delete_predictor_onnx = delete_predictor_onnx
106
+ self.window = 160
107
+ self.batch_size = 512
108
+ self.alpha = alpha
109
+ self.ref_freqs = [
110
+ 49.00,
111
+ 51.91,
112
+ 55.00,
113
+ 58.27,
114
+ 61.74,
115
+ 65.41,
116
+ 69.30,
117
+ 73.42,
118
+ 77.78,
119
+ 82.41,
120
+ 87.31,
121
+ 92.50,
122
+ 98.00,
123
+ 103.83,
124
+ 110.00,
125
+ 116.54,
126
+ 123.47,
127
+ 130.81,
128
+ 138.59,
129
+ 146.83,
130
+ 155.56,
131
+ 164.81,
132
+ 174.61,
133
+ 185.00,
134
+ 196.00,
135
+ 207.65,
136
+ 220.00,
137
+ 233.08,
138
+ 246.94,
139
+ 261.63,
140
+ 277.18,
141
+ 293.66,
142
+ 311.13,
143
+ 329.63,
144
+ 349.23,
145
+ 369.99,
146
+ 392.00,
147
+ 415.30,
148
+ 440.00,
149
+ 466.16,
150
+ 493.88,
151
+ 523.25,
152
+ 554.37,
153
+ 587.33,
154
+ 622.25,
155
+ 659.25,
156
+ 698.46,
157
+ 739.99,
158
+ 783.99,
159
+ 830.61,
160
+ 880.00,
161
+ 932.33,
162
+ 987.77,
163
+ 1046.50
164
+ ]
165
+
166
+ def calculator(
167
+ self,
168
+ x_pad,
169
+ f0_method,
170
+ x,
171
+ f0_up_key = 0,
172
+ p_len = None,
173
+ filter_radius = 3,
174
+ f0_autotune = False,
175
+ f0_autotune_strength = 1,
176
+ manual_f0 = None,
177
+ proposal_pitch = False,
178
+ proposal_pitch_threshold = 255.0
179
+ ):
180
+ if p_len is None: p_len = x.shape[0] // self.window
181
+ if "hybrid" in f0_method: logger.debug(translations["hybrid_calc"].format(f0_method=f0_method))
182
+
183
+ compute_fn = (
184
+ self.get_f0_hybrid if "hybrid" in f0_method else self.compute_f0
185
+ )
186
+
187
+ f0 = compute_fn(
188
+ f0_method,
189
+ x,
190
+ p_len,
191
+ filter_radius if filter_radius % 2 != 0 else filter_radius + 1
192
+ )
193
+
194
+ if proposal_pitch:
195
+ up_key = proposal_f0_up_key(
196
+ f0,
197
+ proposal_pitch_threshold,
198
+ configs["limit_f0"]
199
+ )
200
+
201
+ logger.debug(translations["proposal_f0"].format(up_key=up_key))
202
+ f0_up_key += up_key
203
+
204
+ if f0_autotune:
205
+ logger.debug(translations["startautotune"])
206
+
207
+ f0 = autotune_f0(
208
+ self.ref_freqs,
209
+ f0,
210
+ f0_autotune_strength
211
+ )
212
+
213
+ return post_process(
214
+ self.sample_rate // self.window,
215
+ f0,
216
+ f0_up_key,
217
+ x_pad,
218
+ 1127 * math.log(1 + self.f0_min / 700),
219
+ 1127 * math.log(1 + self.f0_max / 700),
220
+ manual_f0
221
+ )
222
+
223
+ def realtime_calculator(
224
+ self,
225
+ audio,
226
+ f0_method,
227
+ pitch,
228
+ pitchf,
229
+ f0_up_key = 0,
230
+ filter_radius = 3,
231
+ f0_autotune = False,
232
+ f0_autotune_strength = 1,
233
+ proposal_pitch = False,
234
+ proposal_pitch_threshold = 255.0
235
+ ):
236
+ if torch.is_tensor(audio): audio = audio.cpu().numpy()
237
+ p_len = audio.shape[0] // self.window
238
+
239
+ f0 = self.compute_f0(
240
+ f0_method,
241
+ audio,
242
+ p_len,
243
+ filter_radius if filter_radius % 2 != 0 else filter_radius + 1
244
+ )
245
+
246
+ if f0_autotune:
247
+ f0 = autotune_f0(
248
+ self.ref_freqs,
249
+ f0,
250
+ f0_autotune_strength
251
+ )
252
+
253
+ if proposal_pitch:
254
+ up_key = proposal_f0_up_key(
255
+ f0,
256
+ proposal_pitch_threshold,
257
+ configs["limit_f0"]
258
+ )
259
+
260
+ f0_up_key += up_key
261
+
262
+ return realtime_post_process(
263
+ torch.from_numpy(f0).float().to(self.device),
264
+ pitch,
265
+ pitchf,
266
+ f0_up_key,
267
+ self.f0_min,
268
+ self.f0_max
269
+ )
270
+
271
+ def _resize_f0(self, x, target_len):
272
+ if len(x) == target_len: return x
273
+
274
+ source = np.array(x)
275
+ source[source < 0.001] = np.nan
276
+
277
+ return np.nan_to_num(
278
+ np.interp(
279
+ np.arange(0, len(source) * target_len, len(source)) / target_len,
280
+ np.arange(0, len(source)),
281
+ source
282
+ )
283
+ )
284
+
285
+ def compute_f0(self, f0_method, x, p_len, filter_radius):
286
+ if "pm" in f0_method:
287
+ f0 = self.get_f0_pm(
288
+ x,
289
+ p_len,
290
+ filter_radius=filter_radius,
291
+ mode=f0_method.split("-")[1]
292
+ )
293
+ elif f0_method.split("-")[0] in ["harvest", "dio"]:
294
+ f0 = self.get_f0_pyworld(
295
+ x,
296
+ p_len,
297
+ filter_radius,
298
+ f0_method.split("-")[0],
299
+ use_stonemask="stonemask" in f0_method
300
+ )
301
+ elif "crepe" in f0_method:
302
+ split_f0 = f0_method.split("-")
303
+ f0 = (
304
+ self.get_f0_mangio_crepe(
305
+ x,
306
+ p_len,
307
+ split_f0[2]
308
+ )
309
+ ) if split_f0[0] == "mangio" else (
310
+ self.get_f0_crepe(
311
+ x,
312
+ p_len,
313
+ split_f0[1],
314
+ filter_radius=filter_radius
315
+ )
316
+ )
317
+ elif "fcpe" in f0_method:
318
+ f0 = self.get_f0_fcpe(
319
+ x,
320
+ p_len,
321
+ legacy="legacy" in f0_method and "previous" not in f0_method,
322
+ previous="previous" in f0_method,
323
+ filter_radius=filter_radius
324
+ )
325
+ elif "rmvpe" in f0_method:
326
+ f0 = self.get_f0_rmvpe(
327
+ x,
328
+ p_len,
329
+ clipping="clipping" in f0_method,
330
+ filter_radius=filter_radius,
331
+ hpa="hpa" in f0_method,
332
+ previous="previous" in f0_method
333
+ )
334
+ elif f0_method in ["yin", "pyin", "piptrack"]:
335
+ f0 = self.get_f0_librosa(
336
+ x,
337
+ p_len,
338
+ mode=f0_method,
339
+ filter_radius=filter_radius
340
+ )
341
+ elif "swipe" in f0_method:
342
+ f0 = self.get_f0_swipe(
343
+ x,
344
+ p_len,
345
+ filter_radius=filter_radius,
346
+ use_stonemask="stonemask" in f0_method
347
+ )
348
+ elif "penn" in f0_method:
349
+ f0 = (
350
+ self.get_f0_mangio_penn(
351
+ x,
352
+ p_len
353
+ )
354
+ ) if f0_method.split("-")[0] == "mangio" else (
355
+ self.get_f0_penn(
356
+ x,
357
+ p_len,
358
+ filter_radius=filter_radius
359
+ )
360
+ )
361
+ elif "djcm" in f0_method:
362
+ f0 = self.get_f0_djcm(
363
+ x,
364
+ p_len,
365
+ clipping="clipping" in f0_method,
366
+ svs="svs" in f0_method,
367
+ filter_radius=filter_radius
368
+ )
369
+ elif "pesto" in f0_method:
370
+ f0 = self.get_f0_pesto(
371
+ x,
372
+ p_len
373
+ )
374
+ elif "swift" in f0_method:
375
+ f0 = self.get_f0_swift(
376
+ x,
377
+ p_len,
378
+ filter_radius=filter_radius
379
+ )
380
+ else:
381
+ raise ValueError(translations["option_not_valid"])
382
+
383
+ if isinstance(f0, tuple): f0 = f0[0]
384
+ if "medfilt" in f0_method or "svs" in f0_method: f0 = medfilt(f0, kernel_size=5)
385
+
386
+ return f0
387
+
388
+ def get_f0_hybrid(self, methods_str, x, p_len, filter_radius):
389
+ methods_str = re.search(r"hybrid\[(.+)\]", methods_str)
390
+ if methods_str:
391
+ methods = [
392
+ method.strip()
393
+ for method in methods_str.group(1).split("+")
394
+ ]
395
+
396
+ n = len(methods)
397
+ f0_stack = []
398
+
399
+ for method in methods:
400
+ f0_stack.append(
401
+ self._resize_f0(
402
+ self.compute_f0(
403
+ method,
404
+ x,
405
+ p_len,
406
+ filter_radius
407
+ ),
408
+ p_len
409
+ )
410
+ )
411
+
412
+ f0_mix = np.zeros(p_len)
413
+
414
+ if not f0_stack: return f0_mix
415
+ if len(f0_stack) == 1: return f0_stack[0]
416
+
417
+ weights = (1 - np.abs(np.arange(n) / (n - 1) - (1 - self.alpha))) ** 2
418
+ weights /= weights.sum()
419
+
420
+ stacked = np.vstack(f0_stack)
421
+ voiced_mask = np.any(stacked > 0, axis=0)
422
+
423
+ f0_mix[voiced_mask] = np.exp(
424
+ np.nansum(
425
+ np.log(stacked + 1e-6) * weights[:, None], axis=0
426
+ )[voiced_mask]
427
+ )
428
+
429
+ return f0_mix
430
+
431
+ def get_f0_pm(self, x, p_len, filter_radius=3, mode="ac"):
432
+ time_step = self.window / self.sample_rate * 1000 / 1000
433
+
434
+ pm = parselmouth.Sound(
435
+ x,
436
+ self.sample_rate
437
+ )
438
+ pm_fn = {
439
+ "ac": pm.to_pitch_ac,
440
+ "cc": pm.to_pitch_cc,
441
+ "shs": pm.to_pitch_shs
442
+ }.get(mode, pm.to_pitch_ac)
443
+
444
+ pitch = (
445
+ pm_fn(
446
+ time_step=time_step,
447
+ voicing_threshold=filter_radius / 10 * 2,
448
+ pitch_floor=self.f0_min,
449
+ pitch_ceiling=self.f0_max
450
+ )
451
+ ) if mode != "shs" else (
452
+ pm_fn(
453
+ time_step=time_step,
454
+ minimum_pitch=self.f0_min,
455
+ maximum_frequency_component=self.f0_max
456
+ )
457
+ )
458
+
459
+ f0 = pitch.selected_array["frequency"]
460
+ pad_size = (p_len - len(f0) + 1) // 2
461
+
462
+ if pad_size > 0 or p_len - len(f0) - pad_size > 0:
463
+ f0 = np.pad(
464
+ f0,
465
+ [[pad_size, p_len - len(f0) - pad_size]],
466
+ mode="constant"
467
+ )
468
+
469
+ return f0
470
+
471
+ def get_f0_mangio_crepe(self, x, p_len, model="full"):
472
+ if not hasattr(self, "mangio_crepe"):
473
+ from main.library.predictors.CREPE.CREPE import CREPE
474
+
475
+ self.mangio_crepe = CREPE(
476
+ os.path.join(
477
+ configs["predictors_path"],
478
+ f"crepe_{model}.{'onnx' if self.predictor_onnx else 'pth'}"
479
+ ),
480
+ model_size=model,
481
+ hop_length=self.hop_length,
482
+ batch_size=self.hop_length * 2,
483
+ f0_min=self.f0_min,
484
+ f0_max=self.f0_max,
485
+ device=self.device,
486
+ sample_rate=self.sample_rate,
487
+ providers=self.providers,
488
+ onnx=self.predictor_onnx,
489
+ return_periodicity=False
490
+ )
491
+
492
+ x = x.astype(np.float32)
493
+ x /= np.quantile(np.abs(x), 0.999)
494
+
495
+ audio = torch.from_numpy(x).to(self.device, copy=True).unsqueeze(dim=0)
496
+ if audio.ndim == 2 and audio.shape[0] > 1: audio = audio.mean(dim=0, keepdim=True).detach()
497
+
498
+ f0 = self.mangio_crepe.compute_f0(audio.detach(), pad=True)
499
+ if self.predictor_onnx and self.delete_predictor_onnx: del self.mangio_crepe.model, self.mangio_crepe
500
+
501
+ return self._resize_f0(f0.squeeze(0).cpu().float().numpy(), p_len)
502
+
503
+ def get_f0_crepe(self, x, p_len, model="full", filter_radius=3):
504
+ if not hasattr(self, "crepe"):
505
+ from main.library.predictors.CREPE.CREPE import CREPE
506
+
507
+ self.crepe = CREPE(
508
+ os.path.join(
509
+ configs["predictors_path"],
510
+ f"crepe_{model}.{'onnx' if self.predictor_onnx else 'pth'}"
511
+ ),
512
+ model_size=model,
513
+ hop_length=self.window,
514
+ batch_size=self.batch_size,
515
+ f0_min=self.f0_min,
516
+ f0_max=self.f0_max,
517
+ device=self.device,
518
+ sample_rate=self.sample_rate,
519
+ providers=self.providers,
520
+ onnx=self.predictor_onnx,
521
+ return_periodicity=True
522
+ )
523
+
524
+ f0, pd = self.crepe.compute_f0(torch.tensor(np.copy(x))[None].float(), pad=True)
525
+ if self.predictor_onnx and self.delete_predictor_onnx: del self.crepe.model, self.crepe
526
+
527
+ f0, pd = mean(f0, filter_radius), median(pd, filter_radius)
528
+ f0[pd < 0.1] = 0
529
+
530
+ return self._resize_f0(f0[0].cpu().numpy(), p_len)
531
+
532
+ def get_f0_fcpe(self, x, p_len, legacy=False, previous=False, filter_radius=3):
533
+ if not hasattr(self, "fcpe"):
534
+ from main.library.predictors.FCPE.FCPE import FCPE
535
+
536
+ self.fcpe = FCPE(
537
+ configs,
538
+ os.path.join(
539
+ configs["predictors_path"],
540
+ (
541
+ "fcpe_legacy"
542
+ if legacy else
543
+ ("fcpe" if previous else "ddsp_200k")
544
+ ) + (".onnx" if self.predictor_onnx else ".pt")
545
+ ),
546
+ hop_length=self.hop_length,
547
+ f0_min=self.f0_min,
548
+ f0_max=self.f0_max,
549
+ dtype=torch.float32,
550
+ device=self.device,
551
+ sample_rate=self.sample_rate,
552
+ threshold=(
553
+ filter_radius / 100
554
+ ) if legacy else (
555
+ filter_radius / 1000 * 2
556
+ ),
557
+ providers=self.providers,
558
+ onnx=self.predictor_onnx,
559
+ legacy=legacy
560
+ )
561
+
562
+ f0 = self.fcpe.compute_f0(x, p_len)
563
+ if self.predictor_onnx and self.delete_predictor_onnx: del self.fcpe.fcpe.model, self.fcpe
564
+
565
+ return f0
566
+
567
+ def get_f0_rmvpe(self, x, p_len, clipping=False, filter_radius=3, hpa=False, previous=False):
568
+ if not hasattr(self, "rmvpe"):
569
+ from main.library.predictors.RMVPE.RMVPE import RMVPE
570
+
571
+ self.rmvpe = RMVPE(
572
+ os.path.join(
573
+ configs["predictors_path"],
574
+ (
575
+ (
576
+ "hpa-rmvpe-76000"
577
+ if previous else
578
+ "hpa-rmvpe-112000"
579
+ ) if hpa else "rmvpe"
580
+ ) + (".onnx" if self.predictor_onnx else ".pt")
581
+ ),
582
+ is_half=self.is_half,
583
+ device=self.device,
584
+ onnx=self.predictor_onnx,
585
+ providers=self.providers,
586
+ hpa=hpa
587
+ )
588
+
589
+ filter_radius = filter_radius / 100
590
+
591
+ f0 = (
592
+ self.rmvpe.infer_from_audio_with_pitch(
593
+ x,
594
+ thred=filter_radius,
595
+ f0_min=self.f0_min,
596
+ f0_max=self.f0_max
597
+ )
598
+ ) if clipping else (
599
+ self.rmvpe.infer_from_audio(
600
+ x,
601
+ thred=filter_radius
602
+ )
603
+ )
604
+
605
+ if self.predictor_onnx and self.delete_predictor_onnx: del self.rmvpe.model, self.rmvpe
606
+ return self._resize_f0(f0, p_len)
607
+
608
+ def get_f0_pyworld(self, x, p_len, filter_radius, model="harvest", use_stonemask=True):
609
+ if not hasattr(self, "pw"):
610
+ from main.library.predictors.WORLD.WORLD import PYWORLD
611
+
612
+ self.pw = PYWORLD(
613
+ os.path.join(configs["predictors_path"], "world"),
614
+ os.path.join(configs["binary_path"], "world.bin")
615
+ )
616
+
617
+ x = x.astype(np.double)
618
+ pw_fn = self.pw.harvest if model == "harvest" else self.pw.dio
619
+
620
+ f0, t = pw_fn(
621
+ x,
622
+ fs=self.sample_rate,
623
+ f0_ceil=self.f0_max,
624
+ f0_floor=self.f0_min,
625
+ frame_period=1000 * self.window / self.sample_rate
626
+ )
627
+
628
+ if use_stonemask:
629
+ f0 = self.pw.stonemask(
630
+ x,
631
+ self.sample_rate,
632
+ t,
633
+ f0
634
+ )
635
+
636
+ if filter_radius > 2 and model == "harvest": f0 = medfilt(f0, filter_radius)
637
+ elif model == "dio":
638
+ for index, pitch in enumerate(f0):
639
+ f0[index] = round(pitch, 1)
640
+
641
+ return self._resize_f0(f0, p_len)
642
+
643
+ def get_f0_swipe(self, x, p_len, filter_radius=3, use_stonemask=True):
644
+ f0, t = swipe(
645
+ x.astype(np.float32),
646
+ self.sample_rate,
647
+ f0_floor=self.f0_min,
648
+ f0_ceil=self.f0_max,
649
+ frame_period=1000 * self.window / self.sample_rate,
650
+ sTHR=filter_radius / 10
651
+ )
652
+
653
+ if use_stonemask:
654
+ f0 = stonemask(
655
+ x,
656
+ self.sample_rate,
657
+ t,
658
+ f0
659
+ )
660
+
661
+ return self._resize_f0(f0, p_len)
662
+
663
+ def get_f0_librosa(self, x, p_len, mode="yin", filter_radius=3):
664
+ if mode != "piptrack":
665
+ self.if_yin = mode == "yin"
666
+ self.yin = yin if self.if_yin else pyin
667
+
668
+ f0 = self.yin(
669
+ x.astype(np.float32),
670
+ sr=self.sample_rate,
671
+ fmin=self.f0_min,
672
+ fmax=self.f0_max,
673
+ hop_length=self.hop_length
674
+ )
675
+
676
+ if not self.if_yin: f0 = f0[0]
677
+ else:
678
+ pitches, magnitudes = piptrack(
679
+ y=x.astype(np.float32),
680
+ sr=self.sample_rate,
681
+ fmin=self.f0_min,
682
+ fmax=self.f0_max,
683
+ hop_length=self.hop_length,
684
+ threshold=filter_radius / 10
685
+ )
686
+
687
+ max_indexes = np.argmax(magnitudes, axis=0)
688
+ f0 = pitches[max_indexes, range(magnitudes.shape[1])]
689
+
690
+ return self._resize_f0(f0, p_len)
691
+
692
+ def get_f0_penn(self, x, p_len, filter_radius=3):
693
+ if not hasattr(self, "penn"):
694
+ from main.library.predictors.PENN.PENN import PENN
695
+
696
+ self.penn = PENN(
697
+ os.path.join(
698
+ configs["predictors_path"],
699
+ f"fcn.{'onnx' if self.predictor_onnx else 'pt'}"
700
+ ),
701
+ hop_length=self.window // 2,
702
+ batch_size=self.batch_size // 2,
703
+ f0_min=self.f0_min,
704
+ f0_max=self.f0_max,
705
+ sample_rate=self.sample_rate,
706
+ device=self.device,
707
+ providers=self.providers,
708
+ onnx=self.predictor_onnx,
709
+ )
710
+
711
+ f0, pd = self.penn.compute_f0(torch.tensor(np.copy((x)))[None].float())
712
+
713
+ if self.predictor_onnx and self.delete_predictor_onnx:
714
+ del self.penn.model, self.penn.decoder
715
+ del self.penn.resample_audio, self.penn
716
+
717
+ f0, pd = mean(f0, filter_radius), median(pd, filter_radius)
718
+ f0[pd < 0.1] = 0
719
+
720
+ return self._resize_f0(f0[0].cpu().numpy(), p_len)
721
+
722
+ def get_f0_mangio_penn(self, x, p_len):
723
+ if not hasattr(self, "mangio_penn"):
724
+ from main.library.predictors.PENN.PENN import PENN
725
+
726
+ self.mangio_penn = PENN(
727
+ os.path.join(
728
+ configs["predictors_path"],
729
+ f"fcn.{'onnx' if self.predictor_onnx else 'pt'}"
730
+ ),
731
+ hop_length=self.hop_length // 2,
732
+ batch_size=self.hop_length,
733
+ f0_min=self.f0_min,
734
+ f0_max=self.f0_max,
735
+ sample_rate=self.sample_rate,
736
+ device=self.device,
737
+ providers=self.providers,
738
+ onnx=self.predictor_onnx,
739
+ interp_unvoiced_at=0.1
740
+ )
741
+
742
+ x = x.astype(np.float32)
743
+ x /= np.quantile(np.abs(x), 0.999)
744
+
745
+ audio = torch.from_numpy(x).to(self.device, copy=True).unsqueeze(dim=0)
746
+ if audio.ndim == 2 and audio.shape[0] > 1: audio = audio.mean(dim=0, keepdim=True).detach()
747
+
748
+ f0 = self.mangio_penn.compute_f0(audio.detach())
749
+
750
+ if self.predictor_onnx and self.delete_predictor_onnx:
751
+ del self.mangio_penn.model, self.mangio_penn.decoder
752
+ del self.mangio_penn.resample_audio, self.mangio_penn
753
+
754
+ return self._resize_f0(f0.squeeze(0).cpu().float().numpy(), p_len)
755
+
756
+ def get_f0_djcm(self, x, p_len, clipping=False, svs=False, filter_radius=3):
757
+ if not hasattr(self, "djcm"):
758
+ from main.library.predictors.DJCM.DJCM import DJCM
759
+
760
+ self.djcm = DJCM(
761
+ os.path.join(
762
+ configs["predictors_path"],
763
+ (
764
+ "djcm-svs"
765
+ if svs else
766
+ "djcm"
767
+ ) + (".onnx" if self.predictor_onnx else ".pt")
768
+ ),
769
+ is_half=self.is_half,
770
+ device=self.device,
771
+ onnx=self.predictor_onnx,
772
+ svs=svs,
773
+ providers=self.providers
774
+ )
775
+
776
+ filter_radius /= 10
777
+
778
+ f0 = (
779
+ self.djcm.infer_from_audio_with_pitch(
780
+ x,
781
+ thred=filter_radius,
782
+ f0_min=self.f0_min,
783
+ f0_max=self.f0_max
784
+ )
785
+ ) if clipping else (
786
+ self.djcm.infer_from_audio(
787
+ x,
788
+ thred=filter_radius
789
+ )
790
+ )
791
+
792
+ if self.predictor_onnx and self.delete_predictor_onnx: del self.djcm.model, self.djcm
793
+ return self._resize_f0(f0, p_len)
794
+
795
+ def get_f0_swift(self, x, p_len, filter_radius=3):
796
+ if not hasattr(self, "swift"):
797
+ from main.library.predictors.SWIFT.SWIFT import SWIFT
798
+
799
+ self.swift = SWIFT(
800
+ os.path.join(
801
+ configs["predictors_path"],
802
+ "swift.onnx"
803
+ ),
804
+ fmin=self.f0_min,
805
+ fmax=self.f0_max,
806
+ confidence_threshold=filter_radius / 4 + 0.137
807
+ )
808
+
809
+ pitch_hz, _, _ = self.swift.detect_from_array(x, self.sample_rate)
810
+ return self._resize_f0(pitch_hz, p_len)
811
+
812
+ def get_f0_pesto(self, x, p_len):
813
+ if not hasattr(self, "pesto"):
814
+ from main.library.predictors.PESTO.PESTO import PESTO
815
+
816
+ self.pesto = PESTO(
817
+ os.path.join(
818
+ configs["predictors_path"],
819
+ f"pesto.{'onnx' if self.predictor_onnx else 'pt'}"
820
+ ),
821
+ step_size=1000 * self.window / self.sample_rate,
822
+ reduction = "alwa",
823
+ num_chunks=1,
824
+ sample_rate=self.sample_rate,
825
+ device=self.device,
826
+ providers=self.providers,
827
+ onnx=self.predictor_onnx
828
+ )
829
+
830
+ x = x.astype(np.float32)
831
+ x /= np.quantile(np.abs(x), 0.999)
832
+
833
+ audio = torch.from_numpy(x).to(self.device, copy=True).unsqueeze(dim=0)
834
+ if audio.ndim == 2 and audio.shape[0] > 1: audio = audio.mean(dim=0, keepdim=True).detach()
835
+
836
+ f0 = self.pesto.compute_f0(audio.detach())[0]
837
+ if self.predictor_onnx and self.delete_predictor_onnx: del self.pesto.model, self.pesto
838
+
839
+ return self._resize_f0(f0.squeeze(0).cpu().float().numpy(), p_len)