Update infer/lib/predictors/Generator.py
Browse files- infer/lib/predictors/Generator.py +14 -202
infer/lib/predictors/Generator.py
CHANGED
|
@@ -13,10 +13,10 @@ from librosa import yin, pyin, piptrack
|
|
| 13 |
|
| 14 |
sys.path.append(os.getcwd())
|
| 15 |
|
| 16 |
-
from
|
| 17 |
-
from
|
| 18 |
-
from
|
| 19 |
-
from
|
| 20 |
|
| 21 |
@nb.jit(nopython=True)
|
| 22 |
def post_process(
|
|
@@ -338,26 +338,7 @@ class Generator:
|
|
| 338 |
mode=f0_method,
|
| 339 |
filter_radius=filter_radius
|
| 340 |
)
|
| 341 |
-
|
| 342 |
-
f0 = self.get_f0_swipe(
|
| 343 |
-
x,
|
| 344 |
-
p_len,
|
| 345 |
-
filter_radius=filter_radius,
|
| 346 |
-
use_stonemask="stonemask" in f0_method
|
| 347 |
-
)
|
| 348 |
-
elif "penn" in f0_method:
|
| 349 |
-
f0 = (
|
| 350 |
-
self.get_f0_mangio_penn(
|
| 351 |
-
x,
|
| 352 |
-
p_len
|
| 353 |
-
)
|
| 354 |
-
) if f0_method.split("-")[0] == "mangio" else (
|
| 355 |
-
self.get_f0_penn(
|
| 356 |
-
x,
|
| 357 |
-
p_len,
|
| 358 |
-
filter_radius=filter_radius
|
| 359 |
-
)
|
| 360 |
-
)
|
| 361 |
elif "djcm" in f0_method:
|
| 362 |
f0 = self.get_f0_djcm(
|
| 363 |
x,
|
|
@@ -366,17 +347,7 @@ class Generator:
|
|
| 366 |
svs="svs" in f0_method,
|
| 367 |
filter_radius=filter_radius
|
| 368 |
)
|
| 369 |
-
|
| 370 |
-
f0 = self.get_f0_pesto(
|
| 371 |
-
x,
|
| 372 |
-
p_len
|
| 373 |
-
)
|
| 374 |
-
elif "swift" in f0_method:
|
| 375 |
-
f0 = self.get_f0_swift(
|
| 376 |
-
x,
|
| 377 |
-
p_len,
|
| 378 |
-
filter_radius=filter_radius
|
| 379 |
-
)
|
| 380 |
else:
|
| 381 |
raise ValueError(translations["option_not_valid"])
|
| 382 |
|
|
@@ -470,7 +441,7 @@ class Generator:
|
|
| 470 |
|
| 471 |
def get_f0_mangio_crepe(self, x, p_len, model="full"):
|
| 472 |
if not hasattr(self, "mangio_crepe"):
|
| 473 |
-
from
|
| 474 |
|
| 475 |
self.mangio_crepe = CREPE(
|
| 476 |
os.path.join(
|
|
@@ -502,7 +473,7 @@ class Generator:
|
|
| 502 |
|
| 503 |
def get_f0_crepe(self, x, p_len, model="full", filter_radius=3):
|
| 504 |
if not hasattr(self, "crepe"):
|
| 505 |
-
from
|
| 506 |
|
| 507 |
self.crepe = CREPE(
|
| 508 |
os.path.join(
|
|
@@ -531,7 +502,7 @@ class Generator:
|
|
| 531 |
|
| 532 |
def get_f0_fcpe(self, x, p_len, legacy=False, previous=False, filter_radius=3):
|
| 533 |
if not hasattr(self, "fcpe"):
|
| 534 |
-
from
|
| 535 |
|
| 536 |
self.fcpe = FCPE(
|
| 537 |
configs,
|
|
@@ -566,7 +537,7 @@ class Generator:
|
|
| 566 |
|
| 567 |
def get_f0_rmvpe(self, x, p_len, clipping=False, filter_radius=3, hpa=False, previous=False):
|
| 568 |
if not hasattr(self, "rmvpe"):
|
| 569 |
-
from
|
| 570 |
|
| 571 |
self.rmvpe = RMVPE(
|
| 572 |
os.path.join(
|
|
@@ -605,60 +576,6 @@ class Generator:
|
|
| 605 |
if self.predictor_onnx and self.delete_predictor_onnx: del self.rmvpe.model, self.rmvpe
|
| 606 |
return self._resize_f0(f0, p_len)
|
| 607 |
|
| 608 |
-
def get_f0_pyworld(self, x, p_len, filter_radius, model="harvest", use_stonemask=True):
|
| 609 |
-
if not hasattr(self, "pw"):
|
| 610 |
-
from main.library.predictors.WORLD.WORLD import PYWORLD
|
| 611 |
-
|
| 612 |
-
self.pw = PYWORLD(
|
| 613 |
-
os.path.join(configs["predictors_path"], "world"),
|
| 614 |
-
os.path.join(configs["binary_path"], "world.bin")
|
| 615 |
-
)
|
| 616 |
-
|
| 617 |
-
x = x.astype(np.double)
|
| 618 |
-
pw_fn = self.pw.harvest if model == "harvest" else self.pw.dio
|
| 619 |
-
|
| 620 |
-
f0, t = pw_fn(
|
| 621 |
-
x,
|
| 622 |
-
fs=self.sample_rate,
|
| 623 |
-
f0_ceil=self.f0_max,
|
| 624 |
-
f0_floor=self.f0_min,
|
| 625 |
-
frame_period=1000 * self.window / self.sample_rate
|
| 626 |
-
)
|
| 627 |
-
|
| 628 |
-
if use_stonemask:
|
| 629 |
-
f0 = self.pw.stonemask(
|
| 630 |
-
x,
|
| 631 |
-
self.sample_rate,
|
| 632 |
-
t,
|
| 633 |
-
f0
|
| 634 |
-
)
|
| 635 |
-
|
| 636 |
-
if filter_radius > 2 and model == "harvest": f0 = medfilt(f0, filter_radius)
|
| 637 |
-
elif model == "dio":
|
| 638 |
-
for index, pitch in enumerate(f0):
|
| 639 |
-
f0[index] = round(pitch, 1)
|
| 640 |
-
|
| 641 |
-
return self._resize_f0(f0, p_len)
|
| 642 |
-
|
| 643 |
-
def get_f0_swipe(self, x, p_len, filter_radius=3, use_stonemask=True):
|
| 644 |
-
f0, t = swipe(
|
| 645 |
-
x.astype(np.float32),
|
| 646 |
-
self.sample_rate,
|
| 647 |
-
f0_floor=self.f0_min,
|
| 648 |
-
f0_ceil=self.f0_max,
|
| 649 |
-
frame_period=1000 * self.window / self.sample_rate,
|
| 650 |
-
sTHR=filter_radius / 10
|
| 651 |
-
)
|
| 652 |
-
|
| 653 |
-
if use_stonemask:
|
| 654 |
-
f0 = stonemask(
|
| 655 |
-
x,
|
| 656 |
-
self.sample_rate,
|
| 657 |
-
t,
|
| 658 |
-
f0
|
| 659 |
-
)
|
| 660 |
-
|
| 661 |
-
return self._resize_f0(f0, p_len)
|
| 662 |
|
| 663 |
def get_f0_librosa(self, x, p_len, mode="yin", filter_radius=3):
|
| 664 |
if mode != "piptrack":
|
|
@@ -689,70 +606,8 @@ class Generator:
|
|
| 689 |
|
| 690 |
return self._resize_f0(f0, p_len)
|
| 691 |
|
| 692 |
-
|
| 693 |
-
|
| 694 |
-
from main.library.predictors.PENN.PENN import PENN
|
| 695 |
-
|
| 696 |
-
self.penn = PENN(
|
| 697 |
-
os.path.join(
|
| 698 |
-
configs["predictors_path"],
|
| 699 |
-
f"fcn.{'onnx' if self.predictor_onnx else 'pt'}"
|
| 700 |
-
),
|
| 701 |
-
hop_length=self.window // 2,
|
| 702 |
-
batch_size=self.batch_size // 2,
|
| 703 |
-
f0_min=self.f0_min,
|
| 704 |
-
f0_max=self.f0_max,
|
| 705 |
-
sample_rate=self.sample_rate,
|
| 706 |
-
device=self.device,
|
| 707 |
-
providers=self.providers,
|
| 708 |
-
onnx=self.predictor_onnx,
|
| 709 |
-
)
|
| 710 |
-
|
| 711 |
-
f0, pd = self.penn.compute_f0(torch.tensor(np.copy((x)))[None].float())
|
| 712 |
-
|
| 713 |
-
if self.predictor_onnx and self.delete_predictor_onnx:
|
| 714 |
-
del self.penn.model, self.penn.decoder
|
| 715 |
-
del self.penn.resample_audio, self.penn
|
| 716 |
-
|
| 717 |
-
f0, pd = mean(f0, filter_radius), median(pd, filter_radius)
|
| 718 |
-
f0[pd < 0.1] = 0
|
| 719 |
-
|
| 720 |
-
return self._resize_f0(f0[0].cpu().numpy(), p_len)
|
| 721 |
-
|
| 722 |
-
def get_f0_mangio_penn(self, x, p_len):
|
| 723 |
-
if not hasattr(self, "mangio_penn"):
|
| 724 |
-
from main.library.predictors.PENN.PENN import PENN
|
| 725 |
-
|
| 726 |
-
self.mangio_penn = PENN(
|
| 727 |
-
os.path.join(
|
| 728 |
-
configs["predictors_path"],
|
| 729 |
-
f"fcn.{'onnx' if self.predictor_onnx else 'pt'}"
|
| 730 |
-
),
|
| 731 |
-
hop_length=self.hop_length // 2,
|
| 732 |
-
batch_size=self.hop_length,
|
| 733 |
-
f0_min=self.f0_min,
|
| 734 |
-
f0_max=self.f0_max,
|
| 735 |
-
sample_rate=self.sample_rate,
|
| 736 |
-
device=self.device,
|
| 737 |
-
providers=self.providers,
|
| 738 |
-
onnx=self.predictor_onnx,
|
| 739 |
-
interp_unvoiced_at=0.1
|
| 740 |
-
)
|
| 741 |
-
|
| 742 |
-
x = x.astype(np.float32)
|
| 743 |
-
x /= np.quantile(np.abs(x), 0.999)
|
| 744 |
-
|
| 745 |
-
audio = torch.from_numpy(x).to(self.device, copy=True).unsqueeze(dim=0)
|
| 746 |
-
if audio.ndim == 2 and audio.shape[0] > 1: audio = audio.mean(dim=0, keepdim=True).detach()
|
| 747 |
-
|
| 748 |
-
f0 = self.mangio_penn.compute_f0(audio.detach())
|
| 749 |
-
|
| 750 |
-
if self.predictor_onnx and self.delete_predictor_onnx:
|
| 751 |
-
del self.mangio_penn.model, self.mangio_penn.decoder
|
| 752 |
-
del self.mangio_penn.resample_audio, self.mangio_penn
|
| 753 |
-
|
| 754 |
-
return self._resize_f0(f0.squeeze(0).cpu().float().numpy(), p_len)
|
| 755 |
-
|
| 756 |
def get_f0_djcm(self, x, p_len, clipping=False, svs=False, filter_radius=3):
|
| 757 |
if not hasattr(self, "djcm"):
|
| 758 |
from main.library.predictors.DJCM.DJCM import DJCM
|
|
@@ -792,48 +647,5 @@ class Generator:
|
|
| 792 |
if self.predictor_onnx and self.delete_predictor_onnx: del self.djcm.model, self.djcm
|
| 793 |
return self._resize_f0(f0, p_len)
|
| 794 |
|
| 795 |
-
|
| 796 |
-
|
| 797 |
-
from main.library.predictors.SWIFT.SWIFT import SWIFT
|
| 798 |
-
|
| 799 |
-
self.swift = SWIFT(
|
| 800 |
-
os.path.join(
|
| 801 |
-
configs["predictors_path"],
|
| 802 |
-
"swift.onnx"
|
| 803 |
-
),
|
| 804 |
-
fmin=self.f0_min,
|
| 805 |
-
fmax=self.f0_max,
|
| 806 |
-
confidence_threshold=filter_radius / 4 + 0.137
|
| 807 |
-
)
|
| 808 |
-
|
| 809 |
-
pitch_hz, _, _ = self.swift.detect_from_array(x, self.sample_rate)
|
| 810 |
-
return self._resize_f0(pitch_hz, p_len)
|
| 811 |
-
|
| 812 |
-
def get_f0_pesto(self, x, p_len):
|
| 813 |
-
if not hasattr(self, "pesto"):
|
| 814 |
-
from main.library.predictors.PESTO.PESTO import PESTO
|
| 815 |
-
|
| 816 |
-
self.pesto = PESTO(
|
| 817 |
-
os.path.join(
|
| 818 |
-
configs["predictors_path"],
|
| 819 |
-
f"pesto.{'onnx' if self.predictor_onnx else 'pt'}"
|
| 820 |
-
),
|
| 821 |
-
step_size=1000 * self.window / self.sample_rate,
|
| 822 |
-
reduction = "alwa",
|
| 823 |
-
num_chunks=1,
|
| 824 |
-
sample_rate=self.sample_rate,
|
| 825 |
-
device=self.device,
|
| 826 |
-
providers=self.providers,
|
| 827 |
-
onnx=self.predictor_onnx
|
| 828 |
-
)
|
| 829 |
-
|
| 830 |
-
x = x.astype(np.float32)
|
| 831 |
-
x /= np.quantile(np.abs(x), 0.999)
|
| 832 |
-
|
| 833 |
-
audio = torch.from_numpy(x).to(self.device, copy=True).unsqueeze(dim=0)
|
| 834 |
-
if audio.ndim == 2 and audio.shape[0] > 1: audio = audio.mean(dim=0, keepdim=True).detach()
|
| 835 |
-
|
| 836 |
-
f0 = self.pesto.compute_f0(audio.detach())[0]
|
| 837 |
-
if self.predictor_onnx and self.delete_predictor_onnx: del self.pesto.model, self.pesto
|
| 838 |
-
|
| 839 |
-
return self._resize_f0(f0.squeeze(0).cpu().float().numpy(), p_len)
|
|
|
|
| 13 |
|
| 14 |
sys.path.append(os.getcwd())
|
| 15 |
|
| 16 |
+
from infer.lib.predictors.CREPE.filter import mean, median
|
| 17 |
+
from infer.lib.predictors.WORLD.SWIPE import swipe, stonemask
|
| 18 |
+
from infer.lib.variables import config, configs, logger, translations
|
| 19 |
+
from infer.lib.utils import autotune_f0, proposal_f0_up_key, circular_write
|
| 20 |
|
| 21 |
@nb.jit(nopython=True)
|
| 22 |
def post_process(
|
|
|
|
| 338 |
mode=f0_method,
|
| 339 |
filter_radius=filter_radius
|
| 340 |
)
|
| 341 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 342 |
elif "djcm" in f0_method:
|
| 343 |
f0 = self.get_f0_djcm(
|
| 344 |
x,
|
|
|
|
| 347 |
svs="svs" in f0_method,
|
| 348 |
filter_radius=filter_radius
|
| 349 |
)
|
| 350 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 351 |
else:
|
| 352 |
raise ValueError(translations["option_not_valid"])
|
| 353 |
|
|
|
|
| 441 |
|
| 442 |
def get_f0_mangio_crepe(self, x, p_len, model="full"):
|
| 443 |
if not hasattr(self, "mangio_crepe"):
|
| 444 |
+
from infer.lib.predictors.CREPE.CREPE import CREPE
|
| 445 |
|
| 446 |
self.mangio_crepe = CREPE(
|
| 447 |
os.path.join(
|
|
|
|
| 473 |
|
| 474 |
def get_f0_crepe(self, x, p_len, model="full", filter_radius=3):
|
| 475 |
if not hasattr(self, "crepe"):
|
| 476 |
+
from infer.lib.predictors.CREPE.CREPE import CREPE
|
| 477 |
|
| 478 |
self.crepe = CREPE(
|
| 479 |
os.path.join(
|
|
|
|
| 502 |
|
| 503 |
def get_f0_fcpe(self, x, p_len, legacy=False, previous=False, filter_radius=3):
|
| 504 |
if not hasattr(self, "fcpe"):
|
| 505 |
+
from infer.lib.predictors.FCPE.FCPE import FCPE
|
| 506 |
|
| 507 |
self.fcpe = FCPE(
|
| 508 |
configs,
|
|
|
|
| 537 |
|
| 538 |
def get_f0_rmvpe(self, x, p_len, clipping=False, filter_radius=3, hpa=False, previous=False):
|
| 539 |
if not hasattr(self, "rmvpe"):
|
| 540 |
+
from infer.lib.predictors.RMVPE.RMVPE import RMVPE
|
| 541 |
|
| 542 |
self.rmvpe = RMVPE(
|
| 543 |
os.path.join(
|
|
|
|
| 576 |
if self.predictor_onnx and self.delete_predictor_onnx: del self.rmvpe.model, self.rmvpe
|
| 577 |
return self._resize_f0(f0, p_len)
|
| 578 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 579 |
|
| 580 |
def get_f0_librosa(self, x, p_len, mode="yin", filter_radius=3):
|
| 581 |
if mode != "piptrack":
|
|
|
|
| 606 |
|
| 607 |
return self._resize_f0(f0, p_len)
|
| 608 |
|
| 609 |
+
|
| 610 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 611 |
def get_f0_djcm(self, x, p_len, clipping=False, svs=False, filter_radius=3):
|
| 612 |
if not hasattr(self, "djcm"):
|
| 613 |
from main.library.predictors.DJCM.DJCM import DJCM
|
|
|
|
| 647 |
if self.predictor_onnx and self.delete_predictor_onnx: del self.djcm.model, self.djcm
|
| 648 |
return self._resize_f0(f0, p_len)
|
| 649 |
|
| 650 |
+
|
| 651 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|