Spaces:
Running
Running
MarcusSu1216
commited on
Delete modules/enhancer.py
Browse files- modules/enhancer.py +0 -105
modules/enhancer.py
DELETED
@@ -1,105 +0,0 @@
|
|
1 |
-
import numpy as np
|
2 |
-
import torch
|
3 |
-
import torch.nn.functional as F
|
4 |
-
from vdecoder.nsf_hifigan.nvSTFT import STFT
|
5 |
-
from vdecoder.nsf_hifigan.models import load_model
|
6 |
-
from torchaudio.transforms import Resample
|
7 |
-
|
8 |
-
class Enhancer:
|
9 |
-
def __init__(self, enhancer_type, enhancer_ckpt, device=None):
|
10 |
-
if device is None:
|
11 |
-
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
12 |
-
self.device = device
|
13 |
-
|
14 |
-
if enhancer_type == 'nsf-hifigan':
|
15 |
-
self.enhancer = NsfHifiGAN(enhancer_ckpt, device=self.device)
|
16 |
-
else:
|
17 |
-
raise ValueError(f" [x] Unknown enhancer: {enhancer_type}")
|
18 |
-
|
19 |
-
self.resample_kernel = {}
|
20 |
-
self.enhancer_sample_rate = self.enhancer.sample_rate()
|
21 |
-
self.enhancer_hop_size = self.enhancer.hop_size()
|
22 |
-
|
23 |
-
def enhance(self,
|
24 |
-
audio, # 1, T
|
25 |
-
sample_rate,
|
26 |
-
f0, # 1, n_frames, 1
|
27 |
-
hop_size,
|
28 |
-
adaptive_key = 0,
|
29 |
-
silence_front = 0
|
30 |
-
):
|
31 |
-
# enhancer start time
|
32 |
-
start_frame = int(silence_front * sample_rate / hop_size)
|
33 |
-
real_silence_front = start_frame * hop_size / sample_rate
|
34 |
-
audio = audio[:, int(np.round(real_silence_front * sample_rate)) : ]
|
35 |
-
f0 = f0[: , start_frame :, :]
|
36 |
-
|
37 |
-
# adaptive parameters
|
38 |
-
adaptive_factor = 2 ** ( -adaptive_key / 12)
|
39 |
-
adaptive_sample_rate = 100 * int(np.round(self.enhancer_sample_rate / adaptive_factor / 100))
|
40 |
-
real_factor = self.enhancer_sample_rate / adaptive_sample_rate
|
41 |
-
|
42 |
-
# resample the ddsp output
|
43 |
-
if sample_rate == adaptive_sample_rate:
|
44 |
-
audio_res = audio
|
45 |
-
else:
|
46 |
-
key_str = str(sample_rate) + str(adaptive_sample_rate)
|
47 |
-
if key_str not in self.resample_kernel:
|
48 |
-
self.resample_kernel[key_str] = Resample(sample_rate, adaptive_sample_rate, lowpass_filter_width = 128).to(self.device)
|
49 |
-
audio_res = self.resample_kernel[key_str](audio)
|
50 |
-
|
51 |
-
n_frames = int(audio_res.size(-1) // self.enhancer_hop_size + 1)
|
52 |
-
|
53 |
-
# resample f0
|
54 |
-
f0_np = f0.squeeze(0).squeeze(-1).cpu().numpy()
|
55 |
-
f0_np *= real_factor
|
56 |
-
time_org = (hop_size / sample_rate) * np.arange(len(f0_np)) / real_factor
|
57 |
-
time_frame = (self.enhancer_hop_size / self.enhancer_sample_rate) * np.arange(n_frames)
|
58 |
-
f0_res = np.interp(time_frame, time_org, f0_np, left=f0_np[0], right=f0_np[-1])
|
59 |
-
f0_res = torch.from_numpy(f0_res).unsqueeze(0).float().to(self.device) # 1, n_frames
|
60 |
-
|
61 |
-
# enhance
|
62 |
-
enhanced_audio, enhancer_sample_rate = self.enhancer(audio_res, f0_res)
|
63 |
-
|
64 |
-
# resample the enhanced output
|
65 |
-
if adaptive_factor != 0:
|
66 |
-
key_str = str(adaptive_sample_rate) + str(enhancer_sample_rate)
|
67 |
-
if key_str not in self.resample_kernel:
|
68 |
-
self.resample_kernel[key_str] = Resample(adaptive_sample_rate, enhancer_sample_rate, lowpass_filter_width = 128).to(self.device)
|
69 |
-
enhanced_audio = self.resample_kernel[key_str](enhanced_audio)
|
70 |
-
|
71 |
-
# pad the silence frames
|
72 |
-
if start_frame > 0:
|
73 |
-
enhanced_audio = F.pad(enhanced_audio, (int(np.round(enhancer_sample_rate * real_silence_front)), 0))
|
74 |
-
|
75 |
-
return enhanced_audio, enhancer_sample_rate
|
76 |
-
|
77 |
-
|
78 |
-
class NsfHifiGAN(torch.nn.Module):
|
79 |
-
def __init__(self, model_path, device=None):
|
80 |
-
super().__init__()
|
81 |
-
if device is None:
|
82 |
-
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
83 |
-
self.device = device
|
84 |
-
print('| Load HifiGAN: ', model_path)
|
85 |
-
self.model, self.h = load_model(model_path, device=self.device)
|
86 |
-
|
87 |
-
def sample_rate(self):
|
88 |
-
return self.h.sampling_rate
|
89 |
-
|
90 |
-
def hop_size(self):
|
91 |
-
return self.h.hop_size
|
92 |
-
|
93 |
-
def forward(self, audio, f0):
|
94 |
-
stft = STFT(
|
95 |
-
self.h.sampling_rate,
|
96 |
-
self.h.num_mels,
|
97 |
-
self.h.n_fft,
|
98 |
-
self.h.win_size,
|
99 |
-
self.h.hop_size,
|
100 |
-
self.h.fmin,
|
101 |
-
self.h.fmax)
|
102 |
-
with torch.no_grad():
|
103 |
-
mel = stft.get_mel(audio)
|
104 |
-
enhanced_audio = self.model(mel, f0[:,:mel.size(-1)]).view(-1)
|
105 |
-
return enhanced_audio, self.h.sampling_rate
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|