MarcusSu1216 commited on
Commit
52580f0
·
verified ·
1 Parent(s): 67e013d

Delete modules/enhancer.py

Browse files
Files changed (1) hide show
  1. modules/enhancer.py +0 -105
modules/enhancer.py DELETED
@@ -1,105 +0,0 @@
1
- import numpy as np
2
- import torch
3
- import torch.nn.functional as F
4
- from vdecoder.nsf_hifigan.nvSTFT import STFT
5
- from vdecoder.nsf_hifigan.models import load_model
6
- from torchaudio.transforms import Resample
7
-
8
- class Enhancer:
9
- def __init__(self, enhancer_type, enhancer_ckpt, device=None):
10
- if device is None:
11
- device = 'cuda' if torch.cuda.is_available() else 'cpu'
12
- self.device = device
13
-
14
- if enhancer_type == 'nsf-hifigan':
15
- self.enhancer = NsfHifiGAN(enhancer_ckpt, device=self.device)
16
- else:
17
- raise ValueError(f" [x] Unknown enhancer: {enhancer_type}")
18
-
19
- self.resample_kernel = {}
20
- self.enhancer_sample_rate = self.enhancer.sample_rate()
21
- self.enhancer_hop_size = self.enhancer.hop_size()
22
-
23
- def enhance(self,
24
- audio, # 1, T
25
- sample_rate,
26
- f0, # 1, n_frames, 1
27
- hop_size,
28
- adaptive_key = 0,
29
- silence_front = 0
30
- ):
31
- # enhancer start time
32
- start_frame = int(silence_front * sample_rate / hop_size)
33
- real_silence_front = start_frame * hop_size / sample_rate
34
- audio = audio[:, int(np.round(real_silence_front * sample_rate)) : ]
35
- f0 = f0[: , start_frame :, :]
36
-
37
- # adaptive parameters
38
- adaptive_factor = 2 ** ( -adaptive_key / 12)
39
- adaptive_sample_rate = 100 * int(np.round(self.enhancer_sample_rate / adaptive_factor / 100))
40
- real_factor = self.enhancer_sample_rate / adaptive_sample_rate
41
-
42
- # resample the ddsp output
43
- if sample_rate == adaptive_sample_rate:
44
- audio_res = audio
45
- else:
46
- key_str = str(sample_rate) + str(adaptive_sample_rate)
47
- if key_str not in self.resample_kernel:
48
- self.resample_kernel[key_str] = Resample(sample_rate, adaptive_sample_rate, lowpass_filter_width = 128).to(self.device)
49
- audio_res = self.resample_kernel[key_str](audio)
50
-
51
- n_frames = int(audio_res.size(-1) // self.enhancer_hop_size + 1)
52
-
53
- # resample f0
54
- f0_np = f0.squeeze(0).squeeze(-1).cpu().numpy()
55
- f0_np *= real_factor
56
- time_org = (hop_size / sample_rate) * np.arange(len(f0_np)) / real_factor
57
- time_frame = (self.enhancer_hop_size / self.enhancer_sample_rate) * np.arange(n_frames)
58
- f0_res = np.interp(time_frame, time_org, f0_np, left=f0_np[0], right=f0_np[-1])
59
- f0_res = torch.from_numpy(f0_res).unsqueeze(0).float().to(self.device) # 1, n_frames
60
-
61
- # enhance
62
- enhanced_audio, enhancer_sample_rate = self.enhancer(audio_res, f0_res)
63
-
64
- # resample the enhanced output
65
- if adaptive_factor != 0:
66
- key_str = str(adaptive_sample_rate) + str(enhancer_sample_rate)
67
- if key_str not in self.resample_kernel:
68
- self.resample_kernel[key_str] = Resample(adaptive_sample_rate, enhancer_sample_rate, lowpass_filter_width = 128).to(self.device)
69
- enhanced_audio = self.resample_kernel[key_str](enhanced_audio)
70
-
71
- # pad the silence frames
72
- if start_frame > 0:
73
- enhanced_audio = F.pad(enhanced_audio, (int(np.round(enhancer_sample_rate * real_silence_front)), 0))
74
-
75
- return enhanced_audio, enhancer_sample_rate
76
-
77
-
78
- class NsfHifiGAN(torch.nn.Module):
79
- def __init__(self, model_path, device=None):
80
- super().__init__()
81
- if device is None:
82
- device = 'cuda' if torch.cuda.is_available() else 'cpu'
83
- self.device = device
84
- print('| Load HifiGAN: ', model_path)
85
- self.model, self.h = load_model(model_path, device=self.device)
86
-
87
- def sample_rate(self):
88
- return self.h.sampling_rate
89
-
90
- def hop_size(self):
91
- return self.h.hop_size
92
-
93
- def forward(self, audio, f0):
94
- stft = STFT(
95
- self.h.sampling_rate,
96
- self.h.num_mels,
97
- self.h.n_fft,
98
- self.h.win_size,
99
- self.h.hop_size,
100
- self.h.fmin,
101
- self.h.fmax)
102
- with torch.no_grad():
103
- mel = stft.get_mel(audio)
104
- enhanced_audio = self.model(mel, f0[:,:mel.size(-1)]).view(-1)
105
- return enhanced_audio, self.h.sampling_rate