Spaces:
Running
Running
darksakura
commited on
Commit
•
46bab9a
1
Parent(s):
d91d908
Upload 67 files
Browse files
inference_main.py
CHANGED
@@ -29,7 +29,7 @@ def main():
|
|
29 |
parser.add_argument('-cm', '--cluster_model_path', type=str, default="", help='聚类模型或特征检索索引路径,留空则自动设为各方案模型的默认路径,如果没有训练聚类或特征检索则随便填')
|
30 |
parser.add_argument('-cr', '--cluster_infer_ratio', type=float, default=0, help='聚类方案或特征检索占比,范围0-1,若没有训练聚类模型或特征检索则默认0即可')
|
31 |
parser.add_argument('-lg', '--linear_gradient', type=float, default=0, help='两段音频切片的交叉淡入长度,如果强制切片后出现人声不连贯可调整该数值,如果连贯建议采用默认值0,单位为秒')
|
32 |
-
parser.add_argument('-f0p', '--f0_predictor', type=str, default="pm", help='选择F0预测器,可选择crepe,pm,dio,harvest,rmvpe
|
33 |
parser.add_argument('-eh', '--enhance', action='store_true', default=False, help='是否使用NSF_HIFIGAN增强器,该选项对部分训练集少的模型有一定的音质增强效果,但是对训练好的模型有反面效果,默认关闭')
|
34 |
parser.add_argument('-shd', '--shallow_diffusion', action='store_true', default=False, help='是否使用浅层扩散,使用后可解决一部分电音问题,默认关闭,该选项打开时,NSF_HIFIGAN增强器将会被禁止')
|
35 |
parser.add_argument('-usm', '--use_spk_mix', action='store_true', default=False, help='是否使用角色融合')
|
|
|
29 |
parser.add_argument('-cm', '--cluster_model_path', type=str, default="", help='聚类模型或特征检索索引路径,留空则自动设为各方案模型的默认路径,如果没有训练聚类或特征检索则随便填')
|
30 |
parser.add_argument('-cr', '--cluster_infer_ratio', type=float, default=0, help='聚类方案或特征检索占比,范围0-1,若没有训练聚类模型或特征检索则默认0即可')
|
31 |
parser.add_argument('-lg', '--linear_gradient', type=float, default=0, help='两段音频切片的交叉淡入长度,如果强制切片后出现人声不连贯可调整该数值,如果连贯建议采用默认值0,单位为秒')
|
32 |
+
parser.add_argument('-f0p', '--f0_predictor', type=str, default="pm", help='选择F0预测器,可选择crepe,pm,dio,harvest,rmvpe,fcpe默认为pm(注意:crepe为原F0使用均值滤波器)')
|
33 |
parser.add_argument('-eh', '--enhance', action='store_true', default=False, help='是否使用NSF_HIFIGAN增强器,该选项对部分训练集少的模型有一定的音质增强效果,但是对训练好的模型有反面效果,默认关闭')
|
34 |
parser.add_argument('-shd', '--shallow_diffusion', action='store_true', default=False, help='是否使用浅层扩散,使用后可解决一部分电音问题,默认关闭,该选项打开时,NSF_HIFIGAN增强器将会被禁止')
|
35 |
parser.add_argument('-usm', '--use_spk_mix', action='store_true', default=False, help='是否使用角色融合')
|
modules/F0Predictor/fcpe/model.py
CHANGED
@@ -170,7 +170,7 @@ class FCPEInfer:
|
|
170 |
model.load_state_dict(ckpt['model'])
|
171 |
model.eval()
|
172 |
self.model = model
|
173 |
-
self.wav2mel = Wav2Mel(self.args)
|
174 |
|
175 |
@torch.no_grad()
|
176 |
def __call__(self, audio, sr, threshold=0.05):
|
@@ -182,13 +182,15 @@ class FCPEInfer:
|
|
182 |
|
183 |
|
184 |
class Wav2Mel:
|
185 |
-
|
|
|
186 |
# self.args = args
|
187 |
self.sampling_rate = args.mel.sampling_rate
|
188 |
self.hop_size = args.mel.hop_size
|
189 |
if device is None:
|
190 |
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
191 |
self.device = device
|
|
|
192 |
self.stft = STFT(
|
193 |
args.mel.sampling_rate,
|
194 |
args.mel.num_mels,
|
@@ -205,14 +207,15 @@ class Wav2Mel:
|
|
205 |
return mel
|
206 |
|
207 |
def extract_mel(self, audio, sample_rate, keyshift=0, train=False):
|
|
|
208 |
# resample
|
209 |
if sample_rate == self.sampling_rate:
|
210 |
audio_res = audio
|
211 |
else:
|
212 |
key_str = str(sample_rate)
|
213 |
if key_str not in self.resample_kernel:
|
214 |
-
self.resample_kernel[key_str] = Resample(sample_rate, self.sampling_rate,
|
215 |
-
|
216 |
audio_res = self.resample_kernel[key_str](audio)
|
217 |
|
218 |
# extract
|
|
|
170 |
model.load_state_dict(ckpt['model'])
|
171 |
model.eval()
|
172 |
self.model = model
|
173 |
+
self.wav2mel = Wav2Mel(self.args, dtype=self.dtype, device=self.device)
|
174 |
|
175 |
@torch.no_grad()
|
176 |
def __call__(self, audio, sr, threshold=0.05):
|
|
|
182 |
|
183 |
|
184 |
class Wav2Mel:
|
185 |
+
|
186 |
+
def __init__(self, args, device=None, dtype=torch.float32):
|
187 |
# self.args = args
|
188 |
self.sampling_rate = args.mel.sampling_rate
|
189 |
self.hop_size = args.mel.hop_size
|
190 |
if device is None:
|
191 |
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
192 |
self.device = device
|
193 |
+
self.dtype = dtype
|
194 |
self.stft = STFT(
|
195 |
args.mel.sampling_rate,
|
196 |
args.mel.num_mels,
|
|
|
207 |
return mel
|
208 |
|
209 |
def extract_mel(self, audio, sample_rate, keyshift=0, train=False):
|
210 |
+
audio = audio.to(self.dtype).to(self.device)
|
211 |
# resample
|
212 |
if sample_rate == self.sampling_rate:
|
213 |
audio_res = audio
|
214 |
else:
|
215 |
key_str = str(sample_rate)
|
216 |
if key_str not in self.resample_kernel:
|
217 |
+
self.resample_kernel[key_str] = Resample(sample_rate, self.sampling_rate, lowpass_filter_width=128)
|
218 |
+
self.resample_kernel[key_str] = self.resample_kernel[key_str].to(self.dtype).to(self.device)
|
219 |
audio_res = self.resample_kernel[key_str](audio)
|
220 |
|
221 |
# extract
|
modules/F0Predictor/rmvpe/inference.py
CHANGED
@@ -28,7 +28,7 @@ class RMVPE:
|
|
28 |
def mel2hidden(self, mel):
|
29 |
with torch.no_grad():
|
30 |
n_frames = mel.shape[-1]
|
31 |
-
mel = F.pad(mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode='
|
32 |
hidden = self.model(mel)
|
33 |
return hidden[:, :n_frames]
|
34 |
|
|
|
28 |
def mel2hidden(self, mel):
|
29 |
with torch.no_grad():
|
30 |
n_frames = mel.shape[-1]
|
31 |
+
mel = F.pad(mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode='constant')
|
32 |
hidden = self.model(mel)
|
33 |
return hidden[:, :n_frames]
|
34 |
|