Spaces:
Running
Running
darksakura
commited on
Commit
•
cb73098
1
Parent(s):
c96445d
Upload 4 files
Browse files- data_utils.py +9 -9
- inference_main.py +92 -74
- models.py +56 -32
- train.py +35 -37
data_utils.py
CHANGED
@@ -1,14 +1,13 @@
|
|
1 |
-
import time
|
2 |
import os
|
3 |
import random
|
|
|
4 |
import numpy as np
|
5 |
import torch
|
6 |
import torch.utils.data
|
7 |
|
8 |
-
import modules.commons as commons
|
9 |
import utils
|
10 |
-
from modules.mel_processing import spectrogram_torch
|
11 |
-
from utils import
|
12 |
|
13 |
# import h5py
|
14 |
|
@@ -31,6 +30,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
|
|
31 |
self.filter_length = hparams.data.filter_length
|
32 |
self.hop_length = hparams.data.hop_length
|
33 |
self.win_length = hparams.data.win_length
|
|
|
34 |
self.sampling_rate = hparams.data.sampling_rate
|
35 |
self.use_sr = hparams.train.use_sr
|
36 |
self.spec_len = hparams.train.max_speclen
|
@@ -73,7 +73,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
|
|
73 |
uv = torch.FloatTensor(np.array(uv,dtype=float))
|
74 |
|
75 |
c = torch.load(filename+ ".soft.pt")
|
76 |
-
c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[0])
|
77 |
if self.vol_emb:
|
78 |
volume_path = filename + ".vol.npy"
|
79 |
volume = np.load(volume_path)
|
@@ -86,7 +86,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
|
|
86 |
assert abs(audio_norm.shape[1]-lmin * self.hop_length) < 3 * self.hop_length
|
87 |
spec, c, f0, uv = spec[:, :lmin], c[:, :lmin], f0[:lmin], uv[:lmin]
|
88 |
audio_norm = audio_norm[:, :lmin * self.hop_length]
|
89 |
-
if volume
|
90 |
volume = volume[:lmin]
|
91 |
return c, f0, spec, audio_norm, spk, uv, volume
|
92 |
|
@@ -95,7 +95,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
|
|
95 |
# print("skip too short audio:", filename)
|
96 |
# return None
|
97 |
|
98 |
-
if random.choice([True, False]) and self.vol_aug and volume
|
99 |
max_amp = float(torch.max(torch.abs(audio_norm))) + 1e-5
|
100 |
max_shift = min(1, np.log10(1/max_amp))
|
101 |
log10_vol_shift = random.uniform(-1, max_shift)
|
@@ -113,7 +113,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
|
|
113 |
end = start + 790
|
114 |
spec, c, f0, uv = spec[:, start:end], c[:, start:end], f0[start:end], uv[start:end]
|
115 |
audio_norm = audio_norm[:, start * self.hop_length : end * self.hop_length]
|
116 |
-
if volume
|
117 |
volume = volume[start:end]
|
118 |
return c, f0, spec, audio_norm, spk, uv,volume
|
119 |
|
@@ -177,7 +177,7 @@ class TextAudioCollate:
|
|
177 |
uv = row[5]
|
178 |
uv_padded[i, :uv.size(0)] = uv
|
179 |
volume = row[6]
|
180 |
-
if volume
|
181 |
volume_padded[i, :volume.size(0)] = volume
|
182 |
else :
|
183 |
volume_padded = None
|
|
|
|
|
1 |
import os
|
2 |
import random
|
3 |
+
|
4 |
import numpy as np
|
5 |
import torch
|
6 |
import torch.utils.data
|
7 |
|
|
|
8 |
import utils
|
9 |
+
from modules.mel_processing import spectrogram_torch
|
10 |
+
from utils import load_filepaths_and_text, load_wav_to_torch
|
11 |
|
12 |
# import h5py
|
13 |
|
|
|
30 |
self.filter_length = hparams.data.filter_length
|
31 |
self.hop_length = hparams.data.hop_length
|
32 |
self.win_length = hparams.data.win_length
|
33 |
+
self.unit_interpolate_mode = hparams.data.unit_interpolate_mode
|
34 |
self.sampling_rate = hparams.data.sampling_rate
|
35 |
self.use_sr = hparams.train.use_sr
|
36 |
self.spec_len = hparams.train.max_speclen
|
|
|
73 |
uv = torch.FloatTensor(np.array(uv,dtype=float))
|
74 |
|
75 |
c = torch.load(filename+ ".soft.pt")
|
76 |
+
c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[0], mode=self.unit_interpolate_mode)
|
77 |
if self.vol_emb:
|
78 |
volume_path = filename + ".vol.npy"
|
79 |
volume = np.load(volume_path)
|
|
|
86 |
assert abs(audio_norm.shape[1]-lmin * self.hop_length) < 3 * self.hop_length
|
87 |
spec, c, f0, uv = spec[:, :lmin], c[:, :lmin], f0[:lmin], uv[:lmin]
|
88 |
audio_norm = audio_norm[:, :lmin * self.hop_length]
|
89 |
+
if volume is not None:
|
90 |
volume = volume[:lmin]
|
91 |
return c, f0, spec, audio_norm, spk, uv, volume
|
92 |
|
|
|
95 |
# print("skip too short audio:", filename)
|
96 |
# return None
|
97 |
|
98 |
+
if random.choice([True, False]) and self.vol_aug and volume is not None:
|
99 |
max_amp = float(torch.max(torch.abs(audio_norm))) + 1e-5
|
100 |
max_shift = min(1, np.log10(1/max_amp))
|
101 |
log10_vol_shift = random.uniform(-1, max_shift)
|
|
|
113 |
end = start + 790
|
114 |
spec, c, f0, uv = spec[:, start:end], c[:, start:end], f0[start:end], uv[start:end]
|
115 |
audio_norm = audio_norm[:, start * self.hop_length : end * self.hop_length]
|
116 |
+
if volume is not None:
|
117 |
volume = volume[start:end]
|
118 |
return c, f0, spec, audio_norm, spk, uv,volume
|
119 |
|
|
|
177 |
uv = row[5]
|
178 |
uv_padded[i, :uv.size(0)] = uv
|
179 |
volume = row[6]
|
180 |
+
if volume is not None:
|
181 |
volume_padded[i, :volume.size(0)] = volume
|
182 |
else :
|
183 |
volume_padded = None
|
inference_main.py
CHANGED
@@ -1,16 +1,10 @@
|
|
1 |
-
import io
|
2 |
import logging
|
3 |
-
import time
|
4 |
-
from pathlib import Path
|
5 |
|
6 |
-
import librosa
|
7 |
-
import matplotlib.pyplot as plt
|
8 |
-
import numpy as np
|
9 |
import soundfile
|
10 |
|
11 |
from inference import infer_tool
|
12 |
-
from inference import slicer
|
13 |
from inference.infer_tool import Svc
|
|
|
14 |
|
15 |
logging.getLogger('numba').setLevel(logging.WARNING)
|
16 |
chunks_dict = infer_tool.read_temp("inference/chunks_temp.json")
|
@@ -23,21 +17,33 @@ def main():
|
|
23 |
parser = argparse.ArgumentParser(description='sovits4 inference')
|
24 |
|
25 |
# 一定要设置的部分
|
26 |
-
parser.add_argument('-m', '--model_path', type=str, default="logs/44k/
|
27 |
-
parser.add_argument('-c', '--config_path', type=str, default="
|
28 |
parser.add_argument('-cl', '--clip', type=float, default=0, help='音频强制切片,默认0为自动切片,单位为秒/s')
|
29 |
parser.add_argument('-n', '--clean_names', type=str, nargs='+', default=["君の知らない物語-src.wav"], help='wav文件名列表,放在raw文件夹下')
|
30 |
parser.add_argument('-t', '--trans', type=int, nargs='+', default=[0], help='音高调整,支持正负(半音)')
|
31 |
-
parser.add_argument('-s', '--spk_list', type=str, nargs='+', default=['
|
32 |
-
|
33 |
# 可选项部分
|
34 |
-
parser.add_argument('-a', '--auto_predict_f0', action='store_true', default=False,help='语音转换自动预测音高,转换歌声时不要打开这个会严重跑调')
|
35 |
-
parser.add_argument('-cm', '--cluster_model_path', type=str, default="
|
36 |
-
parser.add_argument('-cr', '--cluster_infer_ratio', type=float, default=0, help='
|
37 |
parser.add_argument('-lg', '--linear_gradient', type=float, default=0, help='两段音频切片的交叉淡入长度,如果强制切片后出现人声不连贯可调整该数值,如果连贯建议采用默认值0,单位为秒')
|
38 |
-
parser.add_argument('-
|
39 |
-
parser.add_argument('-eh', '--enhance',
|
40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
# 不用动的部分
|
42 |
parser.add_argument('-sd', '--slice_db', type=int, default=-40, help='默认-40,嘈杂的音频可以-30,干声保留呼吸可以-50')
|
43 |
parser.add_argument('-d', '--device', type=str, default=None, help='推理设备,None则为自动选择cpu和gpu')
|
@@ -46,7 +52,9 @@ def main():
|
|
46 |
parser.add_argument('-wf', '--wav_format', type=str, default='flac', help='音频输出格式')
|
47 |
parser.add_argument('-lgr', '--linear_gradient_retain', type=float, default=0.75, help='自动音频切片后,需要舍弃每段切片的头尾。该参数设置交叉长度保留的比例,范围0-1,左开右闭')
|
48 |
parser.add_argument('-eak', '--enhancer_adaptive_key', type=int, default=0, help='使增强器适应更高的音域(单位为半音数)|默认为0')
|
49 |
-
|
|
|
|
|
50 |
args = parser.parse_args()
|
51 |
|
52 |
clean_names = args.clean_names
|
@@ -61,75 +69,85 @@ def main():
|
|
61 |
clip = args.clip
|
62 |
lg = args.linear_gradient
|
63 |
lgr = args.linear_gradient_retain
|
64 |
-
|
65 |
enhance = args.enhance
|
66 |
enhancer_adaptive_key = args.enhancer_adaptive_key
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
|
68 |
-
|
69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
infer_tool.fill_a_to_b(trans, clean_names)
|
72 |
for clean_name, tran in zip(clean_names, trans):
|
73 |
raw_audio_path = f"raw/{clean_name}"
|
74 |
if "." not in raw_audio_path:
|
75 |
raw_audio_path += ".wav"
|
76 |
infer_tool.format_wav(raw_audio_path)
|
77 |
-
wav_path = Path(raw_audio_path).with_suffix('.wav')
|
78 |
-
chunks = slicer.cut(wav_path, db_thresh=slice_db)
|
79 |
-
audio_data, audio_sr = slicer.chunks2audio(wav_path, chunks)
|
80 |
-
per_size = int(clip*audio_sr)
|
81 |
-
lg_size = int(lg*audio_sr)
|
82 |
-
lg_size_r = int(lg_size*lgr)
|
83 |
-
lg_size_c_l = (lg_size-lg_size_r)//2
|
84 |
-
lg_size_c_r = lg_size-lg_size_r-lg_size_c_l
|
85 |
-
lg = np.linspace(0,1,lg_size_r) if lg_size!=0 else 0
|
86 |
-
|
87 |
for spk in spk_list:
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
soundfile.write(raw_path, dat, audio_sr, format="wav")
|
110 |
-
raw_path.seek(0)
|
111 |
-
out_audio, out_sr = svc_model.infer(spk, tran, raw_path,
|
112 |
-
cluster_infer_ratio=cluster_infer_ratio,
|
113 |
-
auto_predict_f0=auto_predict_f0,
|
114 |
-
noice_scale=noice_scale,
|
115 |
-
F0_mean_pooling = F0_mean_pooling,
|
116 |
-
enhancer_adaptive_key = enhancer_adaptive_key
|
117 |
-
)
|
118 |
-
_audio = out_audio.cpu().numpy()
|
119 |
-
pad_len = int(svc_model.target_sample * pad_seconds)
|
120 |
-
_audio = _audio[pad_len:-pad_len]
|
121 |
-
_audio = infer_tool.pad_array(_audio, per_length)
|
122 |
-
if lg_size!=0 and k!=0:
|
123 |
-
lg1 = audio[-(lg_size_r+lg_size_c_r):-lg_size_c_r] if lgr != 1 else audio[-lg_size:]
|
124 |
-
lg2 = _audio[lg_size_c_l:lg_size_c_l+lg_size_r] if lgr != 1 else _audio[0:lg_size]
|
125 |
-
lg_pre = lg1*(1-lg)+lg2*lg
|
126 |
-
audio = audio[0:-(lg_size_r+lg_size_c_r)] if lgr != 1 else audio[0:-lg_size]
|
127 |
-
audio.extend(lg_pre)
|
128 |
-
_audio = _audio[lg_size_c_l+lg_size_r:] if lgr != 1 else _audio[lg_size:]
|
129 |
-
audio.extend(list(_audio))
|
130 |
key = "auto" if auto_predict_f0 else f"{tran}key"
|
131 |
cluster_name = "" if cluster_infer_ratio == 0 else f"_{cluster_infer_ratio}"
|
132 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
soundfile.write(res_path, audio, svc_model.target_sample, format=wav_format)
|
134 |
svc_model.clear_empty()
|
135 |
|
|
|
|
|
1 |
import logging
|
|
|
|
|
2 |
|
|
|
|
|
|
|
3 |
import soundfile
|
4 |
|
5 |
from inference import infer_tool
|
|
|
6 |
from inference.infer_tool import Svc
|
7 |
+
from spkmix import spk_mix_map
|
8 |
|
9 |
logging.getLogger('numba').setLevel(logging.WARNING)
|
10 |
chunks_dict = infer_tool.read_temp("inference/chunks_temp.json")
|
|
|
17 |
parser = argparse.ArgumentParser(description='sovits4 inference')
|
18 |
|
19 |
# 一定要设置的部分
|
20 |
+
parser.add_argument('-m', '--model_path', type=str, default="logs/44k/G_37600.pth", help='模型路径')
|
21 |
+
parser.add_argument('-c', '--config_path', type=str, default="logs/44k/config.json", help='配置文件路径')
|
22 |
parser.add_argument('-cl', '--clip', type=float, default=0, help='音频强制切片,默认0为自动切片,单位为秒/s')
|
23 |
parser.add_argument('-n', '--clean_names', type=str, nargs='+', default=["君の知らない物語-src.wav"], help='wav文件名列表,放在raw文件夹下')
|
24 |
parser.add_argument('-t', '--trans', type=int, nargs='+', default=[0], help='音高调整,支持正负(半音)')
|
25 |
+
parser.add_argument('-s', '--spk_list', type=str, nargs='+', default=['buyizi'], help='合成目标说话人名称')
|
26 |
+
|
27 |
# 可选项部分
|
28 |
+
parser.add_argument('-a', '--auto_predict_f0', action='store_true', default=False, help='语音转换自动预测音高,转换歌声时不要打开这个会严重跑调')
|
29 |
+
parser.add_argument('-cm', '--cluster_model_path', type=str, default="", help='聚类模型或特征检索索引路径,留空则自动设为各方案模型的默认路径,如果没有训练聚类或特征检索则随便填')
|
30 |
+
parser.add_argument('-cr', '--cluster_infer_ratio', type=float, default=0, help='聚类方案或特征检索占比,范围0-1,若没有训练聚类模型或特征检索则默认0即可')
|
31 |
parser.add_argument('-lg', '--linear_gradient', type=float, default=0, help='两段音频切片的交叉淡入长度,如果强制切片后出现人声不连贯可调整该数值,如果连贯建议采用默认值0,单位为秒')
|
32 |
+
parser.add_argument('-f0p', '--f0_predictor', type=str, default="pm", help='选择F0预测器,可选择crepe,pm,dio,harvest,rmvpe,默认为pm(注意:crepe为原F0使用均值滤波器)')
|
33 |
+
parser.add_argument('-eh', '--enhance', action='store_true', default=False, help='是否使用NSF_HIFIGAN增强器,该选项对部分训练集少的模型有一定的音质增强效果,但是对训练好的模型有反面效果,默认关闭')
|
34 |
+
parser.add_argument('-shd', '--shallow_diffusion', action='store_true', default=False, help='是否使用浅层扩散,使用后可解决一部分电音问题,默认关闭,该选项打开时,NSF_HIFIGAN增强器将会被禁止')
|
35 |
+
parser.add_argument('-usm', '--use_spk_mix', action='store_true', default=False, help='是否使用角色融合')
|
36 |
+
parser.add_argument('-lea', '--loudness_envelope_adjustment', type=float, default=1, help='输入源响度包络替换输出响度包络融合比例,越靠近1越使用输出响度包络')
|
37 |
+
parser.add_argument('-fr', '--feature_retrieval', action='store_true', default=False, help='是否使用特征检索,如果使用聚类模型将被禁用,且cm与cr参数将会变成特征检索的索引路径与混合比例')
|
38 |
+
|
39 |
+
# 浅扩散设置
|
40 |
+
parser.add_argument('-dm', '--diffusion_model_path', type=str, default="logs/44k/diffusion/model_0.pt", help='扩散模型路径')
|
41 |
+
parser.add_argument('-dc', '--diffusion_config_path', type=str, default="logs/44k/diffusion/config.yaml", help='扩散模型配置文件路径')
|
42 |
+
parser.add_argument('-ks', '--k_step', type=int, default=100, help='扩散步数,越大越接近扩散模型的结果,默认100')
|
43 |
+
parser.add_argument('-se', '--second_encoding', action='store_true', default=False, help='二次编码,浅扩散前会对原始音频进行二次编码,玄学选项,有时候效果好,有时候效果差')
|
44 |
+
parser.add_argument('-od', '--only_diffusion', action='store_true', default=False, help='纯扩散模式,该模式不会加载sovits模型,以扩散模型推理')
|
45 |
+
|
46 |
+
|
47 |
# 不用动的部分
|
48 |
parser.add_argument('-sd', '--slice_db', type=int, default=-40, help='默认-40,嘈杂的音频可以-30,干声保留呼吸可以-50')
|
49 |
parser.add_argument('-d', '--device', type=str, default=None, help='推理设备,None则为自动选择cpu和gpu')
|
|
|
52 |
parser.add_argument('-wf', '--wav_format', type=str, default='flac', help='音频输出格式')
|
53 |
parser.add_argument('-lgr', '--linear_gradient_retain', type=float, default=0.75, help='自动音频切片后,需要舍弃每段切片的头尾。该参数设置交叉长度保留的比例,范围0-1,左开右闭')
|
54 |
parser.add_argument('-eak', '--enhancer_adaptive_key', type=int, default=0, help='使增强器适应更高的音域(单位为半音数)|默认为0')
|
55 |
+
parser.add_argument('-ft', '--f0_filter_threshold', type=float, default=0.05,help='F0过滤阈值,只有使用crepe时有效. 数值范围从0-1. 降低该值可减少跑调概率,但会增加哑音')
|
56 |
+
|
57 |
+
|
58 |
args = parser.parse_args()
|
59 |
|
60 |
clean_names = args.clean_names
|
|
|
69 |
clip = args.clip
|
70 |
lg = args.linear_gradient
|
71 |
lgr = args.linear_gradient_retain
|
72 |
+
f0p = args.f0_predictor
|
73 |
enhance = args.enhance
|
74 |
enhancer_adaptive_key = args.enhancer_adaptive_key
|
75 |
+
cr_threshold = args.f0_filter_threshold
|
76 |
+
diffusion_model_path = args.diffusion_model_path
|
77 |
+
diffusion_config_path = args.diffusion_config_path
|
78 |
+
k_step = args.k_step
|
79 |
+
only_diffusion = args.only_diffusion
|
80 |
+
shallow_diffusion = args.shallow_diffusion
|
81 |
+
use_spk_mix = args.use_spk_mix
|
82 |
+
second_encoding = args.second_encoding
|
83 |
+
loudness_envelope_adjustment = args.loudness_envelope_adjustment
|
84 |
|
85 |
+
if cluster_infer_ratio != 0:
|
86 |
+
if args.cluster_model_path == "":
|
87 |
+
if args.feature_retrieval: # 若指定了占比但没有指定模型路径,则按是否使用特征检索分配默认的模型路径
|
88 |
+
args.cluster_model_path = "logs/44k/feature_and_index.pkl"
|
89 |
+
else:
|
90 |
+
args.cluster_model_path = "logs/44k/kmeans_10000.pt"
|
91 |
+
else: # 若未指定占比,则无论是否指定模型路径,都将其置空以避免之后的模型加载
|
92 |
+
args.cluster_model_path = ""
|
93 |
|
94 |
+
svc_model = Svc(args.model_path,
|
95 |
+
args.config_path,
|
96 |
+
args.device,
|
97 |
+
args.cluster_model_path,
|
98 |
+
enhance,
|
99 |
+
diffusion_model_path,
|
100 |
+
diffusion_config_path,
|
101 |
+
shallow_diffusion,
|
102 |
+
only_diffusion,
|
103 |
+
use_spk_mix,
|
104 |
+
args.feature_retrieval)
|
105 |
+
|
106 |
+
infer_tool.mkdir(["raw", "results"])
|
107 |
+
|
108 |
+
if len(spk_mix_map)<=1:
|
109 |
+
use_spk_mix = False
|
110 |
+
if use_spk_mix:
|
111 |
+
spk_list = [spk_mix_map]
|
112 |
+
|
113 |
infer_tool.fill_a_to_b(trans, clean_names)
|
114 |
for clean_name, tran in zip(clean_names, trans):
|
115 |
raw_audio_path = f"raw/{clean_name}"
|
116 |
if "." not in raw_audio_path:
|
117 |
raw_audio_path += ".wav"
|
118 |
infer_tool.format_wav(raw_audio_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
for spk in spk_list:
|
120 |
+
kwarg = {
|
121 |
+
"raw_audio_path" : raw_audio_path,
|
122 |
+
"spk" : spk,
|
123 |
+
"tran" : tran,
|
124 |
+
"slice_db" : slice_db,
|
125 |
+
"cluster_infer_ratio" : cluster_infer_ratio,
|
126 |
+
"auto_predict_f0" : auto_predict_f0,
|
127 |
+
"noice_scale" : noice_scale,
|
128 |
+
"pad_seconds" : pad_seconds,
|
129 |
+
"clip_seconds" : clip,
|
130 |
+
"lg_num": lg,
|
131 |
+
"lgr_num" : lgr,
|
132 |
+
"f0_predictor" : f0p,
|
133 |
+
"enhancer_adaptive_key" : enhancer_adaptive_key,
|
134 |
+
"cr_threshold" : cr_threshold,
|
135 |
+
"k_step":k_step,
|
136 |
+
"use_spk_mix":use_spk_mix,
|
137 |
+
"second_encoding":second_encoding,
|
138 |
+
"loudness_envelope_adjustment":loudness_envelope_adjustment
|
139 |
+
}
|
140 |
+
audio = svc_model.slice_inference(**kwarg)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
141 |
key = "auto" if auto_predict_f0 else f"{tran}key"
|
142 |
cluster_name = "" if cluster_infer_ratio == 0 else f"_{cluster_infer_ratio}"
|
143 |
+
isdiffusion = "sovits"
|
144 |
+
if shallow_diffusion :
|
145 |
+
isdiffusion = "sovdiff"
|
146 |
+
if only_diffusion :
|
147 |
+
isdiffusion = "diff"
|
148 |
+
if use_spk_mix:
|
149 |
+
spk = "spk_mix"
|
150 |
+
res_path = f'results/{clean_name}_{key}_{spk}{cluster_name}_{isdiffusion}_{f0p}.{wav_format}'
|
151 |
soundfile.write(res_path, audio, svc_model.target_sample, format=wav_format)
|
152 |
svc_model.clear_empty()
|
153 |
|
models.py
CHANGED
@@ -1,21 +1,17 @@
|
|
1 |
-
import copy
|
2 |
-
import math
|
3 |
import torch
|
4 |
from torch import nn
|
|
|
5 |
from torch.nn import functional as F
|
|
|
6 |
|
7 |
import modules.attentions as attentions
|
8 |
import modules.commons as commons
|
9 |
import modules.modules as modules
|
10 |
-
|
11 |
-
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
|
12 |
-
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
13 |
-
|
14 |
import utils
|
15 |
-
from modules.commons import
|
16 |
-
from vdecoder.hifigan.models import Generator
|
17 |
from utils import f0_to_coarse
|
18 |
|
|
|
19 |
class ResidualCouplingBlock(nn.Module):
|
20 |
def __init__(self,
|
21 |
channels,
|
@@ -126,7 +122,7 @@ class DiscriminatorP(torch.nn.Module):
|
|
126 |
super(DiscriminatorP, self).__init__()
|
127 |
self.period = period
|
128 |
self.use_spectral_norm = use_spectral_norm
|
129 |
-
norm_f = weight_norm if use_spectral_norm
|
130 |
self.convs = nn.ModuleList([
|
131 |
norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
|
132 |
norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
|
@@ -161,7 +157,7 @@ class DiscriminatorP(torch.nn.Module):
|
|
161 |
class DiscriminatorS(torch.nn.Module):
|
162 |
def __init__(self, use_spectral_norm=False):
|
163 |
super(DiscriminatorS, self).__init__()
|
164 |
-
norm_f = weight_norm if use_spectral_norm
|
165 |
self.convs = nn.ModuleList([
|
166 |
norm_f(Conv1d(1, 16, 15, 1, padding=7)),
|
167 |
norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
|
@@ -321,6 +317,10 @@ class SynthesizerTrn(nn.Module):
|
|
321 |
n_speakers,
|
322 |
sampling_rate=44100,
|
323 |
vol_embedding=False,
|
|
|
|
|
|
|
|
|
324 |
**kwargs):
|
325 |
|
326 |
super().__init__()
|
@@ -343,6 +343,8 @@ class SynthesizerTrn(nn.Module):
|
|
343 |
self.ssl_dim = ssl_dim
|
344 |
self.vol_embedding = vol_embedding
|
345 |
self.emb_g = nn.Embedding(n_speakers, gin_channels)
|
|
|
|
|
346 |
if vol_embedding:
|
347 |
self.emb_vol = nn.Linear(1, hidden_channels)
|
348 |
|
@@ -367,20 +369,35 @@ class SynthesizerTrn(nn.Module):
|
|
367 |
"upsample_initial_channel": upsample_initial_channel,
|
368 |
"upsample_kernel_sizes": upsample_kernel_sizes,
|
369 |
"gin_channels": gin_channels,
|
|
|
370 |
}
|
371 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
372 |
self.enc_q = Encoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
|
373 |
-
self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1,
|
374 |
-
self.
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
|
|
|
384 |
self.emb_uv = nn.Embedding(2, hidden_channels)
|
385 |
self.character_mix = False
|
386 |
|
@@ -395,17 +412,21 @@ class SynthesizerTrn(nn.Module):
|
|
395 |
g = self.emb_g(g).transpose(1,2)
|
396 |
|
397 |
# vol proj
|
398 |
-
vol = self.emb_vol(vol[:,:,None]).transpose(1,2) if vol
|
399 |
|
400 |
# ssl prenet
|
401 |
x_mask = torch.unsqueeze(commons.sequence_mask(c_lengths, c.size(2)), 1).to(c.dtype)
|
402 |
x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1,2) + vol
|
403 |
-
|
404 |
# f0 predict
|
405 |
-
|
406 |
-
|
407 |
-
|
408 |
-
|
|
|
|
|
|
|
|
|
409 |
# encoder
|
410 |
z_ptemp, m_p, logs_p, _ = self.enc_p(x, x_mask, f0=f0_to_coarse(f0))
|
411 |
z, m_q, logs_q, spec_mask = self.enc_q(spec, spec_lengths, g=g)
|
@@ -419,6 +440,7 @@ class SynthesizerTrn(nn.Module):
|
|
419 |
|
420 |
return o, ids_slice, spec_mask, (z, z_p, m_p, logs_p, m_q, logs_q), pred_lf0, norm_lf0, lf0
|
421 |
|
|
|
422 |
def infer(self, c, f0, uv, g=None, noice_scale=0.35, seed=52468, predict_f0=False, vol = None):
|
423 |
|
424 |
if c.device == torch.device("cuda"):
|
@@ -440,11 +462,13 @@ class SynthesizerTrn(nn.Module):
|
|
440 |
|
441 |
x_mask = torch.unsqueeze(commons.sequence_mask(c_lengths, c.size(2)), 1).to(c.dtype)
|
442 |
# vol proj
|
443 |
-
vol = self.emb_vol(vol[:,:,None]).transpose(1,2) if vol!=None and self.vol_embedding else 0
|
444 |
-
|
445 |
-
x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1,2) + vol
|
446 |
|
447 |
-
if
|
|
|
|
|
|
|
|
|
|
|
448 |
lf0 = 2595. * torch.log10(1. + f0.unsqueeze(1) / 700.) / 500
|
449 |
norm_lf0 = utils.normalize_f0(lf0, x_mask, uv, random_scale=False)
|
450 |
pred_lf0 = self.f0_decoder(x, norm_lf0, x_mask, spk_emb=g)
|
|
|
|
|
|
|
1 |
import torch
|
2 |
from torch import nn
|
3 |
+
from torch.nn import Conv1d, Conv2d
|
4 |
from torch.nn import functional as F
|
5 |
+
from torch.nn.utils import spectral_norm, weight_norm
|
6 |
|
7 |
import modules.attentions as attentions
|
8 |
import modules.commons as commons
|
9 |
import modules.modules as modules
|
|
|
|
|
|
|
|
|
10 |
import utils
|
11 |
+
from modules.commons import get_padding
|
|
|
12 |
from utils import f0_to_coarse
|
13 |
|
14 |
+
|
15 |
class ResidualCouplingBlock(nn.Module):
|
16 |
def __init__(self,
|
17 |
channels,
|
|
|
122 |
super(DiscriminatorP, self).__init__()
|
123 |
self.period = period
|
124 |
self.use_spectral_norm = use_spectral_norm
|
125 |
+
norm_f = weight_norm if use_spectral_norm is False else spectral_norm
|
126 |
self.convs = nn.ModuleList([
|
127 |
norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
|
128 |
norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
|
|
|
157 |
class DiscriminatorS(torch.nn.Module):
|
158 |
def __init__(self, use_spectral_norm=False):
|
159 |
super(DiscriminatorS, self).__init__()
|
160 |
+
norm_f = weight_norm if use_spectral_norm is False else spectral_norm
|
161 |
self.convs = nn.ModuleList([
|
162 |
norm_f(Conv1d(1, 16, 15, 1, padding=7)),
|
163 |
norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
|
|
|
317 |
n_speakers,
|
318 |
sampling_rate=44100,
|
319 |
vol_embedding=False,
|
320 |
+
vocoder_name = "nsf-hifigan",
|
321 |
+
use_depthwise_conv = False,
|
322 |
+
use_automatic_f0_prediction = True,
|
323 |
+
n_flow_layer = 4,
|
324 |
**kwargs):
|
325 |
|
326 |
super().__init__()
|
|
|
343 |
self.ssl_dim = ssl_dim
|
344 |
self.vol_embedding = vol_embedding
|
345 |
self.emb_g = nn.Embedding(n_speakers, gin_channels)
|
346 |
+
self.use_depthwise_conv = use_depthwise_conv
|
347 |
+
self.use_automatic_f0_prediction = use_automatic_f0_prediction
|
348 |
if vol_embedding:
|
349 |
self.emb_vol = nn.Linear(1, hidden_channels)
|
350 |
|
|
|
369 |
"upsample_initial_channel": upsample_initial_channel,
|
370 |
"upsample_kernel_sizes": upsample_kernel_sizes,
|
371 |
"gin_channels": gin_channels,
|
372 |
+
"use_depthwise_conv":use_depthwise_conv
|
373 |
}
|
374 |
+
|
375 |
+
modules.set_Conv1dModel(self.use_depthwise_conv)
|
376 |
+
|
377 |
+
if vocoder_name == "nsf-hifigan":
|
378 |
+
from vdecoder.hifigan.models import Generator
|
379 |
+
self.dec = Generator(h=hps)
|
380 |
+
elif vocoder_name == "nsf-snake-hifigan":
|
381 |
+
from vdecoder.hifiganwithsnake.models import Generator
|
382 |
+
self.dec = Generator(h=hps)
|
383 |
+
else:
|
384 |
+
print("[?] Unkown vocoder: use default(nsf-hifigan)")
|
385 |
+
from vdecoder.hifigan.models import Generator
|
386 |
+
self.dec = Generator(h=hps)
|
387 |
+
|
388 |
self.enc_q = Encoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
|
389 |
+
self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, n_flow_layer, gin_channels=gin_channels)
|
390 |
+
if self.use_automatic_f0_prediction:
|
391 |
+
self.f0_decoder = F0Decoder(
|
392 |
+
1,
|
393 |
+
hidden_channels,
|
394 |
+
filter_channels,
|
395 |
+
n_heads,
|
396 |
+
n_layers,
|
397 |
+
kernel_size,
|
398 |
+
p_dropout,
|
399 |
+
spk_channels=gin_channels
|
400 |
+
)
|
401 |
self.emb_uv = nn.Embedding(2, hidden_channels)
|
402 |
self.character_mix = False
|
403 |
|
|
|
412 |
g = self.emb_g(g).transpose(1,2)
|
413 |
|
414 |
# vol proj
|
415 |
+
vol = self.emb_vol(vol[:,:,None]).transpose(1,2) if vol is not None and self.vol_embedding else 0
|
416 |
|
417 |
# ssl prenet
|
418 |
x_mask = torch.unsqueeze(commons.sequence_mask(c_lengths, c.size(2)), 1).to(c.dtype)
|
419 |
x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1,2) + vol
|
420 |
+
|
421 |
# f0 predict
|
422 |
+
if self.use_automatic_f0_prediction:
|
423 |
+
lf0 = 2595. * torch.log10(1. + f0.unsqueeze(1) / 700.) / 500
|
424 |
+
norm_lf0 = utils.normalize_f0(lf0, x_mask, uv)
|
425 |
+
pred_lf0 = self.f0_decoder(x, norm_lf0, x_mask, spk_emb=g)
|
426 |
+
else:
|
427 |
+
lf0 = 0
|
428 |
+
norm_lf0 = 0
|
429 |
+
pred_lf0 = 0
|
430 |
# encoder
|
431 |
z_ptemp, m_p, logs_p, _ = self.enc_p(x, x_mask, f0=f0_to_coarse(f0))
|
432 |
z, m_q, logs_q, spec_mask = self.enc_q(spec, spec_lengths, g=g)
|
|
|
440 |
|
441 |
return o, ids_slice, spec_mask, (z, z_p, m_p, logs_p, m_q, logs_q), pred_lf0, norm_lf0, lf0
|
442 |
|
443 |
+
@torch.no_grad()
|
444 |
def infer(self, c, f0, uv, g=None, noice_scale=0.35, seed=52468, predict_f0=False, vol = None):
|
445 |
|
446 |
if c.device == torch.device("cuda"):
|
|
|
462 |
|
463 |
x_mask = torch.unsqueeze(commons.sequence_mask(c_lengths, c.size(2)), 1).to(c.dtype)
|
464 |
# vol proj
|
|
|
|
|
|
|
465 |
|
466 |
+
vol = self.emb_vol(vol[:,:,None]).transpose(1,2) if vol is not None and self.vol_embedding else 0
|
467 |
+
|
468 |
+
x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1, 2) + vol
|
469 |
+
|
470 |
+
|
471 |
+
if self.use_automatic_f0_prediction and predict_f0:
|
472 |
lf0 = 2595. * torch.log10(1. + f0.unsqueeze(1) / 700.) / 500
|
473 |
norm_lf0 = utils.normalize_f0(lf0, x_mask, uv, random_scale=False)
|
474 |
pred_lf0 = self.f0_decoder(x, norm_lf0, x_mask, spk_emb=g)
|
train.py
CHANGED
@@ -1,39 +1,30 @@
|
|
1 |
import logging
|
2 |
import multiprocessing
|
|
|
3 |
import time
|
4 |
|
5 |
-
logging.getLogger('matplotlib').setLevel(logging.WARNING)
|
6 |
-
logging.getLogger('numba').setLevel(logging.WARNING)
|
7 |
-
|
8 |
-
import os
|
9 |
-
import json
|
10 |
-
import argparse
|
11 |
-
import itertools
|
12 |
-
import math
|
13 |
import torch
|
14 |
-
|
|
|
|
|
15 |
from torch.nn import functional as F
|
|
|
16 |
from torch.utils.data import DataLoader
|
17 |
from torch.utils.tensorboard import SummaryWriter
|
18 |
-
import torch.multiprocessing as mp
|
19 |
-
import torch.distributed as dist
|
20 |
-
from torch.nn.parallel import DistributedDataParallel as DDP
|
21 |
-
from torch.cuda.amp import autocast, GradScaler
|
22 |
|
23 |
import modules.commons as commons
|
24 |
import utils
|
25 |
-
from data_utils import
|
26 |
from models import (
|
27 |
-
SynthesizerTrn,
|
28 |
MultiPeriodDiscriminator,
|
|
|
29 |
)
|
30 |
-
from modules.losses import
|
31 |
-
kl_loss,
|
32 |
-
generator_loss, discriminator_loss, feature_loss
|
33 |
-
)
|
34 |
-
|
35 |
from modules.mel_processing import mel_spectrogram_torch, spec_to_mel_torch
|
36 |
|
|
|
|
|
|
|
37 |
torch.backends.cudnn.benchmark = True
|
38 |
global_step = 0
|
39 |
start_time = time.time()
|
@@ -61,7 +52,7 @@ def run(rank, n_gpus, hps):
|
|
61 |
utils.check_git_hash(hps.model_dir)
|
62 |
writer = SummaryWriter(log_dir=hps.model_dir)
|
63 |
writer_eval = SummaryWriter(log_dir=os.path.join(hps.model_dir, "eval"))
|
64 |
-
|
65 |
# for pytorch on win, backend use gloo
|
66 |
dist.init_process_group(backend= 'gloo' if os.name == 'nt' else 'nccl', init_method='env://', world_size=n_gpus, rank=rank)
|
67 |
torch.manual_seed(hps.train.seed)
|
@@ -108,7 +99,7 @@ def run(rank, n_gpus, hps):
|
|
108 |
name=utils.latest_checkpoint_path(hps.model_dir, "D_*.pth")
|
109 |
global_step=int(name[name.rfind("_")+1:name.rfind(".")])+1
|
110 |
#global_step = (epoch_str - 1) * len(train_loader)
|
111 |
-
except:
|
112 |
print("load old checkpoint failed...")
|
113 |
epoch_str = 1
|
114 |
global_step = 0
|
@@ -148,6 +139,8 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade
|
|
148 |
train_loader, eval_loader = loaders
|
149 |
if writers is not None:
|
150 |
writer, writer_eval = writers
|
|
|
|
|
151 |
|
152 |
# train_loader.batch_sampler.set_epoch(epoch)
|
153 |
global global_step
|
@@ -169,8 +162,8 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade
|
|
169 |
hps.data.sampling_rate,
|
170 |
hps.data.mel_fmin,
|
171 |
hps.data.mel_fmax)
|
172 |
-
|
173 |
-
with autocast(enabled=hps.train.fp16_run):
|
174 |
y_hat, ids_slice, z_mask, \
|
175 |
(z, z_p, m_p, logs_p, m_q, logs_q), pred_lf0, norm_lf0, lf0 = net_g(c, f0, uv, spec, g=g, c_lengths=lengths,
|
176 |
spec_lengths=lengths,vol = volume)
|
@@ -191,25 +184,26 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade
|
|
191 |
# Discriminator
|
192 |
y_d_hat_r, y_d_hat_g, _, _ = net_d(y, y_hat.detach())
|
193 |
|
194 |
-
with autocast(enabled=False):
|
195 |
loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(y_d_hat_r, y_d_hat_g)
|
196 |
loss_disc_all = loss_disc
|
197 |
-
|
198 |
optim_d.zero_grad()
|
199 |
scaler.scale(loss_disc_all).backward()
|
200 |
scaler.unscale_(optim_d)
|
201 |
grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None)
|
202 |
scaler.step(optim_d)
|
|
|
203 |
|
204 |
-
with autocast(enabled=hps.train.fp16_run):
|
205 |
# Generator
|
206 |
y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(y, y_hat)
|
207 |
-
with autocast(enabled=False):
|
208 |
loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel
|
209 |
loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl
|
210 |
loss_fm = feature_loss(fmap_r, fmap_g)
|
211 |
loss_gen, losses_gen = generator_loss(y_d_hat_g)
|
212 |
-
loss_lf0 = F.mse_loss(pred_lf0, lf0)
|
213 |
loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl + loss_lf0
|
214 |
optim_g.zero_grad()
|
215 |
scaler.scale(loss_gen_all).backward()
|
@@ -241,13 +235,17 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade
|
|
241 |
image_dict = {
|
242 |
"slice/mel_org": utils.plot_spectrogram_to_numpy(y_mel[0].data.cpu().numpy()),
|
243 |
"slice/mel_gen": utils.plot_spectrogram_to_numpy(y_hat_mel[0].data.cpu().numpy()),
|
244 |
-
"all/mel": utils.plot_spectrogram_to_numpy(mel[0].data.cpu().numpy())
|
245 |
-
"all/lf0": utils.plot_data_to_numpy(lf0[0, 0, :].cpu().numpy(),
|
246 |
-
pred_lf0[0, 0, :].detach().cpu().numpy()),
|
247 |
-
"all/norm_lf0": utils.plot_data_to_numpy(lf0[0, 0, :].cpu().numpy(),
|
248 |
-
norm_lf0[0, 0, :].detach().cpu().numpy())
|
249 |
}
|
250 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
251 |
utils.summarize(
|
252 |
writer=writer,
|
253 |
global_step=global_step,
|
@@ -287,7 +285,7 @@ def evaluate(hps, generator, eval_loader, writer_eval):
|
|
287 |
c = c[:1].cuda(0)
|
288 |
f0 = f0[:1].cuda(0)
|
289 |
uv= uv[:1].cuda(0)
|
290 |
-
if volume
|
291 |
volume = volume[:1].cuda(0)
|
292 |
mel = spec_to_mel_torch(
|
293 |
spec,
|
@@ -314,7 +312,7 @@ def evaluate(hps, generator, eval_loader, writer_eval):
|
|
314 |
f"gt/audio_{batch_idx}": y[0]
|
315 |
})
|
316 |
image_dict.update({
|
317 |
-
|
318 |
"gt/mel": utils.plot_spectrogram_to_numpy(mel[0].cpu().numpy())
|
319 |
})
|
320 |
utils.summarize(
|
@@ -328,4 +326,4 @@ def evaluate(hps, generator, eval_loader, writer_eval):
|
|
328 |
|
329 |
|
330 |
if __name__ == "__main__":
|
331 |
-
main()
|
|
|
1 |
import logging
|
2 |
import multiprocessing
|
3 |
+
import os
|
4 |
import time
|
5 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
import torch
|
7 |
+
import torch.distributed as dist
|
8 |
+
import torch.multiprocessing as mp
|
9 |
+
from torch.cuda.amp import GradScaler, autocast
|
10 |
from torch.nn import functional as F
|
11 |
+
from torch.nn.parallel import DistributedDataParallel as DDP
|
12 |
from torch.utils.data import DataLoader
|
13 |
from torch.utils.tensorboard import SummaryWriter
|
|
|
|
|
|
|
|
|
14 |
|
15 |
import modules.commons as commons
|
16 |
import utils
|
17 |
+
from data_utils import TextAudioCollate, TextAudioSpeakerLoader
|
18 |
from models import (
|
|
|
19 |
MultiPeriodDiscriminator,
|
20 |
+
SynthesizerTrn,
|
21 |
)
|
22 |
+
from modules.losses import discriminator_loss, feature_loss, generator_loss, kl_loss
|
|
|
|
|
|
|
|
|
23 |
from modules.mel_processing import mel_spectrogram_torch, spec_to_mel_torch
|
24 |
|
25 |
+
logging.getLogger('matplotlib').setLevel(logging.WARNING)
|
26 |
+
logging.getLogger('numba').setLevel(logging.WARNING)
|
27 |
+
|
28 |
torch.backends.cudnn.benchmark = True
|
29 |
global_step = 0
|
30 |
start_time = time.time()
|
|
|
52 |
utils.check_git_hash(hps.model_dir)
|
53 |
writer = SummaryWriter(log_dir=hps.model_dir)
|
54 |
writer_eval = SummaryWriter(log_dir=os.path.join(hps.model_dir, "eval"))
|
55 |
+
|
56 |
# for pytorch on win, backend use gloo
|
57 |
dist.init_process_group(backend= 'gloo' if os.name == 'nt' else 'nccl', init_method='env://', world_size=n_gpus, rank=rank)
|
58 |
torch.manual_seed(hps.train.seed)
|
|
|
99 |
name=utils.latest_checkpoint_path(hps.model_dir, "D_*.pth")
|
100 |
global_step=int(name[name.rfind("_")+1:name.rfind(".")])+1
|
101 |
#global_step = (epoch_str - 1) * len(train_loader)
|
102 |
+
except Exception:
|
103 |
print("load old checkpoint failed...")
|
104 |
epoch_str = 1
|
105 |
global_step = 0
|
|
|
139 |
train_loader, eval_loader = loaders
|
140 |
if writers is not None:
|
141 |
writer, writer_eval = writers
|
142 |
+
|
143 |
+
half_type = torch.bfloat16 if hps.train.half_type=="bf16" else torch.float16
|
144 |
|
145 |
# train_loader.batch_sampler.set_epoch(epoch)
|
146 |
global global_step
|
|
|
162 |
hps.data.sampling_rate,
|
163 |
hps.data.mel_fmin,
|
164 |
hps.data.mel_fmax)
|
165 |
+
|
166 |
+
with autocast(enabled=hps.train.fp16_run, dtype=half_type):
|
167 |
y_hat, ids_slice, z_mask, \
|
168 |
(z, z_p, m_p, logs_p, m_q, logs_q), pred_lf0, norm_lf0, lf0 = net_g(c, f0, uv, spec, g=g, c_lengths=lengths,
|
169 |
spec_lengths=lengths,vol = volume)
|
|
|
184 |
# Discriminator
|
185 |
y_d_hat_r, y_d_hat_g, _, _ = net_d(y, y_hat.detach())
|
186 |
|
187 |
+
with autocast(enabled=False, dtype=half_type):
|
188 |
loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(y_d_hat_r, y_d_hat_g)
|
189 |
loss_disc_all = loss_disc
|
190 |
+
|
191 |
optim_d.zero_grad()
|
192 |
scaler.scale(loss_disc_all).backward()
|
193 |
scaler.unscale_(optim_d)
|
194 |
grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None)
|
195 |
scaler.step(optim_d)
|
196 |
+
|
197 |
|
198 |
+
with autocast(enabled=hps.train.fp16_run, dtype=half_type):
|
199 |
# Generator
|
200 |
y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(y, y_hat)
|
201 |
+
with autocast(enabled=False, dtype=half_type):
|
202 |
loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel
|
203 |
loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl
|
204 |
loss_fm = feature_loss(fmap_r, fmap_g)
|
205 |
loss_gen, losses_gen = generator_loss(y_d_hat_g)
|
206 |
+
loss_lf0 = F.mse_loss(pred_lf0, lf0) if net_g.module.use_automatic_f0_prediction else 0
|
207 |
loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl + loss_lf0
|
208 |
optim_g.zero_grad()
|
209 |
scaler.scale(loss_gen_all).backward()
|
|
|
235 |
image_dict = {
|
236 |
"slice/mel_org": utils.plot_spectrogram_to_numpy(y_mel[0].data.cpu().numpy()),
|
237 |
"slice/mel_gen": utils.plot_spectrogram_to_numpy(y_hat_mel[0].data.cpu().numpy()),
|
238 |
+
"all/mel": utils.plot_spectrogram_to_numpy(mel[0].data.cpu().numpy())
|
|
|
|
|
|
|
|
|
239 |
}
|
240 |
|
241 |
+
if net_g.module.use_automatic_f0_prediction:
|
242 |
+
image_dict.update({
|
243 |
+
"all/lf0": utils.plot_data_to_numpy(lf0[0, 0, :].cpu().numpy(),
|
244 |
+
pred_lf0[0, 0, :].detach().cpu().numpy()),
|
245 |
+
"all/norm_lf0": utils.plot_data_to_numpy(lf0[0, 0, :].cpu().numpy(),
|
246 |
+
norm_lf0[0, 0, :].detach().cpu().numpy())
|
247 |
+
})
|
248 |
+
|
249 |
utils.summarize(
|
250 |
writer=writer,
|
251 |
global_step=global_step,
|
|
|
285 |
c = c[:1].cuda(0)
|
286 |
f0 = f0[:1].cuda(0)
|
287 |
uv= uv[:1].cuda(0)
|
288 |
+
if volume is not None:
|
289 |
volume = volume[:1].cuda(0)
|
290 |
mel = spec_to_mel_torch(
|
291 |
spec,
|
|
|
312 |
f"gt/audio_{batch_idx}": y[0]
|
313 |
})
|
314 |
image_dict.update({
|
315 |
+
"gen/mel": utils.plot_spectrogram_to_numpy(y_hat_mel[0].cpu().numpy()),
|
316 |
"gt/mel": utils.plot_spectrogram_to_numpy(mel[0].cpu().numpy())
|
317 |
})
|
318 |
utils.summarize(
|
|
|
326 |
|
327 |
|
328 |
if __name__ == "__main__":
|
329 |
+
main()
|