smjain commited on
Commit
1b83a0a
1 Parent(s): b9c2ec2

Upload 6 files

Browse files
lib/infer_pack/modules/train/extract/extract_f0_print.py ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import traceback
4
+
5
+ import parselmouth
6
+
7
+ now_dir = os.getcwd()
8
+ sys.path.append(now_dir)
9
+ import logging
10
+
11
+ import numpy as np
12
+ import pyworld
13
+
14
+ from infer.lib.audio import load_audio
15
+
16
+ logging.getLogger("numba").setLevel(logging.WARNING)
17
+ from multiprocessing import Process
18
+
19
+ exp_dir = sys.argv[1]
20
+ f = open("%s/extract_f0_feature.log" % exp_dir, "a+")
21
+
22
+
23
+ def printt(strr):
24
+ print(strr)
25
+ f.write("%s\n" % strr)
26
+ f.flush()
27
+
28
+
29
+ n_p = int(sys.argv[2])
30
+ f0method = sys.argv[3]
31
+
32
+
33
+ class FeatureInput(object):
34
+ def __init__(self, samplerate=16000, hop_size=160):
35
+ self.fs = samplerate
36
+ self.hop = hop_size
37
+
38
+ self.f0_bin = 256
39
+ self.f0_max = 1100.0
40
+ self.f0_min = 50.0
41
+ self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
42
+ self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
43
+
44
+ def compute_f0(self, path, f0_method):
45
+ x = load_audio(path, self.fs)
46
+ p_len = x.shape[0] // self.hop
47
+ if f0_method == "pm":
48
+ time_step = 160 / 16000 * 1000
49
+ f0_min = 50
50
+ f0_max = 1100
51
+ f0 = (
52
+ parselmouth.Sound(x, self.fs)
53
+ .to_pitch_ac(
54
+ time_step=time_step / 1000,
55
+ voicing_threshold=0.6,
56
+ pitch_floor=f0_min,
57
+ pitch_ceiling=f0_max,
58
+ )
59
+ .selected_array["frequency"]
60
+ )
61
+ pad_size = (p_len - len(f0) + 1) // 2
62
+ if pad_size > 0 or p_len - len(f0) - pad_size > 0:
63
+ f0 = np.pad(
64
+ f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
65
+ )
66
+ elif f0_method == "harvest":
67
+ f0, t = pyworld.harvest(
68
+ x.astype(np.double),
69
+ fs=self.fs,
70
+ f0_ceil=self.f0_max,
71
+ f0_floor=self.f0_min,
72
+ frame_period=1000 * self.hop / self.fs,
73
+ )
74
+ f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.fs)
75
+ elif f0_method == "dio":
76
+ f0, t = pyworld.dio(
77
+ x.astype(np.double),
78
+ fs=self.fs,
79
+ f0_ceil=self.f0_max,
80
+ f0_floor=self.f0_min,
81
+ frame_period=1000 * self.hop / self.fs,
82
+ )
83
+ f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.fs)
84
+ elif f0_method == "rmvpe":
85
+ if hasattr(self, "model_rmvpe") == False:
86
+ from infer.lib.rmvpe import RMVPE
87
+
88
+ print("Loading rmvpe model")
89
+ self.model_rmvpe = RMVPE(
90
+ "assets/rmvpe/rmvpe.pt", is_half=False, device="cpu"
91
+ )
92
+ f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
93
+ return f0
94
+
95
+ def coarse_f0(self, f0):
96
+ f0_mel = 1127 * np.log(1 + f0 / 700)
97
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * (
98
+ self.f0_bin - 2
99
+ ) / (self.f0_mel_max - self.f0_mel_min) + 1
100
+
101
+ # use 0 or 1
102
+ f0_mel[f0_mel <= 1] = 1
103
+ f0_mel[f0_mel > self.f0_bin - 1] = self.f0_bin - 1
104
+ f0_coarse = np.rint(f0_mel).astype(int)
105
+ assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (
106
+ f0_coarse.max(),
107
+ f0_coarse.min(),
108
+ )
109
+ return f0_coarse
110
+
111
+ def go(self, paths, f0_method):
112
+ if len(paths) == 0:
113
+ printt("no-f0-todo")
114
+ else:
115
+ printt("todo-f0-%s" % len(paths))
116
+ n = max(len(paths) // 5, 1) # 每个进程最多打印5条
117
+ for idx, (inp_path, opt_path1, opt_path2) in enumerate(paths):
118
+ try:
119
+ if idx % n == 0:
120
+ printt("f0ing,now-%s,all-%s,-%s" % (idx, len(paths), inp_path))
121
+ if (
122
+ os.path.exists(opt_path1 + ".npy") == True
123
+ and os.path.exists(opt_path2 + ".npy") == True
124
+ ):
125
+ continue
126
+ featur_pit = self.compute_f0(inp_path, f0_method)
127
+ np.save(
128
+ opt_path2,
129
+ featur_pit,
130
+ allow_pickle=False,
131
+ ) # nsf
132
+ coarse_pit = self.coarse_f0(featur_pit)
133
+ np.save(
134
+ opt_path1,
135
+ coarse_pit,
136
+ allow_pickle=False,
137
+ ) # ori
138
+ except:
139
+ printt("f0fail-%s-%s-%s" % (idx, inp_path, traceback.format_exc()))
140
+
141
+
142
+ if __name__ == "__main__":
143
+ # exp_dir=r"E:\codes\py39\dataset\mi-test"
144
+ # n_p=16
145
+ # f = open("%s/log_extract_f0.log"%exp_dir, "w")
146
+ printt(sys.argv)
147
+ featureInput = FeatureInput()
148
+ paths = []
149
+ inp_root = "%s/1_16k_wavs" % (exp_dir)
150
+ opt_root1 = "%s/2a_f0" % (exp_dir)
151
+ opt_root2 = "%s/2b-f0nsf" % (exp_dir)
152
+
153
+ os.makedirs(opt_root1, exist_ok=True)
154
+ os.makedirs(opt_root2, exist_ok=True)
155
+ for name in sorted(list(os.listdir(inp_root))):
156
+ inp_path = "%s/%s" % (inp_root, name)
157
+ if "spec" in inp_path:
158
+ continue
159
+ opt_path1 = "%s/%s" % (opt_root1, name)
160
+ opt_path2 = "%s/%s" % (opt_root2, name)
161
+ paths.append([inp_path, opt_path1, opt_path2])
162
+
163
+ ps = []
164
+ for i in range(n_p):
165
+ p = Process(
166
+ target=featureInput.go,
167
+ args=(
168
+ paths[i::n_p],
169
+ f0method,
170
+ ),
171
+ )
172
+ ps.append(p)
173
+ p.start()
174
+ for i in range(n_p):
175
+ ps[i].join()
lib/infer_pack/modules/train/extract/extract_f0_rmvpe.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import traceback
4
+
5
+ import parselmouth
6
+
7
+ now_dir = os.getcwd()
8
+ sys.path.append(now_dir)
9
+ import logging
10
+
11
+ import numpy as np
12
+ import pyworld
13
+
14
+ from infer.lib.audio import load_audio
15
+
16
+ logging.getLogger("numba").setLevel(logging.WARNING)
17
+
18
+ n_part = int(sys.argv[1])
19
+ i_part = int(sys.argv[2])
20
+ i_gpu = sys.argv[3]
21
+ os.environ["CUDA_VISIBLE_DEVICES"] = str(i_gpu)
22
+ exp_dir = sys.argv[4]
23
+ is_half = sys.argv[5]
24
+ f = open("%s/extract_f0_feature.log" % exp_dir, "a+")
25
+
26
+
27
+ def printt(strr):
28
+ print(strr)
29
+ f.write("%s\n" % strr)
30
+ f.flush()
31
+
32
+
33
+ class FeatureInput(object):
34
+ def __init__(self, samplerate=16000, hop_size=160):
35
+ self.fs = samplerate
36
+ self.hop = hop_size
37
+
38
+ self.f0_bin = 256
39
+ self.f0_max = 1100.0
40
+ self.f0_min = 50.0
41
+ self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
42
+ self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
43
+
44
+ def compute_f0(self, path, f0_method):
45
+ x = load_audio(path, self.fs)
46
+ # p_len = x.shape[0] // self.hop
47
+ if f0_method == "rmvpe":
48
+ if hasattr(self, "model_rmvpe") == False:
49
+ from infer.lib.rmvpe import RMVPE
50
+
51
+ print("Loading rmvpe model")
52
+ self.model_rmvpe = RMVPE(
53
+ "assets/rmvpe/rmvpe.pt", is_half=is_half, device="cuda"
54
+ )
55
+ f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
56
+ return f0
57
+
58
+ def coarse_f0(self, f0):
59
+ f0_mel = 1127 * np.log(1 + f0 / 700)
60
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * (
61
+ self.f0_bin - 2
62
+ ) / (self.f0_mel_max - self.f0_mel_min) + 1
63
+
64
+ # use 0 or 1
65
+ f0_mel[f0_mel <= 1] = 1
66
+ f0_mel[f0_mel > self.f0_bin - 1] = self.f0_bin - 1
67
+ f0_coarse = np.rint(f0_mel).astype(int)
68
+ assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (
69
+ f0_coarse.max(),
70
+ f0_coarse.min(),
71
+ )
72
+ return f0_coarse
73
+
74
+ def go(self, paths, f0_method):
75
+ if len(paths) == 0:
76
+ printt("no-f0-todo")
77
+ else:
78
+ printt("todo-f0-%s" % len(paths))
79
+ n = max(len(paths) // 5, 1) # 每个进程最多打印5条
80
+ for idx, (inp_path, opt_path1, opt_path2) in enumerate(paths):
81
+ try:
82
+ if idx % n == 0:
83
+ printt("f0ing,now-%s,all-%s,-%s" % (idx, len(paths), inp_path))
84
+ if (
85
+ os.path.exists(opt_path1 + ".npy") == True
86
+ and os.path.exists(opt_path2 + ".npy") == True
87
+ ):
88
+ continue
89
+ featur_pit = self.compute_f0(inp_path, f0_method)
90
+ np.save(
91
+ opt_path2,
92
+ featur_pit,
93
+ allow_pickle=False,
94
+ ) # nsf
95
+ coarse_pit = self.coarse_f0(featur_pit)
96
+ np.save(
97
+ opt_path1,
98
+ coarse_pit,
99
+ allow_pickle=False,
100
+ ) # ori
101
+ except:
102
+ printt("f0fail-%s-%s-%s" % (idx, inp_path, traceback.format_exc()))
103
+
104
+
105
+ if __name__ == "__main__":
106
+ # exp_dir=r"E:\codes\py39\dataset\mi-test"
107
+ # n_p=16
108
+ # f = open("%s/log_extract_f0.log"%exp_dir, "w")
109
+ printt(sys.argv)
110
+ featureInput = FeatureInput()
111
+ paths = []
112
+ inp_root = "%s/1_16k_wavs" % (exp_dir)
113
+ opt_root1 = "%s/2a_f0" % (exp_dir)
114
+ opt_root2 = "%s/2b-f0nsf" % (exp_dir)
115
+
116
+ os.makedirs(opt_root1, exist_ok=True)
117
+ os.makedirs(opt_root2, exist_ok=True)
118
+ for name in sorted(list(os.listdir(inp_root))):
119
+ inp_path = "%s/%s" % (inp_root, name)
120
+ if "spec" in inp_path:
121
+ continue
122
+ opt_path1 = "%s/%s" % (opt_root1, name)
123
+ opt_path2 = "%s/%s" % (opt_root2, name)
124
+ paths.append([inp_path, opt_path1, opt_path2])
125
+ try:
126
+ featureInput.go(paths[i_part::n_part], "rmvpe")
127
+ except:
128
+ printt("f0_all_fail-%s" % (traceback.format_exc()))
129
+ # ps = []
130
+ # for i in range(n_p):
131
+ # p = Process(
132
+ # target=featureInput.go,
133
+ # args=(
134
+ # paths[i::n_p],
135
+ # f0method,
136
+ # ),
137
+ # )
138
+ # ps.append(p)
139
+ # p.start()
140
+ # for i in range(n_p):
141
+ # ps[i].join()
lib/infer_pack/modules/train/extract/extract_f0_rmvpe_dml.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import traceback
4
+
5
+ import parselmouth
6
+
7
+ now_dir = os.getcwd()
8
+ sys.path.append(now_dir)
9
+ import logging
10
+
11
+ import numpy as np
12
+ import pyworld
13
+
14
+ from infer.lib.audio import load_audio
15
+
16
+ logging.getLogger("numba").setLevel(logging.WARNING)
17
+
18
+ exp_dir = sys.argv[1]
19
+ import torch_directml
20
+
21
+ device = torch_directml.device(torch_directml.default_device())
22
+ f = open("%s/extract_f0_feature.log" % exp_dir, "a+")
23
+
24
+
25
+ def printt(strr):
26
+ print(strr)
27
+ f.write("%s\n" % strr)
28
+ f.flush()
29
+
30
+
31
+ class FeatureInput(object):
32
+ def __init__(self, samplerate=16000, hop_size=160):
33
+ self.fs = samplerate
34
+ self.hop = hop_size
35
+
36
+ self.f0_bin = 256
37
+ self.f0_max = 1100.0
38
+ self.f0_min = 50.0
39
+ self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
40
+ self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
41
+
42
+ def compute_f0(self, path, f0_method):
43
+ x = load_audio(path, self.fs)
44
+ # p_len = x.shape[0] // self.hop
45
+ if f0_method == "rmvpe":
46
+ if hasattr(self, "model_rmvpe") == False:
47
+ from infer.lib.rmvpe import RMVPE
48
+
49
+ print("Loading rmvpe model")
50
+ self.model_rmvpe = RMVPE(
51
+ "assets/rmvpe/rmvpe.pt", is_half=False, device=device
52
+ )
53
+ f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
54
+ return f0
55
+
56
+ def coarse_f0(self, f0):
57
+ f0_mel = 1127 * np.log(1 + f0 / 700)
58
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * (
59
+ self.f0_bin - 2
60
+ ) / (self.f0_mel_max - self.f0_mel_min) + 1
61
+
62
+ # use 0 or 1
63
+ f0_mel[f0_mel <= 1] = 1
64
+ f0_mel[f0_mel > self.f0_bin - 1] = self.f0_bin - 1
65
+ f0_coarse = np.rint(f0_mel).astype(int)
66
+ assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (
67
+ f0_coarse.max(),
68
+ f0_coarse.min(),
69
+ )
70
+ return f0_coarse
71
+
72
+ def go(self, paths, f0_method):
73
+ if len(paths) == 0:
74
+ printt("no-f0-todo")
75
+ else:
76
+ printt("todo-f0-%s" % len(paths))
77
+ n = max(len(paths) // 5, 1) # 每个进程最多打印5条
78
+ for idx, (inp_path, opt_path1, opt_path2) in enumerate(paths):
79
+ try:
80
+ if idx % n == 0:
81
+ printt("f0ing,now-%s,all-%s,-%s" % (idx, len(paths), inp_path))
82
+ if (
83
+ os.path.exists(opt_path1 + ".npy") == True
84
+ and os.path.exists(opt_path2 + ".npy") == True
85
+ ):
86
+ continue
87
+ featur_pit = self.compute_f0(inp_path, f0_method)
88
+ np.save(
89
+ opt_path2,
90
+ featur_pit,
91
+ allow_pickle=False,
92
+ ) # nsf
93
+ coarse_pit = self.coarse_f0(featur_pit)
94
+ np.save(
95
+ opt_path1,
96
+ coarse_pit,
97
+ allow_pickle=False,
98
+ ) # ori
99
+ except:
100
+ printt("f0fail-%s-%s-%s" % (idx, inp_path, traceback.format_exc()))
101
+
102
+
103
+ if __name__ == "__main__":
104
+ # exp_dir=r"E:\codes\py39\dataset\mi-test"
105
+ # n_p=16
106
+ # f = open("%s/log_extract_f0.log"%exp_dir, "w")
107
+ printt(sys.argv)
108
+ featureInput = FeatureInput()
109
+ paths = []
110
+ inp_root = "%s/1_16k_wavs" % (exp_dir)
111
+ opt_root1 = "%s/2a_f0" % (exp_dir)
112
+ opt_root2 = "%s/2b-f0nsf" % (exp_dir)
113
+
114
+ os.makedirs(opt_root1, exist_ok=True)
115
+ os.makedirs(opt_root2, exist_ok=True)
116
+ for name in sorted(list(os.listdir(inp_root))):
117
+ inp_path = "%s/%s" % (inp_root, name)
118
+ if "spec" in inp_path:
119
+ continue
120
+ opt_path1 = "%s/%s" % (opt_root1, name)
121
+ opt_path2 = "%s/%s" % (opt_root2, name)
122
+ paths.append([inp_path, opt_path1, opt_path2])
123
+ try:
124
+ featureInput.go(paths, "rmvpe")
125
+ except:
126
+ printt("f0_all_fail-%s" % (traceback.format_exc()))
127
+ # ps = []
128
+ # for i in range(n_p):
129
+ # p = Process(
130
+ # target=featureInput.go,
131
+ # args=(
132
+ # paths[i::n_p],
133
+ # f0method,
134
+ # ),
135
+ # )
136
+ # ps.append(p)
137
+ # p.start()
138
+ # for i in range(n_p):
139
+ # ps[i].join()
lib/infer_pack/modules/train/extract_feature_print.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import traceback
4
+
5
+ os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
6
+ os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"
7
+
8
+ device = sys.argv[1]
9
+ n_part = int(sys.argv[2])
10
+ i_part = int(sys.argv[3])
11
+ if len(sys.argv) == 6:
12
+ exp_dir = sys.argv[4]
13
+ version = sys.argv[5]
14
+ else:
15
+ i_gpu = sys.argv[4]
16
+ exp_dir = sys.argv[5]
17
+ os.environ["CUDA_VISIBLE_DEVICES"] = str(i_gpu)
18
+ version = sys.argv[6]
19
+ import fairseq
20
+ import numpy as np
21
+ import soundfile as sf
22
+ import torch
23
+ import torch.nn.functional as F
24
+
25
+ if "privateuseone" not in device:
26
+ device = "cpu"
27
+ if torch.cuda.is_available():
28
+ device = "cuda"
29
+ elif torch.backends.mps.is_available():
30
+ device = "mps"
31
+ else:
32
+ import torch_directml
33
+
34
+ device = torch_directml.device(torch_directml.default_device())
35
+
36
+ def forward_dml(ctx, x, scale):
37
+ ctx.scale = scale
38
+ res = x.clone().detach()
39
+ return res
40
+
41
+ fairseq.modules.grad_multiply.GradMultiply.forward = forward_dml
42
+
43
+ f = open("%s/extract_f0_feature.log" % exp_dir, "a+")
44
+
45
+
46
+ def printt(strr):
47
+ print(strr)
48
+ f.write("%s\n" % strr)
49
+ f.flush()
50
+
51
+
52
+ printt(sys.argv)
53
+ model_path = "assets/hubert/hubert_base.pt"
54
+
55
+ printt(exp_dir)
56
+ wavPath = "%s/1_16k_wavs" % exp_dir
57
+ outPath = (
58
+ "%s/3_feature256" % exp_dir if version == "v1" else "%s/3_feature768" % exp_dir
59
+ )
60
+ os.makedirs(outPath, exist_ok=True)
61
+
62
+
63
+ # wave must be 16k, hop_size=320
64
+ def readwave(wav_path, normalize=False):
65
+ wav, sr = sf.read(wav_path)
66
+ assert sr == 16000
67
+ feats = torch.from_numpy(wav).float()
68
+ if feats.dim() == 2: # double channels
69
+ feats = feats.mean(-1)
70
+ assert feats.dim() == 1, feats.dim()
71
+ if normalize:
72
+ with torch.no_grad():
73
+ feats = F.layer_norm(feats, feats.shape)
74
+ feats = feats.view(1, -1)
75
+ return feats
76
+
77
+
78
+ # HuBERT model
79
+ printt("load model(s) from {}".format(model_path))
80
+ # if hubert model is exist
81
+ if os.access(model_path, os.F_OK) == False:
82
+ printt(
83
+ "Error: Extracting is shut down because %s does not exist, you may download it from https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main"
84
+ % model_path
85
+ )
86
+ exit(0)
87
+ models, saved_cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task(
88
+ [model_path],
89
+ suffix="",
90
+ )
91
+ model = models[0]
92
+ model = model.to(device)
93
+ printt("move model to %s" % device)
94
+ if device not in ["mps", "cpu"]:
95
+ model = model.half()
96
+ model.eval()
97
+
98
+ todo = sorted(list(os.listdir(wavPath)))[i_part::n_part]
99
+ n = max(1, len(todo) // 10) # 最多打印十条
100
+ if len(todo) == 0:
101
+ printt("no-feature-todo")
102
+ else:
103
+ printt("all-feature-%s" % len(todo))
104
+ for idx, file in enumerate(todo):
105
+ try:
106
+ if file.endswith(".wav"):
107
+ wav_path = "%s/%s" % (wavPath, file)
108
+ out_path = "%s/%s" % (outPath, file.replace("wav", "npy"))
109
+
110
+ if os.path.exists(out_path):
111
+ continue
112
+
113
+ feats = readwave(wav_path, normalize=saved_cfg.task.normalize)
114
+ padding_mask = torch.BoolTensor(feats.shape).fill_(False)
115
+ inputs = {
116
+ "source": feats.half().to(device)
117
+ if device not in ["mps", "cpu"]
118
+ else feats.to(device),
119
+ "padding_mask": padding_mask.to(device),
120
+ "output_layer": 9 if version == "v1" else 12, # layer 9
121
+ }
122
+ with torch.no_grad():
123
+ logits = model.extract_features(**inputs)
124
+ feats = (
125
+ model.final_proj(logits[0]) if version == "v1" else logits[0]
126
+ )
127
+
128
+ feats = feats.squeeze(0).float().cpu().numpy()
129
+ if np.isnan(feats).sum() == 0:
130
+ np.save(out_path, feats, allow_pickle=False)
131
+ else:
132
+ printt("%s-contains nan" % file)
133
+ if idx % n == 0:
134
+ printt("now-%s,all-%s,%s,%s" % (len(todo), idx, file, feats.shape))
135
+ except:
136
+ printt(traceback.format_exc())
137
+ printt("all-feature-done")
lib/infer_pack/modules/train/preprocess.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import multiprocessing
2
+ import os
3
+ import sys
4
+
5
+ from scipy import signal
6
+
7
+ now_dir = os.getcwd()
8
+ sys.path.append(now_dir)
9
+ print(sys.argv)
10
+ inp_root = sys.argv[1]
11
+ sr = int(sys.argv[2])
12
+ n_p = int(sys.argv[3])
13
+ exp_dir = sys.argv[4]
14
+ noparallel = sys.argv[5] == "True"
15
+ per = float(sys.argv[6])
16
+ import multiprocessing
17
+ import os
18
+ import traceback
19
+
20
+ import librosa
21
+ import numpy as np
22
+ from scipy.io import wavfile
23
+
24
+ from infer.lib.audio import load_audio
25
+ from infer.lib.slicer2 import Slicer
26
+
27
+ mutex = multiprocessing.Lock()
28
+ f = open("%s/preprocess.log" % exp_dir, "a+")
29
+
30
+
31
+ def println(strr):
32
+ mutex.acquire()
33
+ print(strr)
34
+ f.write("%s\n" % strr)
35
+ f.flush()
36
+ mutex.release()
37
+
38
+
39
+ class PreProcess:
40
+ def __init__(self, sr, exp_dir, per=3.0):
41
+ self.slicer = Slicer(
42
+ sr=sr,
43
+ threshold=-42,
44
+ min_length=1500,
45
+ min_interval=400,
46
+ hop_size=15,
47
+ max_sil_kept=500,
48
+ )
49
+ self.sr = sr
50
+ self.bh, self.ah = signal.butter(N=5, Wn=48, btype="high", fs=self.sr)
51
+ self.per = per
52
+ self.overlap = 0.3
53
+ self.tail = self.per + self.overlap
54
+ self.max = 0.9
55
+ self.alpha = 0.75
56
+ self.exp_dir = exp_dir
57
+ self.gt_wavs_dir = "%s/0_gt_wavs" % exp_dir
58
+ self.wavs16k_dir = "%s/1_16k_wavs" % exp_dir
59
+ os.makedirs(self.exp_dir, exist_ok=True)
60
+ os.makedirs(self.gt_wavs_dir, exist_ok=True)
61
+ os.makedirs(self.wavs16k_dir, exist_ok=True)
62
+
63
+ def norm_write(self, tmp_audio, idx0, idx1):
64
+ tmp_max = np.abs(tmp_audio).max()
65
+ if tmp_max > 2.5:
66
+ print("%s-%s-%s-filtered" % (idx0, idx1, tmp_max))
67
+ return
68
+ tmp_audio = (tmp_audio / tmp_max * (self.max * self.alpha)) + (
69
+ 1 - self.alpha
70
+ ) * tmp_audio
71
+ wavfile.write(
72
+ "%s/%s_%s.wav" % (self.gt_wavs_dir, idx0, idx1),
73
+ self.sr,
74
+ tmp_audio.astype(np.float32),
75
+ )
76
+ tmp_audio = librosa.resample(
77
+ tmp_audio, orig_sr=self.sr, target_sr=16000
78
+ ) # , res_type="soxr_vhq"
79
+ wavfile.write(
80
+ "%s/%s_%s.wav" % (self.wavs16k_dir, idx0, idx1),
81
+ 16000,
82
+ tmp_audio.astype(np.float32),
83
+ )
84
+
85
+ def pipeline(self, path, idx0):
86
+ try:
87
+ audio = load_audio(path, self.sr)
88
+ # zero phased digital filter cause pre-ringing noise...
89
+ # audio = signal.filtfilt(self.bh, self.ah, audio)
90
+ audio = signal.lfilter(self.bh, self.ah, audio)
91
+
92
+ idx1 = 0
93
+ for audio in self.slicer.slice(audio):
94
+ i = 0
95
+ while 1:
96
+ start = int(self.sr * (self.per - self.overlap) * i)
97
+ i += 1
98
+ if len(audio[start:]) > self.tail * self.sr:
99
+ tmp_audio = audio[start : start + int(self.per * self.sr)]
100
+ self.norm_write(tmp_audio, idx0, idx1)
101
+ idx1 += 1
102
+ else:
103
+ tmp_audio = audio[start:]
104
+ idx1 += 1
105
+ break
106
+ self.norm_write(tmp_audio, idx0, idx1)
107
+ println("%s->Suc." % path)
108
+ except:
109
+ println("%s->%s" % (path, traceback.format_exc()))
110
+
111
+ def pipeline_mp(self, infos):
112
+ for path, idx0 in infos:
113
+ self.pipeline(path, idx0)
114
+
115
+ def pipeline_mp_inp_dir(self, inp_root, n_p):
116
+ try:
117
+ infos = [
118
+ ("%s/%s" % (inp_root, name), idx)
119
+ for idx, name in enumerate(sorted(list(os.listdir(inp_root))))
120
+ ]
121
+ if noparallel:
122
+ for i in range(n_p):
123
+ self.pipeline_mp(infos[i::n_p])
124
+ else:
125
+ ps = []
126
+ for i in range(n_p):
127
+ p = multiprocessing.Process(
128
+ target=self.pipeline_mp, args=(infos[i::n_p],)
129
+ )
130
+ ps.append(p)
131
+ p.start()
132
+ for i in range(n_p):
133
+ ps[i].join()
134
+ except:
135
+ println("Fail. %s" % traceback.format_exc())
136
+
137
+
138
+ def preprocess_trainset(inp_root, sr, n_p, exp_dir, per):
139
+ pp = PreProcess(sr, exp_dir, per)
140
+ println("start preprocess")
141
+ println(sys.argv)
142
+ pp.pipeline_mp_inp_dir(inp_root, n_p)
143
+ println("end preprocess")
144
+
145
+
146
+ if __name__ == "__main__":
147
+ preprocess_trainset(inp_root, sr, n_p, exp_dir, per)
lib/infer_pack/modules/train/train.py ADDED
@@ -0,0 +1,643 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import logging
4
+
5
+ logger = logging.getLogger(__name__)
6
+
7
+ now_dir = os.getcwd()
8
+ sys.path.append(os.path.join(now_dir))
9
+
10
+ import datetime
11
+
12
+ from infer.lib.train import utils
13
+
14
+ hps = utils.get_hparams()
15
+ os.environ["CUDA_VISIBLE_DEVICES"] = hps.gpus.replace("-", ",")
16
+ n_gpus = len(hps.gpus.split("-"))
17
+ from random import randint, shuffle
18
+
19
+ import torch
20
+
21
+ try:
22
+ import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import
23
+
24
+ if torch.xpu.is_available():
25
+ from infer.modules.ipex import ipex_init
26
+ from infer.modules.ipex.gradscaler import gradscaler_init
27
+ from torch.xpu.amp import autocast
28
+
29
+ GradScaler = gradscaler_init()
30
+ ipex_init()
31
+ else:
32
+ from torch.cuda.amp import GradScaler, autocast
33
+ except Exception:
34
+ from torch.cuda.amp import GradScaler, autocast
35
+
36
+ torch.backends.cudnn.deterministic = False
37
+ torch.backends.cudnn.benchmark = False
38
+ from time import sleep
39
+ from time import time as ttime
40
+
41
+ import torch.distributed as dist
42
+ import torch.multiprocessing as mp
43
+ from torch.nn import functional as F
44
+ from torch.nn.parallel import DistributedDataParallel as DDP
45
+ from torch.utils.data import DataLoader
46
+ from torch.utils.tensorboard import SummaryWriter
47
+
48
+ from infer.lib.infer_pack import commons
49
+ from infer.lib.train.data_utils import (
50
+ DistributedBucketSampler,
51
+ TextAudioCollate,
52
+ TextAudioCollateMultiNSFsid,
53
+ TextAudioLoader,
54
+ TextAudioLoaderMultiNSFsid,
55
+ )
56
+
57
+ if hps.version == "v1":
58
+ from infer.lib.infer_pack.models import MultiPeriodDiscriminator
59
+ from infer.lib.infer_pack.models import SynthesizerTrnMs256NSFsid as RVC_Model_f0
60
+ from infer.lib.infer_pack.models import (
61
+ SynthesizerTrnMs256NSFsid_nono as RVC_Model_nof0,
62
+ )
63
+ else:
64
+ from infer.lib.infer_pack.models import (
65
+ SynthesizerTrnMs768NSFsid as RVC_Model_f0,
66
+ SynthesizerTrnMs768NSFsid_nono as RVC_Model_nof0,
67
+ MultiPeriodDiscriminatorV2 as MultiPeriodDiscriminator,
68
+ )
69
+
70
+ from infer.lib.train.losses import (
71
+ discriminator_loss,
72
+ feature_loss,
73
+ generator_loss,
74
+ kl_loss,
75
+ )
76
+ from infer.lib.train.mel_processing import mel_spectrogram_torch, spec_to_mel_torch
77
+ from infer.lib.train.process_ckpt import savee
78
+
79
+ global_step = 0
80
+
81
+
82
+ class EpochRecorder:
83
+ def __init__(self):
84
+ self.last_time = ttime()
85
+
86
+ def record(self):
87
+ now_time = ttime()
88
+ elapsed_time = now_time - self.last_time
89
+ self.last_time = now_time
90
+ elapsed_time_str = str(datetime.timedelta(seconds=elapsed_time))
91
+ current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
92
+ return f"[{current_time}] | ({elapsed_time_str})"
93
+
94
+
95
+ def main():
96
+ n_gpus = torch.cuda.device_count()
97
+
98
+ if torch.cuda.is_available() == False and torch.backends.mps.is_available() == True:
99
+ n_gpus = 1
100
+ if n_gpus < 1:
101
+ # patch to unblock people without gpus. there is probably a better way.
102
+ print("NO GPU DETECTED: falling back to CPU - this may take a while")
103
+ n_gpus = 1
104
+ os.environ["MASTER_ADDR"] = "localhost"
105
+ os.environ["MASTER_PORT"] = str(randint(20000, 55555))
106
+ children = []
107
+ for i in range(n_gpus):
108
+ subproc = mp.Process(
109
+ target=run,
110
+ args=(i, n_gpus, hps),
111
+ )
112
+ children.append(subproc)
113
+ subproc.start()
114
+
115
+ for i in range(n_gpus):
116
+ children[i].join()
117
+
118
+
119
+ def run(
120
+ rank,
121
+ n_gpus,
122
+ hps,
123
+ ):
124
+ global global_step
125
+ if rank == 0:
126
+ logger = utils.get_logger(hps.model_dir)
127
+ logger.info(hps)
128
+ # utils.check_git_hash(hps.model_dir)
129
+ writer = SummaryWriter(log_dir=hps.model_dir)
130
+ writer_eval = SummaryWriter(log_dir=os.path.join(hps.model_dir, "eval"))
131
+
132
+ dist.init_process_group(
133
+ backend="gloo", init_method="env://", world_size=n_gpus, rank=rank
134
+ )
135
+ torch.manual_seed(hps.train.seed)
136
+ if torch.cuda.is_available():
137
+ torch.cuda.set_device(rank)
138
+
139
+ if hps.if_f0 == 1:
140
+ train_dataset = TextAudioLoaderMultiNSFsid(hps.data.training_files, hps.data)
141
+ else:
142
+ train_dataset = TextAudioLoader(hps.data.training_files, hps.data)
143
+ train_sampler = DistributedBucketSampler(
144
+ train_dataset,
145
+ hps.train.batch_size * n_gpus,
146
+ # [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1200,1400], # 16s
147
+ [100, 200, 300, 400, 500, 600, 700, 800, 900], # 16s
148
+ num_replicas=n_gpus,
149
+ rank=rank,
150
+ shuffle=True,
151
+ )
152
+ # It is possible that dataloader's workers are out of shared memory. Please try to raise your shared memory limit.
153
+ # num_workers=8 -> num_workers=4
154
+ if hps.if_f0 == 1:
155
+ collate_fn = TextAudioCollateMultiNSFsid()
156
+ else:
157
+ collate_fn = TextAudioCollate()
158
+ train_loader = DataLoader(
159
+ train_dataset,
160
+ num_workers=4,
161
+ shuffle=False,
162
+ pin_memory=True,
163
+ collate_fn=collate_fn,
164
+ batch_sampler=train_sampler,
165
+ persistent_workers=True,
166
+ prefetch_factor=8,
167
+ )
168
+ if hps.if_f0 == 1:
169
+ net_g = RVC_Model_f0(
170
+ hps.data.filter_length // 2 + 1,
171
+ hps.train.segment_size // hps.data.hop_length,
172
+ **hps.model,
173
+ is_half=hps.train.fp16_run,
174
+ sr=hps.sample_rate,
175
+ )
176
+ else:
177
+ net_g = RVC_Model_nof0(
178
+ hps.data.filter_length // 2 + 1,
179
+ hps.train.segment_size // hps.data.hop_length,
180
+ **hps.model,
181
+ is_half=hps.train.fp16_run,
182
+ )
183
+ if torch.cuda.is_available():
184
+ net_g = net_g.cuda(rank)
185
+ net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm)
186
+ if torch.cuda.is_available():
187
+ net_d = net_d.cuda(rank)
188
+ optim_g = torch.optim.AdamW(
189
+ net_g.parameters(),
190
+ hps.train.learning_rate,
191
+ betas=hps.train.betas,
192
+ eps=hps.train.eps,
193
+ )
194
+ optim_d = torch.optim.AdamW(
195
+ net_d.parameters(),
196
+ hps.train.learning_rate,
197
+ betas=hps.train.betas,
198
+ eps=hps.train.eps,
199
+ )
200
+ # net_g = DDP(net_g, device_ids=[rank], find_unused_parameters=True)
201
+ # net_d = DDP(net_d, device_ids=[rank], find_unused_parameters=True)
202
+ if hasattr(torch, "xpu") and torch.xpu.is_available():
203
+ pass
204
+ elif torch.cuda.is_available():
205
+ net_g = DDP(net_g, device_ids=[rank])
206
+ net_d = DDP(net_d, device_ids=[rank])
207
+ else:
208
+ net_g = DDP(net_g)
209
+ net_d = DDP(net_d)
210
+
211
+ try: # 如果能加载自动resume
212
+ _, _, _, epoch_str = utils.load_checkpoint(
213
+ utils.latest_checkpoint_path(hps.model_dir, "D_*.pth"), net_d, optim_d
214
+ ) # D多半加载没事
215
+ if rank == 0:
216
+ logger.info("loaded D")
217
+ # _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g,load_opt=0)
218
+ _, _, _, epoch_str = utils.load_checkpoint(
219
+ utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g
220
+ )
221
+ global_step = (epoch_str - 1) * len(train_loader)
222
+ # epoch_str = 1
223
+ # global_step = 0
224
+ except: # 如果首次不能加载,加载pretrain
225
+ # traceback.print_exc()
226
+ epoch_str = 1
227
+ global_step = 0
228
+ if hps.pretrainG != "":
229
+ if rank == 0:
230
+ logger.info("loaded pretrained %s" % (hps.pretrainG))
231
+ if hasattr(net_g, "module"):
232
+ logger.info(
233
+ net_g.module.load_state_dict(
234
+ torch.load(hps.pretrainG, map_location="cpu")["model"]
235
+ )
236
+ ) ##测试不加载优化器
237
+ else:
238
+ logger.info(
239
+ net_g.load_state_dict(
240
+ torch.load(hps.pretrainG, map_location="cpu")["model"]
241
+ )
242
+ ) ##测试不加载优化器
243
+ if hps.pretrainD != "":
244
+ if rank == 0:
245
+ logger.info("loaded pretrained %s" % (hps.pretrainD))
246
+ if hasattr(net_d, "module"):
247
+ logger.info(
248
+ net_d.module.load_state_dict(
249
+ torch.load(hps.pretrainD, map_location="cpu")["model"]
250
+ )
251
+ )
252
+ else:
253
+ logger.info(
254
+ net_d.load_state_dict(
255
+ torch.load(hps.pretrainD, map_location="cpu")["model"]
256
+ )
257
+ )
258
+
259
+ scheduler_g = torch.optim.lr_scheduler.ExponentialLR(
260
+ optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2
261
+ )
262
+ scheduler_d = torch.optim.lr_scheduler.ExponentialLR(
263
+ optim_d, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2
264
+ )
265
+
266
+ scaler = GradScaler(enabled=hps.train.fp16_run)
267
+
268
+ cache = []
269
+ for epoch in range(epoch_str, hps.train.epochs + 1):
270
+ if rank == 0:
271
+ train_and_evaluate(
272
+ rank,
273
+ epoch,
274
+ hps,
275
+ [net_g, net_d],
276
+ [optim_g, optim_d],
277
+ [scheduler_g, scheduler_d],
278
+ scaler,
279
+ [train_loader, None],
280
+ logger,
281
+ [writer, writer_eval],
282
+ cache,
283
+ )
284
+ else:
285
+ train_and_evaluate(
286
+ rank,
287
+ epoch,
288
+ hps,
289
+ [net_g, net_d],
290
+ [optim_g, optim_d],
291
+ [scheduler_g, scheduler_d],
292
+ scaler,
293
+ [train_loader, None],
294
+ None,
295
+ None,
296
+ cache,
297
+ )
298
+ scheduler_g.step()
299
+ scheduler_d.step()
300
+
301
+
302
+ def train_and_evaluate(
303
+ rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers, cache
304
+ ):
305
+ net_g, net_d = nets
306
+ optim_g, optim_d = optims
307
+ train_loader, eval_loader = loaders
308
+ if writers is not None:
309
+ writer, writer_eval = writers
310
+
311
+ train_loader.batch_sampler.set_epoch(epoch)
312
+ global global_step
313
+
314
+ net_g.train()
315
+ net_d.train()
316
+
317
+ # Prepare data iterator
318
+ if hps.if_cache_data_in_gpu == True:
319
+ # Use Cache
320
+ data_iterator = cache
321
+ if cache == []:
322
+ # Make new cache
323
+ for batch_idx, info in enumerate(train_loader):
324
+ # Unpack
325
+ if hps.if_f0 == 1:
326
+ (
327
+ phone,
328
+ phone_lengths,
329
+ pitch,
330
+ pitchf,
331
+ spec,
332
+ spec_lengths,
333
+ wave,
334
+ wave_lengths,
335
+ sid,
336
+ ) = info
337
+ else:
338
+ (
339
+ phone,
340
+ phone_lengths,
341
+ spec,
342
+ spec_lengths,
343
+ wave,
344
+ wave_lengths,
345
+ sid,
346
+ ) = info
347
+ # Load on CUDA
348
+ if torch.cuda.is_available():
349
+ phone = phone.cuda(rank, non_blocking=True)
350
+ phone_lengths = phone_lengths.cuda(rank, non_blocking=True)
351
+ if hps.if_f0 == 1:
352
+ pitch = pitch.cuda(rank, non_blocking=True)
353
+ pitchf = pitchf.cuda(rank, non_blocking=True)
354
+ sid = sid.cuda(rank, non_blocking=True)
355
+ spec = spec.cuda(rank, non_blocking=True)
356
+ spec_lengths = spec_lengths.cuda(rank, non_blocking=True)
357
+ wave = wave.cuda(rank, non_blocking=True)
358
+ wave_lengths = wave_lengths.cuda(rank, non_blocking=True)
359
+ # Cache on list
360
+ if hps.if_f0 == 1:
361
+ cache.append(
362
+ (
363
+ batch_idx,
364
+ (
365
+ phone,
366
+ phone_lengths,
367
+ pitch,
368
+ pitchf,
369
+ spec,
370
+ spec_lengths,
371
+ wave,
372
+ wave_lengths,
373
+ sid,
374
+ ),
375
+ )
376
+ )
377
+ else:
378
+ cache.append(
379
+ (
380
+ batch_idx,
381
+ (
382
+ phone,
383
+ phone_lengths,
384
+ spec,
385
+ spec_lengths,
386
+ wave,
387
+ wave_lengths,
388
+ sid,
389
+ ),
390
+ )
391
+ )
392
+ else:
393
+ # Load shuffled cache
394
+ shuffle(cache)
395
+ else:
396
+ # Loader
397
+ data_iterator = enumerate(train_loader)
398
+
399
+ # Run steps
400
+ epoch_recorder = EpochRecorder()
401
+ for batch_idx, info in data_iterator:
402
+ # Data
403
+ ## Unpack
404
+ if hps.if_f0 == 1:
405
+ (
406
+ phone,
407
+ phone_lengths,
408
+ pitch,
409
+ pitchf,
410
+ spec,
411
+ spec_lengths,
412
+ wave,
413
+ wave_lengths,
414
+ sid,
415
+ ) = info
416
+ else:
417
+ phone, phone_lengths, spec, spec_lengths, wave, wave_lengths, sid = info
418
+ ## Load on CUDA
419
+ if (hps.if_cache_data_in_gpu == False) and torch.cuda.is_available():
420
+ phone = phone.cuda(rank, non_blocking=True)
421
+ phone_lengths = phone_lengths.cuda(rank, non_blocking=True)
422
+ if hps.if_f0 == 1:
423
+ pitch = pitch.cuda(rank, non_blocking=True)
424
+ pitchf = pitchf.cuda(rank, non_blocking=True)
425
+ sid = sid.cuda(rank, non_blocking=True)
426
+ spec = spec.cuda(rank, non_blocking=True)
427
+ spec_lengths = spec_lengths.cuda(rank, non_blocking=True)
428
+ wave = wave.cuda(rank, non_blocking=True)
429
+ # wave_lengths = wave_lengths.cuda(rank, non_blocking=True)
430
+
431
+ # Calculate
432
+ with autocast(enabled=hps.train.fp16_run):
433
+ if hps.if_f0 == 1:
434
+ (
435
+ y_hat,
436
+ ids_slice,
437
+ x_mask,
438
+ z_mask,
439
+ (z, z_p, m_p, logs_p, m_q, logs_q),
440
+ ) = net_g(phone, phone_lengths, pitch, pitchf, spec, spec_lengths, sid)
441
+ else:
442
+ (
443
+ y_hat,
444
+ ids_slice,
445
+ x_mask,
446
+ z_mask,
447
+ (z, z_p, m_p, logs_p, m_q, logs_q),
448
+ ) = net_g(phone, phone_lengths, spec, spec_lengths, sid)
449
+ mel = spec_to_mel_torch(
450
+ spec,
451
+ hps.data.filter_length,
452
+ hps.data.n_mel_channels,
453
+ hps.data.sampling_rate,
454
+ hps.data.mel_fmin,
455
+ hps.data.mel_fmax,
456
+ )
457
+ y_mel = commons.slice_segments(
458
+ mel, ids_slice, hps.train.segment_size // hps.data.hop_length
459
+ )
460
+ with autocast(enabled=False):
461
+ y_hat_mel = mel_spectrogram_torch(
462
+ y_hat.float().squeeze(1),
463
+ hps.data.filter_length,
464
+ hps.data.n_mel_channels,
465
+ hps.data.sampling_rate,
466
+ hps.data.hop_length,
467
+ hps.data.win_length,
468
+ hps.data.mel_fmin,
469
+ hps.data.mel_fmax,
470
+ )
471
+ if hps.train.fp16_run == True:
472
+ y_hat_mel = y_hat_mel.half()
473
+ wave = commons.slice_segments(
474
+ wave, ids_slice * hps.data.hop_length, hps.train.segment_size
475
+ ) # slice
476
+
477
+ # Discriminator
478
+ y_d_hat_r, y_d_hat_g, _, _ = net_d(wave, y_hat.detach())
479
+ with autocast(enabled=False):
480
+ loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(
481
+ y_d_hat_r, y_d_hat_g
482
+ )
483
+ optim_d.zero_grad()
484
+ scaler.scale(loss_disc).backward()
485
+ scaler.unscale_(optim_d)
486
+ grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None)
487
+ scaler.step(optim_d)
488
+
489
+ with autocast(enabled=hps.train.fp16_run):
490
+ # Generator
491
+ y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(wave, y_hat)
492
+ with autocast(enabled=False):
493
+ loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel
494
+ loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl
495
+ loss_fm = feature_loss(fmap_r, fmap_g)
496
+ loss_gen, losses_gen = generator_loss(y_d_hat_g)
497
+ loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl
498
+ optim_g.zero_grad()
499
+ scaler.scale(loss_gen_all).backward()
500
+ scaler.unscale_(optim_g)
501
+ grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None)
502
+ scaler.step(optim_g)
503
+ scaler.update()
504
+
505
+ if rank == 0:
506
+ if global_step % hps.train.log_interval == 0:
507
+ lr = optim_g.param_groups[0]["lr"]
508
+ logger.info(
509
+ "Train Epoch: {} [{:.0f}%]".format(
510
+ epoch, 100.0 * batch_idx / len(train_loader)
511
+ )
512
+ )
513
+ # Amor For Tensorboard display
514
+ if loss_mel > 75:
515
+ loss_mel = 75
516
+ if loss_kl > 9:
517
+ loss_kl = 9
518
+
519
+ logger.info([global_step, lr])
520
+ logger.info(
521
+ f"loss_disc={loss_disc:.3f}, loss_gen={loss_gen:.3f}, loss_fm={loss_fm:.3f},loss_mel={loss_mel:.3f}, loss_kl={loss_kl:.3f}"
522
+ )
523
+ scalar_dict = {
524
+ "loss/g/total": loss_gen_all,
525
+ "loss/d/total": loss_disc,
526
+ "learning_rate": lr,
527
+ "grad_norm_d": grad_norm_d,
528
+ "grad_norm_g": grad_norm_g,
529
+ }
530
+ scalar_dict.update(
531
+ {
532
+ "loss/g/fm": loss_fm,
533
+ "loss/g/mel": loss_mel,
534
+ "loss/g/kl": loss_kl,
535
+ }
536
+ )
537
+
538
+ scalar_dict.update(
539
+ {"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)}
540
+ )
541
+ scalar_dict.update(
542
+ {"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)}
543
+ )
544
+ scalar_dict.update(
545
+ {"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)}
546
+ )
547
+ image_dict = {
548
+ "slice/mel_org": utils.plot_spectrogram_to_numpy(
549
+ y_mel[0].data.cpu().numpy()
550
+ ),
551
+ "slice/mel_gen": utils.plot_spectrogram_to_numpy(
552
+ y_hat_mel[0].data.cpu().numpy()
553
+ ),
554
+ "all/mel": utils.plot_spectrogram_to_numpy(
555
+ mel[0].data.cpu().numpy()
556
+ ),
557
+ }
558
+ utils.summarize(
559
+ writer=writer,
560
+ global_step=global_step,
561
+ images=image_dict,
562
+ scalars=scalar_dict,
563
+ )
564
+ global_step += 1
565
+ # /Run steps
566
+
567
+ if epoch % hps.save_every_epoch == 0 and rank == 0:
568
+ if hps.if_latest == 0:
569
+ utils.save_checkpoint(
570
+ net_g,
571
+ optim_g,
572
+ hps.train.learning_rate,
573
+ epoch,
574
+ os.path.join(hps.model_dir, "G_{}.pth".format(global_step)),
575
+ )
576
+ utils.save_checkpoint(
577
+ net_d,
578
+ optim_d,
579
+ hps.train.learning_rate,
580
+ epoch,
581
+ os.path.join(hps.model_dir, "D_{}.pth".format(global_step)),
582
+ )
583
+ else:
584
+ utils.save_checkpoint(
585
+ net_g,
586
+ optim_g,
587
+ hps.train.learning_rate,
588
+ epoch,
589
+ os.path.join(hps.model_dir, "G_{}.pth".format(2333333)),
590
+ )
591
+ utils.save_checkpoint(
592
+ net_d,
593
+ optim_d,
594
+ hps.train.learning_rate,
595
+ epoch,
596
+ os.path.join(hps.model_dir, "D_{}.pth".format(2333333)),
597
+ )
598
+ if rank == 0 and hps.save_every_weights == "1":
599
+ if hasattr(net_g, "module"):
600
+ ckpt = net_g.module.state_dict()
601
+ else:
602
+ ckpt = net_g.state_dict()
603
+ logger.info(
604
+ "saving ckpt %s_e%s:%s"
605
+ % (
606
+ hps.name,
607
+ epoch,
608
+ savee(
609
+ ckpt,
610
+ hps.sample_rate,
611
+ hps.if_f0,
612
+ hps.name + "_e%s_s%s" % (epoch, global_step),
613
+ epoch,
614
+ hps.version,
615
+ hps,
616
+ ),
617
+ )
618
+ )
619
+
620
+ if rank == 0:
621
+ logger.info("====> Epoch: {} {}".format(epoch, epoch_recorder.record()))
622
+ if epoch >= hps.total_epoch and rank == 0:
623
+ logger.info("Training is done. The program is closed.")
624
+
625
+ if hasattr(net_g, "module"):
626
+ ckpt = net_g.module.state_dict()
627
+ else:
628
+ ckpt = net_g.state_dict()
629
+ logger.info(
630
+ "saving final ckpt:%s"
631
+ % (
632
+ savee(
633
+ ckpt, hps.sample_rate, hps.if_f0, hps.name, epoch, hps.version, hps
634
+ )
635
+ )
636
+ )
637
+ sleep(1)
638
+ os._exit(2333333)
639
+
640
+
641
+ if __name__ == "__main__":
642
+ torch.multiprocessing.set_start_method("spawn")
643
+ main()