Altadmin commited on
Commit
890b83f
1 Parent(s): f4dd36d

Upload 38 files

Browse files
Dockerfile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # syntax=docker/dockerfile:1
2
+
3
+ FROM python:3.10-bullseye
4
+
5
+ EXPOSE 7865
6
+
7
+ WORKDIR /app
8
+
9
+ COPY . .
10
+
11
+ RUN pip3 install -r requirements.txt
12
+
13
+ CMD ["python3", "infer-web.py"]
MDXNet.py ADDED
@@ -0,0 +1,274 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import soundfile as sf
2
+ import torch, pdb, time, argparse, os, warnings, sys, librosa
3
+ import numpy as np
4
+ import onnxruntime as ort
5
+ from scipy.io.wavfile import write
6
+ from tqdm import tqdm
7
+ import torch
8
+ import torch.nn as nn
9
+
10
+ dim_c = 4
11
+
12
+
13
+ class Conv_TDF_net_trim:
14
+ def __init__(
15
+ self, device, model_name, target_name, L, dim_f, dim_t, n_fft, hop=1024
16
+ ):
17
+ super(Conv_TDF_net_trim, self).__init__()
18
+
19
+ self.dim_f = dim_f
20
+ self.dim_t = 2**dim_t
21
+ self.n_fft = n_fft
22
+ self.hop = hop
23
+ self.n_bins = self.n_fft // 2 + 1
24
+ self.chunk_size = hop * (self.dim_t - 1)
25
+ self.window = torch.hann_window(window_length=self.n_fft, periodic=True).to(
26
+ device
27
+ )
28
+ self.target_name = target_name
29
+ self.blender = "blender" in model_name
30
+
31
+ out_c = dim_c * 4 if target_name == "*" else dim_c
32
+ self.freq_pad = torch.zeros(
33
+ [1, out_c, self.n_bins - self.dim_f, self.dim_t]
34
+ ).to(device)
35
+
36
+ self.n = L // 2
37
+
38
+ def stft(self, x):
39
+ x = x.reshape([-1, self.chunk_size])
40
+ x = torch.stft(
41
+ x,
42
+ n_fft=self.n_fft,
43
+ hop_length=self.hop,
44
+ window=self.window,
45
+ center=True,
46
+ return_complex=True,
47
+ )
48
+ x = torch.view_as_real(x)
49
+ x = x.permute([0, 3, 1, 2])
50
+ x = x.reshape([-1, 2, 2, self.n_bins, self.dim_t]).reshape(
51
+ [-1, dim_c, self.n_bins, self.dim_t]
52
+ )
53
+ return x[:, :, : self.dim_f]
54
+
55
+ def istft(self, x, freq_pad=None):
56
+ freq_pad = (
57
+ self.freq_pad.repeat([x.shape[0], 1, 1, 1])
58
+ if freq_pad is None
59
+ else freq_pad
60
+ )
61
+ x = torch.cat([x, freq_pad], -2)
62
+ c = 4 * 2 if self.target_name == "*" else 2
63
+ x = x.reshape([-1, c, 2, self.n_bins, self.dim_t]).reshape(
64
+ [-1, 2, self.n_bins, self.dim_t]
65
+ )
66
+ x = x.permute([0, 2, 3, 1])
67
+ x = x.contiguous()
68
+ x = torch.view_as_complex(x)
69
+ x = torch.istft(
70
+ x, n_fft=self.n_fft, hop_length=self.hop, window=self.window, center=True
71
+ )
72
+ return x.reshape([-1, c, self.chunk_size])
73
+
74
+
75
+ def get_models(device, dim_f, dim_t, n_fft):
76
+ return Conv_TDF_net_trim(
77
+ device=device,
78
+ model_name="Conv-TDF",
79
+ target_name="vocals",
80
+ L=11,
81
+ dim_f=dim_f,
82
+ dim_t=dim_t,
83
+ n_fft=n_fft,
84
+ )
85
+
86
+
87
+ warnings.filterwarnings("ignore")
88
+ cpu = torch.device("cpu")
89
+ if torch.cuda.is_available():
90
+ device = torch.device("cuda:0")
91
+ elif torch.backends.mps.is_available():
92
+ device = torch.device("mps")
93
+ else:
94
+ device = torch.device("cpu")
95
+
96
+
97
+ class Predictor:
98
+ def __init__(self, args):
99
+ self.args = args
100
+ self.model_ = get_models(
101
+ device=cpu, dim_f=args.dim_f, dim_t=args.dim_t, n_fft=args.n_fft
102
+ )
103
+ self.model = ort.InferenceSession(
104
+ os.path.join(args.onnx, self.model_.target_name + ".onnx"),
105
+ providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
106
+ )
107
+ print("onnx load done")
108
+
109
+ def demix(self, mix):
110
+ samples = mix.shape[-1]
111
+ margin = self.args.margin
112
+ chunk_size = self.args.chunks * 44100
113
+ assert not margin == 0, "margin cannot be zero!"
114
+ if margin > chunk_size:
115
+ margin = chunk_size
116
+
117
+ segmented_mix = {}
118
+
119
+ if self.args.chunks == 0 or samples < chunk_size:
120
+ chunk_size = samples
121
+
122
+ counter = -1
123
+ for skip in range(0, samples, chunk_size):
124
+ counter += 1
125
+
126
+ s_margin = 0 if counter == 0 else margin
127
+ end = min(skip + chunk_size + margin, samples)
128
+
129
+ start = skip - s_margin
130
+
131
+ segmented_mix[skip] = mix[:, start:end].copy()
132
+ if end == samples:
133
+ break
134
+
135
+ sources = self.demix_base(segmented_mix, margin_size=margin)
136
+ """
137
+ mix:(2,big_sample)
138
+ segmented_mix:offset->(2,small_sample)
139
+ sources:(1,2,big_sample)
140
+ """
141
+ return sources
142
+
143
+ def demix_base(self, mixes, margin_size):
144
+ chunked_sources = []
145
+ progress_bar = tqdm(total=len(mixes))
146
+ progress_bar.set_description("Processing")
147
+ for mix in mixes:
148
+ cmix = mixes[mix]
149
+ sources = []
150
+ n_sample = cmix.shape[1]
151
+ model = self.model_
152
+ trim = model.n_fft // 2
153
+ gen_size = model.chunk_size - 2 * trim
154
+ pad = gen_size - n_sample % gen_size
155
+ mix_p = np.concatenate(
156
+ (np.zeros((2, trim)), cmix, np.zeros((2, pad)), np.zeros((2, trim))), 1
157
+ )
158
+ mix_waves = []
159
+ i = 0
160
+ while i < n_sample + pad:
161
+ waves = np.array(mix_p[:, i : i + model.chunk_size])
162
+ mix_waves.append(waves)
163
+ i += gen_size
164
+ mix_waves = torch.tensor(mix_waves, dtype=torch.float32).to(cpu)
165
+ with torch.no_grad():
166
+ _ort = self.model
167
+ spek = model.stft(mix_waves)
168
+ if self.args.denoise:
169
+ spec_pred = (
170
+ -_ort.run(None, {"input": -spek.cpu().numpy()})[0] * 0.5
171
+ + _ort.run(None, {"input": spek.cpu().numpy()})[0] * 0.5
172
+ )
173
+ tar_waves = model.istft(torch.tensor(spec_pred))
174
+ else:
175
+ tar_waves = model.istft(
176
+ torch.tensor(_ort.run(None, {"input": spek.cpu().numpy()})[0])
177
+ )
178
+ tar_signal = (
179
+ tar_waves[:, :, trim:-trim]
180
+ .transpose(0, 1)
181
+ .reshape(2, -1)
182
+ .numpy()[:, :-pad]
183
+ )
184
+
185
+ start = 0 if mix == 0 else margin_size
186
+ end = None if mix == list(mixes.keys())[::-1][0] else -margin_size
187
+ if margin_size == 0:
188
+ end = None
189
+ sources.append(tar_signal[:, start:end])
190
+
191
+ progress_bar.update(1)
192
+
193
+ chunked_sources.append(sources)
194
+ _sources = np.concatenate(chunked_sources, axis=-1)
195
+ # del self.model
196
+ progress_bar.close()
197
+ return _sources
198
+
199
+ def prediction(self, m, vocal_root, others_root, format):
200
+ os.makedirs(vocal_root, exist_ok=True)
201
+ os.makedirs(others_root, exist_ok=True)
202
+ basename = os.path.basename(m)
203
+ mix, rate = librosa.load(m, mono=False, sr=44100)
204
+ if mix.ndim == 1:
205
+ mix = np.asfortranarray([mix, mix])
206
+ mix = mix.T
207
+ sources = self.demix(mix.T)
208
+ opt = sources[0].T
209
+ if format in ["wav", "flac"]:
210
+ sf.write(
211
+ "%s/%s_main_vocal.%s" % (vocal_root, basename, format), mix - opt, rate
212
+ )
213
+ sf.write("%s/%s_others.%s" % (others_root, basename, format), opt, rate)
214
+ else:
215
+ path_vocal = "%s/%s_main_vocal.wav" % (vocal_root, basename)
216
+ path_other = "%s/%s_others.wav" % (others_root, basename)
217
+ sf.write(path_vocal, mix - opt, rate)
218
+ sf.write(path_other, opt, rate)
219
+ if os.path.exists(path_vocal):
220
+ os.system(
221
+ "ffmpeg -i %s -vn %s -q:a 2 -y"
222
+ % (path_vocal, path_vocal[:-4] + ".%s" % format)
223
+ )
224
+ if os.path.exists(path_other):
225
+ os.system(
226
+ "ffmpeg -i %s -vn %s -q:a 2 -y"
227
+ % (path_other, path_other[:-4] + ".%s" % format)
228
+ )
229
+
230
+
231
+ class MDXNetDereverb:
232
+ def __init__(self, chunks):
233
+ self.onnx = "uvr5_weights/onnx_dereverb_By_FoxJoy"
234
+ self.shifts = 10 #'Predict with randomised equivariant stabilisation'
235
+ self.mixing = "min_mag" # ['default','min_mag','max_mag']
236
+ self.chunks = chunks
237
+ self.margin = 44100
238
+ self.dim_t = 9
239
+ self.dim_f = 3072
240
+ self.n_fft = 6144
241
+ self.denoise = True
242
+ self.pred = Predictor(self)
243
+
244
+ def _path_audio_(self, input, vocal_root, others_root, format):
245
+ self.pred.prediction(input, vocal_root, others_root, format)
246
+
247
+
248
+ if __name__ == "__main__":
249
+ dereverb = MDXNetDereverb(15)
250
+ from time import time as ttime
251
+
252
+ t0 = ttime()
253
+ dereverb._path_audio_(
254
+ "雪雪伴奏对消HP5.wav",
255
+ "vocal",
256
+ "others",
257
+ )
258
+ t1 = ttime()
259
+ print(t1 - t0)
260
+
261
+
262
+ """
263
+
264
+ runtime\python.exe MDXNet.py
265
+
266
+ 6G:
267
+ 15/9:0.8G->6.8G
268
+ 14:0.8G->6.5G
269
+ 25:炸
270
+
271
+ half15:0.7G->6.6G,22.69s
272
+ fp32-15:0.7G->6.6G,20.85s
273
+
274
+ """
RVC.py ADDED
@@ -0,0 +1,1318 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ import requests
4
+ import sys
5
+ import tempfile
6
+ import soundfile as sf
7
+ os.environ["CUDA_VISIBLE_DEVICES"] = ""
8
+ now_dir = os.getcwd()
9
+ sys.path.append(now_dir)
10
+ import traceback, pdb
11
+ import warnings
12
+
13
+ import numpy as np
14
+ import torch
15
+ from pydub import AudioSegment
16
+ os.environ['OPENBLAS_NUM_THREADS'] = '1'
17
+ os.environ["no_proxy"] = "localhost, 127.0.0.1, ::1"
18
+ import logging
19
+ import threading
20
+ from random import shuffle
21
+ from subprocess import Popen
22
+ from time import sleep
23
+
24
+ import faiss
25
+ import ffmpeg
26
+ import gradio as gr
27
+ import soundfile as sf
28
+ from config import Config
29
+ from fairseq import checkpoint_utils
30
+ from i18n import I18nAuto
31
+ from infer_pack.models import (
32
+ SynthesizerTrnMs256NSFsid,
33
+ SynthesizerTrnMs256NSFsid_nono,
34
+ SynthesizerTrnMs768NSFsid,
35
+ SynthesizerTrnMs768NSFsid_nono,
36
+ )
37
+ from infer_pack.models_onnx import SynthesizerTrnMsNSFsidM
38
+ from infer_uvr5 import _audio_pre_, _audio_pre_new
39
+ from MDXNet import MDXNetDereverb
40
+ from my_utils import load_audio
41
+ from train.process_ckpt import change_info, extract_small_model, merge, show_info
42
+ from vc_infer_pipeline import VC
43
+ from sklearn.cluster import MiniBatchKMeans
44
+
45
+ logging.getLogger("numba").setLevel(logging.WARNING)
46
+
47
+
48
+ tmp = os.path.join(now_dir, "TEMP")
49
+ shutil.rmtree(tmp, ignore_errors=True)
50
+ shutil.rmtree("%s/runtime/Lib/site-packages/infer_pack" % (now_dir), ignore_errors=True)
51
+ shutil.rmtree("%s/runtime/Lib/site-packages/uvr5_pack" % (now_dir), ignore_errors=True)
52
+ os.makedirs(tmp, exist_ok=True)
53
+ os.makedirs(os.path.join(now_dir, "logs"), exist_ok=True)
54
+ os.makedirs(os.path.join(now_dir, "weights"), exist_ok=True)
55
+ os.environ["TEMP"] = tmp
56
+ warnings.filterwarnings("ignore")
57
+ torch.manual_seed(114514)
58
+
59
+
60
+ config = Config()
61
+ i18n = I18nAuto()
62
+ i18n.print()
63
+ # 判断是否有能用来训练和加速推理的N卡
64
+ ngpu = torch.cuda.device_count()
65
+ gpu_infos = []
66
+ mem = []
67
+ if_gpu_ok = False
68
+
69
+ if torch.cuda.is_available() or ngpu != 0:
70
+ for i in range(ngpu):
71
+ gpu_name = torch.cuda.get_device_name(i)
72
+ if any(
73
+ value in gpu_name.upper()
74
+ for value in [
75
+ "10",
76
+ "16",
77
+ "20",
78
+ "30",
79
+ "40",
80
+ "A2",
81
+ "A3",
82
+ "A4",
83
+ "P4",
84
+ "A50",
85
+ "500",
86
+ "A60",
87
+ "70",
88
+ "80",
89
+ "90",
90
+ "M4",
91
+ "T4",
92
+ "TITAN",
93
+ ]
94
+ ):
95
+ # A10#A100#V100#A40#P40#M40#K80#A4500
96
+ if_gpu_ok = True # 至少有一张能用的N卡
97
+ gpu_infos.append("%s\t%s" % (i, gpu_name))
98
+ mem.append(
99
+ int(
100
+ torch.cuda.get_device_properties(i).total_memory
101
+ / 1024
102
+ / 1024
103
+ / 1024
104
+ + 0.4
105
+ )
106
+ )
107
+ if if_gpu_ok and len(gpu_infos) > 0:
108
+ gpu_info = "\n".join(gpu_infos)
109
+ default_batch_size = 1
110
+ else:
111
+ gpu_info = i18n("很遗憾您这没有能用的显卡来支持您训练")
112
+ default_batch_size = 1
113
+ gpus = "-".join([i[0] for i in gpu_infos])
114
+
115
+
116
+ class ToolButton(gr.Button, gr.components.FormComponent):
117
+ """Small button with single emoji as text, fits inside gradio forms"""
118
+
119
+ def __init__(self, **kwargs):
120
+ super().__init__(variant="tool", **kwargs)
121
+
122
+ def get_block_name(self):
123
+ return "button"
124
+
125
+
126
+ hubert_model = None
127
+
128
+
129
+ def load_hubert():
130
+ global hubert_model
131
+ models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
132
+ ["hubert_base.pt"],
133
+ suffix="",
134
+ )
135
+ hubert_model = models[0]
136
+ hubert_model = hubert_model.to(config.device)
137
+ if config.is_half:
138
+ hubert_model = hubert_model.half()
139
+ else:
140
+ hubert_model = hubert_model.float()
141
+ hubert_model.eval()
142
+
143
+
144
+ weight_root = "weights"
145
+ weight_uvr5_root = "uvr5_weights"
146
+ index_root = "logs"
147
+ names = []
148
+ for name in os.listdir(weight_root):
149
+ if name.endswith(".pth"):
150
+ names.append(name)
151
+ index_paths = []
152
+ for root, dirs, files in os.walk(index_root, topdown=False):
153
+ for name in files:
154
+ if name.endswith(".index") and "trained" not in name:
155
+ index_paths.append("%s/%s" % (root, name))
156
+ uvr5_names = []
157
+ for name in os.listdir(weight_uvr5_root):
158
+ if name.endswith(".pth") or "onnx" in name:
159
+ uvr5_names.append(name.replace(".pth", ""))
160
+
161
+
162
+ def vc_single(
163
+ sid,
164
+ input_audio_path,
165
+ f0_up_key,
166
+ f0_file,
167
+ f0_method,
168
+ file_index,
169
+ file_index2,
170
+ # file_big_npy,
171
+ index_rate,
172
+ filter_radius,
173
+ resample_sr,
174
+ rms_mix_rate,
175
+ protect,
176
+ song
177
+ ): # spk_item, input_audio0, vc_transform0,f0_file,f0method0
178
+ global tgt_sr, net_g, vc, hubert_model, version
179
+ logging.info(f0_up_key)
180
+
181
+ if input_audio_path is None:
182
+ return "You need to upload an audio", None
183
+
184
+ f0_up_key = int(f0_up_key)
185
+ try:
186
+ # logging.info("before response")
187
+ # response = requests.get(input_audio_path, stream=True)
188
+ # response.raise_for_status()
189
+ # logging.info("after response")
190
+ # temp_file = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
191
+ # path = temp_file.name
192
+ # for chunk in response.iter_content(chunk_size=1024):
193
+ # temp_file.write(chunk)
194
+
195
+ # with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file:
196
+ # path = temp_file.name
197
+ # logging.info("Temporary file path: %s", path)
198
+ # temp_file.write(response.content)
199
+
200
+ # logging.info("tempfilewrite")
201
+
202
+
203
+
204
+ if not input_audio_path.lower().endswith(".wav"):
205
+ # Download the audio file
206
+ response = requests.get(input_audio_path, stream=True)
207
+ response.raise_for_status()
208
+ logging.info("after response")
209
+
210
+ # Create a temporary file and save the downloaded audio content
211
+ with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file:
212
+ path = temp_file.name
213
+ logging.info("Temporary file path: %s", path)
214
+ temp_file.write(response.content)
215
+
216
+ # Convert the audio file to WAV format using pydub
217
+ audio = AudioSegment.from_file(path)
218
+ output_file_path = os.path.splitext(path)[0] + ".wav"
219
+ audio.export(output_file_path, format="wav")
220
+
221
+ # Use the converted WAV file as the new temporary file
222
+ path = output_file_path
223
+
224
+ else:
225
+ # The input audio URL is already pointing to a WAV file, so use it as-is
226
+ response = requests.get(input_audio_path)
227
+ response.raise_for_status()
228
+
229
+ # Create a temporary file and save the downloaded audio content
230
+ with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file:
231
+ path = temp_file.name
232
+ logging.info("Temporary file path: %s", path)
233
+ temp_file.write(response.content)
234
+
235
+ if song == True:
236
+ uvr(path)
237
+ path = f"{os.getcwd()}/opt/vocal.wav"
238
+
239
+
240
+
241
+ audio = load_audio(path, 16000)
242
+ temp_file.close()
243
+ audio_max = np.abs(audio).max() / 0.95
244
+ if audio_max > 1:
245
+ audio = audio.astype(float)
246
+ audio /= audio_max
247
+ times = [0, 0, 0]
248
+ if not hubert_model:
249
+ load_hubert()
250
+ if_f0 = cpt.get("f0", 1)
251
+ file_index = (
252
+ (
253
+ file_index.strip(" ")
254
+ .strip('"')
255
+ .strip("\n")
256
+ .strip('"')
257
+ .strip(" ")
258
+ .replace("trained", "added")
259
+ )
260
+ if file_index != ""
261
+ else file_index2
262
+ ) # 防止小白写错,自动帮他替换掉
263
+ # file_big_npy = (
264
+ # file_big_npy.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
265
+ # )
266
+ logging.info("before vc pipeline")
267
+ logging.info(vc)
268
+ audio_opt = vc.pipeline(
269
+ hubert_model,
270
+ net_g,
271
+ sid,
272
+ audio,
273
+ input_audio_path,
274
+ times,
275
+ f0_up_key,
276
+ f0_method,
277
+ file_index,
278
+ # file_big_npy,
279
+ index_rate,
280
+ if_f0,
281
+ filter_radius,
282
+ tgt_sr,
283
+ resample_sr,
284
+ rms_mix_rate,
285
+ version,
286
+ protect,
287
+ f0_file=f0_file,
288
+ )
289
+ logging.info("after vc pipeline")
290
+
291
+ logging.info(f0_up_key)
292
+
293
+ if tgt_sr != resample_sr >= 16000:
294
+ tgt_sr = resample_sr
295
+ index_info = (
296
+ "Using index:%s." % file_index
297
+ if os.path.exists(file_index)
298
+ else "Index not used."
299
+ )
300
+ return "Success.\n %s\nTime:\n npy:%ss, f0:%ss, infer:%ss" % (
301
+ index_info,
302
+ times[0],
303
+ times[1],
304
+ times[2],
305
+ ), (tgt_sr, audio_opt)
306
+ except:
307
+ info = traceback.format_exc()
308
+ logging.info(info)
309
+ return info, (None, None)
310
+
311
+
312
+ def vc_multi(
313
+ sid,
314
+ dir_path,
315
+ opt_root,
316
+ paths,
317
+ f0_up_key,
318
+ f0_method,
319
+ file_index,
320
+ file_index2,
321
+ # file_big_npy,
322
+ index_rate,
323
+ filter_radius,
324
+ resample_sr,
325
+ rms_mix_rate,
326
+ protect,
327
+ format1,
328
+ ):
329
+
330
+ try:
331
+
332
+ dir_path = (
333
+ dir_path.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
334
+ ) # 防止小白拷路径头尾带了空格和"和回车
335
+ opt_root = opt_root.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
336
+ os.makedirs(opt_root, exist_ok=True)
337
+ try:
338
+ if dir_path != "":
339
+ paths = [os.path.join(dir_path, name) for name in os.listdir(dir_path)]
340
+ else:
341
+ paths = [path.name for path in paths]
342
+ except:
343
+ traceback.print_exc()
344
+ paths = [path.name for path in paths]
345
+ infos = []
346
+ for path in paths:
347
+ info, opt = vc_single(
348
+ sid,
349
+ path,
350
+ f0_up_key,
351
+ None,
352
+ f0_method,
353
+ file_index,
354
+ file_index2,
355
+ # file_big_npy,
356
+ index_rate,
357
+ filter_radius,
358
+ resample_sr,
359
+ rms_mix_rate,
360
+ protect,
361
+ )
362
+ if "Success" in info:
363
+ try:
364
+ tgt_sr, audio_opt = opt
365
+ if format1 in ["wav", "flac"]:
366
+ sf.write(
367
+ "%s/%s.%s" % (opt_root, os.path.basename(path), format1),
368
+ audio_opt,
369
+ tgt_sr,
370
+ )
371
+ else:
372
+ path = "%s/%s.wav" % (opt_root, os.path.basename(path))
373
+ sf.write(
374
+ path,
375
+ audio_opt,
376
+ tgt_sr,
377
+ )
378
+ if os.path.exists(path):
379
+ os.system(
380
+ "ffmpeg -i %s -vn %s -q:a 2 -y"
381
+ % (path, path[:-4] + ".%s" % format1)
382
+ )
383
+ except:
384
+ info += traceback.format_exc()
385
+ infos.append("%s->%s" % (os.path.basename(path), info))
386
+ yield "\n".join(infos)
387
+ yield "\n".join(infos)
388
+ except:
389
+ yield traceback.format_exc()
390
+
391
+
392
+ def uvr(inp_root, model_name=uvr5_names[0], save_root_vocal='opt', paths=None, save_root_ins="opt", agg=10, format0="wav"):
393
+ try:
394
+ func = _audio_pre_
395
+ pre_fun = func(
396
+ agg=int(agg),
397
+ model_path=os.path.join(weight_uvr5_root, model_name + ".pth"),
398
+ device=config.device,
399
+ is_half=config.is_half,
400
+ )
401
+
402
+ pre_fun._path_audio_(
403
+ inp_root, save_root_ins, save_root_vocal, format0
404
+ )
405
+
406
+
407
+ finally:
408
+ try:
409
+ del pre_fun.model
410
+ del pre_fun
411
+ except:
412
+ traceback.print_exc()
413
+ logging.info("clean_empty_cache")
414
+ if torch.cuda.is_available():
415
+ torch.cuda.empty_cache()
416
+
417
+
418
+
419
+ # 一个选项卡全局只能有一个音色
420
+ def get_vc(sid, to_return_protect0, to_return_protect1):
421
+ global n_spk, tgt_sr, net_g, vc, cpt, version
422
+ if sid == "" or sid == []:
423
+ global hubert_model
424
+ if hubert_model is not None: # 考虑到轮询, 需要加个判断看是否 sid 是由有模型切换到无模型的
425
+ logging.info("clean_empty_cache")
426
+ del net_g, n_spk, vc, hubert_model, tgt_sr # ,cpt
427
+ hubert_model = net_g = n_spk = vc = hubert_model = tgt_sr = None
428
+ if torch.cuda.is_available():
429
+ torch.cuda.empty_cache()
430
+ ###楼下不这么折腾清理不干净
431
+ if_f0 = cpt.get("f0", 1)
432
+ version = cpt.get("version", "v1")
433
+ if version == "v1":
434
+ if if_f0 == 1:
435
+ net_g = SynthesizerTrnMs256NSFsid(
436
+ *cpt["config"], is_half=config.is_half
437
+ )
438
+ else:
439
+ net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
440
+ elif version == "v2":
441
+ if if_f0 == 1:
442
+ net_g = SynthesizerTrnMs768NSFsid(
443
+ *cpt["config"], is_half=config.is_half
444
+ )
445
+ else:
446
+ net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
447
+ del net_g, cpt
448
+ if torch.cuda.is_available():
449
+ torch.cuda.empty_cache()
450
+ cpt = None
451
+ return {"visible": False, "__type__": "update"}
452
+ person = "%s/%s" % (weight_root, sid)
453
+ logging.info("loading %s" % person)
454
+ cpt = torch.load(person, map_location="cpu")
455
+ tgt_sr = cpt["config"][-1]
456
+ cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
457
+ if_f0 = cpt.get("f0", 1)
458
+ if if_f0 == 0:
459
+ to_return_protect0 = to_return_protect1 = {
460
+ "visible": False,
461
+ "value": 0.5,
462
+ "__type__": "update",
463
+ }
464
+ else:
465
+ to_return_protect0 = {
466
+ "visible": True,
467
+ "value": to_return_protect0,
468
+ "__type__": "update",
469
+ }
470
+ to_return_protect1 = {
471
+ "visible": True,
472
+ "value": to_return_protect1,
473
+ "__type__": "update",
474
+ }
475
+ version = cpt.get("version", "v1")
476
+ if version == "v1":
477
+ if if_f0 == 1:
478
+ net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=config.is_half)
479
+ else:
480
+ net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
481
+ elif version == "v2":
482
+ if if_f0 == 1:
483
+ net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=config.is_half)
484
+ else:
485
+ net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
486
+ del net_g.enc_q
487
+ logging.info(net_g.load_state_dict(cpt["weight"], strict=False))
488
+ net_g.eval().to(config.device)
489
+ if config.is_half:
490
+ net_g = net_g.half()
491
+ else:
492
+ net_g = net_g.float()
493
+ vc = VC(tgt_sr, config)
494
+ n_spk = cpt["config"][-3]
495
+ return (
496
+ {"visible": True, "maximum": n_spk, "__type__": "update"},
497
+ to_return_protect0,
498
+ to_return_protect1,
499
+ )
500
+
501
+
502
+ def change_choices():
503
+ names = []
504
+ for name in os.listdir(weight_root):
505
+ if name.endswith(".pth"):
506
+ names.append(name)
507
+ index_paths = []
508
+ for root, dirs, files in os.walk(index_root, topdown=False):
509
+ for name in files:
510
+ if name.endswith(".index") and "trained" not in name:
511
+ index_paths.append("%s/%s" % (root, name))
512
+ return {"choices": sorted(names), "__type__": "update"}, {
513
+ "choices": sorted(index_paths),
514
+ "__type__": "update",
515
+ }
516
+
517
+
518
+ def clean():
519
+ return {"value": "", "__type__": "update"}
520
+
521
+
522
+ sr_dict = {
523
+ "32k": 32000,
524
+ "40k": 40000,
525
+ "48k": 48000,
526
+ }
527
+
528
+
529
+ def if_done(done, p):
530
+ while 1:
531
+ if p.poll() is None:
532
+ sleep(0.5)
533
+ else:
534
+ break
535
+ done[0] = True
536
+
537
+
538
+ def if_done_multi(done, ps):
539
+ while 1:
540
+ # poll==None代表进程未结束
541
+ # 只要有一个进程未结束都不停
542
+ flag = 1
543
+ for p in ps:
544
+ if p.poll() is None:
545
+ flag = 0
546
+ sleep(0.5)
547
+ break
548
+ if flag == 1:
549
+ break
550
+ done[0] = True
551
+
552
+
553
+ def preprocess_dataset(trainset_dir, exp_dir, sr, n_p):
554
+ sr = sr_dict[sr]
555
+ os.makedirs("%s/logs/%s" % (now_dir, exp_dir), exist_ok=True)
556
+ f = open("%s/logs/%s/preprocess.log" % (now_dir, exp_dir), "w")
557
+ f.close()
558
+ cmd = (
559
+ config.python_cmd
560
+ + " trainset_preprocess_pipeline_print.py %s %s %s %s/logs/%s "
561
+ % (trainset_dir, sr, n_p, now_dir, exp_dir)
562
+ + str(config.noparallel)
563
+ )
564
+ logging.info(cmd)
565
+ p = Popen(cmd, shell=True) # , stdin=PIPE, stdout=PIPE,stderr=PIPE,cwd=now_dir
566
+ ###煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读
567
+ done = [False]
568
+ threading.Thread(
569
+ target=if_done,
570
+ args=(
571
+ done,
572
+ p,
573
+ ),
574
+ ).start()
575
+ while 1:
576
+ with open("%s/logs/%s/preprocess.log" % (now_dir, exp_dir), "r") as f:
577
+ yield (f.read())
578
+ sleep(1)
579
+ if done[0]:
580
+ break
581
+ with open("%s/logs/%s/preprocess.log" % (now_dir, exp_dir), "r") as f:
582
+ log = f.read()
583
+ logging.info(log)
584
+ yield log
585
+
586
+
587
+ # but2.click(extract_f0,[gpus6,np7,f0method8,if_f0_3,trainset_dir4],[info2])
588
+ def extract_f0_feature(gpus, n_p, f0method, if_f0, exp_dir, version19):
589
+ gpus = gpus.split("-")
590
+ os.makedirs("%s/logs/%s" % (now_dir, exp_dir), exist_ok=True)
591
+ f = open("%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir), "w")
592
+ f.close()
593
+ if if_f0:
594
+ cmd = config.python_cmd + " extract_f0_print.py %s/logs/%s %s %s" % (
595
+ now_dir,
596
+ exp_dir,
597
+ n_p,
598
+ f0method,
599
+ )
600
+ logging.info(cmd)
601
+ p = Popen(cmd, shell=True, cwd=now_dir) # , stdin=PIPE, stdout=PIPE,stderr=PIPE
602
+ ###煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读
603
+ done = [False]
604
+ threading.Thread(
605
+ target=if_done,
606
+ args=(
607
+ done,
608
+ p,
609
+ ),
610
+ ).start()
611
+ while 1:
612
+ with open(
613
+ "%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir), "r"
614
+ ) as f:
615
+ yield (f.read())
616
+ sleep(1)
617
+ if done[0]:
618
+ break
619
+ with open("%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir), "r") as f:
620
+ log = f.read()
621
+ logging.info(log)
622
+ yield log
623
+ ####对不同part分别开多进程
624
+ """
625
+ n_part=int(sys.argv[1])
626
+ i_part=int(sys.argv[2])
627
+ i_gpu=sys.argv[3]
628
+ exp_dir=sys.argv[4]
629
+ os.environ["CUDA_VISIBLE_DEVICES"]=str(i_gpu)
630
+ """
631
+ leng = len(gpus)
632
+ ps = []
633
+ for idx, n_g in enumerate(gpus):
634
+ cmd = (
635
+ config.python_cmd
636
+ + " extract_feature_print.py %s %s %s %s %s/logs/%s %s"
637
+ % (
638
+ config.device,
639
+ leng,
640
+ idx,
641
+ n_g,
642
+ now_dir,
643
+ exp_dir,
644
+ version19,
645
+ )
646
+ )
647
+ logging.info(cmd)
648
+ p = Popen(
649
+ cmd, shell=True, cwd=now_dir
650
+ ) # , shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=now_dir
651
+ ps.append(p)
652
+ ###煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读
653
+ done = [False]
654
+ threading.Thread(
655
+ target=if_done_multi,
656
+ args=(
657
+ done,
658
+ ps,
659
+ ),
660
+ ).start()
661
+ while 1:
662
+ with open("%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir), "r") as f:
663
+ yield (f.read())
664
+ sleep(1)
665
+ if done[0]:
666
+ break
667
+ with open("%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir), "r") as f:
668
+ log = f.read()
669
+ logging.info(log)
670
+ yield log
671
+
672
+
673
+ def change_sr2(sr2, if_f0_3, version19):
674
+ path_str = "" if version19 == "v1" else "_v2"
675
+ f0_str = "f0" if if_f0_3 else ""
676
+ if_pretrained_generator_exist = os.access(
677
+ "pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2), os.F_OK
678
+ )
679
+ if_pretrained_discriminator_exist = os.access(
680
+ "pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2), os.F_OK
681
+ )
682
+ if not if_pretrained_generator_exist:
683
+ logging.info(
684
+ "pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2),
685
+ "not exist, will not use pretrained model",
686
+ )
687
+ if not if_pretrained_discriminator_exist:
688
+ logging.info(
689
+ "pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2),
690
+ "not exist, will not use pretrained model",
691
+ )
692
+ return (
693
+ "pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2)
694
+ if if_pretrained_generator_exist
695
+ else "",
696
+ "pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2)
697
+ if if_pretrained_discriminator_exist
698
+ else "",
699
+ )
700
+
701
+
702
+ def change_version19(sr2, if_f0_3, version19):
703
+ path_str = "" if version19 == "v1" else "_v2"
704
+ if sr2 == "32k" and version19 == "v1":
705
+ sr2 = "40k"
706
+ to_return_sr2 = (
707
+ {"choices": ["40k", "48k"], "__type__": "update", "value": sr2}
708
+ if version19 == "v1"
709
+ else {"choices": ["40k", "48k", "32k"], "__type__": "update", "value": sr2}
710
+ )
711
+ f0_str = "f0" if if_f0_3 else ""
712
+ if_pretrained_generator_exist = os.access(
713
+ "pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2), os.F_OK
714
+ )
715
+ if_pretrained_discriminator_exist = os.access(
716
+ "pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2), os.F_OK
717
+ )
718
+ if not if_pretrained_generator_exist:
719
+ logging.info(
720
+ "pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2),
721
+ "not exist, will not use pretrained model",
722
+ )
723
+ if not if_pretrained_discriminator_exist:
724
+ logging.info(
725
+ "pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2),
726
+ "not exist, will not use pretrained model",
727
+ )
728
+ return (
729
+ "pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2)
730
+ if if_pretrained_generator_exist
731
+ else "",
732
+ "pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2)
733
+ if if_pretrained_discriminator_exist
734
+ else "",
735
+ to_return_sr2,
736
+ )
737
+
738
+
739
+ def change_f0(if_f0_3, sr2, version19): # f0method8,pretrained_G14,pretrained_D15
740
+ path_str = "" if version19 == "v1" else "_v2"
741
+ if_pretrained_generator_exist = os.access(
742
+ "pretrained%s/f0G%s.pth" % (path_str, sr2), os.F_OK
743
+ )
744
+ if_pretrained_discriminator_exist = os.access(
745
+ "pretrained%s/f0D%s.pth" % (path_str, sr2), os.F_OK
746
+ )
747
+ if not if_pretrained_generator_exist:
748
+ logging.info(
749
+ "pretrained%s/f0G%s.pth" % (path_str, sr2),
750
+ "not exist, will not use pretrained model",
751
+ )
752
+ if not if_pretrained_discriminator_exist:
753
+ logging.info(
754
+ "pretrained%s/f0D%s.pth" % (path_str, sr2),
755
+ "not exist, will not use pretrained model",
756
+ )
757
+ if if_f0_3:
758
+ return (
759
+ {"visible": True, "__type__": "update"},
760
+ "pretrained%s/f0G%s.pth" % (path_str, sr2)
761
+ if if_pretrained_generator_exist
762
+ else "",
763
+ "pretrained%s/f0D%s.pth" % (path_str, sr2)
764
+ if if_pretrained_discriminator_exist
765
+ else "",
766
+ )
767
+ return (
768
+ {"visible": False, "__type__": "update"},
769
+ ("pretrained%s/G%s.pth" % (path_str, sr2))
770
+ if if_pretrained_generator_exist
771
+ else "",
772
+ ("pretrained%s/D%s.pth" % (path_str, sr2))
773
+ if if_pretrained_discriminator_exist
774
+ else "",
775
+ )
776
+
777
+
778
+ # but3.click(click_train,[exp_dir1,sr2,if_f0_3,save_epoch10,total_epoch11,batch_size12,if_save_latest13,pretrained_G14,pretrained_D15,gpus16])
779
+ def click_train(
780
+ exp_dir1,
781
+ sr2,
782
+ if_f0_3,
783
+ spk_id5,
784
+ save_epoch10,
785
+ total_epoch11,
786
+ batch_size12,
787
+ if_save_latest13,
788
+ pretrained_G14,
789
+ pretrained_D15,
790
+ gpus16,
791
+ if_cache_gpu17,
792
+ if_save_every_weights18,
793
+ version19,
794
+ ):
795
+ # 生成filelist
796
+ exp_dir = "%s/logs/%s" % (now_dir, exp_dir1)
797
+ os.makedirs(exp_dir, exist_ok=True)
798
+ gt_wavs_dir = "%s/0_gt_wavs" % (exp_dir)
799
+ feature_dir = (
800
+ "%s/3_feature256" % (exp_dir)
801
+ if version19 == "v1"
802
+ else "%s/3_feature768" % (exp_dir)
803
+ )
804
+ if if_f0_3:
805
+ f0_dir = "%s/2a_f0" % (exp_dir)
806
+ f0nsf_dir = "%s/2b-f0nsf" % (exp_dir)
807
+ names = (
808
+ set([name.split(".")[0] for name in os.listdir(gt_wavs_dir)])
809
+ & set([name.split(".")[0] for name in os.listdir(feature_dir)])
810
+ & set([name.split(".")[0] for name in os.listdir(f0_dir)])
811
+ & set([name.split(".")[0] for name in os.listdir(f0nsf_dir)])
812
+ )
813
+ else:
814
+ names = set([name.split(".")[0] for name in os.listdir(gt_wavs_dir)]) & set(
815
+ [name.split(".")[0] for name in os.listdir(feature_dir)]
816
+ )
817
+ opt = []
818
+ for name in names:
819
+ if if_f0_3:
820
+ opt.append(
821
+ "%s/%s.wav|%s/%s.npy|%s/%s.wav.npy|%s/%s.wav.npy|%s"
822
+ % (
823
+ gt_wavs_dir.replace("\\", "\\\\"),
824
+ name,
825
+ feature_dir.replace("\\", "\\\\"),
826
+ name,
827
+ f0_dir.replace("\\", "\\\\"),
828
+ name,
829
+ f0nsf_dir.replace("\\", "\\\\"),
830
+ name,
831
+ spk_id5,
832
+ )
833
+ )
834
+ else:
835
+ opt.append(
836
+ "%s/%s.wav|%s/%s.npy|%s"
837
+ % (
838
+ gt_wavs_dir.replace("\\", "\\\\"),
839
+ name,
840
+ feature_dir.replace("\\", "\\\\"),
841
+ name,
842
+ spk_id5,
843
+ )
844
+ )
845
+ fea_dim = 256 if version19 == "v1" else 768
846
+ if if_f0_3:
847
+ for _ in range(2):
848
+ opt.append(
849
+ "%s/logs/mute/0_gt_wavs/mute%s.wav|%s/logs/mute/3_feature%s/mute.npy|%s/logs/mute/2a_f0/mute.wav.npy|%s/logs/mute/2b-f0nsf/mute.wav.npy|%s"
850
+ % (now_dir, sr2, now_dir, fea_dim, now_dir, now_dir, spk_id5)
851
+ )
852
+ else:
853
+ for _ in range(2):
854
+ opt.append(
855
+ "%s/logs/mute/0_gt_wavs/mute%s.wav|%s/logs/mute/3_feature%s/mute.npy|%s"
856
+ % (now_dir, sr2, now_dir, fea_dim, spk_id5)
857
+ )
858
+ shuffle(opt)
859
+ with open("%s/filelist.txt" % exp_dir, "w") as f:
860
+ f.write("\n".join(opt))
861
+ logging.info("write filelist done")
862
+ # 生成config#无需生成config
863
+ # cmd = python_cmd + " train_nsf_sim_cache_sid_load_pretrain.py -e mi-test -sr 40k -f0 1 -bs 4 -g 0 -te 10 -se 5 -pg pretrained/f0G40k.pth -pd pretrained/f0D40k.pth -l 1 -c 0"
864
+ logging.info("use gpus:", gpus16)
865
+ if pretrained_G14 == "":
866
+ logging.info("no pretrained Generator")
867
+ if pretrained_D15 == "":
868
+ logging.info("no pretrained Discriminator")
869
+ if gpus16:
870
+ cmd = (
871
+ config.python_cmd
872
+ + " train_nsf_sim_cache_sid_load_pretrain.py -e %s -sr %s -f0 %s -bs %s -g %s -te %s -se %s %s %s -l %s -c %s -sw %s -v %s"
873
+ % (
874
+ exp_dir1,
875
+ sr2,
876
+ 1 if if_f0_3 else 0,
877
+ batch_size12,
878
+ gpus16,
879
+ total_epoch11,
880
+ save_epoch10,
881
+ "-pg %s" % pretrained_G14 if pretrained_G14 != "" else "",
882
+ "-pd %s" % pretrained_D15 if pretrained_D15 != "" else "",
883
+ 1 if if_save_latest13 == i18n("是") else 0,
884
+ 1 if if_cache_gpu17 == i18n("是") else 0,
885
+ 1 if if_save_every_weights18 == i18n("是") else 0,
886
+ version19,
887
+ )
888
+ )
889
+ else:
890
+ cmd = (
891
+ config.python_cmd
892
+ + " train_nsf_sim_cache_sid_load_pretrain.py -e %s -sr %s -f0 %s -bs %s -te %s -se %s %s %s -l %s -c %s -sw %s -v %s"
893
+ % (
894
+ exp_dir1,
895
+ sr2,
896
+ 1 if if_f0_3 else 0,
897
+ batch_size12,
898
+ total_epoch11,
899
+ save_epoch10,
900
+ "-pg %s" % pretrained_G14 if pretrained_G14 != "" else "\b",
901
+ "-pd %s" % pretrained_D15 if pretrained_D15 != "" else "\b",
902
+ 1 if if_save_latest13 == i18n("是") else 0,
903
+ 1 if if_cache_gpu17 == i18n("是") else 0,
904
+ 1 if if_save_every_weights18 == i18n("是") else 0,
905
+ version19,
906
+ )
907
+ )
908
+ logging.info(cmd)
909
+ p = Popen(cmd, shell=True, cwd=now_dir)
910
+ p.wait()
911
+ return "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log"
912
+
913
+
914
+ # but4.click(train_index, [exp_dir1], info3)
915
+ def train_index(exp_dir1, version19):
916
+ exp_dir = "%s/logs/%s" % (now_dir, exp_dir1)
917
+ os.makedirs(exp_dir, exist_ok=True)
918
+ feature_dir = (
919
+ "%s/3_feature256" % (exp_dir)
920
+ if version19 == "v1"
921
+ else "%s/3_feature768" % (exp_dir)
922
+ )
923
+ if not os.path.exists(feature_dir):
924
+ return "请先进行特征提取!"
925
+ listdir_res = list(os.listdir(feature_dir))
926
+ if len(listdir_res) == 0:
927
+ return "请先进行特征提取!"
928
+ infos = []
929
+ npys = []
930
+ for name in sorted(listdir_res):
931
+ phone = np.load("%s/%s" % (feature_dir, name))
932
+ npys.append(phone)
933
+ big_npy = np.concatenate(npys, 0)
934
+ big_npy_idx = np.arange(big_npy.shape[0])
935
+ np.random.shuffle(big_npy_idx)
936
+ big_npy = big_npy[big_npy_idx]
937
+ if big_npy.shape[0] > 2e5:
938
+ # if(1):
939
+ infos.append("Trying doing kmeans %s shape to 10k centers." % big_npy.shape[0])
940
+ yield "\n".join(infos)
941
+ try:
942
+ big_npy = (
943
+ MiniBatchKMeans(
944
+ n_clusters=10000,
945
+ verbose=True,
946
+ batch_size=256 * config.n_cpu,
947
+ compute_labels=False,
948
+ init="random",
949
+ )
950
+ .fit(big_npy)
951
+ .cluster_centers_
952
+ )
953
+ except:
954
+ info = traceback.format_exc()
955
+ logging.info(info)
956
+ infos.append(info)
957
+ yield "\n".join(infos)
958
+
959
+ np.save("%s/total_fea.npy" % exp_dir, big_npy)
960
+ n_ivf = min(int(16 * np.sqrt(big_npy.shape[0])), big_npy.shape[0] // 39)
961
+ infos.append("%s,%s" % (big_npy.shape, n_ivf))
962
+ yield "\n".join(infos)
963
+ index = faiss.index_factory(256 if version19 == "v1" else 768, "IVF%s,Flat" % n_ivf)
964
+ # index = faiss.index_factory(256if version19=="v1"else 768, "IVF%s,PQ128x4fs,RFlat"%n_ivf)
965
+ infos.append("training")
966
+ yield "\n".join(infos)
967
+ index_ivf = faiss.extract_index_ivf(index) #
968
+ index_ivf.nprobe = 1
969
+ index.train(big_npy)
970
+ faiss.write_index(
971
+ index,
972
+ "%s/trained_IVF%s_Flat_nprobe_%s_%s_%s.index"
973
+ % (exp_dir, n_ivf, index_ivf.nprobe, exp_dir1, version19),
974
+ )
975
+ # faiss.write_index(index, '%s/trained_IVF%s_Flat_FastScan_%s.index'%(exp_dir,n_ivf,version19))
976
+ infos.append("adding")
977
+ yield "\n".join(infos)
978
+ batch_size_add = 8192
979
+ for i in range(0, big_npy.shape[0], batch_size_add):
980
+ index.add(big_npy[i : i + batch_size_add])
981
+ faiss.write_index(
982
+ index,
983
+ "%s/added_IVF%s_Flat_nprobe_%s_%s_%s.index"
984
+ % (exp_dir, n_ivf, index_ivf.nprobe, exp_dir1, version19),
985
+ )
986
+ infos.append(
987
+ "成功构建索引,added_IVF%s_Flat_nprobe_%s_%s_%s.index"
988
+ % (n_ivf, index_ivf.nprobe, exp_dir1, version19)
989
+ )
990
+ # faiss.write_index(index, '%s/added_IVF%s_Flat_FastScan_%s.index'%(exp_dir,n_ivf,version19))
991
+ # infos.append("成功构建索引,added_IVF%s_Flat_FastScan_%s.index"%(n_ivf,version19))
992
+ yield "\n".join(infos)
993
+
994
+
995
+ # but5.click(train1key, [exp_dir1, sr2, if_f0_3, trainset_dir4, spk_id5, gpus6, np7, f0method8, save_epoch10, total_epoch11, batch_size12, if_save_latest13, pretrained_G14, pretrained_D15, gpus16, if_cache_gpu17], info3)
996
+ def train1key(
997
+ exp_dir1,
998
+ sr2,
999
+ if_f0_3,
1000
+ trainset_dir4,
1001
+ spk_id5,
1002
+ np7,
1003
+ f0method8,
1004
+ save_epoch10,
1005
+ total_epoch11,
1006
+ batch_size12,
1007
+ if_save_latest13,
1008
+ pretrained_G14,
1009
+ pretrained_D15,
1010
+ gpus16,
1011
+ if_cache_gpu17,
1012
+ if_save_every_weights18,
1013
+ version19,
1014
+ ):
1015
+ infos = []
1016
+
1017
+ def get_info_str(strr):
1018
+ infos.append(strr)
1019
+ return "\n".join(infos)
1020
+
1021
+ model_log_dir = "%s/logs/%s" % (now_dir, exp_dir1)
1022
+ preprocess_log_path = "%s/preprocess.log" % model_log_dir
1023
+ extract_f0_feature_log_path = "%s/extract_f0_feature.log" % model_log_dir
1024
+ gt_wavs_dir = "%s/0_gt_wavs" % model_log_dir
1025
+ feature_dir = (
1026
+ "%s/3_feature256" % model_log_dir
1027
+ if version19 == "v1"
1028
+ else "%s/3_feature768" % model_log_dir
1029
+ )
1030
+
1031
+ os.makedirs(model_log_dir, exist_ok=True)
1032
+ #########step1:处理数据
1033
+ open(preprocess_log_path, "w").close()
1034
+ cmd = (
1035
+ config.python_cmd
1036
+ + " trainset_preprocess_pipeline_print.py %s %s %s %s "
1037
+ % (trainset_dir4, sr_dict[sr2], np7, model_log_dir)
1038
+ + str(config.noparallel)
1039
+ )
1040
+ yield get_info_str(i18n("step1:正在处理数据"))
1041
+ yield get_info_str(cmd)
1042
+ p = Popen(cmd, shell=True)
1043
+ p.wait()
1044
+ with open(preprocess_log_path, "r") as f:
1045
+ logging.info(f.read())
1046
+ #########step2a:提取音高
1047
+ open(extract_f0_feature_log_path, "w")
1048
+ if if_f0_3:
1049
+ yield get_info_str("step2a:正在提取音高")
1050
+ cmd = config.python_cmd + " extract_f0_print.py %s %s %s" % (
1051
+ model_log_dir,
1052
+ np7,
1053
+ f0method8,
1054
+ )
1055
+ yield get_info_str(cmd)
1056
+ p = Popen(cmd, shell=True, cwd=now_dir)
1057
+ p.wait()
1058
+ with open(extract_f0_feature_log_path, "r") as f:
1059
+ logging.info(f.read())
1060
+ else:
1061
+ yield get_info_str(i18n("step2a:无需提取音高"))
1062
+ #######step2b:提取特征
1063
+ yield get_info_str(i18n("step2b:正在提取特征"))
1064
+ gpus = gpus16.split("-")
1065
+ leng = len(gpus)
1066
+ ps = []
1067
+ for idx, n_g in enumerate(gpus):
1068
+ cmd = config.python_cmd + " extract_feature_print.py %s %s %s %s %s %s" % (
1069
+ config.device,
1070
+ leng,
1071
+ idx,
1072
+ n_g,
1073
+ model_log_dir,
1074
+ version19,
1075
+ )
1076
+ yield get_info_str(cmd)
1077
+ p = Popen(
1078
+ cmd, shell=True, cwd=now_dir
1079
+ ) # , shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=now_dir
1080
+ ps.append(p)
1081
+ for p in ps:
1082
+ p.wait()
1083
+ with open(extract_f0_feature_log_path, "r") as f:
1084
+ logging.info(f.read())
1085
+ #######step3a:训练模型
1086
+ yield get_info_str(i18n("step3a:正在训练模型"))
1087
+ # 生成filelist
1088
+ if if_f0_3:
1089
+ f0_dir = "%s/2a_f0" % model_log_dir
1090
+ f0nsf_dir = "%s/2b-f0nsf" % model_log_dir
1091
+ names = (
1092
+ set([name.split(".")[0] for name in os.listdir(gt_wavs_dir)])
1093
+ & set([name.split(".")[0] for name in os.listdir(feature_dir)])
1094
+ & set([name.split(".")[0] for name in os.listdir(f0_dir)])
1095
+ & set([name.split(".")[0] for name in os.listdir(f0nsf_dir)])
1096
+ )
1097
+ else:
1098
+ names = set([name.split(".")[0] for name in os.listdir(gt_wavs_dir)]) & set(
1099
+ [name.split(".")[0] for name in os.listdir(feature_dir)]
1100
+ )
1101
+ opt = []
1102
+ for name in names:
1103
+ if if_f0_3:
1104
+ opt.append(
1105
+ "%s/%s.wav|%s/%s.npy|%s/%s.wav.npy|%s/%s.wav.npy|%s"
1106
+ % (
1107
+ gt_wavs_dir.replace("\\", "\\\\"),
1108
+ name,
1109
+ feature_dir.replace("\\", "\\\\"),
1110
+ name,
1111
+ f0_dir.replace("\\", "\\\\"),
1112
+ name,
1113
+ f0nsf_dir.replace("\\", "\\\\"),
1114
+ name,
1115
+ spk_id5,
1116
+ )
1117
+ )
1118
+ else:
1119
+ opt.append(
1120
+ "%s/%s.wav|%s/%s.npy|%s"
1121
+ % (
1122
+ gt_wavs_dir.replace("\\", "\\\\"),
1123
+ name,
1124
+ feature_dir.replace("\\", "\\\\"),
1125
+ name,
1126
+ spk_id5,
1127
+ )
1128
+ )
1129
+ fea_dim = 256 if version19 == "v1" else 768
1130
+ if if_f0_3:
1131
+ for _ in range(2):
1132
+ opt.append(
1133
+ "%s/logs/mute/0_gt_wavs/mute%s.wav|%s/logs/mute/3_feature%s/mute.npy|%s/logs/mute/2a_f0/mute.wav.npy|%s/logs/mute/2b-f0nsf/mute.wav.npy|%s"
1134
+ % (now_dir, sr2, now_dir, fea_dim, now_dir, now_dir, spk_id5)
1135
+ )
1136
+ else:
1137
+ for _ in range(2):
1138
+ opt.append(
1139
+ "%s/logs/mute/0_gt_wavs/mute%s.wav|%s/logs/mute/3_feature%s/mute.npy|%s"
1140
+ % (now_dir, sr2, now_dir, fea_dim, spk_id5)
1141
+ )
1142
+ shuffle(opt)
1143
+ with open("%s/filelist.txt" % model_log_dir, "w") as f:
1144
+ f.write("\n".join(opt))
1145
+ yield get_info_str("write filelist done")
1146
+ if gpus16:
1147
+ cmd = (
1148
+ config.python_cmd
1149
+ + " train_nsf_sim_cache_sid_load_pretrain.py -e %s -sr %s -f0 %s -bs %s -g %s -te %s -se %s %s %s -l %s -c %s -sw %s -v %s"
1150
+ % (
1151
+ exp_dir1,
1152
+ sr2,
1153
+ 1 if if_f0_3 else 0,
1154
+ batch_size12,
1155
+ gpus16,
1156
+ total_epoch11,
1157
+ save_epoch10,
1158
+ "-pg %s" % pretrained_G14 if pretrained_G14 != "" else "",
1159
+ "-pd %s" % pretrained_D15 if pretrained_D15 != "" else "",
1160
+ 1 if if_save_latest13 == i18n("是") else 0,
1161
+ 1 if if_cache_gpu17 == i18n("是") else 0,
1162
+ 1 if if_save_every_weights18 == i18n("是") else 0,
1163
+ version19,
1164
+ )
1165
+ )
1166
+ else:
1167
+ cmd = (
1168
+ config.python_cmd
1169
+ + " train_nsf_sim_cache_sid_load_pretrain.py -e %s -sr %s -f0 %s -bs %s -te %s -se %s %s %s -l %s -c %s -sw %s -v %s"
1170
+ % (
1171
+ exp_dir1,
1172
+ sr2,
1173
+ 1 if if_f0_3 else 0,
1174
+ batch_size12,
1175
+ total_epoch11,
1176
+ save_epoch10,
1177
+ "-pg %s" % pretrained_G14 if pretrained_G14 != "" else "",
1178
+ "-pd %s" % pretrained_D15 if pretrained_D15 != "" else "",
1179
+ 1 if if_save_latest13 == i18n("是") else 0,
1180
+ 1 if if_cache_gpu17 == i18n("是") else 0,
1181
+ 1 if if_save_every_weights18 == i18n("是") else 0,
1182
+ version19,
1183
+ )
1184
+ )
1185
+ yield get_info_str(cmd)
1186
+ p = Popen(cmd, shell=True, cwd=now_dir)
1187
+ p.wait()
1188
+ yield get_info_str(i18n("训练结束, 您可查看控制台训练日志或实验文件夹下的train.log"))
1189
+ #######step3b:训练索引
1190
+ npys = []
1191
+ listdir_res = list(os.listdir(feature_dir))
1192
+ for name in sorted(listdir_res):
1193
+ phone = np.load("%s/%s" % (feature_dir, name))
1194
+ npys.append(phone)
1195
+ big_npy = np.concatenate(npys, 0)
1196
+
1197
+ big_npy_idx = np.arange(big_npy.shape[0])
1198
+ np.random.shuffle(big_npy_idx)
1199
+ big_npy = big_npy[big_npy_idx]
1200
+
1201
+ if big_npy.shape[0] > 2e5:
1202
+ # if(1):
1203
+ info = "Trying doing kmeans %s shape to 10k centers." % big_npy.shape[0]
1204
+ logging.info(info)
1205
+ yield get_info_str(info)
1206
+ try:
1207
+ big_npy = (
1208
+ MiniBatchKMeans(
1209
+ n_clusters=10000,
1210
+ verbose=True,
1211
+ batch_size=256 * config.n_cpu,
1212
+ compute_labels=False,
1213
+ init="random",
1214
+ )
1215
+ .fit(big_npy)
1216
+ .cluster_centers_
1217
+ )
1218
+ except:
1219
+ info = traceback.format_exc()
1220
+ logging.info(info)
1221
+ yield get_info_str(info)
1222
+
1223
+ np.save("%s/total_fea.npy" % model_log_dir, big_npy)
1224
+
1225
+ # n_ivf = big_npy.shape[0] // 39
1226
+ n_ivf = min(int(16 * np.sqrt(big_npy.shape[0])), big_npy.shape[0] // 39)
1227
+ yield get_info_str("%s,%s" % (big_npy.shape, n_ivf))
1228
+ index = faiss.index_factory(256 if version19 == "v1" else 768, "IVF%s,Flat" % n_ivf)
1229
+ yield get_info_str("training index")
1230
+ index_ivf = faiss.extract_index_ivf(index) #
1231
+ index_ivf.nprobe = 1
1232
+ index.train(big_npy)
1233
+ faiss.write_index(
1234
+ index,
1235
+ "%s/trained_IVF%s_Flat_nprobe_%s_%s_%s.index"
1236
+ % (model_log_dir, n_ivf, index_ivf.nprobe, exp_dir1, version19),
1237
+ )
1238
+ yield get_info_str("adding index")
1239
+ batch_size_add = 8192
1240
+ for i in range(0, big_npy.shape[0], batch_size_add):
1241
+ index.add(big_npy[i : i + batch_size_add])
1242
+ faiss.write_index(
1243
+ index,
1244
+ "%s/added_IVF%s_Flat_nprobe_%s_%s_%s.index"
1245
+ % (model_log_dir, n_ivf, index_ivf.nprobe, exp_dir1, version19),
1246
+ )
1247
+ yield get_info_str(
1248
+ "成功构建索引, added_IVF%s_Flat_nprobe_%s_%s_%s.index"
1249
+ % (n_ivf, index_ivf.nprobe, exp_dir1, version19)
1250
+ )
1251
+ yield get_info_str(i18n("全流程结束!"))
1252
+
1253
+
1254
+ # ckpt_path2.change(change_info_,[ckpt_path2],[sr__,if_f0__])
1255
+ def change_info_(ckpt_path):
1256
+ if not os.path.exists(ckpt_path.replace(os.path.basename(ckpt_path), "train.log")):
1257
+ return {"__type__": "update"}, {"__type__": "update"}, {"__type__": "update"}
1258
+ try:
1259
+ with open(
1260
+ ckpt_path.replace(os.path.basename(ckpt_path), "train.log"), "r"
1261
+ ) as f:
1262
+ info = eval(f.read().strip("\n").split("\n")[0].split("\t")[-1])
1263
+ sr, f0 = info["sample_rate"], info["if_f0"]
1264
+ version = "v2" if ("version" in info and info["version"] == "v2") else "v1"
1265
+ return sr, str(f0), version
1266
+ except:
1267
+ traceback.print_exc()
1268
+ return {"__type__": "update"}, {"__type__": "update"}, {"__type__": "update"}
1269
+
1270
+
1271
+ def export_onnx(ModelPath, ExportedPath):
1272
+ cpt = torch.load(ModelPath, map_location="cpu")
1273
+ cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]
1274
+ vec_channels = 256 if cpt.get("version", "v1") == "v1" else 768
1275
+
1276
+ test_phone = torch.rand(1, 200, vec_channels) # hidden unit
1277
+ test_phone_lengths = torch.tensor([200]).long() # hidden unit 长度(貌似没啥用)
1278
+ test_pitch = torch.randint(size=(1, 200), low=5, high=255) # 基频(单位赫兹)
1279
+ test_pitchf = torch.rand(1, 200) # nsf基频
1280
+ test_ds = torch.LongTensor([0]) # 说话人ID
1281
+ test_rnd = torch.rand(1, 192, 200) # 噪声(加入随机因子)
1282
+
1283
+ device = "cpu" # 导出时设备(不影响使用模型)
1284
+
1285
+ net_g = SynthesizerTrnMsNSFsidM(
1286
+ *cpt["config"], is_half=False, version=cpt.get("version", "v1")
1287
+ ) # fp32导出(C++要支持fp16必须手动将内存重新排列所以暂时不用fp16)
1288
+ net_g.load_state_dict(cpt["weight"], strict=False)
1289
+ input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds", "rnd"]
1290
+ output_names = [
1291
+ "audio",
1292
+ ]
1293
+ # net_g.construct_spkmixmap(n_speaker) 多角色混合轨道导出
1294
+ torch.onnx.export(
1295
+ net_g,
1296
+ (
1297
+ test_phone.to(device),
1298
+ test_phone_lengths.to(device),
1299
+ test_pitch.to(device),
1300
+ test_pitchf.to(device),
1301
+ test_ds.to(device),
1302
+ test_rnd.to(device),
1303
+ ),
1304
+ ExportedPath,
1305
+ dynamic_axes={
1306
+ "phone": [1],
1307
+ "pitch": [1],
1308
+ "pitchf": [1],
1309
+ "rnd": [2],
1310
+ },
1311
+ do_constant_folding=False,
1312
+ opset_version=13,
1313
+ verbose=False,
1314
+ input_names=input_names,
1315
+ output_names=output_names,
1316
+ )
1317
+ return "Finished"
1318
+
RVC_class.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import RVC
2
+ from scipy.io import wavfile
3
+ import numpy as np
4
+ import os
5
+ import uuid
6
+ import io
7
+ import requests
8
+
9
+ class VoiceConverter:
10
+ def __init__(self):
11
+ self.models = RVC.names
12
+
13
+ def single_run(self, input_audio, model_name, vc_transform, song, opt_input="opt", f0_method="crepe",
14
+ filter_radius=3, file_index1="", file_index2="", index_rate1=0.75,
15
+ resample_sr=0, rms_mix_rate=0.25, protect=0.33, f0_file=None):
16
+
17
+ RVC.get_vc(model_name, protect, protect)
18
+ spk_item = 0
19
+
20
+ vc_output1, vc_output2 = RVC.vc_single(
21
+ spk_item,
22
+ input_audio,
23
+ vc_transform,
24
+ f0_file,
25
+ f0_method,
26
+ file_index1,
27
+ file_index2,
28
+ index_rate1,
29
+ filter_radius,
30
+ resample_sr,
31
+ rms_mix_rate,
32
+ protect,
33
+ song
34
+ )
35
+
36
+ random_string = str(uuid.uuid4())
37
+ filename = os.path.basename(input_audio)
38
+ name, extension = os.path.splitext(filename)
39
+ new_file_name = f"{name}_{random_string}{extension}"
40
+
41
+ sample_rate, audio_data = vc_output2
42
+ url = self.upload_audio(audio_data, sample_rate, new_file_name)
43
+
44
+ return url
45
+
46
+ def upload_audio(self, audio_data, sample_rate, filename):
47
+ try:
48
+ url = "https://tmpfiles.org/api/v1/upload"
49
+ # Convert audio data to WAV format in memory
50
+ wav_bytes = io.BytesIO()
51
+ scaled_audio_data = np.int16(audio_data)
52
+ wavfile.write(wav_bytes, sample_rate, scaled_audio_data)
53
+ wav_bytes.seek(0)
54
+
55
+ files = {'file': (filename, wav_bytes)}
56
+
57
+ response = requests.post(url, files=files)
58
+ response.raise_for_status()
59
+
60
+ return response.json()
61
+ except Exception as e:
62
+ raise RuntimeError(f"Failed to upload audio: {e}")
63
+
64
+
65
+ def uvr(self, dir_wav_input, wav_inputs=None, model_choose=RVC.uvr5_names[0],
66
+ opt_vocal_root="opt", opt_ins_root="opt", format0="wav"):
67
+
68
+ agg = 10
69
+ vc_output4 = RVC.uvr(
70
+ model_choose,
71
+ dir_wav_input,
72
+ opt_vocal_root,
73
+ wav_inputs,
74
+ opt_ins_root,
75
+ agg,
76
+ format0
77
+ )
78
+
79
+ for value in vc_output4:
80
+ print(value)
81
+
82
+
83
+
84
+
85
+ # converter.uvr('/home/teewhy/Downloads/around_the_world-atc.wav')
RVC_functions.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import RVC
2
+ from scipy.io import wavfile
3
+ import numpy as np
4
+ import os
5
+ import uuid
6
+ import io
7
+ import requests
8
+ def get_models():
9
+ return RVC.names
10
+
11
+ def single_run(input_audio0, model_name, vc_transform, opt_input="opt", f0method0 = "pm", filter_radius0 = 3, file_index1 = "", file_index2 = "",
12
+ index_rate1 = 0.75, resample_sr0 = 0, rms_mix_rate0 = 0.25, protect0 = 0.33, f0_file = None):
13
+
14
+ RVC.get_vc(model_name, protect0, protect0)
15
+ spk_item = 0
16
+ # file_index2 = sorted(index_paths)
17
+ # f0_file = gr.File(label=i18n("F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调"))
18
+
19
+ vc_output1, vc_output2 = RVC.vc_single(
20
+ spk_item,
21
+ input_audio0,
22
+ vc_transform,
23
+ f0_file,
24
+ f0method0,
25
+ file_index1,
26
+ file_index2,
27
+ index_rate1,
28
+ filter_radius0,
29
+ resample_sr0,
30
+ rms_mix_rate0,
31
+ protect0
32
+ )
33
+ random_string = str(uuid.uuid4())
34
+ filename = os.path.basename(input_audio0)
35
+ name, extension = os.path.splitext(filename)
36
+ new_file_name = f"{name}_{random_string}{extension}"
37
+
38
+ sample_rate, audio_data = vc_output2
39
+ # scaled_audio_data = np.int16(audio_data)
40
+ # wavfile.write(new_file_name, sample_rate, scaled_audio_data)
41
+ # response = requests.post(f"https://filebin.net/dajeii61xk3c4oxi/{new_file_name}" , files=files)
42
+ url="https://filebin.net/dajeii61xk3c4oxi"
43
+ print(upload_audio(url, audio_data, sample_rate, new_file_name))
44
+
45
+ return f"{url}/{new_file_name}"
46
+
47
+
48
+ def upload_audio(url, audio_data, sample_rate, filename):
49
+ try:
50
+ # Convert audio data to WAV format in memory
51
+ wav_bytes = io.BytesIO()
52
+ scaled_audio_data = np.int16(audio_data)
53
+ wavfile.write(wav_bytes, sample_rate, scaled_audio_data)
54
+ wav_bytes.seek(0)
55
+
56
+ files = {'file': (filename, wav_bytes)}
57
+
58
+ response = requests.post(f"{url}/{filename}", files=files)
59
+
60
+ response.raise_for_status()
61
+
62
+ return response.json()
63
+ except Exception as e:
64
+ raise RuntimeError(f"Failed to upload audio: {e}")
65
+
66
+
67
+ def uvr(dir_wav_input, wav_inputs = None, model_choose = RVC.uvr5_names[0], opt_vocal_root ="opt", opt_ins_root = "opt", format0 = "wav"):
68
+
69
+ vc_output4 = RVC.uvr(
70
+ dir_wav_input,
71
+ )
72
+ for value in vc_output4:
73
+ print(value)
74
+
75
+
76
+ print(single_run("https://www.learningcontainer.com/wp-content/uploads/2020/02/Kalimba.mp3", "mymodelbilawal.pth", -2))
Retrieval_based_Voice_Conversion_WebUI.ipynb ADDED
@@ -0,0 +1,381 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "private_outputs": true,
7
+ "provenance": []
8
+ },
9
+ "kernelspec": {
10
+ "name": "python3",
11
+ "display_name": "Python 3"
12
+ },
13
+ "language_info": {
14
+ "name": "python"
15
+ },
16
+ "accelerator": "GPU",
17
+ "gpuClass": "standard"
18
+ },
19
+ "cells": [
20
+ {
21
+ "cell_type": "markdown",
22
+ "source": [
23
+ "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/main/Retrieval_based_Voice_Conversion_WebUI.ipynb)"
24
+ ],
25
+ "metadata": {
26
+ "id": "ZFFCx5J80SGa"
27
+ }
28
+ },
29
+ {
30
+ "cell_type": "code",
31
+ "execution_count": null,
32
+ "metadata": {
33
+ "id": "GmFP6bN9dvOq"
34
+ },
35
+ "outputs": [],
36
+ "source": [
37
+ "#@title 查看显卡\n",
38
+ "!nvidia-smi"
39
+ ]
40
+ },
41
+ {
42
+ "cell_type": "code",
43
+ "source": [
44
+ "#@title 安装依赖\n",
45
+ "!apt-get -y install build-essential python3-dev ffmpeg\n",
46
+ "!pip3 install --upgrade setuptools wheel\n",
47
+ "!pip3 install --upgrade pip\n",
48
+ "!pip3 install faiss-cpu==1.7.2 fairseq gradio==3.14.0 ffmpeg ffmpeg-python praat-parselmouth pyworld numpy==1.23.5 numba==0.56.4 librosa==0.9.2"
49
+ ],
50
+ "metadata": {
51
+ "id": "wjddIFr1oS3W"
52
+ },
53
+ "execution_count": null,
54
+ "outputs": []
55
+ },
56
+ {
57
+ "cell_type": "code",
58
+ "source": [
59
+ "#@title 克隆仓库\n",
60
+ "\n",
61
+ "!git clone --depth=1 -b stable https://github.com/fumiama/Retrieval-based-Voice-Conversion-WebUI\n",
62
+ "%cd /content/Retrieval-based-Voice-Conversion-WebUI\n",
63
+ "!mkdir -p pretrained uvr5_weights"
64
+ ],
65
+ "metadata": {
66
+ "id": "ge_97mfpgqTm"
67
+ },
68
+ "execution_count": null,
69
+ "outputs": []
70
+ },
71
+ {
72
+ "cell_type": "code",
73
+ "source": [
74
+ "#@title 更新仓库(一般无需执行)\n",
75
+ "!git pull"
76
+ ],
77
+ "metadata": {
78
+ "id": "BLDEZADkvlw1"
79
+ },
80
+ "execution_count": null,
81
+ "outputs": []
82
+ },
83
+ {
84
+ "cell_type": "code",
85
+ "source": [
86
+ "#@title 安装aria2\n",
87
+ "!apt -y install -qq aria2"
88
+ ],
89
+ "metadata": {
90
+ "id": "pqE0PrnuRqI2"
91
+ },
92
+ "execution_count": null,
93
+ "outputs": []
94
+ },
95
+ {
96
+ "cell_type": "code",
97
+ "source": [
98
+ "#@title 下载底模\n",
99
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/D32k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o D32k.pth\n",
100
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/D40k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o D40k.pth\n",
101
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/D48k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o D48k.pth\n",
102
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/G32k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o G32k.pth\n",
103
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/G40k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o G40k.pth\n",
104
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/G48k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o G48k.pth\n",
105
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0D32k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o f0D32k.pth\n",
106
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0D40k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o f0D40k.pth\n",
107
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0D48k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o f0D48k.pth\n",
108
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0G32k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o f0G32k.pth\n",
109
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0G40k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o f0G40k.pth\n",
110
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0G48k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o f0G48k.pth"
111
+ ],
112
+ "metadata": {
113
+ "id": "UG3XpUwEomUz"
114
+ },
115
+ "execution_count": null,
116
+ "outputs": []
117
+ },
118
+ {
119
+ "cell_type": "code",
120
+ "source": [
121
+ "#@title 下载人声分离模型\n",
122
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP2-人声vocals+非人声instrumentals.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/uvr5_weights -o HP2-人声vocals+非人声instrumentals.pth\n",
123
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP5-主旋律人声vocals+其他instrumentals.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/uvr5_weights -o HP5-主旋律人声vocals+其他instrumentals.pth"
124
+ ],
125
+ "metadata": {
126
+ "id": "HugjmZqZRuiF"
127
+ },
128
+ "execution_count": null,
129
+ "outputs": []
130
+ },
131
+ {
132
+ "cell_type": "code",
133
+ "source": [
134
+ "#@title 下载hubert_base\n",
135
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt -d /content/Retrieval-based-Voice-Conversion-WebUI -o hubert_base.pt"
136
+ ],
137
+ "metadata": {
138
+ "id": "2RCaT9FTR0ej"
139
+ },
140
+ "execution_count": null,
141
+ "outputs": []
142
+ },
143
+ {
144
+ "cell_type": "code",
145
+ "source": [
146
+ "#@title 挂载谷歌云盘\n",
147
+ "\n",
148
+ "from google.colab import drive\n",
149
+ "drive.mount('/content/drive')"
150
+ ],
151
+ "metadata": {
152
+ "id": "jwu07JgqoFON"
153
+ },
154
+ "execution_count": null,
155
+ "outputs": []
156
+ },
157
+ {
158
+ "cell_type": "code",
159
+ "source": [
160
+ "#@title 从谷歌云盘加载打包好的数据集到/content/dataset\n",
161
+ "\n",
162
+ "#@markdown 数据集位置\n",
163
+ "DATASET = \"/content/drive/MyDrive/dataset/lulu20230327_32k.zip\" #@param {type:\"string\"}\n",
164
+ "\n",
165
+ "!mkdir -p /content/dataset\n",
166
+ "!unzip -d /content/dataset -B {DATASET}"
167
+ ],
168
+ "metadata": {
169
+ "id": "Mwk7Q0Loqzjx"
170
+ },
171
+ "execution_count": null,
172
+ "outputs": []
173
+ },
174
+ {
175
+ "cell_type": "code",
176
+ "source": [
177
+ "#@title 重命名数据集中的重名文件\n",
178
+ "!ls -a /content/dataset/\n",
179
+ "!rename 's/(\\w+)\\.(\\w+)~(\\d*)/$1_$3.$2/' /content/dataset/*.*~*"
180
+ ],
181
+ "metadata": {
182
+ "id": "PDlFxWHWEynD"
183
+ },
184
+ "execution_count": null,
185
+ "outputs": []
186
+ },
187
+ {
188
+ "cell_type": "code",
189
+ "source": [
190
+ "#@title 启动web\n",
191
+ "%cd /content/Retrieval-based-Voice-Conversion-WebUI\n",
192
+ "# %load_ext tensorboard\n",
193
+ "# %tensorboard --logdir /content/Retrieval-based-Voice-Conversion-WebUI/logs\n",
194
+ "!python3 infer-web.py --colab --pycmd python3"
195
+ ],
196
+ "metadata": {
197
+ "id": "7vh6vphDwO0b"
198
+ },
199
+ "execution_count": null,
200
+ "outputs": []
201
+ },
202
+ {
203
+ "cell_type": "code",
204
+ "source": [
205
+ "#@title 手动将训练后的模型文件备份到谷歌云盘\n",
206
+ "#@markdown 需要自己查看logs文件夹下模型的文件名,手动修改下方命令末尾的文件名\n",
207
+ "\n",
208
+ "#@markdown 模型名\n",
209
+ "MODELNAME = \"lulu\" #@param {type:\"string\"}\n",
210
+ "#@markdown 模型epoch\n",
211
+ "MODELEPOCH = 9600 #@param {type:\"integer\"}\n",
212
+ "\n",
213
+ "!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/G_{MODELEPOCH}.pth /content/drive/MyDrive/{MODELNAME}_D_{MODELEPOCH}.pth\n",
214
+ "!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/D_{MODELEPOCH}.pth /content/drive/MyDrive/{MODELNAME}_G_{MODELEPOCH}.pth\n",
215
+ "!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/added_*.index /content/drive/MyDrive/\n",
216
+ "!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/total_*.npy /content/drive/MyDrive/\n",
217
+ "\n",
218
+ "!cp /content/Retrieval-based-Voice-Conversion-WebUI/weights/{MODELNAME}.pth /content/drive/MyDrive/{MODELNAME}{MODELEPOCH}.pth"
219
+ ],
220
+ "metadata": {
221
+ "id": "FgJuNeAwx5Y_"
222
+ },
223
+ "execution_count": null,
224
+ "outputs": []
225
+ },
226
+ {
227
+ "cell_type": "code",
228
+ "source": [
229
+ "#@title 从谷歌云盘恢复pth\n",
230
+ "#@markdown 需要自己查看logs文件夹下模型的文件名,手动修改下方命令末尾的文件名\n",
231
+ "\n",
232
+ "#@markdown 模型名\n",
233
+ "MODELNAME = \"lulu\" #@param {type:\"string\"}\n",
234
+ "#@markdown 模型epoch\n",
235
+ "MODELEPOCH = 7500 #@param {type:\"integer\"}\n",
236
+ "\n",
237
+ "!mkdir -p /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}\n",
238
+ "\n",
239
+ "!cp /content/drive/MyDrive/{MODELNAME}_D_{MODELEPOCH}.pth /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/G_{MODELEPOCH}.pth\n",
240
+ "!cp /content/drive/MyDrive/{MODELNAME}_G_{MODELEPOCH}.pth /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/D_{MODELEPOCH}.pth\n",
241
+ "!cp /content/drive/MyDrive/*.index /content/\n",
242
+ "!cp /content/drive/MyDrive/*.npy /content/\n",
243
+ "!cp /content/drive/MyDrive/{MODELNAME}{MODELEPOCH}.pth /content/Retrieval-based-Voice-Conversion-WebUI/weights/{MODELNAME}.pth"
244
+ ],
245
+ "metadata": {
246
+ "id": "OVQoLQJXS7WX"
247
+ },
248
+ "execution_count": null,
249
+ "outputs": []
250
+ },
251
+ {
252
+ "cell_type": "code",
253
+ "source": [
254
+ "#@title 手动预处理(不推荐)\n",
255
+ "#@markdown 模型名\n",
256
+ "MODELNAME = \"lulu\" #@param {type:\"string\"}\n",
257
+ "#@markdown 采样率\n",
258
+ "BITRATE = 48000 #@param {type:\"integer\"}\n",
259
+ "#@markdown 使用的进程数\n",
260
+ "THREADCOUNT = 8 #@param {type:\"integer\"}\n",
261
+ "\n",
262
+ "!python3 trainset_preprocess_pipeline_print.py /content/dataset {BITRATE} {THREADCOUNT} logs/{MODELNAME} True\n"
263
+ ],
264
+ "metadata": {
265
+ "id": "ZKAyuKb9J6dz"
266
+ },
267
+ "execution_count": null,
268
+ "outputs": []
269
+ },
270
+ {
271
+ "cell_type": "code",
272
+ "source": [
273
+ "#@title 手动提取特征(不推荐)\n",
274
+ "#@markdown 模型名\n",
275
+ "MODELNAME = \"lulu\" #@param {type:\"string\"}\n",
276
+ "#@markdown 使用的进程数\n",
277
+ "THREADCOUNT = 8 #@param {type:\"integer\"}\n",
278
+ "#@markdown 音高提取算法\n",
279
+ "ALGO = \"harvest\" #@param {type:\"string\"}\n",
280
+ "\n",
281
+ "!python3 extract_f0_print.py logs/{MODELNAME} {THREADCOUNT} {ALGO}\n",
282
+ "\n",
283
+ "!python3 extract_feature_print.py cpu 1 0 0 logs/{MODELNAME}\n"
284
+ ],
285
+ "metadata": {
286
+ "id": "CrxJqzAUKmPJ"
287
+ },
288
+ "execution_count": null,
289
+ "outputs": []
290
+ },
291
+ {
292
+ "cell_type": "code",
293
+ "source": [
294
+ "#@title 手动训练(不推荐)\n",
295
+ "#@markdown 模型名\n",
296
+ "MODELNAME = \"lulu\" #@param {type:\"string\"}\n",
297
+ "#@markdown 使用的GPU\n",
298
+ "USEGPU = \"0\" #@param {type:\"string\"}\n",
299
+ "#@markdown 批大小\n",
300
+ "BATCHSIZE = 32 #@param {type:\"integer\"}\n",
301
+ "#@markdown 停止的epoch\n",
302
+ "MODELEPOCH = 3200 #@param {type:\"integer\"}\n",
303
+ "#@markdown 保存epoch间隔\n",
304
+ "EPOCHSAVE = 100 #@param {type:\"integer\"}\n",
305
+ "#@markdown 采样率\n",
306
+ "MODELSAMPLE = \"48k\" #@param {type:\"string\"}\n",
307
+ "#@markdown 是否缓存训练集\n",
308
+ "CACHEDATA = 1 #@param {type:\"integer\"}\n",
309
+ "#@markdown 是否仅保存最新的ckpt文件\n",
310
+ "ONLYLATEST = 0 #@param {type:\"integer\"}\n",
311
+ "\n",
312
+ "!python3 train_nsf_sim_cache_sid_load_pretrain.py -e lulu -sr {MODELSAMPLE} -f0 1 -bs {BATCHSIZE} -g {USEGPU} -te {MODELEPOCH} -se {EPOCHSAVE} -pg pretrained/f0G{MODELSAMPLE}.pth -pd pretrained/f0D{MODELSAMPLE}.pth -l {ONLYLATEST} -c {CACHEDATA}\n"
313
+ ],
314
+ "metadata": {
315
+ "id": "IMLPLKOaKj58"
316
+ },
317
+ "execution_count": null,
318
+ "outputs": []
319
+ },
320
+ {
321
+ "cell_type": "code",
322
+ "source": [
323
+ "#@title 删除其它pth,只留选中的(慎点,仔细看代码)\n",
324
+ "#@markdown 模型名\n",
325
+ "MODELNAME = \"lulu\" #@param {type:\"string\"}\n",
326
+ "#@markdown 选中模型epoch\n",
327
+ "MODELEPOCH = 9600 #@param {type:\"integer\"}\n",
328
+ "\n",
329
+ "!echo \"备份选中的模型。。。\"\n",
330
+ "!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/G_{MODELEPOCH}.pth /content/{MODELNAME}_D_{MODELEPOCH}.pth\n",
331
+ "!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/D_{MODELEPOCH}.pth /content/{MODELNAME}_G_{MODELEPOCH}.pth\n",
332
+ "\n",
333
+ "!echo \"正在删除。。。\"\n",
334
+ "!ls /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}\n",
335
+ "!rm /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/*.pth\n",
336
+ "\n",
337
+ "!echo \"恢复选中的模型。。。\"\n",
338
+ "!mv /content/{MODELNAME}_D_{MODELEPOCH}.pth /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/G_{MODELEPOCH}.pth \n",
339
+ "!mv /content/{MODELNAME}_G_{MODELEPOCH}.pth /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/D_{MODELEPOCH}.pth\n",
340
+ "\n",
341
+ "!echo \"删除完成\"\n",
342
+ "!ls /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}"
343
+ ],
344
+ "metadata": {
345
+ "id": "haYA81hySuDl"
346
+ },
347
+ "execution_count": null,
348
+ "outputs": []
349
+ },
350
+ {
351
+ "cell_type": "code",
352
+ "source": [
353
+ "#@title 清除项目下所有文件,只留选中的模型(慎点,仔细看代码)\n",
354
+ "#@markdown 模型名\n",
355
+ "MODELNAME = \"lulu\" #@param {type:\"string\"}\n",
356
+ "#@markdown 选中模型epoch\n",
357
+ "MODELEPOCH = 9600 #@param {type:\"integer\"}\n",
358
+ "\n",
359
+ "!echo \"备份选中的模型。。。\"\n",
360
+ "!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/G_{MODELEPOCH}.pth /content/{MODELNAME}_D_{MODELEPOCH}.pth\n",
361
+ "!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/D_{MODELEPOCH}.pth /content/{MODELNAME}_G_{MODELEPOCH}.pth\n",
362
+ "\n",
363
+ "!echo \"正在删除。。。\"\n",
364
+ "!ls /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}\n",
365
+ "!rm -rf /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/*\n",
366
+ "\n",
367
+ "!echo \"恢复选中的模型。。。\"\n",
368
+ "!mv /content/{MODELNAME}_D_{MODELEPOCH}.pth /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/G_{MODELEPOCH}.pth \n",
369
+ "!mv /content/{MODELNAME}_G_{MODELEPOCH}.pth /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/D_{MODELEPOCH}.pth\n",
370
+ "\n",
371
+ "!echo \"删除完成\"\n",
372
+ "!ls /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}"
373
+ ],
374
+ "metadata": {
375
+ "id": "QhSiPTVPoIRh"
376
+ },
377
+ "execution_count": null,
378
+ "outputs": []
379
+ }
380
+ ]
381
+ }
Retrieval_based_Voice_Conversion_WebUI_v2.ipynb ADDED
@@ -0,0 +1,401 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "attachments": {},
5
+ "cell_type": "markdown",
6
+ "metadata": {
7
+ "id": "ZFFCx5J80SGa"
8
+ },
9
+ "source": [
10
+ "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/main/Retrieval_based_Voice_Conversion_WebUI_v2.ipynb)"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": null,
16
+ "metadata": {
17
+ "id": "GmFP6bN9dvOq"
18
+ },
19
+ "outputs": [],
20
+ "source": [
21
+ "#@title 查看显卡\n",
22
+ "!nvidia-smi"
23
+ ]
24
+ },
25
+ {
26
+ "cell_type": "code",
27
+ "execution_count": null,
28
+ "metadata": {
29
+ "id": "wjddIFr1oS3W"
30
+ },
31
+ "outputs": [],
32
+ "source": [
33
+ "#@title 安装依赖\n",
34
+ "!apt-get -y install build-essential python3-dev ffmpeg\n",
35
+ "!pip3 install --upgrade setuptools wheel\n",
36
+ "!pip3 install --upgrade pip\n",
37
+ "!pip3 install faiss-cpu==1.7.2 fairseq gradio==3.14.0 ffmpeg ffmpeg-python praat-parselmouth pyworld numpy==1.23.5 numba==0.56.4 librosa==0.9.2"
38
+ ]
39
+ },
40
+ {
41
+ "cell_type": "code",
42
+ "execution_count": null,
43
+ "metadata": {
44
+ "id": "ge_97mfpgqTm"
45
+ },
46
+ "outputs": [],
47
+ "source": [
48
+ "#@title 克隆仓库\n",
49
+ "\n",
50
+ "!mkdir Retrieval-based-Voice-Conversion-WebUI\n",
51
+ "%cd /content/Retrieval-based-Voice-Conversion-WebUI\n",
52
+ "!git init\n",
53
+ "!git remote add origin https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI.git\n",
54
+ "!git fetch origin cfd984812804ddc9247d65b14c82cd32e56c1133 --depth=1 \n",
55
+ "!git reset --hard FETCH_HEAD"
56
+ ]
57
+ },
58
+ {
59
+ "cell_type": "code",
60
+ "execution_count": null,
61
+ "metadata": {
62
+ "id": "BLDEZADkvlw1"
63
+ },
64
+ "outputs": [],
65
+ "source": [
66
+ "#@title 更新仓库(一般无需执行)\n",
67
+ "!git pull"
68
+ ]
69
+ },
70
+ {
71
+ "cell_type": "code",
72
+ "execution_count": null,
73
+ "metadata": {
74
+ "id": "pqE0PrnuRqI2"
75
+ },
76
+ "outputs": [],
77
+ "source": [
78
+ "#@title 安装aria2\n",
79
+ "!apt -y install -qq aria2"
80
+ ]
81
+ },
82
+ {
83
+ "cell_type": "code",
84
+ "execution_count": null,
85
+ "metadata": {
86
+ "id": "UG3XpUwEomUz"
87
+ },
88
+ "outputs": [],
89
+ "source": [
90
+ "#@title 下载底模\n",
91
+ "\n",
92
+ "# v1\n",
93
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/D32k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o D32k.pth\n",
94
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/D40k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o D40k.pth\n",
95
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/D48k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o D48k.pth\n",
96
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/G32k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o G32k.pth\n",
97
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/G40k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o G40k.pth\n",
98
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/G48k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o G48k.pth\n",
99
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0D32k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o f0D32k.pth\n",
100
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0D40k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o f0D40k.pth\n",
101
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0D48k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o f0D48k.pth\n",
102
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0G32k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o f0G32k.pth\n",
103
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0G40k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o f0G40k.pth\n",
104
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0G48k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o f0G48k.pth\n",
105
+ "\n",
106
+ "#v2\n",
107
+ "# !aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/D32k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained_v2 -o D32k.pth\n",
108
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/D40k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained_v2 -o D40k.pth\n",
109
+ "# !aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/D48k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained_v2 -o D48k.pth\n",
110
+ "# !aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/G32k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained_v2 -o G32k.pth\n",
111
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/G40k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained_v2 -o G40k.pth\n",
112
+ "# !aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/G48k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained_v2 -o G48k.pth\n",
113
+ "# !aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/f0D32k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained_v2 -o f0D32k.pth\n",
114
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/f0D40k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained_v2 -o f0D40k.pth\n",
115
+ "# !aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/f0D48k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained_v2 -o f0D48k.pth\n",
116
+ "# !aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/f0G32k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained_v2 -o f0G32k.pth\n",
117
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/f0G40k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained_v2 -o f0G40k.pth\n",
118
+ "# !aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/f0G48k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained_v2 -o f0G48k.pth"
119
+ ]
120
+ },
121
+ {
122
+ "cell_type": "code",
123
+ "execution_count": null,
124
+ "metadata": {
125
+ "id": "HugjmZqZRuiF"
126
+ },
127
+ "outputs": [],
128
+ "source": [
129
+ "#@title 下载人声分离模型\n",
130
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP2-人声vocals+非人声instrumentals.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/uvr5_weights -o HP2-人声vocals+非人声instrumentals.pth\n",
131
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP5-主旋律人声vocals+其他instrumentals.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/uvr5_weights -o HP5-主旋律人声vocals+其他instrumentals.pth"
132
+ ]
133
+ },
134
+ {
135
+ "cell_type": "code",
136
+ "execution_count": null,
137
+ "metadata": {
138
+ "id": "2RCaT9FTR0ej"
139
+ },
140
+ "outputs": [],
141
+ "source": [
142
+ "#@title 下载hubert_base\n",
143
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt -d /content/Retrieval-based-Voice-Conversion-WebUI -o hubert_base.pt"
144
+ ]
145
+ },
146
+ {
147
+ "cell_type": "code",
148
+ "execution_count": null,
149
+ "metadata": {
150
+ "id": "jwu07JgqoFON"
151
+ },
152
+ "outputs": [],
153
+ "source": [
154
+ "#@title 挂载谷歌云盘\n",
155
+ "\n",
156
+ "from google.colab import drive\n",
157
+ "drive.mount('/content/drive')"
158
+ ]
159
+ },
160
+ {
161
+ "cell_type": "code",
162
+ "execution_count": null,
163
+ "metadata": {
164
+ "id": "Mwk7Q0Loqzjx"
165
+ },
166
+ "outputs": [],
167
+ "source": [
168
+ "#@title 从谷歌云盘加载打包好的数据集到/content/dataset\n",
169
+ "\n",
170
+ "#@markdown 数据集位置\n",
171
+ "DATASET = \"/content/drive/MyDrive/dataset/lulu20230327_32k.zip\" #@param {type:\"string\"}\n",
172
+ "\n",
173
+ "!mkdir -p /content/dataset\n",
174
+ "!unzip -d /content/dataset -B {DATASET}"
175
+ ]
176
+ },
177
+ {
178
+ "cell_type": "code",
179
+ "execution_count": null,
180
+ "metadata": {
181
+ "id": "PDlFxWHWEynD"
182
+ },
183
+ "outputs": [],
184
+ "source": [
185
+ "#@title 重命名数据集中的重名文件\n",
186
+ "!ls -a /content/dataset/\n",
187
+ "!rename 's/(\\w+)\\.(\\w+)~(\\d*)/$1_$3.$2/' /content/dataset/*.*~*"
188
+ ]
189
+ },
190
+ {
191
+ "cell_type": "code",
192
+ "execution_count": null,
193
+ "metadata": {
194
+ "id": "7vh6vphDwO0b"
195
+ },
196
+ "outputs": [],
197
+ "source": [
198
+ "#@title 启动web\n",
199
+ "%cd /content/Retrieval-based-Voice-Conversion-WebUI\n",
200
+ "# %load_ext tensorboard\n",
201
+ "# %tensorboard --logdir /content/Retrieval-based-Voice-Conversion-WebUI/logs\n",
202
+ "!python3 infer-web.py --colab --pycmd python3"
203
+ ]
204
+ },
205
+ {
206
+ "cell_type": "code",
207
+ "execution_count": null,
208
+ "metadata": {
209
+ "id": "FgJuNeAwx5Y_"
210
+ },
211
+ "outputs": [],
212
+ "source": [
213
+ "#@title 手动将训练后的模型文件备份到谷歌云盘\n",
214
+ "#@markdown 需要自己查看logs文件夹下模型的文件名,手动修改下方命令末尾的文件名\n",
215
+ "\n",
216
+ "#@markdown 模型名\n",
217
+ "MODELNAME = \"lulu\" #@param {type:\"string\"}\n",
218
+ "#@markdown 模型epoch\n",
219
+ "MODELEPOCH = 9600 #@param {type:\"integer\"}\n",
220
+ "\n",
221
+ "!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/G_{MODELEPOCH}.pth /content/drive/MyDrive/{MODELNAME}_D_{MODELEPOCH}.pth\n",
222
+ "!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/D_{MODELEPOCH}.pth /content/drive/MyDrive/{MODELNAME}_G_{MODELEPOCH}.pth\n",
223
+ "!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/added_*.index /content/drive/MyDrive/\n",
224
+ "!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/total_*.npy /content/drive/MyDrive/\n",
225
+ "\n",
226
+ "!cp /content/Retrieval-based-Voice-Conversion-WebUI/weights/{MODELNAME}.pth /content/drive/MyDrive/{MODELNAME}{MODELEPOCH}.pth"
227
+ ]
228
+ },
229
+ {
230
+ "cell_type": "code",
231
+ "execution_count": null,
232
+ "metadata": {
233
+ "id": "OVQoLQJXS7WX"
234
+ },
235
+ "outputs": [],
236
+ "source": [
237
+ "#@title 从谷歌云盘恢复pth\n",
238
+ "#@markdown 需要自己查看logs文件夹下模型的文件名,手动修改下方命令末尾的文件名\n",
239
+ "\n",
240
+ "#@markdown 模型名\n",
241
+ "MODELNAME = \"lulu\" #@param {type:\"string\"}\n",
242
+ "#@markdown 模型epoch\n",
243
+ "MODELEPOCH = 7500 #@param {type:\"integer\"}\n",
244
+ "\n",
245
+ "!mkdir -p /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}\n",
246
+ "\n",
247
+ "!cp /content/drive/MyDrive/{MODELNAME}_D_{MODELEPOCH}.pth /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/G_{MODELEPOCH}.pth\n",
248
+ "!cp /content/drive/MyDrive/{MODELNAME}_G_{MODELEPOCH}.pth /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/D_{MODELEPOCH}.pth\n",
249
+ "!cp /content/drive/MyDrive/*.index /content/\n",
250
+ "!cp /content/drive/MyDrive/*.npy /content/\n",
251
+ "!cp /content/drive/MyDrive/{MODELNAME}{MODELEPOCH}.pth /content/Retrieval-based-Voice-Conversion-WebUI/weights/{MODELNAME}.pth"
252
+ ]
253
+ },
254
+ {
255
+ "cell_type": "code",
256
+ "execution_count": null,
257
+ "metadata": {
258
+ "id": "ZKAyuKb9J6dz"
259
+ },
260
+ "outputs": [],
261
+ "source": [
262
+ "#@title 手动预处理(不推荐)\n",
263
+ "#@markdown 模型名\n",
264
+ "MODELNAME = \"lulu\" #@param {type:\"string\"}\n",
265
+ "#@markdown 采样率\n",
266
+ "BITRATE = 48000 #@param {type:\"integer\"}\n",
267
+ "#@markdown 使用的进程数\n",
268
+ "THREADCOUNT = 8 #@param {type:\"integer\"}\n",
269
+ "\n",
270
+ "!python3 trainset_preprocess_pipeline_print.py /content/dataset {BITRATE} {THREADCOUNT} logs/{MODELNAME} True\n"
271
+ ]
272
+ },
273
+ {
274
+ "cell_type": "code",
275
+ "execution_count": null,
276
+ "metadata": {
277
+ "id": "CrxJqzAUKmPJ"
278
+ },
279
+ "outputs": [],
280
+ "source": [
281
+ "#@title 手动提取特征(不推荐)\n",
282
+ "#@markdown 模型名\n",
283
+ "MODELNAME = \"lulu\" #@param {type:\"string\"}\n",
284
+ "#@markdown 使用的进程数\n",
285
+ "THREADCOUNT = 8 #@param {type:\"integer\"}\n",
286
+ "#@markdown 音高提取算法\n",
287
+ "ALGO = \"harvest\" #@param {type:\"string\"}\n",
288
+ "\n",
289
+ "!python3 extract_f0_print.py logs/{MODELNAME} {THREADCOUNT} {ALGO}\n",
290
+ "\n",
291
+ "!python3 extract_feature_print.py cpu 1 0 0 logs/{MODELNAME}\n"
292
+ ]
293
+ },
294
+ {
295
+ "cell_type": "code",
296
+ "execution_count": null,
297
+ "metadata": {
298
+ "id": "IMLPLKOaKj58"
299
+ },
300
+ "outputs": [],
301
+ "source": [
302
+ "#@title 手动训练(不推荐)\n",
303
+ "#@markdown 模型名\n",
304
+ "MODELNAME = \"lulu\" #@param {type:\"string\"}\n",
305
+ "#@markdown 使用的GPU\n",
306
+ "USEGPU = \"0\" #@param {type:\"string\"}\n",
307
+ "#@markdown 批大小\n",
308
+ "BATCHSIZE = 32 #@param {type:\"integer\"}\n",
309
+ "#@markdown 停止的epoch\n",
310
+ "MODELEPOCH = 3200 #@param {type:\"integer\"}\n",
311
+ "#@markdown 保存epoch间隔\n",
312
+ "EPOCHSAVE = 100 #@param {type:\"integer\"}\n",
313
+ "#@markdown 采样率\n",
314
+ "MODELSAMPLE = \"48k\" #@param {type:\"string\"}\n",
315
+ "#@markdown 是否缓存训练集\n",
316
+ "CACHEDATA = 1 #@param {type:\"integer\"}\n",
317
+ "#@markdown 是否仅保存最新的ckpt文件\n",
318
+ "ONLYLATEST = 0 #@param {type:\"integer\"}\n",
319
+ "\n",
320
+ "!python3 train_nsf_sim_cache_sid_load_pretrain.py -e lulu -sr {MODELSAMPLE} -f0 1 -bs {BATCHSIZE} -g {USEGPU} -te {MODELEPOCH} -se {EPOCHSAVE} -pg pretrained/f0G{MODELSAMPLE}.pth -pd pretrained/f0D{MODELSAMPLE}.pth -l {ONLYLATEST} -c {CACHEDATA}\n"
321
+ ]
322
+ },
323
+ {
324
+ "cell_type": "code",
325
+ "execution_count": null,
326
+ "metadata": {
327
+ "id": "haYA81hySuDl"
328
+ },
329
+ "outputs": [],
330
+ "source": [
331
+ "#@title 删除其它pth,只留选中的(慎点,仔细看代码)\n",
332
+ "#@markdown 模型名\n",
333
+ "MODELNAME = \"lulu\" #@param {type:\"string\"}\n",
334
+ "#@markdown 选中模型epoch\n",
335
+ "MODELEPOCH = 9600 #@param {type:\"integer\"}\n",
336
+ "\n",
337
+ "!echo \"备份选中的模型。。。\"\n",
338
+ "!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/G_{MODELEPOCH}.pth /content/{MODELNAME}_D_{MODELEPOCH}.pth\n",
339
+ "!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/D_{MODELEPOCH}.pth /content/{MODELNAME}_G_{MODELEPOCH}.pth\n",
340
+ "\n",
341
+ "!echo \"正在删除。。。\"\n",
342
+ "!ls /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}\n",
343
+ "!rm /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/*.pth\n",
344
+ "\n",
345
+ "!echo \"恢复选中的模型。。。\"\n",
346
+ "!mv /content/{MODELNAME}_D_{MODELEPOCH}.pth /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/G_{MODELEPOCH}.pth \n",
347
+ "!mv /content/{MODELNAME}_G_{MODELEPOCH}.pth /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/D_{MODELEPOCH}.pth\n",
348
+ "\n",
349
+ "!echo \"删除完成\"\n",
350
+ "!ls /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}"
351
+ ]
352
+ },
353
+ {
354
+ "cell_type": "code",
355
+ "execution_count": null,
356
+ "metadata": {
357
+ "id": "QhSiPTVPoIRh"
358
+ },
359
+ "outputs": [],
360
+ "source": [
361
+ "#@title 清除项目下所有文件,只留选中的模型(慎点,仔细看代码)\n",
362
+ "#@markdown 模型名\n",
363
+ "MODELNAME = \"lulu\" #@param {type:\"string\"}\n",
364
+ "#@markdown 选中模型epoch\n",
365
+ "MODELEPOCH = 9600 #@param {type:\"integer\"}\n",
366
+ "\n",
367
+ "!echo \"备份选中的模型。。。\"\n",
368
+ "!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/G_{MODELEPOCH}.pth /content/{MODELNAME}_D_{MODELEPOCH}.pth\n",
369
+ "!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/D_{MODELEPOCH}.pth /content/{MODELNAME}_G_{MODELEPOCH}.pth\n",
370
+ "\n",
371
+ "!echo \"正在删除。。。\"\n",
372
+ "!ls /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}\n",
373
+ "!rm -rf /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/*\n",
374
+ "\n",
375
+ "!echo \"恢复选中的模型。。。\"\n",
376
+ "!mv /content/{MODELNAME}_D_{MODELEPOCH}.pth /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/G_{MODELEPOCH}.pth \n",
377
+ "!mv /content/{MODELNAME}_G_{MODELEPOCH}.pth /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/D_{MODELEPOCH}.pth\n",
378
+ "\n",
379
+ "!echo \"删除完成\"\n",
380
+ "!ls /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}"
381
+ ]
382
+ }
383
+ ],
384
+ "metadata": {
385
+ "accelerator": "GPU",
386
+ "colab": {
387
+ "private_outputs": true,
388
+ "provenance": []
389
+ },
390
+ "gpuClass": "standard",
391
+ "kernelspec": {
392
+ "display_name": "Python 3",
393
+ "name": "python3"
394
+ },
395
+ "language_info": {
396
+ "name": "python"
397
+ }
398
+ },
399
+ "nbformat": 4,
400
+ "nbformat_minor": 0
401
+ }
app.py ADDED
@@ -0,0 +1,322 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import os
3
+ import torch
4
+
5
+ # os.system("wget -P cvec/ https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt")
6
+ import gradio as gr
7
+ import librosa
8
+ import numpy as np
9
+ import soundfile
10
+ import logging
11
+ from fairseq import checkpoint_utils
12
+ from my_utils import load_audio
13
+ from vc_infer_pipeline import VC
14
+ import traceback
15
+ from config import Config
16
+ from infer_pack.models import (
17
+ SynthesizerTrnMs256NSFsid,
18
+ SynthesizerTrnMs256NSFsid_nono,
19
+ SynthesizerTrnMs768NSFsid,
20
+ SynthesizerTrnMs768NSFsid_nono,
21
+ )
22
+ from i18n import I18nAuto
23
+
24
+ logging.getLogger("numba").setLevel(logging.WARNING)
25
+ logging.getLogger("markdown_it").setLevel(logging.WARNING)
26
+ logging.getLogger("urllib3").setLevel(logging.WARNING)
27
+ logging.getLogger("matplotlib").setLevel(logging.WARNING)
28
+
29
+ i18n = I18nAuto()
30
+ i18n.print()
31
+
32
+ config = Config()
33
+
34
+ weight_root = "weights"
35
+ weight_uvr5_root = "uvr5_weights"
36
+ index_root = "logs"
37
+ names = []
38
+ hubert_model = None
39
+ for name in os.listdir(weight_root):
40
+ if name.endswith(".pth"):
41
+ names.append(name)
42
+ index_paths = []
43
+ for root, dirs, files in os.walk(index_root, topdown=False):
44
+ for name in files:
45
+ if name.endswith(".index") and "trained" not in name:
46
+ index_paths.append("%s/%s" % (root, name))
47
+
48
+
49
+ def get_vc(sid):
50
+ global n_spk, tgt_sr, net_g, vc, cpt, version
51
+ if sid == "" or sid == []:
52
+ global hubert_model
53
+ if hubert_model != None: # 考虑到轮询, 需要加个判断看是否 sid 是由有模型切换到无模型的
54
+ print("clean_empty_cache")
55
+ del net_g, n_spk, vc, hubert_model, tgt_sr # ,cpt
56
+ hubert_model = net_g = n_spk = vc = hubert_model = tgt_sr = None
57
+ if torch.cuda.is_available():
58
+ torch.cuda.empty_cache()
59
+ ###楼下不这么折腾清理不干净
60
+ if_f0 = cpt.get("f0", 1)
61
+ version = cpt.get("version", "v1")
62
+ if version == "v1":
63
+ if if_f0 == 1:
64
+ net_g = SynthesizerTrnMs256NSFsid(
65
+ *cpt["config"], is_half=config.is_half
66
+ )
67
+ else:
68
+ net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
69
+ elif version == "v2":
70
+ if if_f0 == 1:
71
+ net_g = SynthesizerTrnMs768NSFsid(
72
+ *cpt["config"], is_half=config.is_half
73
+ )
74
+ else:
75
+ net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
76
+ del net_g, cpt
77
+ if torch.cuda.is_available():
78
+ torch.cuda.empty_cache()
79
+ cpt = None
80
+ return {"visible": False, "__type__": "update"}
81
+ person = "%s/%s" % (weight_root, sid)
82
+ print("loading %s" % person)
83
+ cpt = torch.load(person, map_location="cpu")
84
+ tgt_sr = cpt["config"][-1]
85
+ cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
86
+ if_f0 = cpt.get("f0", 1)
87
+ version = cpt.get("version", "v1")
88
+ if version == "v1":
89
+ if if_f0 == 1:
90
+ net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=config.is_half)
91
+ else:
92
+ net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
93
+ elif version == "v2":
94
+ if if_f0 == 1:
95
+ net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=config.is_half)
96
+ else:
97
+ net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
98
+ del net_g.enc_q
99
+ print(net_g.load_state_dict(cpt["weight"], strict=False))
100
+ net_g.eval().to(config.device)
101
+ if config.is_half:
102
+ net_g = net_g.half()
103
+ else:
104
+ net_g = net_g.float()
105
+ vc = VC(tgt_sr, config)
106
+ n_spk = cpt["config"][-3]
107
+ return {"visible": True, "maximum": n_spk, "__type__": "update"}
108
+
109
+
110
+ def load_hubert():
111
+ global hubert_model
112
+ models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
113
+ ["hubert_base.pt"],
114
+ suffix="",
115
+ )
116
+ hubert_model = models[0]
117
+ hubert_model = hubert_model.to(config.device)
118
+ if config.is_half:
119
+ hubert_model = hubert_model.half()
120
+ else:
121
+ hubert_model = hubert_model.float()
122
+ hubert_model.eval()
123
+
124
+
125
+ def vc_single(
126
+ sid,
127
+ input_audio_path,
128
+ f0_up_key,
129
+ f0_file,
130
+ f0_method,
131
+ file_index,
132
+ file_index2,
133
+ # file_big_npy,
134
+ index_rate,
135
+ filter_radius,
136
+ resample_sr,
137
+ rms_mix_rate,
138
+ protect,
139
+ ): # spk_item, input_audio0, vc_transform0,f0_file,f0method0
140
+ global tgt_sr, net_g, vc, hubert_model, version
141
+ if input_audio_path is None:
142
+ return "You need to upload an audio", None
143
+ f0_up_key = int(f0_up_key)
144
+ try:
145
+ audio = input_audio_path[1] / 32768.0
146
+ if len(audio.shape) == 2:
147
+ audio = np.mean(audio, -1)
148
+ audio = librosa.resample(audio, orig_sr=input_audio_path[0], target_sr=16000)
149
+ audio_max = np.abs(audio).max() / 0.95
150
+ if audio_max > 1:
151
+ audio /= audio_max
152
+ times = [0, 0, 0]
153
+ if hubert_model == None:
154
+ load_hubert()
155
+ if_f0 = cpt.get("f0", 1)
156
+ file_index = (
157
+ (
158
+ file_index.strip(" ")
159
+ .strip('"')
160
+ .strip("\n")
161
+ .strip('"')
162
+ .strip(" ")
163
+ .replace("trained", "added")
164
+ )
165
+ if file_index != ""
166
+ else file_index2
167
+ ) # 防止小白写错,自动帮他替换掉
168
+ # file_big_npy = (
169
+ # file_big_npy.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
170
+ # )
171
+ audio_opt = vc.pipeline(
172
+ hubert_model,
173
+ net_g,
174
+ sid,
175
+ audio,
176
+ input_audio_path,
177
+ times,
178
+ f0_up_key,
179
+ f0_method,
180
+ file_index,
181
+ # file_big_npy,
182
+ index_rate,
183
+ if_f0,
184
+ filter_radius,
185
+ tgt_sr,
186
+ resample_sr,
187
+ rms_mix_rate,
188
+ version,
189
+ protect,
190
+ f0_file=f0_file,
191
+ )
192
+ if resample_sr >= 16000 and tgt_sr != resample_sr:
193
+ tgt_sr = resample_sr
194
+ index_info = (
195
+ "Using index:%s." % file_index
196
+ if os.path.exists(file_index)
197
+ else "Index not used."
198
+ )
199
+ return "Success.\n %s\nTime:\n npy:%ss, f0:%ss, infer:%ss" % (
200
+ index_info,
201
+ times[0],
202
+ times[1],
203
+ times[2],
204
+ ), (tgt_sr, audio_opt)
205
+ except:
206
+ info = traceback.format_exc()
207
+ print(info)
208
+ return info, (None, None)
209
+
210
+
211
+ app = gr.Blocks()
212
+ with app:
213
+ with gr.Tabs():
214
+ with gr.TabItem("在线demo"):
215
+ gr.Markdown(
216
+ value="""
217
+ RVC 在线demo
218
+ """
219
+ )
220
+ sid = gr.Dropdown(label=i18n("推理音色"), choices=sorted(names))
221
+ with gr.Column():
222
+ spk_item = gr.Slider(
223
+ minimum=0,
224
+ maximum=2333,
225
+ step=1,
226
+ label=i18n("请选择说话人id"),
227
+ value=0,
228
+ visible=False,
229
+ interactive=True,
230
+ )
231
+ sid.change(
232
+ fn=get_vc,
233
+ inputs=[sid],
234
+ outputs=[spk_item],
235
+ )
236
+ gr.Markdown(
237
+ value=i18n("男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ")
238
+ )
239
+ vc_input3 = gr.Audio(label="上传音频(长度小于90秒)")
240
+ vc_transform0 = gr.Number(label=i18n("变调(整数, 半音数量, 升八度12降八度-12)"), value=0)
241
+ f0method0 = gr.Radio(
242
+ label=i18n("选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU"),
243
+ choices=["pm", "harvest", "crepe"],
244
+ value="pm",
245
+ interactive=True,
246
+ )
247
+ filter_radius0 = gr.Slider(
248
+ minimum=0,
249
+ maximum=7,
250
+ label=i18n(">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音"),
251
+ value=3,
252
+ step=1,
253
+ interactive=True,
254
+ )
255
+ with gr.Column():
256
+ file_index1 = gr.Textbox(
257
+ label=i18n("特征检索库文件路径,为空则使用下拉的选择结果"),
258
+ value="",
259
+ interactive=False,
260
+ visible=False,
261
+ )
262
+ file_index2 = gr.Dropdown(
263
+ label=i18n("自动检测index路径,下拉式选择(dropdown)"),
264
+ choices=sorted(index_paths),
265
+ interactive=True,
266
+ )
267
+ index_rate1 = gr.Slider(
268
+ minimum=0,
269
+ maximum=1,
270
+ label=i18n("检索特征占比"),
271
+ value=0.88,
272
+ interactive=True,
273
+ )
274
+ resample_sr0 = gr.Slider(
275
+ minimum=0,
276
+ maximum=48000,
277
+ label=i18n("后处理重采样至最终采样率,0为不进行重采样"),
278
+ value=0,
279
+ step=1,
280
+ interactive=True,
281
+ )
282
+ rms_mix_rate0 = gr.Slider(
283
+ minimum=0,
284
+ maximum=1,
285
+ label=i18n("输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络"),
286
+ value=1,
287
+ interactive=True,
288
+ )
289
+ protect0 = gr.Slider(
290
+ minimum=0,
291
+ maximum=0.5,
292
+ label=i18n("保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果"),
293
+ value=0.33,
294
+ step=0.01,
295
+ interactive=True,
296
+ )
297
+ f0_file = gr.File(label=i18n("F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调"))
298
+ but0 = gr.Button(i18n("转换"), variant="primary")
299
+ vc_output1 = gr.Textbox(label=i18n("输出信息"))
300
+ vc_output2 = gr.Audio(label=i18n("输出音频(右下角三个点,点了可以下载)"))
301
+ but0.click(
302
+ vc_single,
303
+ [
304
+ spk_item,
305
+ vc_input3,
306
+ vc_transform0,
307
+ f0_file,
308
+ f0method0,
309
+ file_index1,
310
+ file_index2,
311
+ # file_big_npy1,
312
+ index_rate1,
313
+ filter_radius0,
314
+ resample_sr0,
315
+ rms_mix_rate0,
316
+ protect0,
317
+ ],
318
+ [vc_output1, vc_output2],
319
+ )
320
+
321
+
322
+ app.launch()
config.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import torch
3
+ from multiprocessing import cpu_count
4
+
5
+
6
+ def config_file_change_fp32():
7
+ for config_file in ["32k.json", "40k.json", "48k.json"]:
8
+ with open(f"configs/{config_file}", "r") as f:
9
+ strr = f.read().replace("true", "false")
10
+ with open(f"configs/{config_file}", "w") as f:
11
+ f.write(strr)
12
+ with open("trainset_preprocess_pipeline_print.py", "r") as f:
13
+ strr = f.read().replace("3.7", "3.0")
14
+ with open("trainset_preprocess_pipeline_print.py", "w") as f:
15
+ f.write(strr)
16
+
17
+
18
+ class Config:
19
+ def __init__(self):
20
+ self.device = "cuda:0"
21
+ self.is_half = True
22
+ self.n_cpu = 0
23
+ self.gpu_name = None
24
+ self.gpu_mem = None
25
+
26
+ self.python_cmd = "python3"
27
+ self.listen_port = 7865
28
+ self.iscolab = False
29
+ self.noparallel = False
30
+ self.noautoopen = False
31
+
32
+ self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()
33
+
34
+ @staticmethod
35
+ def arg_parse() -> tuple:
36
+ parser = argparse.ArgumentParser()
37
+ parser.add_argument("--port", type=int, default=7865, help="Listen port")
38
+ parser.add_argument(
39
+ "--pycmd", type=str, default="python3", help="Python command"
40
+ )
41
+ parser.add_argument("--colab", action="store_true", help="Launch in colab")
42
+ parser.add_argument(
43
+ "--noparallel", action="store_true", help="Disable parallel processing"
44
+ )
45
+ parser.add_argument(
46
+ "--noautoopen",
47
+ action="store_true",
48
+ help="Do not open in browser automatically",
49
+ )
50
+ cmd_opts = parser.parse_args()
51
+
52
+ cmd_opts.port = cmd_opts.port if 0 <= cmd_opts.port <= 65535 else 7865
53
+
54
+ return (
55
+ cmd_opts.pycmd,
56
+ cmd_opts.port,
57
+ cmd_opts.colab,
58
+ cmd_opts.noparallel,
59
+ cmd_opts.noautoopen,
60
+ )
61
+
62
+ def device_config(self) -> tuple:
63
+ if torch.cuda.is_available():
64
+ i_device = int(self.device.split(":")[-1])
65
+ self.gpu_name = torch.cuda.get_device_name(i_device)
66
+ if (
67
+ ("16" in self.gpu_name and "V100" not in self.gpu_name.upper())
68
+ or "P40" in self.gpu_name.upper()
69
+ or "1060" in self.gpu_name
70
+ or "1070" in self.gpu_name
71
+ or "1080" in self.gpu_name
72
+ ):
73
+ print("16系/10系显卡和P40强制单精度")
74
+ self.is_half = False
75
+ config_file_change_fp32()
76
+ else:
77
+ self.gpu_name = None
78
+ self.gpu_mem = int(
79
+ torch.cuda.get_device_properties(i_device).total_memory
80
+ / 1024
81
+ / 1024
82
+ / 1024
83
+ + 0.4
84
+ )
85
+ if self.gpu_mem <= 4:
86
+ with open("trainset_preprocess_pipeline_print.py", "r") as f:
87
+ strr = f.read().replace("3.7", "3.0")
88
+ with open("trainset_preprocess_pipeline_print.py", "w") as f:
89
+ f.write(strr)
90
+ elif torch.backends.mps.is_available():
91
+ print("没有发现支持的N卡, 使用MPS进行推理")
92
+ self.device = "mps"
93
+ self.is_half = False
94
+ config_file_change_fp32()
95
+ else:
96
+ print("没有发现支持的N卡, 使用CPU进行推理")
97
+ self.device = "cpu"
98
+ self.is_half = False
99
+ config_file_change_fp32()
100
+
101
+ if self.n_cpu == 0:
102
+ self.n_cpu = cpu_count()
103
+
104
+ if self.is_half:
105
+ # 6G显存配置
106
+ x_pad = 3
107
+ x_query = 10
108
+ x_center = 60
109
+ x_max = 65
110
+ else:
111
+ # 5G显存配置
112
+ x_pad = 1
113
+ x_query = 6
114
+ x_center = 38
115
+ x_max = 41
116
+
117
+ if self.gpu_mem != None and self.gpu_mem <= 4:
118
+ x_pad = 1
119
+ x_query = 5
120
+ x_center = 30
121
+ x_max = 32
122
+
123
+ return x_pad, x_query, x_center, x_max
envfilescheck.bat ADDED
@@ -0,0 +1,348 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ @echo off && chcp 65001
2
+
3
+ echo working dir is %cd%
4
+ echo downloading requirement aria2 check.
5
+ echo=
6
+ dir /a:d/b | findstr "aria2" > flag.txt
7
+ findstr "aria2" flag.txt >nul
8
+ if %errorlevel% ==0 (
9
+ echo aria2 checked.
10
+ echo=
11
+ ) else (
12
+ echo failed. please downloading aria2 from webpage!
13
+ echo unzip it and put in this directory!
14
+ timeout /T 5
15
+ start https://github.com/aria2/aria2/releases/tag/release-1.36.0
16
+ echo=
17
+ goto end
18
+ )
19
+
20
+ echo envfiles checking start.
21
+ echo=
22
+
23
+ for /f %%x in ('findstr /i /c:"aria2" "flag.txt"') do (set aria2=%%x)&goto endSch
24
+ :endSch
25
+
26
+ set d32=f0D32k.pth
27
+ set d40=f0D40k.pth
28
+ set d48=f0D48k.pth
29
+ set g32=f0G32k.pth
30
+ set g40=f0G40k.pth
31
+ set g48=f0G48k.pth
32
+
33
+ set d40v2=f0D40k.pth
34
+ set g40v2=f0G40k.pth
35
+
36
+ set dld32=https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0D32k.pth
37
+ set dld40=https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0D40k.pth
38
+ set dld48=https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0D48k.pth
39
+ set dlg32=https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0G32k.pth
40
+ set dlg40=https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0G40k.pth
41
+ set dlg48=https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0G48k.pth
42
+
43
+ set dld40v2=https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/f0D40k.pth
44
+ set dlg40v2=https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/f0G40k.pth
45
+
46
+ set hp2_all=HP2_all_vocals.pth
47
+ set hp3_all=HP3_all_vocals.pth
48
+ set hp5_only=HP5_only_main_vocal.pth
49
+ set VR_DeEchoAggressive=VR-DeEchoAggressive.pth
50
+ set VR_DeEchoDeReverb=VR-DeEchoDeReverb.pth
51
+ set VR_DeEchoNormal=VR-DeEchoNormal.pth
52
+ set onnx_dereverb=vocals.onnx
53
+
54
+ set dlhp2_all=https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP2_all_vocals.pth
55
+ set dlhp3_all=https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP3_all_vocals.pth
56
+ set dlhp5_only=https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP5_only_main_vocal.pth
57
+ set dlVR_DeEchoAggressive=https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/VR-DeEchoAggressive.pth
58
+ set dlVR_DeEchoDeReverb=https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/VR-DeEchoDeReverb.pth
59
+ set dlVR_DeEchoNormal=https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/VR-DeEchoNormal.pth
60
+ set dlonnx_dereverb=https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/onnx_dereverb_By_FoxJoy/vocals.onnx
61
+
62
+ set hb=hubert_base.pt
63
+
64
+ set dlhb=https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt
65
+
66
+ echo dir check start.
67
+ echo=
68
+
69
+ if exist "%~dp0pretrained" (
70
+ echo dir .\pretrained checked.
71
+ ) else (
72
+ echo failed. generating dir .\pretrained.
73
+ mkdir pretrained
74
+ )
75
+ if exist "%~dp0pretrained_v2" (
76
+ echo dir .\pretrained_v2 checked.
77
+ ) else (
78
+ echo failed. generating dir .\pretrained_v2.
79
+ mkdir pretrained_v2
80
+ )
81
+ if exist "%~dp0uvr5_weights" (
82
+ echo dir .\uvr5_weights checked.
83
+ ) else (
84
+ echo failed. generating dir .\uvr5_weights.
85
+ mkdir uvr5_weights
86
+ )
87
+ if exist "%~dp0uvr5_weights\onnx_dereverb_By_FoxJoy" (
88
+ echo dir .\uvr5_weights\onnx_dereverb_By_FoxJoy checked.
89
+ ) else (
90
+ echo failed. generating dir .\uvr5_weights\onnx_dereverb_By_FoxJoy.
91
+ mkdir uvr5_weights\onnx_dereverb_By_FoxJoy
92
+ )
93
+
94
+ echo=
95
+ echo dir check finished.
96
+
97
+ echo=
98
+ echo required files check start.
99
+
100
+ echo checking D32k.pth
101
+ if exist "%~dp0pretrained\D32k.pth" (
102
+ echo D32k.pth in .\pretrained checked.
103
+ echo=
104
+ ) else (
105
+ echo failed. starting download from huggingface.
106
+ %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/D32k.pth -d %~dp0pretrained -o D32k.pth
107
+ if exist "%~dp0pretrained\D32k.pth" (echo download successful.) else (echo please try again!
108
+ echo=)
109
+ )
110
+ echo checking D40k.pth
111
+ if exist "%~dp0pretrained\D40k.pth" (
112
+ echo D40k.pth in .\pretrained checked.
113
+ echo=
114
+ ) else (
115
+ echo failed. starting download from huggingface.
116
+ %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/D40k.pth -d %~dp0pretrained -o D40k.pth
117
+ if exist "%~dp0pretrained\D40k.pth" (echo download successful.) else (echo please try again!
118
+ echo=)
119
+ )
120
+ echo checking D40k.pth
121
+ if exist "%~dp0pretrained_v2\D40k.pth" (
122
+ echo D40k.pth in .\pretrained_v2 checked.
123
+ echo=
124
+ ) else (
125
+ echo failed. starting download from huggingface.
126
+ %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/D40k.pth -d %~dp0pretrained_v2 -o D40k.pth
127
+ if exist "%~dp0pretrained_v2\D40k.pth" (echo download successful.) else (echo please try again!
128
+ echo=)
129
+ )
130
+ echo checking D48k.pth
131
+ if exist "%~dp0pretrained\D48k.pth" (
132
+ echo D48k.pth in .\pretrained checked.
133
+ echo=
134
+ ) else (
135
+ echo failed. starting download from huggingface.
136
+ %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/D48k.pth -d %~dp0pretrained -o D48k.pth
137
+ if exist "%~dp0pretrained\D48k.pth" (echo download successful.) else (echo please try again!
138
+ echo=)
139
+ )
140
+ echo checking G32k.pth
141
+ if exist "%~dp0pretrained\G32k.pth" (
142
+ echo G32k.pth in .\pretrained checked.
143
+ echo=
144
+ ) else (
145
+ echo failed. starting download from huggingface.
146
+ %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/G32k.pth -d %~dp0pretrained -o G32k.pth
147
+ if exist "%~dp0pretrained\G32k.pth" (echo download successful.) else (echo please try again!
148
+ echo=)
149
+ )
150
+ echo checking G40k.pth
151
+ if exist "%~dp0pretrained\G40k.pth" (
152
+ echo G40k.pth in .\pretrained checked.
153
+ echo=
154
+ ) else (
155
+ echo failed. starting download from huggingface.
156
+ %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/G40k.pth -d %~dp0pretrained -o G40k.pth
157
+ if exist "%~dp0pretrained\G40k.pth" (echo download successful.) else (echo please try again!
158
+ echo=)
159
+ )
160
+ echo checking G40k.pth
161
+ if exist "%~dp0pretrained_v2\G40k.pth" (
162
+ echo G40k.pth in .\pretrained_v2 checked.
163
+ echo=
164
+ ) else (
165
+ echo failed. starting download from huggingface.
166
+ %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/G40k.pth -d %~dp0pretrained_v2 -o G40k.pth
167
+ if exist "%~dp0pretrained_v2\G40k.pth" (echo download successful.) else (echo please try again!
168
+ echo=)
169
+ )
170
+ echo checking G48k.pth
171
+ if exist "%~dp0pretrained\G48k.pth" (
172
+ echo G48k.pth in .\pretrained checked.
173
+ echo=
174
+ ) else (
175
+ echo failed. starting download from huggingface.
176
+ %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/G48k.pth -d %~dp0pretrained -o G48k.pth
177
+ if exist "%~dp0pretrained\G48k.pth" (echo download successful.) else (echo please try again!
178
+ echo=)
179
+ )
180
+
181
+ echo checking %d32%
182
+ if exist "%~dp0pretrained\%d32%" (
183
+ echo %d32% in .\pretrained checked.
184
+ echo=
185
+ ) else (
186
+ echo failed. starting download from huggingface.
187
+ %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M %dld32% -d %~dp0pretrained -o %d32%
188
+ if exist "%~dp0pretrained\%d32%" (echo download successful.) else (echo please try again!
189
+ echo=)
190
+ )
191
+ echo checking %d40%
192
+ if exist "%~dp0pretrained\%d40%" (
193
+ echo %d40% in .\pretrained checked.
194
+ echo=
195
+ ) else (
196
+ echo failed. starting download from huggingface.
197
+ %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M %dld40% -d %~dp0pretrained -o %d40%
198
+ if exist "%~dp0pretrained\%d40%" (echo download successful.) else (echo please try again!
199
+ echo=)
200
+ )
201
+ echo checking %d40v2%
202
+ if exist "%~dp0pretrained_v2\%d40v2%" (
203
+ echo %d40v2% in .\pretrained_v2 checked.
204
+ echo=
205
+ ) else (
206
+ echo failed. starting download from huggingface.
207
+ %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M %dld40v2% -d %~dp0pretrained_v2 -o %d40v2%
208
+ if exist "%~dp0pretrained_v2\%d40v2%" (echo download successful.) else (echo please try again!
209
+ echo=)
210
+ )
211
+ echo checking %d48%
212
+ if exist "%~dp0pretrained\%d48%" (
213
+ echo %d48% in .\pretrained checked.
214
+ echo=
215
+ ) else (
216
+ echo failed. starting download from huggingface.
217
+ %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M %dld48% -d %~dp0pretrained -o %d48%
218
+ if exist "%~dp0pretrained\%d48%" (echo download successful.) else (echo please try again!
219
+ echo=)
220
+ )
221
+ echo checking %g32%
222
+ if exist "%~dp0pretrained\%g32%" (
223
+ echo %g32% in .\pretrained checked.
224
+ echo=
225
+ ) else (
226
+ echo failed. starting download from huggingface.
227
+ %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M %dlg32% -d %~dp0pretrained -o %g32%
228
+ if exist "%~dp0pretrained\%g32%" (echo download successful.) else (echo please try again!
229
+ echo=)
230
+ )
231
+ echo checking %g40%
232
+ if exist "%~dp0pretrained\%g40%" (
233
+ echo %g40% in .\pretrained checked.
234
+ echo=
235
+ ) else (
236
+ echo failed. starting download from huggingface.
237
+ %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M %dlg40% -d %~dp0pretrained -o %g40%
238
+ if exist "%~dp0pretrained\%g40%" (echo download successful.) else (echo please try again!
239
+ echo=)
240
+ )
241
+ echo checking %g40v2%
242
+ if exist "%~dp0pretrained_v2\%g40v2%" (
243
+ echo %g40v2% in .\pretrained_v2 checked.
244
+ echo=
245
+ ) else (
246
+ echo failed. starting download from huggingface.
247
+ %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M %dlg40v2% -d %~dp0pretrained_v2 -o %g40v2%
248
+ if exist "%~dp0pretrained_v2\%g40v2%" (echo download successful.) else (echo please try again!
249
+ echo=)
250
+ )
251
+ echo checking %g48%
252
+ if exist "%~dp0pretrained\%g48%" (
253
+ echo %g48% in .\pretrained checked.
254
+ echo=
255
+ ) else (
256
+ echo failed. starting download from huggingface.
257
+ %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M %dlg48% -d %~dp0\pretrained -o %g48%
258
+ if exist "%~dp0pretrained\%g48%" (echo download successful.) else (echo please try again!
259
+ echo=)
260
+ )
261
+
262
+ echo checking %hp2_all%
263
+ if exist "%~dp0uvr5_weights\%hp2_all%" (
264
+ echo %hp2_all% in .\uvr5_weights checked.
265
+ echo=
266
+ ) else (
267
+ echo failed. starting download from huggingface.
268
+ %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M %dlhp2_all% -d %~dp0\uvr5_weights -o %hp2_all%
269
+ if exist "%~dp0uvr5_weights\%hp2_all%" (echo download successful.) else (echo please try again!
270
+ echo=)
271
+ )
272
+ echo checking %hp3_all%
273
+ if exist "%~dp0uvr5_weights\%hp3_all%" (
274
+ echo %hp3_all% in .\uvr5_weights checked.
275
+ echo=
276
+ ) else (
277
+ echo failed. starting download from huggingface.
278
+ %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M %dlhp3_all% -d %~dp0\uvr5_weights -o %hp3_all%
279
+ if exist "%~dp0uvr5_weights\%hp3_all%" (echo download successful.) else (echo please try again!
280
+ echo=)
281
+ )
282
+ echo checking %hp5_only%
283
+ if exist "%~dp0uvr5_weights\%hp5_only%" (
284
+ echo %hp5_only% in .\uvr5_weights checked.
285
+ echo=
286
+ ) else (
287
+ echo failed. starting download from huggingface.
288
+ %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M %dlhp5_only% -d %~dp0\uvr5_weights -o %hp5_only%
289
+ if exist "%~dp0uvr5_weights\%hp5_only%" (echo download successful.) else (echo please try again!
290
+ echo=)
291
+ )
292
+ echo checking %VR_DeEchoAggressive%
293
+ if exist "%~dp0uvr5_weights\%VR_DeEchoAggressive%" (
294
+ echo %VR_DeEchoAggressive% in .\uvr5_weights checked.
295
+ echo=
296
+ ) else (
297
+ echo failed. starting download from huggingface.
298
+ %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M %dlVR_DeEchoAggressive% -d %~dp0\uvr5_weights -o %VR_DeEchoAggressive%
299
+ if exist "%~dp0uvr5_weights\%VR_DeEchoAggressive%" (echo download successful.) else (echo please try again!
300
+ echo=)
301
+ )
302
+ echo checking %VR_DeEchoDeReverb%
303
+ if exist "%~dp0uvr5_weights\%VR_DeEchoDeReverb%" (
304
+ echo %VR_DeEchoDeReverb% in .\uvr5_weights checked.
305
+ echo=
306
+ ) else (
307
+ echo failed. starting download from huggingface.
308
+ %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M %dlVR_DeEchoDeReverb% -d %~dp0\uvr5_weights -o %VR_DeEchoDeReverb%
309
+ if exist "%~dp0uvr5_weights\%VR_DeEchoDeReverb%" (echo download successful.) else (echo please try again!
310
+ echo=)
311
+ )
312
+ echo checking %VR_DeEchoNormal%
313
+ if exist "%~dp0uvr5_weights\%VR_DeEchoNormal%" (
314
+ echo %VR_DeEchoNormal% in .\uvr5_weights checked.
315
+ echo=
316
+ ) else (
317
+ echo failed. starting download from huggingface.
318
+ %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M %dlVR_DeEchoNormal% -d %~dp0\uvr5_weights -o %VR_DeEchoNormal%
319
+ if exist "%~dp0uvr5_weights\%VR_DeEchoNormal%" (echo download successful.) else (echo please try again!
320
+ echo=)
321
+ )
322
+ echo checking %onnx_dereverb%
323
+ if exist "%~dp0uvr5_weights\onnx_dereverb_By_FoxJoy\%onnx_dereverb%" (
324
+ echo %onnx_dereverb% in .\uvr5_weights\onnx_dereverb_By_FoxJoy checked.
325
+ echo=
326
+ ) else (
327
+ echo failed. starting download from huggingface.
328
+ %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M %dlonnx_dereverb% -d %~dp0\uvr5_weights\onnx_dereverb_By_FoxJoy -o %onnx_dereverb%
329
+ if exist "%~dp0uvr5_weights\onnx_dereverb_By_FoxJoy\%onnx_dereverb%" (echo download successful.) else (echo please try again!
330
+ echo=)
331
+ )
332
+
333
+ echo checking %hb%
334
+ if exist "%~dp0%hb%" (
335
+ echo %hb% in .\pretrained checked.
336
+ echo=
337
+ ) else (
338
+ echo failed. starting download from huggingface.
339
+ %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M %dlhb% -d %~dp0 -o %hb%
340
+ if exist "%~dp0%hb%" (echo download successful.) else (echo please try again!
341
+ echo=)
342
+ )
343
+
344
+ echo required files check finished.
345
+ echo envfiles check complete.
346
+ pause
347
+ :end
348
+ del flag.txt
export_onnx.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from infer_pack.models_onnx import SynthesizerTrnMsNSFsidM
2
+ import torch
3
+
4
+ if __name__ == "__main__":
5
+ MoeVS = True # 模型是否为MoeVoiceStudio(原MoeSS)使用
6
+
7
+ ModelPath = "Shiroha/shiroha.pth" # 模型路径
8
+ ExportedPath = "model.onnx" # 输出路径
9
+ hidden_channels = 256 # hidden_channels,为768Vec做准备
10
+ cpt = torch.load(ModelPath, map_location="cpu")
11
+ cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
12
+ print(*cpt["config"])
13
+
14
+ test_phone = torch.rand(1, 200, hidden_channels) # hidden unit
15
+ test_phone_lengths = torch.tensor([200]).long() # hidden unit 长度(貌似没啥用)
16
+ test_pitch = torch.randint(size=(1, 200), low=5, high=255) # 基频(单位赫兹)
17
+ test_pitchf = torch.rand(1, 200) # nsf基频
18
+ test_ds = torch.LongTensor([0]) # 说话人ID
19
+ test_rnd = torch.rand(1, 192, 200) # 噪声(加入随机因子)
20
+
21
+ device = "cpu" # 导出时设备(不影响使用模型)
22
+
23
+ net_g = SynthesizerTrnMsNSFsidM(
24
+ *cpt["config"], is_half=False
25
+ ) # fp32导出(C++要支持fp16必须手动将内存重新排列所以暂时不用fp16)
26
+ net_g.load_state_dict(cpt["weight"], strict=False)
27
+ input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds", "rnd"]
28
+ output_names = [
29
+ "audio",
30
+ ]
31
+ # net_g.construct_spkmixmap(n_speaker) 多角色混合轨道导出
32
+ torch.onnx.export(
33
+ net_g,
34
+ (
35
+ test_phone.to(device),
36
+ test_phone_lengths.to(device),
37
+ test_pitch.to(device),
38
+ test_pitchf.to(device),
39
+ test_ds.to(device),
40
+ test_rnd.to(device),
41
+ ),
42
+ ExportedPath,
43
+ dynamic_axes={
44
+ "phone": [1],
45
+ "pitch": [1],
46
+ "pitchf": [1],
47
+ "rnd": [2],
48
+ },
49
+ do_constant_folding=False,
50
+ opset_version=16,
51
+ verbose=False,
52
+ input_names=input_names,
53
+ output_names=output_names,
54
+ )
extract_f0_print.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, traceback, sys, parselmouth
2
+
3
+ now_dir = os.getcwd()
4
+ sys.path.append(now_dir)
5
+ from my_utils import load_audio
6
+ import pyworld
7
+ from scipy.io import wavfile
8
+ import numpy as np, logging
9
+
10
+ logging.getLogger("numba").setLevel(logging.WARNING)
11
+ from multiprocessing import Process
12
+
13
+ exp_dir = sys.argv[1]
14
+ f = open("%s/extract_f0_feature.log" % exp_dir, "a+")
15
+
16
+
17
+ def printt(strr):
18
+ print(strr)
19
+ f.write("%s\n" % strr)
20
+ f.flush()
21
+
22
+
23
+ n_p = int(sys.argv[2])
24
+ f0method = sys.argv[3]
25
+
26
+
27
+ class FeatureInput(object):
28
+ def __init__(self, samplerate=16000, hop_size=160):
29
+ self.fs = samplerate
30
+ self.hop = hop_size
31
+
32
+ self.f0_bin = 256
33
+ self.f0_max = 1100.0
34
+ self.f0_min = 50.0
35
+ self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
36
+ self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
37
+
38
+ def compute_f0(self, path, f0_method):
39
+ x = load_audio(path, self.fs)
40
+ p_len = x.shape[0] // self.hop
41
+ if f0_method == "pm":
42
+ time_step = 160 / 16000 * 1000
43
+ f0_min = 50
44
+ f0_max = 1100
45
+ f0 = (
46
+ parselmouth.Sound(x, self.fs)
47
+ .to_pitch_ac(
48
+ time_step=time_step / 1000,
49
+ voicing_threshold=0.6,
50
+ pitch_floor=f0_min,
51
+ pitch_ceiling=f0_max,
52
+ )
53
+ .selected_array["frequency"]
54
+ )
55
+ pad_size = (p_len - len(f0) + 1) // 2
56
+ if pad_size > 0 or p_len - len(f0) - pad_size > 0:
57
+ f0 = np.pad(
58
+ f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
59
+ )
60
+ elif f0_method == "harvest":
61
+ f0, t = pyworld.harvest(
62
+ x.astype(np.double),
63
+ fs=self.fs,
64
+ f0_ceil=self.f0_max,
65
+ f0_floor=self.f0_min,
66
+ frame_period=1000 * self.hop / self.fs,
67
+ )
68
+ f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.fs)
69
+ elif f0_method == "dio":
70
+ f0, t = pyworld.dio(
71
+ x.astype(np.double),
72
+ fs=self.fs,
73
+ f0_ceil=self.f0_max,
74
+ f0_floor=self.f0_min,
75
+ frame_period=1000 * self.hop / self.fs,
76
+ )
77
+ f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.fs)
78
+ return f0
79
+
80
+ def coarse_f0(self, f0):
81
+ f0_mel = 1127 * np.log(1 + f0 / 700)
82
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * (
83
+ self.f0_bin - 2
84
+ ) / (self.f0_mel_max - self.f0_mel_min) + 1
85
+
86
+ # use 0 or 1
87
+ f0_mel[f0_mel <= 1] = 1
88
+ f0_mel[f0_mel > self.f0_bin - 1] = self.f0_bin - 1
89
+ f0_coarse = np.rint(f0_mel).astype(int)
90
+ assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (
91
+ f0_coarse.max(),
92
+ f0_coarse.min(),
93
+ )
94
+ return f0_coarse
95
+
96
+ def go(self, paths, f0_method):
97
+ if len(paths) == 0:
98
+ printt("no-f0-todo")
99
+ else:
100
+ printt("todo-f0-%s" % len(paths))
101
+ n = max(len(paths) // 5, 1) # 每个进程最多打印5条
102
+ for idx, (inp_path, opt_path1, opt_path2) in enumerate(paths):
103
+ try:
104
+ if idx % n == 0:
105
+ printt("f0ing,now-%s,all-%s,-%s" % (idx, len(paths), inp_path))
106
+ if (
107
+ os.path.exists(opt_path1 + ".npy") == True
108
+ and os.path.exists(opt_path2 + ".npy") == True
109
+ ):
110
+ continue
111
+ featur_pit = self.compute_f0(inp_path, f0_method)
112
+ np.save(
113
+ opt_path2,
114
+ featur_pit,
115
+ allow_pickle=False,
116
+ ) # nsf
117
+ coarse_pit = self.coarse_f0(featur_pit)
118
+ np.save(
119
+ opt_path1,
120
+ coarse_pit,
121
+ allow_pickle=False,
122
+ ) # ori
123
+ except:
124
+ printt("f0fail-%s-%s-%s" % (idx, inp_path, traceback.format_exc()))
125
+
126
+
127
+ if __name__ == "__main__":
128
+ # exp_dir=r"E:\codes\py39\dataset\mi-test"
129
+ # n_p=16
130
+ # f = open("%s/log_extract_f0.log"%exp_dir, "w")
131
+ printt(sys.argv)
132
+ featureInput = FeatureInput()
133
+ paths = []
134
+ inp_root = "%s/1_16k_wavs" % (exp_dir)
135
+ opt_root1 = "%s/2a_f0" % (exp_dir)
136
+ opt_root2 = "%s/2b-f0nsf" % (exp_dir)
137
+
138
+ os.makedirs(opt_root1, exist_ok=True)
139
+ os.makedirs(opt_root2, exist_ok=True)
140
+ for name in sorted(list(os.listdir(inp_root))):
141
+ inp_path = "%s/%s" % (inp_root, name)
142
+ if "spec" in inp_path:
143
+ continue
144
+ opt_path1 = "%s/%s" % (opt_root1, name)
145
+ opt_path2 = "%s/%s" % (opt_root2, name)
146
+ paths.append([inp_path, opt_path1, opt_path2])
147
+
148
+ ps = []
149
+ for i in range(n_p):
150
+ p = Process(
151
+ target=featureInput.go,
152
+ args=(
153
+ paths[i::n_p],
154
+ f0method,
155
+ ),
156
+ )
157
+ ps.append(p)
158
+ p.start()
159
+ for i in range(n_p):
160
+ ps[i].join()
extract_feature_print.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, sys, traceback
2
+
3
+ # device=sys.argv[1]
4
+ n_part = int(sys.argv[2])
5
+ i_part = int(sys.argv[3])
6
+ if len(sys.argv) == 5:
7
+ exp_dir = sys.argv[4]
8
+ version = sys.argv[5]
9
+ else:
10
+ i_gpu = sys.argv[4]
11
+ exp_dir = sys.argv[5]
12
+ os.environ["CUDA_VISIBLE_DEVICES"] = str(i_gpu)
13
+ version = sys.argv[6]
14
+ import torch
15
+ import torch.nn.functional as F
16
+ import soundfile as sf
17
+ import numpy as np
18
+ from fairseq import checkpoint_utils
19
+
20
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
21
+
22
+ if torch.cuda.is_available():
23
+ device = "cuda"
24
+ elif torch.backends.mps.is_available():
25
+ device = "mps"
26
+ else:
27
+ device = "cpu"
28
+
29
+ f = open("%s/extract_f0_feature.log" % exp_dir, "a+")
30
+
31
+
32
+ def printt(strr):
33
+ print(strr)
34
+ f.write("%s\n" % strr)
35
+ f.flush()
36
+
37
+
38
+ printt(sys.argv)
39
+ model_path = "hubert_base.pt"
40
+
41
+ printt(exp_dir)
42
+ wavPath = "%s/1_16k_wavs" % exp_dir
43
+ outPath = (
44
+ "%s/3_feature256" % exp_dir if version == "v1" else "%s/3_feature768" % exp_dir
45
+ )
46
+ os.makedirs(outPath, exist_ok=True)
47
+
48
+
49
+ # wave must be 16k, hop_size=320
50
+ def readwave(wav_path, normalize=False):
51
+ wav, sr = sf.read(wav_path)
52
+ assert sr == 16000
53
+ feats = torch.from_numpy(wav).float()
54
+ if feats.dim() == 2: # double channels
55
+ feats = feats.mean(-1)
56
+ assert feats.dim() == 1, feats.dim()
57
+ if normalize:
58
+ with torch.no_grad():
59
+ feats = F.layer_norm(feats, feats.shape)
60
+ feats = feats.view(1, -1)
61
+ return feats
62
+
63
+
64
+ # HuBERT model
65
+ printt("load model(s) from {}".format(model_path))
66
+ # if hubert model is exist
67
+ if os.access(model_path, os.F_OK) == False:
68
+ printt(
69
+ "Error: Extracting is shut down because %s does not exist, you may download it from https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main"
70
+ % model_path
71
+ )
72
+ exit(0)
73
+ models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
74
+ [model_path],
75
+ suffix="",
76
+ )
77
+ model = models[0]
78
+ model = model.to(device)
79
+ printt("move model to %s" % device)
80
+ if device not in ["mps", "cpu"]:
81
+ model = model.half()
82
+ model.eval()
83
+
84
+ todo = sorted(list(os.listdir(wavPath)))[i_part::n_part]
85
+ n = max(1, len(todo) // 10) # 最多打印十条
86
+ if len(todo) == 0:
87
+ printt("no-feature-todo")
88
+ else:
89
+ printt("all-feature-%s" % len(todo))
90
+ for idx, file in enumerate(todo):
91
+ try:
92
+ if file.endswith(".wav"):
93
+ wav_path = "%s/%s" % (wavPath, file)
94
+ out_path = "%s/%s" % (outPath, file.replace("wav", "npy"))
95
+
96
+ if os.path.exists(out_path):
97
+ continue
98
+
99
+ feats = readwave(wav_path, normalize=saved_cfg.task.normalize)
100
+ padding_mask = torch.BoolTensor(feats.shape).fill_(False)
101
+ inputs = {
102
+ "source": feats.half().to(device)
103
+ if device not in ["mps", "cpu"]
104
+ else feats.to(device),
105
+ "padding_mask": padding_mask.to(device),
106
+ "output_layer": 9 if version == "v1" else 12, # layer 9
107
+ }
108
+ with torch.no_grad():
109
+ logits = model.extract_features(**inputs)
110
+ feats = (
111
+ model.final_proj(logits[0]) if version == "v1" else logits[0]
112
+ )
113
+
114
+ feats = feats.squeeze(0).float().cpu().numpy()
115
+ if np.isnan(feats).sum() == 0:
116
+ np.save(out_path, feats, allow_pickle=False)
117
+ else:
118
+ printt("%s-contains nan" % file)
119
+ if idx % n == 0:
120
+ printt("now-%s,all-%s,%s,%s" % (len(todo), idx, file, feats.shape))
121
+ except:
122
+ printt(traceback.format_exc())
123
+ printt("all-feature-done")
extract_locale.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import re
3
+
4
+ # Define regular expression patterns
5
+ pattern = r"""i18n\([\s\n\t]*(["'][^"']+["'])[\s\n\t]*\)"""
6
+
7
+ # Initialize the dictionary to store key-value pairs
8
+ data = {}
9
+
10
+
11
+ def process(fn: str):
12
+ global data
13
+ with open(fn, "r", encoding="utf-8") as f:
14
+ contents = f.read()
15
+ matches = re.findall(pattern, contents)
16
+ for key in matches:
17
+ key = eval(key)
18
+ print("extract:", key)
19
+ data[key] = key
20
+
21
+
22
+ print("processing infer-web.py")
23
+ process("infer-web.py")
24
+
25
+ print("processing gui.py")
26
+ process("gui.py")
27
+
28
+ # Save as a JSON file
29
+ with open("./i18n/zh_CN.json", "w", encoding="utf-8") as f:
30
+ json.dump(data, f, ensure_ascii=False, indent=4)
31
+ f.write("\n")
ffmpeg ADDED
Binary file (302 kB). View file
 
go-realtime-gui.bat ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ runtime\python.exe gui.py
2
+ pause
go-web.bat ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ runtime\python.exe infer-web.py --pycmd runtime\python.exe --port 7897
2
+ pause
gui.py ADDED
@@ -0,0 +1,698 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ 0416后的更新:
3
+ 引入config中half
4
+ 重建npy而不用填写
5
+ v2支持
6
+ 无f0模型支持
7
+ 修复
8
+
9
+ int16:
10
+ 增加无索引支持
11
+ f0算法改harvest(怎么看就只有这个会影响CPU占用),但是不这么改效果不好
12
+ """
13
+ import os, sys, traceback, re
14
+
15
+ import json
16
+
17
+ now_dir = os.getcwd()
18
+ sys.path.append(now_dir)
19
+ from config import Config
20
+
21
+ Config = Config()
22
+ import PySimpleGUI as sg
23
+ import sounddevice as sd
24
+ import noisereduce as nr
25
+ import numpy as np
26
+ from fairseq import checkpoint_utils
27
+ import librosa, torch, pyworld, faiss, time, threading
28
+ import torch.nn.functional as F
29
+ import torchaudio.transforms as tat
30
+ import scipy.signal as signal
31
+
32
+
33
+ # import matplotlib.pyplot as plt
34
+ from infer_pack.models import (
35
+ SynthesizerTrnMs256NSFsid,
36
+ SynthesizerTrnMs256NSFsid_nono,
37
+ SynthesizerTrnMs768NSFsid,
38
+ SynthesizerTrnMs768NSFsid_nono,
39
+ )
40
+ from i18n import I18nAuto
41
+
42
+ i18n = I18nAuto()
43
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
44
+ current_dir = os.getcwd()
45
+
46
+
47
+ class RVC:
48
+ def __init__(
49
+ self, key, hubert_path, pth_path, index_path, npy_path, index_rate
50
+ ) -> None:
51
+ """
52
+ 初始化
53
+ """
54
+ try:
55
+ self.f0_up_key = key
56
+ self.time_step = 160 / 16000 * 1000
57
+ self.f0_min = 50
58
+ self.f0_max = 1100
59
+ self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
60
+ self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
61
+ self.sr = 16000
62
+ self.window = 160
63
+ if index_rate != 0:
64
+ self.index = faiss.read_index(index_path)
65
+ # self.big_npy = np.load(npy_path)
66
+ self.big_npy = self.index.reconstruct_n(0, self.index.ntotal)
67
+ print("index search enabled")
68
+ self.index_rate = index_rate
69
+ model_path = hubert_path
70
+ print("load model(s) from {}".format(model_path))
71
+ models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
72
+ [model_path],
73
+ suffix="",
74
+ )
75
+ self.model = models[0]
76
+ self.model = self.model.to(device)
77
+ if Config.is_half:
78
+ self.model = self.model.half()
79
+ else:
80
+ self.model = self.model.float()
81
+ self.model.eval()
82
+ cpt = torch.load(pth_path, map_location="cpu")
83
+ self.tgt_sr = cpt["config"][-1]
84
+ cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
85
+ self.if_f0 = cpt.get("f0", 1)
86
+ self.version = cpt.get("version", "v1")
87
+ if self.version == "v1":
88
+ if self.if_f0 == 1:
89
+ self.net_g = SynthesizerTrnMs256NSFsid(
90
+ *cpt["config"], is_half=Config.is_half
91
+ )
92
+ else:
93
+ self.net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
94
+ elif self.version == "v2":
95
+ if self.if_f0 == 1:
96
+ self.net_g = SynthesizerTrnMs768NSFsid(
97
+ *cpt["config"], is_half=Config.is_half
98
+ )
99
+ else:
100
+ self.net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
101
+ del self.net_g.enc_q
102
+ print(self.net_g.load_state_dict(cpt["weight"], strict=False))
103
+ self.net_g.eval().to(device)
104
+ if Config.is_half:
105
+ self.net_g = self.net_g.half()
106
+ else:
107
+ self.net_g = self.net_g.float()
108
+ except:
109
+ print(traceback.format_exc())
110
+
111
+ def get_f0(self, x, f0_up_key, inp_f0=None):
112
+ x_pad = 1
113
+ f0_min = 50
114
+ f0_max = 1100
115
+ f0_mel_min = 1127 * np.log(1 + f0_min / 700)
116
+ f0_mel_max = 1127 * np.log(1 + f0_max / 700)
117
+ f0, t = pyworld.harvest(
118
+ x.astype(np.double),
119
+ fs=self.sr,
120
+ f0_ceil=f0_max,
121
+ f0_floor=f0_min,
122
+ frame_period=10,
123
+ )
124
+ f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)
125
+ f0 = signal.medfilt(f0, 3)
126
+ f0 *= pow(2, f0_up_key / 12)
127
+ # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
128
+ tf0 = self.sr // self.window # 每秒f0点数
129
+ if inp_f0 is not None:
130
+ delta_t = np.round(
131
+ (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1
132
+ ).astype("int16")
133
+ replace_f0 = np.interp(
134
+ list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1]
135
+ )
136
+ shape = f0[x_pad * tf0 : x_pad * tf0 + len(replace_f0)].shape[0]
137
+ f0[x_pad * tf0 : x_pad * tf0 + len(replace_f0)] = replace_f0[:shape]
138
+ # with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
139
+ f0bak = f0.copy()
140
+ f0_mel = 1127 * np.log(1 + f0 / 700)
141
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
142
+ f0_mel_max - f0_mel_min
143
+ ) + 1
144
+ f0_mel[f0_mel <= 1] = 1
145
+ f0_mel[f0_mel > 255] = 255
146
+ f0_coarse = np.rint(f0_mel).astype(np.int)
147
+ return f0_coarse, f0bak # 1-0
148
+
149
+ def infer(self, feats: torch.Tensor) -> np.ndarray:
150
+ """
151
+ 推理函数
152
+ """
153
+ audio = feats.clone().cpu().numpy()
154
+ assert feats.dim() == 1, feats.dim()
155
+ feats = feats.view(1, -1)
156
+ padding_mask = torch.BoolTensor(feats.shape).fill_(False)
157
+ if Config.is_half:
158
+ feats = feats.half()
159
+ else:
160
+ feats = feats.float()
161
+ inputs = {
162
+ "source": feats.to(device),
163
+ "padding_mask": padding_mask.to(device),
164
+ "output_layer": 9 if self.version == "v1" else 12,
165
+ }
166
+ torch.cuda.synchronize()
167
+ with torch.no_grad():
168
+ logits = self.model.extract_features(**inputs)
169
+ feats = (
170
+ self.model.final_proj(logits[0]) if self.version == "v1" else logits[0]
171
+ )
172
+
173
+ ####索引优化
174
+ try:
175
+ if (
176
+ hasattr(self, "index")
177
+ and hasattr(self, "big_npy")
178
+ and self.index_rate != 0
179
+ ):
180
+ npy = feats[0].cpu().numpy().astype("float32")
181
+ score, ix = self.index.search(npy, k=8)
182
+ weight = np.square(1 / score)
183
+ weight /= weight.sum(axis=1, keepdims=True)
184
+ npy = np.sum(self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
185
+ if Config.is_half:
186
+ npy = npy.astype("float16")
187
+ feats = (
188
+ torch.from_numpy(npy).unsqueeze(0).to(device) * self.index_rate
189
+ + (1 - self.index_rate) * feats
190
+ )
191
+ else:
192
+ print("index search FAIL or disabled")
193
+ except:
194
+ traceback.print_exc()
195
+ print("index search FAIL")
196
+ feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
197
+ torch.cuda.synchronize()
198
+ print(feats.shape)
199
+ if self.if_f0 == 1:
200
+ pitch, pitchf = self.get_f0(audio, self.f0_up_key)
201
+ p_len = min(feats.shape[1], 13000, pitch.shape[0]) # 太大了爆显存
202
+ else:
203
+ pitch, pitchf = None, None
204
+ p_len = min(feats.shape[1], 13000) # 太大了爆显存
205
+ torch.cuda.synchronize()
206
+ # print(feats.shape,pitch.shape)
207
+ feats = feats[:, :p_len, :]
208
+ if self.if_f0 == 1:
209
+ pitch = pitch[:p_len]
210
+ pitchf = pitchf[:p_len]
211
+ pitch = torch.LongTensor(pitch).unsqueeze(0).to(device)
212
+ pitchf = torch.FloatTensor(pitchf).unsqueeze(0).to(device)
213
+ p_len = torch.LongTensor([p_len]).to(device)
214
+ ii = 0 # sid
215
+ sid = torch.LongTensor([ii]).to(device)
216
+ with torch.no_grad():
217
+ if self.if_f0 == 1:
218
+ infered_audio = (
219
+ self.net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0]
220
+ .data.cpu()
221
+ .float()
222
+ )
223
+ else:
224
+ infered_audio = (
225
+ self.net_g.infer(feats, p_len, sid)[0][0, 0].data.cpu().float()
226
+ )
227
+ torch.cuda.synchronize()
228
+ return infered_audio
229
+
230
+
231
+ class GUIConfig:
232
+ def __init__(self) -> None:
233
+ self.hubert_path: str = ""
234
+ self.pth_path: str = ""
235
+ self.index_path: str = ""
236
+ self.npy_path: str = ""
237
+ self.pitch: int = 12
238
+ self.samplerate: int = 44100
239
+ self.block_time: float = 1.0 # s
240
+ self.buffer_num: int = 1
241
+ self.threhold: int = -30
242
+ self.crossfade_time: float = 0.08
243
+ self.extra_time: float = 0.04
244
+ self.I_noise_reduce = False
245
+ self.O_noise_reduce = False
246
+ self.index_rate = 0.3
247
+
248
+
249
+ class GUI:
250
+ def __init__(self) -> None:
251
+ self.config = GUIConfig()
252
+ self.flag_vc = False
253
+
254
+ self.launcher()
255
+
256
+ def load(self):
257
+ input_devices, output_devices, _, _ = self.get_devices()
258
+ try:
259
+ with open("values1.json", "r") as j:
260
+ data = json.load(j)
261
+ except:
262
+ with open("values1.json", "w") as j:
263
+ data = {
264
+ "pth_path": " ",
265
+ "index_path": " ",
266
+ "sg_input_device": input_devices[sd.default.device[0]],
267
+ "sg_output_device": output_devices[sd.default.device[1]],
268
+ "threhold": "-45",
269
+ "pitch": "0",
270
+ "index_rate": "0",
271
+ "block_time": "1",
272
+ "crossfade_length": "0.04",
273
+ "extra_time": "1",
274
+ }
275
+ return data
276
+
277
+ def launcher(self):
278
+ data = self.load()
279
+ sg.theme("LightBlue3")
280
+ input_devices, output_devices, _, _ = self.get_devices()
281
+ layout = [
282
+ [
283
+ sg.Frame(
284
+ title=i18n("加载模型"),
285
+ layout=[
286
+ [
287
+ sg.Input(
288
+ default_text="hubert_base.pt",
289
+ key="hubert_path",
290
+ disabled=True,
291
+ ),
292
+ sg.FileBrowse(
293
+ i18n("Hubert模型"),
294
+ initial_folder=os.path.join(os.getcwd()),
295
+ file_types=((". pt"),),
296
+ ),
297
+ ],
298
+ [
299
+ sg.Input(
300
+ default_text=data.get("pth_path", ""),
301
+ key="pth_path",
302
+ ),
303
+ sg.FileBrowse(
304
+ i18n("选择.pth文件"),
305
+ initial_folder=os.path.join(os.getcwd(), "weights"),
306
+ file_types=((". pth"),),
307
+ ),
308
+ ],
309
+ [
310
+ sg.Input(
311
+ default_text=data.get("index_path", ""),
312
+ key="index_path",
313
+ ),
314
+ sg.FileBrowse(
315
+ i18n("选择.index文件"),
316
+ initial_folder=os.path.join(os.getcwd(), "logs"),
317
+ file_types=((". index"),),
318
+ ),
319
+ ],
320
+ [
321
+ sg.Input(
322
+ default_text="你不需要填写这个You don't need write this.",
323
+ key="npy_path",
324
+ disabled=True,
325
+ ),
326
+ sg.FileBrowse(
327
+ i18n("选择.npy文件"),
328
+ initial_folder=os.path.join(os.getcwd(), "logs"),
329
+ file_types=((". npy"),),
330
+ ),
331
+ ],
332
+ ],
333
+ )
334
+ ],
335
+ [
336
+ sg.Frame(
337
+ layout=[
338
+ [
339
+ sg.Text(i18n("输入设备")),
340
+ sg.Combo(
341
+ input_devices,
342
+ key="sg_input_device",
343
+ default_value=data.get("sg_input_device", ""),
344
+ ),
345
+ ],
346
+ [
347
+ sg.Text(i18n("输出设备")),
348
+ sg.Combo(
349
+ output_devices,
350
+ key="sg_output_device",
351
+ default_value=data.get("sg_output_device", ""),
352
+ ),
353
+ ],
354
+ ],
355
+ title=i18n("音频设备(请使用同种类驱动)"),
356
+ )
357
+ ],
358
+ [
359
+ sg.Frame(
360
+ layout=[
361
+ [
362
+ sg.Text(i18n("响应阈值")),
363
+ sg.Slider(
364
+ range=(-60, 0),
365
+ key="threhold",
366
+ resolution=1,
367
+ orientation="h",
368
+ default_value=data.get("threhold", ""),
369
+ ),
370
+ ],
371
+ [
372
+ sg.Text(i18n("音调设置")),
373
+ sg.Slider(
374
+ range=(-24, 24),
375
+ key="pitch",
376
+ resolution=1,
377
+ orientation="h",
378
+ default_value=data.get("pitch", ""),
379
+ ),
380
+ ],
381
+ [
382
+ sg.Text(i18n("Index Rate")),
383
+ sg.Slider(
384
+ range=(0.0, 1.0),
385
+ key="index_rate",
386
+ resolution=0.01,
387
+ orientation="h",
388
+ default_value=data.get("index_rate", ""),
389
+ ),
390
+ ],
391
+ ],
392
+ title=i18n("常规设置"),
393
+ ),
394
+ sg.Frame(
395
+ layout=[
396
+ [
397
+ sg.Text(i18n("采样长度")),
398
+ sg.Slider(
399
+ range=(0.1, 3.0),
400
+ key="block_time",
401
+ resolution=0.1,
402
+ orientation="h",
403
+ default_value=data.get("block_time", ""),
404
+ ),
405
+ ],
406
+ [
407
+ sg.Text(i18n("淡入淡出长度")),
408
+ sg.Slider(
409
+ range=(0.01, 0.15),
410
+ key="crossfade_length",
411
+ resolution=0.01,
412
+ orientation="h",
413
+ default_value=data.get("crossfade_length", ""),
414
+ ),
415
+ ],
416
+ [
417
+ sg.Text(i18n("额外推理时长")),
418
+ sg.Slider(
419
+ range=(0.05, 3.00),
420
+ key="extra_time",
421
+ resolution=0.01,
422
+ orientation="h",
423
+ default_value=data.get("extra_time", ""),
424
+ ),
425
+ ],
426
+ [
427
+ sg.Checkbox(i18n("输入降噪"), key="I_noise_reduce"),
428
+ sg.Checkbox(i18n("输出降噪"), key="O_noise_reduce"),
429
+ ],
430
+ ],
431
+ title=i18n("性能设置"),
432
+ ),
433
+ ],
434
+ [
435
+ sg.Button(i18n("开始音频转换"), key="start_vc"),
436
+ sg.Button(i18n("停止音频转换"), key="stop_vc"),
437
+ sg.Text(i18n("推理时间(ms):")),
438
+ sg.Text("0", key="infer_time"),
439
+ ],
440
+ ]
441
+ self.window = sg.Window("RVC - GUI", layout=layout)
442
+ self.event_handler()
443
+
444
+ def event_handler(self):
445
+ while True:
446
+ event, values = self.window.read()
447
+ if event == sg.WINDOW_CLOSED:
448
+ self.flag_vc = False
449
+ exit()
450
+ if event == "start_vc" and self.flag_vc == False:
451
+ if self.set_values(values) == True:
452
+ print("using_cuda:" + str(torch.cuda.is_available()))
453
+ self.start_vc()
454
+ settings = {
455
+ "pth_path": values["pth_path"],
456
+ "index_path": values["index_path"],
457
+ "sg_input_device": values["sg_input_device"],
458
+ "sg_output_device": values["sg_output_device"],
459
+ "threhold": values["threhold"],
460
+ "pitch": values["pitch"],
461
+ "index_rate": values["index_rate"],
462
+ "block_time": values["block_time"],
463
+ "crossfade_length": values["crossfade_length"],
464
+ "extra_time": values["extra_time"],
465
+ }
466
+ with open("values1.json", "w") as j:
467
+ json.dump(settings, j)
468
+ if event == "stop_vc" and self.flag_vc == True:
469
+ self.flag_vc = False
470
+
471
+ def set_values(self, values):
472
+ if len(values["pth_path"].strip()) == 0:
473
+ sg.popup(i18n("请选择pth文件"))
474
+ return False
475
+ if len(values["index_path"].strip()) == 0:
476
+ sg.popup(i18n("请选择index文件"))
477
+ return False
478
+ pattern = re.compile("[^\x00-\x7F]+")
479
+ if pattern.findall(values["hubert_path"]):
480
+ sg.popup(i18n("hubert模型路径不可包含中文"))
481
+ return False
482
+ if pattern.findall(values["pth_path"]):
483
+ sg.popup(i18n("pth文件路径不可包含中文"))
484
+ return False
485
+ if pattern.findall(values["index_path"]):
486
+ sg.popup(i18n("index文件路径不可包含中文"))
487
+ return False
488
+ self.set_devices(values["sg_input_device"], values["sg_output_device"])
489
+ self.config.hubert_path = os.path.join(current_dir, "hubert_base.pt")
490
+ self.config.pth_path = values["pth_path"]
491
+ self.config.index_path = values["index_path"]
492
+ self.config.npy_path = values["npy_path"]
493
+ self.config.threhold = values["threhold"]
494
+ self.config.pitch = values["pitch"]
495
+ self.config.block_time = values["block_time"]
496
+ self.config.crossfade_time = values["crossfade_length"]
497
+ self.config.extra_time = values["extra_time"]
498
+ self.config.I_noise_reduce = values["I_noise_reduce"]
499
+ self.config.O_noise_reduce = values["O_noise_reduce"]
500
+ self.config.index_rate = values["index_rate"]
501
+ return True
502
+
503
+ def start_vc(self):
504
+ torch.cuda.empty_cache()
505
+ self.flag_vc = True
506
+ self.block_frame = int(self.config.block_time * self.config.samplerate)
507
+ self.crossfade_frame = int(self.config.crossfade_time * self.config.samplerate)
508
+ self.sola_search_frame = int(0.012 * self.config.samplerate)
509
+ self.delay_frame = int(0.01 * self.config.samplerate) # 往前预留0.02s
510
+ self.extra_frame = int(self.config.extra_time * self.config.samplerate)
511
+ self.rvc = None
512
+ self.rvc = RVC(
513
+ self.config.pitch,
514
+ self.config.hubert_path,
515
+ self.config.pth_path,
516
+ self.config.index_path,
517
+ self.config.npy_path,
518
+ self.config.index_rate,
519
+ )
520
+ self.input_wav: np.ndarray = np.zeros(
521
+ self.extra_frame
522
+ + self.crossfade_frame
523
+ + self.sola_search_frame
524
+ + self.block_frame,
525
+ dtype="float32",
526
+ )
527
+ self.output_wav: torch.Tensor = torch.zeros(
528
+ self.block_frame, device=device, dtype=torch.float32
529
+ )
530
+ self.sola_buffer: torch.Tensor = torch.zeros(
531
+ self.crossfade_frame, device=device, dtype=torch.float32
532
+ )
533
+ self.fade_in_window: torch.Tensor = torch.linspace(
534
+ 0.0, 1.0, steps=self.crossfade_frame, device=device, dtype=torch.float32
535
+ )
536
+ self.fade_out_window: torch.Tensor = 1 - self.fade_in_window
537
+ self.resampler1 = tat.Resample(
538
+ orig_freq=self.config.samplerate, new_freq=16000, dtype=torch.float32
539
+ )
540
+ self.resampler2 = tat.Resample(
541
+ orig_freq=self.rvc.tgt_sr,
542
+ new_freq=self.config.samplerate,
543
+ dtype=torch.float32,
544
+ )
545
+ thread_vc = threading.Thread(target=self.soundinput)
546
+ thread_vc.start()
547
+
548
+ def soundinput(self):
549
+ """
550
+ 接受音频输入
551
+ """
552
+ with sd.Stream(
553
+ callback=self.audio_callback,
554
+ blocksize=self.block_frame,
555
+ samplerate=self.config.samplerate,
556
+ dtype="float32",
557
+ ):
558
+ while self.flag_vc:
559
+ time.sleep(self.config.block_time)
560
+ print("Audio block passed.")
561
+ print("ENDing VC")
562
+
563
+ def audio_callback(
564
+ self, indata: np.ndarray, outdata: np.ndarray, frames, times, status
565
+ ):
566
+ """
567
+ 音频处理
568
+ """
569
+ start_time = time.perf_counter()
570
+ indata = librosa.to_mono(indata.T)
571
+ if self.config.I_noise_reduce:
572
+ indata[:] = nr.reduce_noise(y=indata, sr=self.config.samplerate)
573
+
574
+ """noise gate"""
575
+ frame_length = 2048
576
+ hop_length = 1024
577
+ rms = librosa.feature.rms(
578
+ y=indata, frame_length=frame_length, hop_length=hop_length
579
+ )
580
+ db_threhold = librosa.amplitude_to_db(rms, ref=1.0)[0] < self.config.threhold
581
+ # print(rms.shape,db.shape,db)
582
+ for i in range(db_threhold.shape[0]):
583
+ if db_threhold[i]:
584
+ indata[i * hop_length : (i + 1) * hop_length] = 0
585
+ self.input_wav[:] = np.append(self.input_wav[self.block_frame :], indata)
586
+
587
+ # infer
588
+ print("input_wav:" + str(self.input_wav.shape))
589
+ # print('infered_wav:'+str(infer_wav.shape))
590
+ infer_wav: torch.Tensor = self.resampler2(
591
+ self.rvc.infer(self.resampler1(torch.from_numpy(self.input_wav)))
592
+ )[-self.crossfade_frame - self.sola_search_frame - self.block_frame :].to(
593
+ device
594
+ )
595
+ print("infer_wav:" + str(infer_wav.shape))
596
+
597
+ # SOLA algorithm from https://github.com/yxlllc/DDSP-SVC
598
+ cor_nom = F.conv1d(
599
+ infer_wav[None, None, : self.crossfade_frame + self.sola_search_frame],
600
+ self.sola_buffer[None, None, :],
601
+ )
602
+ cor_den = torch.sqrt(
603
+ F.conv1d(
604
+ infer_wav[None, None, : self.crossfade_frame + self.sola_search_frame]
605
+ ** 2,
606
+ torch.ones(1, 1, self.crossfade_frame, device=device),
607
+ )
608
+ + 1e-8
609
+ )
610
+ sola_offset = torch.argmax(cor_nom[0, 0] / cor_den[0, 0])
611
+ print("sola offset: " + str(int(sola_offset)))
612
+
613
+ # crossfade
614
+ self.output_wav[:] = infer_wav[sola_offset : sola_offset + self.block_frame]
615
+ self.output_wav[: self.crossfade_frame] *= self.fade_in_window
616
+ self.output_wav[: self.crossfade_frame] += self.sola_buffer[:]
617
+ if sola_offset < self.sola_search_frame:
618
+ self.sola_buffer[:] = (
619
+ infer_wav[
620
+ -self.sola_search_frame
621
+ - self.crossfade_frame
622
+ + sola_offset : -self.sola_search_frame
623
+ + sola_offset
624
+ ]
625
+ * self.fade_out_window
626
+ )
627
+ else:
628
+ self.sola_buffer[:] = (
629
+ infer_wav[-self.crossfade_frame :] * self.fade_out_window
630
+ )
631
+
632
+ if self.config.O_noise_reduce:
633
+ outdata[:] = np.tile(
634
+ nr.reduce_noise(
635
+ y=self.output_wav[:].cpu().numpy(), sr=self.config.samplerate
636
+ ),
637
+ (2, 1),
638
+ ).T
639
+ else:
640
+ outdata[:] = self.output_wav[:].repeat(2, 1).t().cpu().numpy()
641
+ total_time = time.perf_counter() - start_time
642
+ self.window["infer_time"].update(int(total_time * 1000))
643
+ print("infer time:" + str(total_time))
644
+
645
+ def get_devices(self, update: bool = True):
646
+ """获取设备列表"""
647
+ if update:
648
+ sd._terminate()
649
+ sd._initialize()
650
+ devices = sd.query_devices()
651
+ hostapis = sd.query_hostapis()
652
+ for hostapi in hostapis:
653
+ for device_idx in hostapi["devices"]:
654
+ devices[device_idx]["hostapi_name"] = hostapi["name"]
655
+ input_devices = [
656
+ f"{d['name']} ({d['hostapi_name']})"
657
+ for d in devices
658
+ if d["max_input_channels"] > 0
659
+ ]
660
+ output_devices = [
661
+ f"{d['name']} ({d['hostapi_name']})"
662
+ for d in devices
663
+ if d["max_output_channels"] > 0
664
+ ]
665
+ input_devices_indices = [
666
+ d["index"] if "index" in d else d["name"]
667
+ for d in devices
668
+ if d["max_input_channels"] > 0
669
+ ]
670
+ output_devices_indices = [
671
+ d["index"] if "index" in d else d["name"]
672
+ for d in devices
673
+ if d["max_output_channels"] > 0
674
+ ]
675
+ return (
676
+ input_devices,
677
+ output_devices,
678
+ input_devices_indices,
679
+ output_devices_indices,
680
+ )
681
+
682
+ def set_devices(self, input_device, output_device):
683
+ """设置输出设备"""
684
+ (
685
+ input_devices,
686
+ output_devices,
687
+ input_device_indices,
688
+ output_device_indices,
689
+ ) = self.get_devices()
690
+ sd.default.device[0] = input_device_indices[input_devices.index(input_device)]
691
+ sd.default.device[1] = output_device_indices[
692
+ output_devices.index(output_device)
693
+ ]
694
+ print("input device:" + str(sd.default.device[0]) + ":" + str(input_device))
695
+ print("output device:" + str(sd.default.device[1]) + ":" + str(output_device))
696
+
697
+
698
+ gui = GUI()
host.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "2.0",
3
+ "logging": {
4
+ "applicationInsights": {
5
+ "samplingSettings": {
6
+ "isEnabled": true,
7
+ "excludedTypes": "Request"
8
+ }
9
+ }
10
+ },
11
+ "extensionBundle": {
12
+ "id": "Microsoft.Azure.Functions.ExtensionBundle",
13
+ "version": "[4.*, 5.0.0)"
14
+ }
15
+ }
hubert_base.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f54b40fd2802423a5643779c4861af1e9ee9c1564dc9d32f54f20b5ffba7db96
3
+ size 189507909
i18n.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import locale
2
+ import json
3
+ import os
4
+
5
+
6
+ def load_language_list(language):
7
+ with open(f"./i18n/{language}.json", "r", encoding="utf-8") as f:
8
+ language_list = json.load(f)
9
+ return language_list
10
+
11
+
12
+ class I18nAuto:
13
+ def __init__(self, language=None):
14
+ if language in ["Auto", None]:
15
+ language = locale.getdefaultlocale()[
16
+ 0
17
+ ] # getlocale can't identify the system's language ((None, None))
18
+ if not os.path.exists(f"./i18n/{language}.json"):
19
+ language = "en_US"
20
+ self.language = language
21
+ # print("Use Language:", language)
22
+ self.language_map = load_language_list(language)
23
+
24
+ def __call__(self, key):
25
+ return self.language_map.get(key, key)
26
+
27
+ def print(self):
28
+ print("Use Language:", self.language)
infer-web.py ADDED
@@ -0,0 +1,1999 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ import sys
4
+ # os.environ["CUDA_VISIBLE_DEVICES"] = ""
5
+ now_dir = os.getcwd()
6
+ sys.path.append(now_dir)
7
+ import traceback, pdb
8
+ import warnings
9
+
10
+ import numpy as np
11
+ import torch
12
+
13
+ os.environ['OPENBLAS_NUM_THREADS'] = '1'
14
+ os.environ["no_proxy"] = "localhost, 127.0.0.1, ::1"
15
+ import logging
16
+ import threading
17
+ from random import shuffle
18
+ from subprocess import Popen
19
+ from time import sleep
20
+
21
+ import faiss
22
+ import ffmpeg
23
+ import gradio as gr
24
+ import soundfile as sf
25
+ from config import Config
26
+ from fairseq import checkpoint_utils
27
+ from i18n import I18nAuto
28
+ from infer_pack.models import (
29
+ SynthesizerTrnMs256NSFsid,
30
+ SynthesizerTrnMs256NSFsid_nono,
31
+ SynthesizerTrnMs768NSFsid,
32
+ SynthesizerTrnMs768NSFsid_nono,
33
+ )
34
+ from infer_pack.models_onnx import SynthesizerTrnMsNSFsidM
35
+ from infer_uvr5 import _audio_pre_, _audio_pre_new
36
+ from MDXNet import MDXNetDereverb
37
+ from my_utils import load_audio
38
+ from train.process_ckpt import change_info, extract_small_model, merge, show_info
39
+ from vc_infer_pipeline import VC
40
+ from sklearn.cluster import MiniBatchKMeans
41
+
42
+ logging.getLogger("numba").setLevel(logging.WARNING)
43
+
44
+
45
+ tmp = os.path.join(now_dir, "TEMP")
46
+ shutil.rmtree(tmp, ignore_errors=True)
47
+ shutil.rmtree("%s/runtime/Lib/site-packages/infer_pack" % (now_dir), ignore_errors=True)
48
+ shutil.rmtree("%s/runtime/Lib/site-packages/uvr5_pack" % (now_dir), ignore_errors=True)
49
+ os.makedirs(tmp, exist_ok=True)
50
+ os.makedirs(os.path.join(now_dir, "logs"), exist_ok=True)
51
+ os.makedirs(os.path.join(now_dir, "weights"), exist_ok=True)
52
+ os.environ["TEMP"] = tmp
53
+ warnings.filterwarnings("ignore")
54
+ torch.manual_seed(114514)
55
+
56
+
57
+ config = Config()
58
+ i18n = I18nAuto()
59
+ i18n.print()
60
+ # 判断是否有能用来训练和加速推理的N卡
61
+ ngpu = torch.cuda.device_count()
62
+ gpu_infos = []
63
+ mem = []
64
+ if_gpu_ok = False
65
+
66
+ if torch.cuda.is_available() or ngpu != 0:
67
+ for i in range(ngpu):
68
+ gpu_name = torch.cuda.get_device_name(i)
69
+ if any(
70
+ value in gpu_name.upper()
71
+ for value in [
72
+ "10",
73
+ "16",
74
+ "20",
75
+ "30",
76
+ "40",
77
+ "A2",
78
+ "A3",
79
+ "A4",
80
+ "P4",
81
+ "A50",
82
+ "500",
83
+ "A60",
84
+ "70",
85
+ "80",
86
+ "90",
87
+ "M4",
88
+ "T4",
89
+ "TITAN",
90
+ ]
91
+ ):
92
+ # A10#A100#V100#A40#P40#M40#K80#A4500
93
+ if_gpu_ok = True # 至少有一张能用的N卡
94
+ gpu_infos.append("%s\t%s" % (i, gpu_name))
95
+ mem.append(
96
+ int(
97
+ torch.cuda.get_device_properties(i).total_memory
98
+ / 1024
99
+ / 1024
100
+ / 1024
101
+ + 0.4
102
+ )
103
+ )
104
+ if if_gpu_ok and len(gpu_infos) > 0:
105
+ gpu_info = "\n".join(gpu_infos)
106
+ default_batch_size = 1
107
+ else:
108
+ gpu_info = i18n("很遗憾您这没有能用的显卡来支持您训练")
109
+ default_batch_size = 1
110
+ gpus = "-".join([i[0] for i in gpu_infos])
111
+
112
+
113
+ class ToolButton(gr.Button, gr.components.FormComponent):
114
+ """Small button with single emoji as text, fits inside gradio forms"""
115
+
116
+ def __init__(self, **kwargs):
117
+ super().__init__(variant="tool", **kwargs)
118
+
119
+ def get_block_name(self):
120
+ return "button"
121
+
122
+
123
+ hubert_model = None
124
+
125
+
126
+ def load_hubert():
127
+ global hubert_model
128
+ models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
129
+ ["hubert_base.pt"],
130
+ suffix="",
131
+ )
132
+ hubert_model = models[0]
133
+ hubert_model = hubert_model.to(config.device)
134
+ if config.is_half:
135
+ hubert_model = hubert_model.half()
136
+ else:
137
+ hubert_model = hubert_model.float()
138
+ hubert_model.eval()
139
+
140
+
141
+ weight_root = "weights"
142
+ weight_uvr5_root = "uvr5_weights"
143
+ index_root = "logs"
144
+ names = []
145
+ for name in os.listdir(weight_root):
146
+ if name.endswith(".pth"):
147
+ names.append(name)
148
+ index_paths = []
149
+ for root, dirs, files in os.walk(index_root, topdown=False):
150
+ for name in files:
151
+ if name.endswith(".index") and "trained" not in name:
152
+ index_paths.append("%s/%s" % (root, name))
153
+ uvr5_names = []
154
+ for name in os.listdir(weight_uvr5_root):
155
+ if name.endswith(".pth") or "onnx" in name:
156
+ uvr5_names.append(name.replace(".pth", ""))
157
+
158
+
159
+ def vc_single(
160
+ sid,
161
+ input_audio_path,
162
+ f0_up_key,
163
+ f0_file,
164
+ f0_method,
165
+ file_index,
166
+ file_index2,
167
+ # file_big_npy,
168
+ index_rate,
169
+ filter_radius,
170
+ resample_sr,
171
+ rms_mix_rate,
172
+ protect,
173
+ ): # spk_item, input_audio0, vc_transform0,f0_file,f0method0
174
+ global tgt_sr, net_g, vc, hubert_model, version
175
+ if input_audio_path is None:
176
+ return "You need to upload an audio", None
177
+ f0_up_key = int(f0_up_key)
178
+ try:
179
+ audio = load_audio(input_audio_path, 16000)
180
+ audio_max = np.abs(audio).max() / 0.95
181
+ if audio_max > 1:
182
+ audio /= audio_max
183
+ times = [0, 0, 0]
184
+ if not hubert_model:
185
+ load_hubert()
186
+ if_f0 = cpt.get("f0", 1)
187
+ file_index = (
188
+ (
189
+ file_index.strip(" ")
190
+ .strip('"')
191
+ .strip("\n")
192
+ .strip('"')
193
+ .strip(" ")
194
+ .replace("trained", "added")
195
+ )
196
+ if file_index != ""
197
+ else file_index2
198
+ ) # 防止小白写错,自动帮他替换掉
199
+ # file_big_npy = (
200
+ # file_big_npy.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
201
+ # )
202
+ audio_opt = vc.pipeline(
203
+ hubert_model,
204
+ net_g,
205
+ sid,
206
+ audio,
207
+ input_audio_path,
208
+ times,
209
+ f0_up_key,
210
+ f0_method,
211
+ file_index,
212
+ # file_big_npy,
213
+ index_rate,
214
+ if_f0,
215
+ filter_radius,
216
+ tgt_sr,
217
+ resample_sr,
218
+ rms_mix_rate,
219
+ version,
220
+ protect,
221
+ f0_file=f0_file,
222
+ )
223
+ if tgt_sr != resample_sr >= 16000:
224
+ tgt_sr = resample_sr
225
+ index_info = (
226
+ "Using index:%s." % file_index
227
+ if os.path.exists(file_index)
228
+ else "Index not used."
229
+ )
230
+ return "Success.\n %s\nTime:\n npy:%ss, f0:%ss, infer:%ss" % (
231
+ index_info,
232
+ times[0],
233
+ times[1],
234
+ times[2],
235
+ ), (tgt_sr, audio_opt)
236
+ except:
237
+ info = traceback.format_exc()
238
+ print(info)
239
+ return info, (None, None)
240
+
241
+
242
+ def vc_multi(
243
+ sid,
244
+ dir_path,
245
+ opt_root,
246
+ paths,
247
+ f0_up_key,
248
+ f0_method,
249
+ file_index,
250
+ file_index2,
251
+ # file_big_npy,
252
+ index_rate,
253
+ filter_radius,
254
+ resample_sr,
255
+ rms_mix_rate,
256
+ protect,
257
+ format1,
258
+ ):
259
+ try:
260
+ dir_path = (
261
+ dir_path.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
262
+ ) # 防止小白拷路径头尾带了空格和"和回车
263
+ opt_root = opt_root.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
264
+ os.makedirs(opt_root, exist_ok=True)
265
+ try:
266
+ if dir_path != "":
267
+ paths = [os.path.join(dir_path, name) for name in os.listdir(dir_path)]
268
+ else:
269
+ paths = [path.name for path in paths]
270
+ except:
271
+ traceback.print_exc()
272
+ paths = [path.name for path in paths]
273
+ infos = []
274
+ for path in paths:
275
+ info, opt = vc_single(
276
+ sid,
277
+ path,
278
+ f0_up_key,
279
+ None,
280
+ f0_method,
281
+ file_index,
282
+ file_index2,
283
+ # file_big_npy,
284
+ index_rate,
285
+ filter_radius,
286
+ resample_sr,
287
+ rms_mix_rate,
288
+ protect,
289
+ )
290
+ if "Success" in info:
291
+ try:
292
+ tgt_sr, audio_opt = opt
293
+ if format1 in ["wav", "flac"]:
294
+ sf.write(
295
+ "%s/%s.%s" % (opt_root, os.path.basename(path), format1),
296
+ audio_opt,
297
+ tgt_sr,
298
+ )
299
+ else:
300
+ path = "%s/%s.wav" % (opt_root, os.path.basename(path))
301
+ sf.write(
302
+ path,
303
+ audio_opt,
304
+ tgt_sr,
305
+ )
306
+ if os.path.exists(path):
307
+ os.system(
308
+ "ffmpeg -i %s -vn %s -q:a 2 -y"
309
+ % (path, path[:-4] + ".%s" % format1)
310
+ )
311
+ except:
312
+ info += traceback.format_exc()
313
+ infos.append("%s->%s" % (os.path.basename(path), info))
314
+ yield "\n".join(infos)
315
+ yield "\n".join(infos)
316
+ except:
317
+ yield traceback.format_exc()
318
+
319
+
320
+ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format0):
321
+ infos = []
322
+ try:
323
+ inp_root = inp_root.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
324
+ save_root_vocal = (
325
+ save_root_vocal.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
326
+ )
327
+ save_root_ins = (
328
+ save_root_ins.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
329
+ )
330
+ if model_name == "onnx_dereverb_By_FoxJoy":
331
+ pre_fun = MDXNetDereverb(15)
332
+ else:
333
+ func = _audio_pre_ if "DeEcho" not in model_name else _audio_pre_new
334
+ pre_fun = func(
335
+ agg=int(agg),
336
+ model_path=os.path.join(weight_uvr5_root, model_name + ".pth"),
337
+ device=config.device,
338
+ is_half=config.is_half,
339
+ )
340
+ if inp_root != "":
341
+ paths = [os.path.join(inp_root, name) for name in os.listdir(inp_root)]
342
+ else:
343
+ paths = [path.name for path in paths]
344
+ for path in paths:
345
+ inp_path = os.path.join(inp_root, path)
346
+ need_reformat = 1
347
+ done = 0
348
+ try:
349
+ info = ffmpeg.probe(inp_path, cmd="ffprobe")
350
+ if (
351
+ info["streams"][0]["channels"] == 2
352
+ and info["streams"][0]["sample_rate"] == "44100"
353
+ ):
354
+ need_reformat = 0
355
+ pre_fun._path_audio_(
356
+ inp_path, save_root_ins, save_root_vocal, format0
357
+ )
358
+ done = 1
359
+ except:
360
+ need_reformat = 1
361
+ traceback.print_exc()
362
+ if need_reformat == 1:
363
+ tmp_path = "%s/%s.reformatted.wav" % (tmp, os.path.basename(inp_path))
364
+ os.system(
365
+ "ffmpeg -i %s -vn -acodec pcm_s16le -ac 2 -ar 44100 %s -y"
366
+ % (inp_path, tmp_path)
367
+ )
368
+ inp_path = tmp_path
369
+ try:
370
+ if done == 0:
371
+ pre_fun._path_audio_(
372
+ inp_path, save_root_ins, save_root_vocal, format0
373
+ )
374
+ infos.append("%s->Success" % (os.path.basename(inp_path)))
375
+ yield "\n".join(infos)
376
+ except:
377
+ infos.append(
378
+ "%s->%s" % (os.path.basename(inp_path), traceback.format_exc())
379
+ )
380
+ yield "\n".join(infos)
381
+ except:
382
+ infos.append(traceback.format_exc())
383
+ yield "\n".join(infos)
384
+ finally:
385
+ try:
386
+ if model_name == "onnx_dereverb_By_FoxJoy":
387
+ del pre_fun.pred.model
388
+ del pre_fun.pred.model_
389
+ else:
390
+ del pre_fun.model
391
+ del pre_fun
392
+ except:
393
+ traceback.print_exc()
394
+ print("clean_empty_cache")
395
+ if torch.cuda.is_available():
396
+ torch.cuda.empty_cache()
397
+ yield "\n".join(infos)
398
+
399
+
400
+ # 一个选项卡全局只能有一个音色
401
+ def get_vc(sid, to_return_protect0, to_return_protect1):
402
+ global n_spk, tgt_sr, net_g, vc, cpt, version
403
+ if sid == "" or sid == []:
404
+ global hubert_model
405
+ if hubert_model is not None: # 考虑到轮询, 需要加个判断看是否 sid 是由有模型切换到无模型的
406
+ print("clean_empty_cache")
407
+ del net_g, n_spk, vc, hubert_model, tgt_sr # ,cpt
408
+ hubert_model = net_g = n_spk = vc = hubert_model = tgt_sr = None
409
+ if torch.cuda.is_available():
410
+ torch.cuda.empty_cache()
411
+ ###楼下不这么折腾清理不干净
412
+ if_f0 = cpt.get("f0", 1)
413
+ version = cpt.get("version", "v1")
414
+ if version == "v1":
415
+ if if_f0 == 1:
416
+ net_g = SynthesizerTrnMs256NSFsid(
417
+ *cpt["config"], is_half=config.is_half
418
+ )
419
+ else:
420
+ net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
421
+ elif version == "v2":
422
+ if if_f0 == 1:
423
+ net_g = SynthesizerTrnMs768NSFsid(
424
+ *cpt["config"], is_half=config.is_half
425
+ )
426
+ else:
427
+ net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
428
+ del net_g, cpt
429
+ if torch.cuda.is_available():
430
+ torch.cuda.empty_cache()
431
+ cpt = None
432
+ return {"visible": False, "__type__": "update"}
433
+ person = "%s/%s" % (weight_root, sid)
434
+ print("loading %s" % person)
435
+ cpt = torch.load(person, map_location="cpu")
436
+ tgt_sr = cpt["config"][-1]
437
+ cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
438
+ if_f0 = cpt.get("f0", 1)
439
+ if if_f0 == 0:
440
+ to_return_protect0 = to_return_protect1 = {
441
+ "visible": False,
442
+ "value": 0.5,
443
+ "__type__": "update",
444
+ }
445
+ else:
446
+ to_return_protect0 = {
447
+ "visible": True,
448
+ "value": to_return_protect0,
449
+ "__type__": "update",
450
+ }
451
+ to_return_protect1 = {
452
+ "visible": True,
453
+ "value": to_return_protect1,
454
+ "__type__": "update",
455
+ }
456
+ version = cpt.get("version", "v1")
457
+ if version == "v1":
458
+ if if_f0 == 1:
459
+ net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=config.is_half)
460
+ else:
461
+ net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
462
+ elif version == "v2":
463
+ if if_f0 == 1:
464
+ net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=config.is_half)
465
+ else:
466
+ net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
467
+ del net_g.enc_q
468
+ print(net_g.load_state_dict(cpt["weight"], strict=False))
469
+ net_g.eval().to(config.device)
470
+ if config.is_half:
471
+ net_g = net_g.half()
472
+ else:
473
+ net_g = net_g.float()
474
+ vc = VC(tgt_sr, config)
475
+ n_spk = cpt["config"][-3]
476
+ return (
477
+ {"visible": True, "maximum": n_spk, "__type__": "update"},
478
+ to_return_protect0,
479
+ to_return_protect1,
480
+ )
481
+
482
+
483
+ def change_choices():
484
+ names = []
485
+ for name in os.listdir(weight_root):
486
+ if name.endswith(".pth"):
487
+ names.append(name)
488
+ index_paths = []
489
+ for root, dirs, files in os.walk(index_root, topdown=False):
490
+ for name in files:
491
+ if name.endswith(".index") and "trained" not in name:
492
+ index_paths.append("%s/%s" % (root, name))
493
+ return {"choices": sorted(names), "__type__": "update"}, {
494
+ "choices": sorted(index_paths),
495
+ "__type__": "update",
496
+ }
497
+
498
+
499
+ def clean():
500
+ return {"value": "", "__type__": "update"}
501
+
502
+
503
+ sr_dict = {
504
+ "32k": 32000,
505
+ "40k": 40000,
506
+ "48k": 48000,
507
+ }
508
+
509
+
510
+ def if_done(done, p):
511
+ while 1:
512
+ if p.poll() is None:
513
+ sleep(0.5)
514
+ else:
515
+ break
516
+ done[0] = True
517
+
518
+
519
+ def if_done_multi(done, ps):
520
+ while 1:
521
+ # poll==None代表进程未结束
522
+ # 只要有一个进程未结束都不停
523
+ flag = 1
524
+ for p in ps:
525
+ if p.poll() is None:
526
+ flag = 0
527
+ sleep(0.5)
528
+ break
529
+ if flag == 1:
530
+ break
531
+ done[0] = True
532
+
533
+
534
+ def preprocess_dataset(trainset_dir, exp_dir, sr, n_p):
535
+ sr = sr_dict[sr]
536
+ os.makedirs("%s/logs/%s" % (now_dir, exp_dir), exist_ok=True)
537
+ f = open("%s/logs/%s/preprocess.log" % (now_dir, exp_dir), "w")
538
+ f.close()
539
+ cmd = (
540
+ config.python_cmd
541
+ + " trainset_preprocess_pipeline_print.py %s %s %s %s/logs/%s "
542
+ % (trainset_dir, sr, n_p, now_dir, exp_dir)
543
+ + str(config.noparallel)
544
+ )
545
+ print(cmd)
546
+ p = Popen(cmd, shell=True) # , stdin=PIPE, stdout=PIPE,stderr=PIPE,cwd=now_dir
547
+ ###煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读
548
+ done = [False]
549
+ threading.Thread(
550
+ target=if_done,
551
+ args=(
552
+ done,
553
+ p,
554
+ ),
555
+ ).start()
556
+ while 1:
557
+ with open("%s/logs/%s/preprocess.log" % (now_dir, exp_dir), "r") as f:
558
+ yield (f.read())
559
+ sleep(1)
560
+ if done[0]:
561
+ break
562
+ with open("%s/logs/%s/preprocess.log" % (now_dir, exp_dir), "r") as f:
563
+ log = f.read()
564
+ print(log)
565
+ yield log
566
+
567
+
568
+ # but2.click(extract_f0,[gpus6,np7,f0method8,if_f0_3,trainset_dir4],[info2])
569
+ def extract_f0_feature(gpus, n_p, f0method, if_f0, exp_dir, version19):
570
+ gpus = gpus.split("-")
571
+ os.makedirs("%s/logs/%s" % (now_dir, exp_dir), exist_ok=True)
572
+ f = open("%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir), "w")
573
+ f.close()
574
+ if if_f0:
575
+ cmd = config.python_cmd + " extract_f0_print.py %s/logs/%s %s %s" % (
576
+ now_dir,
577
+ exp_dir,
578
+ n_p,
579
+ f0method,
580
+ )
581
+ print(cmd)
582
+ p = Popen(cmd, shell=True, cwd=now_dir) # , stdin=PIPE, stdout=PIPE,stderr=PIPE
583
+ ###煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读
584
+ done = [False]
585
+ threading.Thread(
586
+ target=if_done,
587
+ args=(
588
+ done,
589
+ p,
590
+ ),
591
+ ).start()
592
+ while 1:
593
+ with open(
594
+ "%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir), "r"
595
+ ) as f:
596
+ yield (f.read())
597
+ sleep(1)
598
+ if done[0]:
599
+ break
600
+ with open("%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir), "r") as f:
601
+ log = f.read()
602
+ print(log)
603
+ yield log
604
+ ####对不同part分别开多进程
605
+ """
606
+ n_part=int(sys.argv[1])
607
+ i_part=int(sys.argv[2])
608
+ i_gpu=sys.argv[3]
609
+ exp_dir=sys.argv[4]
610
+ os.environ["CUDA_VISIBLE_DEVICES"]=str(i_gpu)
611
+ """
612
+ leng = len(gpus)
613
+ ps = []
614
+ for idx, n_g in enumerate(gpus):
615
+ cmd = (
616
+ config.python_cmd
617
+ + " extract_feature_print.py %s %s %s %s %s/logs/%s %s"
618
+ % (
619
+ config.device,
620
+ leng,
621
+ idx,
622
+ n_g,
623
+ now_dir,
624
+ exp_dir,
625
+ version19,
626
+ )
627
+ )
628
+ print(cmd)
629
+ p = Popen(
630
+ cmd, shell=True, cwd=now_dir
631
+ ) # , shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=now_dir
632
+ ps.append(p)
633
+ ###煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读
634
+ done = [False]
635
+ threading.Thread(
636
+ target=if_done_multi,
637
+ args=(
638
+ done,
639
+ ps,
640
+ ),
641
+ ).start()
642
+ while 1:
643
+ with open("%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir), "r") as f:
644
+ yield (f.read())
645
+ sleep(1)
646
+ if done[0]:
647
+ break
648
+ with open("%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir), "r") as f:
649
+ log = f.read()
650
+ print(log)
651
+ yield log
652
+
653
+
654
+ def change_sr2(sr2, if_f0_3, version19):
655
+ path_str = "" if version19 == "v1" else "_v2"
656
+ f0_str = "f0" if if_f0_3 else ""
657
+ if_pretrained_generator_exist = os.access(
658
+ "pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2), os.F_OK
659
+ )
660
+ if_pretrained_discriminator_exist = os.access(
661
+ "pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2), os.F_OK
662
+ )
663
+ if not if_pretrained_generator_exist:
664
+ print(
665
+ "pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2),
666
+ "not exist, will not use pretrained model",
667
+ )
668
+ if not if_pretrained_discriminator_exist:
669
+ print(
670
+ "pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2),
671
+ "not exist, will not use pretrained model",
672
+ )
673
+ return (
674
+ "pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2)
675
+ if if_pretrained_generator_exist
676
+ else "",
677
+ "pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2)
678
+ if if_pretrained_discriminator_exist
679
+ else "",
680
+ )
681
+
682
+
683
+ def change_version19(sr2, if_f0_3, version19):
684
+ path_str = "" if version19 == "v1" else "_v2"
685
+ if sr2 == "32k" and version19 == "v1":
686
+ sr2 = "40k"
687
+ to_return_sr2 = (
688
+ {"choices": ["40k", "48k"], "__type__": "update", "value": sr2}
689
+ if version19 == "v1"
690
+ else {"choices": ["40k", "48k", "32k"], "__type__": "update", "value": sr2}
691
+ )
692
+ f0_str = "f0" if if_f0_3 else ""
693
+ if_pretrained_generator_exist = os.access(
694
+ "pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2), os.F_OK
695
+ )
696
+ if_pretrained_discriminator_exist = os.access(
697
+ "pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2), os.F_OK
698
+ )
699
+ if not if_pretrained_generator_exist:
700
+ print(
701
+ "pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2),
702
+ "not exist, will not use pretrained model",
703
+ )
704
+ if not if_pretrained_discriminator_exist:
705
+ print(
706
+ "pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2),
707
+ "not exist, will not use pretrained model",
708
+ )
709
+ return (
710
+ "pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2)
711
+ if if_pretrained_generator_exist
712
+ else "",
713
+ "pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2)
714
+ if if_pretrained_discriminator_exist
715
+ else "",
716
+ to_return_sr2,
717
+ )
718
+
719
+
720
+ def change_f0(if_f0_3, sr2, version19): # f0method8,pretrained_G14,pretrained_D15
721
+ path_str = "" if version19 == "v1" else "_v2"
722
+ if_pretrained_generator_exist = os.access(
723
+ "pretrained%s/f0G%s.pth" % (path_str, sr2), os.F_OK
724
+ )
725
+ if_pretrained_discriminator_exist = os.access(
726
+ "pretrained%s/f0D%s.pth" % (path_str, sr2), os.F_OK
727
+ )
728
+ if not if_pretrained_generator_exist:
729
+ print(
730
+ "pretrained%s/f0G%s.pth" % (path_str, sr2),
731
+ "not exist, will not use pretrained model",
732
+ )
733
+ if not if_pretrained_discriminator_exist:
734
+ print(
735
+ "pretrained%s/f0D%s.pth" % (path_str, sr2),
736
+ "not exist, will not use pretrained model",
737
+ )
738
+ if if_f0_3:
739
+ return (
740
+ {"visible": True, "__type__": "update"},
741
+ "pretrained%s/f0G%s.pth" % (path_str, sr2)
742
+ if if_pretrained_generator_exist
743
+ else "",
744
+ "pretrained%s/f0D%s.pth" % (path_str, sr2)
745
+ if if_pretrained_discriminator_exist
746
+ else "",
747
+ )
748
+ return (
749
+ {"visible": False, "__type__": "update"},
750
+ ("pretrained%s/G%s.pth" % (path_str, sr2))
751
+ if if_pretrained_generator_exist
752
+ else "",
753
+ ("pretrained%s/D%s.pth" % (path_str, sr2))
754
+ if if_pretrained_discriminator_exist
755
+ else "",
756
+ )
757
+
758
+
759
+ # but3.click(click_train,[exp_dir1,sr2,if_f0_3,save_epoch10,total_epoch11,batch_size12,if_save_latest13,pretrained_G14,pretrained_D15,gpus16])
760
+ def click_train(
761
+ exp_dir1,
762
+ sr2,
763
+ if_f0_3,
764
+ spk_id5,
765
+ save_epoch10,
766
+ total_epoch11,
767
+ batch_size12,
768
+ if_save_latest13,
769
+ pretrained_G14,
770
+ pretrained_D15,
771
+ gpus16,
772
+ if_cache_gpu17,
773
+ if_save_every_weights18,
774
+ version19,
775
+ ):
776
+ # 生成filelist
777
+ exp_dir = "%s/logs/%s" % (now_dir, exp_dir1)
778
+ os.makedirs(exp_dir, exist_ok=True)
779
+ gt_wavs_dir = "%s/0_gt_wavs" % (exp_dir)
780
+ feature_dir = (
781
+ "%s/3_feature256" % (exp_dir)
782
+ if version19 == "v1"
783
+ else "%s/3_feature768" % (exp_dir)
784
+ )
785
+ if if_f0_3:
786
+ f0_dir = "%s/2a_f0" % (exp_dir)
787
+ f0nsf_dir = "%s/2b-f0nsf" % (exp_dir)
788
+ names = (
789
+ set([name.split(".")[0] for name in os.listdir(gt_wavs_dir)])
790
+ & set([name.split(".")[0] for name in os.listdir(feature_dir)])
791
+ & set([name.split(".")[0] for name in os.listdir(f0_dir)])
792
+ & set([name.split(".")[0] for name in os.listdir(f0nsf_dir)])
793
+ )
794
+ else:
795
+ names = set([name.split(".")[0] for name in os.listdir(gt_wavs_dir)]) & set(
796
+ [name.split(".")[0] for name in os.listdir(feature_dir)]
797
+ )
798
+ opt = []
799
+ for name in names:
800
+ if if_f0_3:
801
+ opt.append(
802
+ "%s/%s.wav|%s/%s.npy|%s/%s.wav.npy|%s/%s.wav.npy|%s"
803
+ % (
804
+ gt_wavs_dir.replace("\\", "\\\\"),
805
+ name,
806
+ feature_dir.replace("\\", "\\\\"),
807
+ name,
808
+ f0_dir.replace("\\", "\\\\"),
809
+ name,
810
+ f0nsf_dir.replace("\\", "\\\\"),
811
+ name,
812
+ spk_id5,
813
+ )
814
+ )
815
+ else:
816
+ opt.append(
817
+ "%s/%s.wav|%s/%s.npy|%s"
818
+ % (
819
+ gt_wavs_dir.replace("\\", "\\\\"),
820
+ name,
821
+ feature_dir.replace("\\", "\\\\"),
822
+ name,
823
+ spk_id5,
824
+ )
825
+ )
826
+ fea_dim = 256 if version19 == "v1" else 768
827
+ if if_f0_3:
828
+ for _ in range(2):
829
+ opt.append(
830
+ "%s/logs/mute/0_gt_wavs/mute%s.wav|%s/logs/mute/3_feature%s/mute.npy|%s/logs/mute/2a_f0/mute.wav.npy|%s/logs/mute/2b-f0nsf/mute.wav.npy|%s"
831
+ % (now_dir, sr2, now_dir, fea_dim, now_dir, now_dir, spk_id5)
832
+ )
833
+ else:
834
+ for _ in range(2):
835
+ opt.append(
836
+ "%s/logs/mute/0_gt_wavs/mute%s.wav|%s/logs/mute/3_feature%s/mute.npy|%s"
837
+ % (now_dir, sr2, now_dir, fea_dim, spk_id5)
838
+ )
839
+ shuffle(opt)
840
+ with open("%s/filelist.txt" % exp_dir, "w") as f:
841
+ f.write("\n".join(opt))
842
+ print("write filelist done")
843
+ # 生成config#无需生成config
844
+ # cmd = python_cmd + " train_nsf_sim_cache_sid_load_pretrain.py -e mi-test -sr 40k -f0 1 -bs 4 -g 0 -te 10 -se 5 -pg pretrained/f0G40k.pth -pd pretrained/f0D40k.pth -l 1 -c 0"
845
+ print("use gpus:", gpus16)
846
+ if pretrained_G14 == "":
847
+ print("no pretrained Generator")
848
+ if pretrained_D15 == "":
849
+ print("no pretrained Discriminator")
850
+ if gpus16:
851
+ cmd = (
852
+ config.python_cmd
853
+ + " train_nsf_sim_cache_sid_load_pretrain.py -e %s -sr %s -f0 %s -bs %s -g %s -te %s -se %s %s %s -l %s -c %s -sw %s -v %s"
854
+ % (
855
+ exp_dir1,
856
+ sr2,
857
+ 1 if if_f0_3 else 0,
858
+ batch_size12,
859
+ gpus16,
860
+ total_epoch11,
861
+ save_epoch10,
862
+ "-pg %s" % pretrained_G14 if pretrained_G14 != "" else "",
863
+ "-pd %s" % pretrained_D15 if pretrained_D15 != "" else "",
864
+ 1 if if_save_latest13 == i18n("是") else 0,
865
+ 1 if if_cache_gpu17 == i18n("是") else 0,
866
+ 1 if if_save_every_weights18 == i18n("是") else 0,
867
+ version19,
868
+ )
869
+ )
870
+ else:
871
+ cmd = (
872
+ config.python_cmd
873
+ + " train_nsf_sim_cache_sid_load_pretrain.py -e %s -sr %s -f0 %s -bs %s -te %s -se %s %s %s -l %s -c %s -sw %s -v %s"
874
+ % (
875
+ exp_dir1,
876
+ sr2,
877
+ 1 if if_f0_3 else 0,
878
+ batch_size12,
879
+ total_epoch11,
880
+ save_epoch10,
881
+ "-pg %s" % pretrained_G14 if pretrained_G14 != "" else "\b",
882
+ "-pd %s" % pretrained_D15 if pretrained_D15 != "" else "\b",
883
+ 1 if if_save_latest13 == i18n("是") else 0,
884
+ 1 if if_cache_gpu17 == i18n("是") else 0,
885
+ 1 if if_save_every_weights18 == i18n("是") else 0,
886
+ version19,
887
+ )
888
+ )
889
+ print(cmd)
890
+ p = Popen(cmd, shell=True, cwd=now_dir)
891
+ p.wait()
892
+ return "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log"
893
+
894
+
895
+ # but4.click(train_index, [exp_dir1], info3)
896
+ def train_index(exp_dir1, version19):
897
+ exp_dir = "%s/logs/%s" % (now_dir, exp_dir1)
898
+ os.makedirs(exp_dir, exist_ok=True)
899
+ feature_dir = (
900
+ "%s/3_feature256" % (exp_dir)
901
+ if version19 == "v1"
902
+ else "%s/3_feature768" % (exp_dir)
903
+ )
904
+ if not os.path.exists(feature_dir):
905
+ return "请先进行特征提取!"
906
+ listdir_res = list(os.listdir(feature_dir))
907
+ if len(listdir_res) == 0:
908
+ return "请先进行特征提取!"
909
+ infos = []
910
+ npys = []
911
+ for name in sorted(listdir_res):
912
+ phone = np.load("%s/%s" % (feature_dir, name))
913
+ npys.append(phone)
914
+ big_npy = np.concatenate(npys, 0)
915
+ big_npy_idx = np.arange(big_npy.shape[0])
916
+ np.random.shuffle(big_npy_idx)
917
+ big_npy = big_npy[big_npy_idx]
918
+ if big_npy.shape[0] > 2e5:
919
+ # if(1):
920
+ infos.append("Trying doing kmeans %s shape to 10k centers." % big_npy.shape[0])
921
+ yield "\n".join(infos)
922
+ try:
923
+ big_npy = (
924
+ MiniBatchKMeans(
925
+ n_clusters=10000,
926
+ verbose=True,
927
+ batch_size=256 * config.n_cpu,
928
+ compute_labels=False,
929
+ init="random",
930
+ )
931
+ .fit(big_npy)
932
+ .cluster_centers_
933
+ )
934
+ except:
935
+ info = traceback.format_exc()
936
+ print(info)
937
+ infos.append(info)
938
+ yield "\n".join(infos)
939
+
940
+ np.save("%s/total_fea.npy" % exp_dir, big_npy)
941
+ n_ivf = min(int(16 * np.sqrt(big_npy.shape[0])), big_npy.shape[0] // 39)
942
+ infos.append("%s,%s" % (big_npy.shape, n_ivf))
943
+ yield "\n".join(infos)
944
+ index = faiss.index_factory(256 if version19 == "v1" else 768, "IVF%s,Flat" % n_ivf)
945
+ # index = faiss.index_factory(256if version19=="v1"else 768, "IVF%s,PQ128x4fs,RFlat"%n_ivf)
946
+ infos.append("training")
947
+ yield "\n".join(infos)
948
+ index_ivf = faiss.extract_index_ivf(index) #
949
+ index_ivf.nprobe = 1
950
+ index.train(big_npy)
951
+ faiss.write_index(
952
+ index,
953
+ "%s/trained_IVF%s_Flat_nprobe_%s_%s_%s.index"
954
+ % (exp_dir, n_ivf, index_ivf.nprobe, exp_dir1, version19),
955
+ )
956
+ # faiss.write_index(index, '%s/trained_IVF%s_Flat_FastScan_%s.index'%(exp_dir,n_ivf,version19))
957
+ infos.append("adding")
958
+ yield "\n".join(infos)
959
+ batch_size_add = 8192
960
+ for i in range(0, big_npy.shape[0], batch_size_add):
961
+ index.add(big_npy[i : i + batch_size_add])
962
+ faiss.write_index(
963
+ index,
964
+ "%s/added_IVF%s_Flat_nprobe_%s_%s_%s.index"
965
+ % (exp_dir, n_ivf, index_ivf.nprobe, exp_dir1, version19),
966
+ )
967
+ infos.append(
968
+ "成功构建索引,added_IVF%s_Flat_nprobe_%s_%s_%s.index"
969
+ % (n_ivf, index_ivf.nprobe, exp_dir1, version19)
970
+ )
971
+ # faiss.write_index(index, '%s/added_IVF%s_Flat_FastScan_%s.index'%(exp_dir,n_ivf,version19))
972
+ # infos.append("成功构建索引,added_IVF%s_Flat_FastScan_%s.index"%(n_ivf,version19))
973
+ yield "\n".join(infos)
974
+
975
+
976
+ # but5.click(train1key, [exp_dir1, sr2, if_f0_3, trainset_dir4, spk_id5, gpus6, np7, f0method8, save_epoch10, total_epoch11, batch_size12, if_save_latest13, pretrained_G14, pretrained_D15, gpus16, if_cache_gpu17], info3)
977
+ def train1key(
978
+ exp_dir1,
979
+ sr2,
980
+ if_f0_3,
981
+ trainset_dir4,
982
+ spk_id5,
983
+ np7,
984
+ f0method8,
985
+ save_epoch10,
986
+ total_epoch11,
987
+ batch_size12,
988
+ if_save_latest13,
989
+ pretrained_G14,
990
+ pretrained_D15,
991
+ gpus16,
992
+ if_cache_gpu17,
993
+ if_save_every_weights18,
994
+ version19,
995
+ ):
996
+ infos = []
997
+
998
+ def get_info_str(strr):
999
+ infos.append(strr)
1000
+ return "\n".join(infos)
1001
+
1002
+ model_log_dir = "%s/logs/%s" % (now_dir, exp_dir1)
1003
+ preprocess_log_path = "%s/preprocess.log" % model_log_dir
1004
+ extract_f0_feature_log_path = "%s/extract_f0_feature.log" % model_log_dir
1005
+ gt_wavs_dir = "%s/0_gt_wavs" % model_log_dir
1006
+ feature_dir = (
1007
+ "%s/3_feature256" % model_log_dir
1008
+ if version19 == "v1"
1009
+ else "%s/3_feature768" % model_log_dir
1010
+ )
1011
+
1012
+ os.makedirs(model_log_dir, exist_ok=True)
1013
+ #########step1:处理数据
1014
+ open(preprocess_log_path, "w").close()
1015
+ cmd = (
1016
+ config.python_cmd
1017
+ + " trainset_preprocess_pipeline_print.py %s %s %s %s "
1018
+ % (trainset_dir4, sr_dict[sr2], np7, model_log_dir)
1019
+ + str(config.noparallel)
1020
+ )
1021
+ yield get_info_str(i18n("step1:正在处理数据"))
1022
+ yield get_info_str(cmd)
1023
+ p = Popen(cmd, shell=True)
1024
+ p.wait()
1025
+ with open(preprocess_log_path, "r") as f:
1026
+ print(f.read())
1027
+ #########step2a:提取音高
1028
+ open(extract_f0_feature_log_path, "w")
1029
+ if if_f0_3:
1030
+ yield get_info_str("step2a:正在提取音高")
1031
+ cmd = config.python_cmd + " extract_f0_print.py %s %s %s" % (
1032
+ model_log_dir,
1033
+ np7,
1034
+ f0method8,
1035
+ )
1036
+ yield get_info_str(cmd)
1037
+ p = Popen(cmd, shell=True, cwd=now_dir)
1038
+ p.wait()
1039
+ with open(extract_f0_feature_log_path, "r") as f:
1040
+ print(f.read())
1041
+ else:
1042
+ yield get_info_str(i18n("step2a:无需提取音高"))
1043
+ #######step2b:提取特征
1044
+ yield get_info_str(i18n("step2b:正在提取特征"))
1045
+ gpus = gpus16.split("-")
1046
+ leng = len(gpus)
1047
+ ps = []
1048
+ for idx, n_g in enumerate(gpus):
1049
+ cmd = config.python_cmd + " extract_feature_print.py %s %s %s %s %s %s" % (
1050
+ config.device,
1051
+ leng,
1052
+ idx,
1053
+ n_g,
1054
+ model_log_dir,
1055
+ version19,
1056
+ )
1057
+ yield get_info_str(cmd)
1058
+ p = Popen(
1059
+ cmd, shell=True, cwd=now_dir
1060
+ ) # , shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=now_dir
1061
+ ps.append(p)
1062
+ for p in ps:
1063
+ p.wait()
1064
+ with open(extract_f0_feature_log_path, "r") as f:
1065
+ print(f.read())
1066
+ #######step3a:训练模型
1067
+ yield get_info_str(i18n("step3a:正在训练模型"))
1068
+ # 生成filelist
1069
+ if if_f0_3:
1070
+ f0_dir = "%s/2a_f0" % model_log_dir
1071
+ f0nsf_dir = "%s/2b-f0nsf" % model_log_dir
1072
+ names = (
1073
+ set([name.split(".")[0] for name in os.listdir(gt_wavs_dir)])
1074
+ & set([name.split(".")[0] for name in os.listdir(feature_dir)])
1075
+ & set([name.split(".")[0] for name in os.listdir(f0_dir)])
1076
+ & set([name.split(".")[0] for name in os.listdir(f0nsf_dir)])
1077
+ )
1078
+ else:
1079
+ names = set([name.split(".")[0] for name in os.listdir(gt_wavs_dir)]) & set(
1080
+ [name.split(".")[0] for name in os.listdir(feature_dir)]
1081
+ )
1082
+ opt = []
1083
+ for name in names:
1084
+ if if_f0_3:
1085
+ opt.append(
1086
+ "%s/%s.wav|%s/%s.npy|%s/%s.wav.npy|%s/%s.wav.npy|%s"
1087
+ % (
1088
+ gt_wavs_dir.replace("\\", "\\\\"),
1089
+ name,
1090
+ feature_dir.replace("\\", "\\\\"),
1091
+ name,
1092
+ f0_dir.replace("\\", "\\\\"),
1093
+ name,
1094
+ f0nsf_dir.replace("\\", "\\\\"),
1095
+ name,
1096
+ spk_id5,
1097
+ )
1098
+ )
1099
+ else:
1100
+ opt.append(
1101
+ "%s/%s.wav|%s/%s.npy|%s"
1102
+ % (
1103
+ gt_wavs_dir.replace("\\", "\\\\"),
1104
+ name,
1105
+ feature_dir.replace("\\", "\\\\"),
1106
+ name,
1107
+ spk_id5,
1108
+ )
1109
+ )
1110
+ fea_dim = 256 if version19 == "v1" else 768
1111
+ if if_f0_3:
1112
+ for _ in range(2):
1113
+ opt.append(
1114
+ "%s/logs/mute/0_gt_wavs/mute%s.wav|%s/logs/mute/3_feature%s/mute.npy|%s/logs/mute/2a_f0/mute.wav.npy|%s/logs/mute/2b-f0nsf/mute.wav.npy|%s"
1115
+ % (now_dir, sr2, now_dir, fea_dim, now_dir, now_dir, spk_id5)
1116
+ )
1117
+ else:
1118
+ for _ in range(2):
1119
+ opt.append(
1120
+ "%s/logs/mute/0_gt_wavs/mute%s.wav|%s/logs/mute/3_feature%s/mute.npy|%s"
1121
+ % (now_dir, sr2, now_dir, fea_dim, spk_id5)
1122
+ )
1123
+ shuffle(opt)
1124
+ with open("%s/filelist.txt" % model_log_dir, "w") as f:
1125
+ f.write("\n".join(opt))
1126
+ yield get_info_str("write filelist done")
1127
+ if gpus16:
1128
+ cmd = (
1129
+ config.python_cmd
1130
+ + " train_nsf_sim_cache_sid_load_pretrain.py -e %s -sr %s -f0 %s -bs %s -g %s -te %s -se %s %s %s -l %s -c %s -sw %s -v %s"
1131
+ % (
1132
+ exp_dir1,
1133
+ sr2,
1134
+ 1 if if_f0_3 else 0,
1135
+ batch_size12,
1136
+ gpus16,
1137
+ total_epoch11,
1138
+ save_epoch10,
1139
+ "-pg %s" % pretrained_G14 if pretrained_G14 != "" else "",
1140
+ "-pd %s" % pretrained_D15 if pretrained_D15 != "" else "",
1141
+ 1 if if_save_latest13 == i18n("是") else 0,
1142
+ 1 if if_cache_gpu17 == i18n("是") else 0,
1143
+ 1 if if_save_every_weights18 == i18n("是") else 0,
1144
+ version19,
1145
+ )
1146
+ )
1147
+ else:
1148
+ cmd = (
1149
+ config.python_cmd
1150
+ + " train_nsf_sim_cache_sid_load_pretrain.py -e %s -sr %s -f0 %s -bs %s -te %s -se %s %s %s -l %s -c %s -sw %s -v %s"
1151
+ % (
1152
+ exp_dir1,
1153
+ sr2,
1154
+ 1 if if_f0_3 else 0,
1155
+ batch_size12,
1156
+ total_epoch11,
1157
+ save_epoch10,
1158
+ "-pg %s" % pretrained_G14 if pretrained_G14 != "" else "",
1159
+ "-pd %s" % pretrained_D15 if pretrained_D15 != "" else "",
1160
+ 1 if if_save_latest13 == i18n("是") else 0,
1161
+ 1 if if_cache_gpu17 == i18n("是") else 0,
1162
+ 1 if if_save_every_weights18 == i18n("是") else 0,
1163
+ version19,
1164
+ )
1165
+ )
1166
+ yield get_info_str(cmd)
1167
+ p = Popen(cmd, shell=True, cwd=now_dir)
1168
+ p.wait()
1169
+ yield get_info_str(i18n("训练结束, 您可查看控制台训练日志或实验文件夹下的train.log"))
1170
+ #######step3b:训练索引
1171
+ npys = []
1172
+ listdir_res = list(os.listdir(feature_dir))
1173
+ for name in sorted(listdir_res):
1174
+ phone = np.load("%s/%s" % (feature_dir, name))
1175
+ npys.append(phone)
1176
+ big_npy = np.concatenate(npys, 0)
1177
+
1178
+ big_npy_idx = np.arange(big_npy.shape[0])
1179
+ np.random.shuffle(big_npy_idx)
1180
+ big_npy = big_npy[big_npy_idx]
1181
+
1182
+ if big_npy.shape[0] > 2e5:
1183
+ # if(1):
1184
+ info = "Trying doing kmeans %s shape to 10k centers." % big_npy.shape[0]
1185
+ print(info)
1186
+ yield get_info_str(info)
1187
+ try:
1188
+ big_npy = (
1189
+ MiniBatchKMeans(
1190
+ n_clusters=10000,
1191
+ verbose=True,
1192
+ batch_size=256 * config.n_cpu,
1193
+ compute_labels=False,
1194
+ init="random",
1195
+ )
1196
+ .fit(big_npy)
1197
+ .cluster_centers_
1198
+ )
1199
+ except:
1200
+ info = traceback.format_exc()
1201
+ print(info)
1202
+ yield get_info_str(info)
1203
+
1204
+ np.save("%s/total_fea.npy" % model_log_dir, big_npy)
1205
+
1206
+ # n_ivf = big_npy.shape[0] // 39
1207
+ n_ivf = min(int(16 * np.sqrt(big_npy.shape[0])), big_npy.shape[0] // 39)
1208
+ yield get_info_str("%s,%s" % (big_npy.shape, n_ivf))
1209
+ index = faiss.index_factory(256 if version19 == "v1" else 768, "IVF%s,Flat" % n_ivf)
1210
+ yield get_info_str("training index")
1211
+ index_ivf = faiss.extract_index_ivf(index) #
1212
+ index_ivf.nprobe = 1
1213
+ index.train(big_npy)
1214
+ faiss.write_index(
1215
+ index,
1216
+ "%s/trained_IVF%s_Flat_nprobe_%s_%s_%s.index"
1217
+ % (model_log_dir, n_ivf, index_ivf.nprobe, exp_dir1, version19),
1218
+ )
1219
+ yield get_info_str("adding index")
1220
+ batch_size_add = 8192
1221
+ for i in range(0, big_npy.shape[0], batch_size_add):
1222
+ index.add(big_npy[i : i + batch_size_add])
1223
+ faiss.write_index(
1224
+ index,
1225
+ "%s/added_IVF%s_Flat_nprobe_%s_%s_%s.index"
1226
+ % (model_log_dir, n_ivf, index_ivf.nprobe, exp_dir1, version19),
1227
+ )
1228
+ yield get_info_str(
1229
+ "成功构建索引, added_IVF%s_Flat_nprobe_%s_%s_%s.index"
1230
+ % (n_ivf, index_ivf.nprobe, exp_dir1, version19)
1231
+ )
1232
+ yield get_info_str(i18n("全流程结束!"))
1233
+
1234
+
1235
+ # ckpt_path2.change(change_info_,[ckpt_path2],[sr__,if_f0__])
1236
+ def change_info_(ckpt_path):
1237
+ if not os.path.exists(ckpt_path.replace(os.path.basename(ckpt_path), "train.log")):
1238
+ return {"__type__": "update"}, {"__type__": "update"}, {"__type__": "update"}
1239
+ try:
1240
+ with open(
1241
+ ckpt_path.replace(os.path.basename(ckpt_path), "train.log"), "r"
1242
+ ) as f:
1243
+ info = eval(f.read().strip("\n").split("\n")[0].split("\t")[-1])
1244
+ sr, f0 = info["sample_rate"], info["if_f0"]
1245
+ version = "v2" if ("version" in info and info["version"] == "v2") else "v1"
1246
+ return sr, str(f0), version
1247
+ except:
1248
+ traceback.print_exc()
1249
+ return {"__type__": "update"}, {"__type__": "update"}, {"__type__": "update"}
1250
+
1251
+
1252
+ def export_onnx(ModelPath, ExportedPath):
1253
+ cpt = torch.load(ModelPath, map_location="cpu")
1254
+ cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]
1255
+ vec_channels = 256 if cpt.get("version", "v1") == "v1" else 768
1256
+
1257
+ test_phone = torch.rand(1, 200, vec_channels) # hidden unit
1258
+ test_phone_lengths = torch.tensor([200]).long() # hidden unit 长度(貌似没啥用)
1259
+ test_pitch = torch.randint(size=(1, 200), low=5, high=255) # 基频(单位赫兹)
1260
+ test_pitchf = torch.rand(1, 200) # nsf基频
1261
+ test_ds = torch.LongTensor([0]) # 说话人ID
1262
+ test_rnd = torch.rand(1, 192, 200) # 噪声(加入随机因子)
1263
+
1264
+ device = "cpu" # 导出时设备(不影响使用模型)
1265
+
1266
+ net_g = SynthesizerTrnMsNSFsidM(
1267
+ *cpt["config"], is_half=False, version=cpt.get("version", "v1")
1268
+ ) # fp32导出(C++要支持fp16必须手动将内存重新排列所以暂时不用fp16)
1269
+ net_g.load_state_dict(cpt["weight"], strict=False)
1270
+ input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds", "rnd"]
1271
+ output_names = [
1272
+ "audio",
1273
+ ]
1274
+ # net_g.construct_spkmixmap(n_speaker) 多角色混合轨道导出
1275
+ torch.onnx.export(
1276
+ net_g,
1277
+ (
1278
+ test_phone.to(device),
1279
+ test_phone_lengths.to(device),
1280
+ test_pitch.to(device),
1281
+ test_pitchf.to(device),
1282
+ test_ds.to(device),
1283
+ test_rnd.to(device),
1284
+ ),
1285
+ ExportedPath,
1286
+ dynamic_axes={
1287
+ "phone": [1],
1288
+ "pitch": [1],
1289
+ "pitchf": [1],
1290
+ "rnd": [2],
1291
+ },
1292
+ do_constant_folding=False,
1293
+ opset_version=13,
1294
+ verbose=False,
1295
+ input_names=input_names,
1296
+ output_names=output_names,
1297
+ )
1298
+ return "Finished"
1299
+
1300
+
1301
+ with gr.Blocks() as app:
1302
+ gr.Markdown(
1303
+ value=i18n(
1304
+ "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责. <br>如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录<b>使用需遵守的协议-LICENSE.txt</b>."
1305
+ )
1306
+ )
1307
+ with gr.Tabs():
1308
+ with gr.TabItem(i18n("模型推理")):
1309
+ with gr.Row():
1310
+ sid0 = gr.Dropdown(label=i18n("推理音色"), choices=sorted(names))
1311
+ refresh_button = gr.Button(i18n("刷新音色列表和索引路径"), variant="primary")
1312
+ clean_button = gr.Button(i18n("卸载音色省显存"), variant="primary")
1313
+ spk_item = gr.Slider(
1314
+ minimum=0,
1315
+ maximum=2333,
1316
+ step=1,
1317
+ label=i18n("请选择说话人id"),
1318
+ value=0,
1319
+ visible=False,
1320
+ interactive=True,
1321
+ )
1322
+ clean_button.click(fn=clean, inputs=[], outputs=[sid0])
1323
+ with gr.Group():
1324
+ gr.Markdown(
1325
+ value=i18n("男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ")
1326
+ )
1327
+ with gr.Row():
1328
+ with gr.Column():
1329
+ vc_transform0 = gr.Number(
1330
+ label=i18n("变调(整数, 半音数量, 升八度12降八度-12)"), value=0
1331
+ )
1332
+ input_audio0 = gr.Textbox(
1333
+ label=i18n("输入待处理音频文件路径(默认是正确格式示例)"),
1334
+ value="E:\\codes\\py39\\test-20230416b\\todo-songs\\冬之花clip1.wav",
1335
+ )
1336
+ f0method0 = gr.Radio(
1337
+ label=i18n(
1338
+ "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU"
1339
+ ),
1340
+ choices=["pm", "harvest", "crepe"],
1341
+ value="pm",
1342
+ interactive=True,
1343
+ )
1344
+ filter_radius0 = gr.Slider(
1345
+ minimum=0,
1346
+ maximum=7,
1347
+ label=i18n(">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音"),
1348
+ value=3,
1349
+ step=1,
1350
+ interactive=True,
1351
+ )
1352
+ with gr.Column():
1353
+ file_index1 = gr.Textbox(
1354
+ label=i18n("特征检索库文件路径,为空则使用下拉的选择结果"),
1355
+ value="",
1356
+ interactive=True,
1357
+ )
1358
+ file_index2 = gr.Dropdown(
1359
+ label=i18n("自动检测index路径,下拉式选择(dropdown)"),
1360
+ choices=sorted(index_paths),
1361
+ interactive=True,
1362
+ )
1363
+ refresh_button.click(
1364
+ fn=change_choices, inputs=[], outputs=[sid0, file_index2]
1365
+ )
1366
+ # file_big_npy1 = gr.Textbox(
1367
+ # label=i18n("特征文件路径"),
1368
+ # value="E:\\codes\py39\\vits_vc_gpu_train\\logs\\mi-test-1key\\total_fea.npy",
1369
+ # interactive=True,
1370
+ # )
1371
+ index_rate1 = gr.Slider(
1372
+ minimum=0,
1373
+ maximum=1,
1374
+ label=i18n("检索特征占比"),
1375
+ value=0.75,
1376
+ interactive=True,
1377
+ )
1378
+ with gr.Column():
1379
+ resample_sr0 = gr.Slider(
1380
+ minimum=0,
1381
+ maximum=48000,
1382
+ label=i18n("后处理重采样至最终采样率,0为不进行重采样"),
1383
+ value=0,
1384
+ step=1,
1385
+ interactive=True,
1386
+ )
1387
+ rms_mix_rate0 = gr.Slider(
1388
+ minimum=0,
1389
+ maximum=1,
1390
+ label=i18n("输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络"),
1391
+ value=0.25,
1392
+ interactive=True,
1393
+ )
1394
+ protect0 = gr.Slider(
1395
+ minimum=0,
1396
+ maximum=0.5,
1397
+ label=i18n(
1398
+ "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果"
1399
+ ),
1400
+ value=0.33,
1401
+ step=0.01,
1402
+ interactive=True,
1403
+ )
1404
+ f0_file = gr.File(label=i18n("F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调"))
1405
+ but0 = gr.Button(i18n("转换"), variant="primary")
1406
+ with gr.Row():
1407
+ vc_output1 = gr.Textbox(label=i18n("输出信息"))
1408
+ vc_output2 = gr.Audio(label=i18n("输出音频(右下角三个点,点了可以下载)"))
1409
+ but0.click(
1410
+ vc_single,
1411
+ [
1412
+ spk_item,
1413
+ input_audio0,
1414
+ vc_transform0,
1415
+ f0_file,
1416
+ f0method0,
1417
+ file_index1,
1418
+ file_index2,
1419
+ # file_big_npy1,
1420
+ index_rate1,
1421
+ filter_radius0,
1422
+ resample_sr0,
1423
+ rms_mix_rate0,
1424
+ protect0,
1425
+ ],
1426
+ [vc_output1, vc_output2],
1427
+ )
1428
+ with gr.Group():
1429
+ gr.Markdown(
1430
+ value=i18n("批量转换, 输入待转换音频文件夹, 或上传多个音频文件, 在指定文件夹(默认opt)下输出转换的音频. ")
1431
+ )
1432
+ with gr.Row():
1433
+ with gr.Column():
1434
+ vc_transform1 = gr.Number(
1435
+ label=i18n("变调(整数, 半音数量, 升八度12降八度-12)"), value=0
1436
+ )
1437
+ opt_input = gr.Textbox(label=i18n("指定输出文件夹"), value="opt")
1438
+ f0method1 = gr.Radio(
1439
+ label=i18n(
1440
+ "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU"
1441
+ ),
1442
+ choices=["pm", "harvest", "crepe"],
1443
+ value="pm",
1444
+ interactive=True,
1445
+ )
1446
+ filter_radius1 = gr.Slider(
1447
+ minimum=0,
1448
+ maximum=7,
1449
+ label=i18n(">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音"),
1450
+ value=3,
1451
+ step=1,
1452
+ interactive=True,
1453
+ )
1454
+ with gr.Column():
1455
+ file_index3 = gr.Textbox(
1456
+ label=i18n("特征检索库文件路径,为空则使用下拉的选择结果"),
1457
+ value="",
1458
+ interactive=True,
1459
+ )
1460
+ file_index4 = gr.Dropdown(
1461
+ label=i18n("自动检测index路径,下拉式选择(dropdown)"),
1462
+ choices=sorted(index_paths),
1463
+ interactive=True,
1464
+ )
1465
+ refresh_button.click(
1466
+ fn=lambda: change_choices()[1],
1467
+ inputs=[],
1468
+ outputs=file_index4,
1469
+ )
1470
+ # file_big_npy2 = gr.Textbox(
1471
+ # label=i18n("特征文件路径"),
1472
+ # value="E:\\codes\\py39\\vits_vc_gpu_train\\logs\\mi-test-1key\\total_fea.npy",
1473
+ # interactive=True,
1474
+ # )
1475
+ index_rate2 = gr.Slider(
1476
+ minimum=0,
1477
+ maximum=1,
1478
+ label=i18n("检索特征占比"),
1479
+ value=1,
1480
+ interactive=True,
1481
+ )
1482
+ with gr.Column():
1483
+ resample_sr1 = gr.Slider(
1484
+ minimum=0,
1485
+ maximum=48000,
1486
+ label=i18n("后处理重采样至最终采样率,0为不进行重采样"),
1487
+ value=0,
1488
+ step=1,
1489
+ interactive=True,
1490
+ )
1491
+ rms_mix_rate1 = gr.Slider(
1492
+ minimum=0,
1493
+ maximum=1,
1494
+ label=i18n("输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络"),
1495
+ value=1,
1496
+ interactive=True,
1497
+ )
1498
+ protect1 = gr.Slider(
1499
+ minimum=0,
1500
+ maximum=0.5,
1501
+ label=i18n(
1502
+ "保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果"
1503
+ ),
1504
+ value=0.33,
1505
+ step=0.01,
1506
+ interactive=True,
1507
+ )
1508
+ with gr.Column():
1509
+ dir_input = gr.Textbox(
1510
+ label=i18n("输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)"),
1511
+ value="E:\codes\py39\\test-20230416b\\todo-songs",
1512
+ )
1513
+ inputs = gr.File(
1514
+ file_count="multiple", label=i18n("也可批量输入音频文件, 二选一, 优先读文件夹")
1515
+ )
1516
+ with gr.Row():
1517
+ format1 = gr.Radio(
1518
+ label=i18n("导出文件格式"),
1519
+ choices=["wav", "flac", "mp3", "m4a"],
1520
+ value="flac",
1521
+ interactive=True,
1522
+ )
1523
+ but1 = gr.Button(i18n("转换"), variant="primary")
1524
+ vc_output3 = gr.Textbox(label=i18n("输出信息"))
1525
+ but1.click(
1526
+ vc_multi,
1527
+ [
1528
+ spk_item,
1529
+ dir_input,
1530
+ opt_input,
1531
+ inputs,
1532
+ vc_transform1,
1533
+ f0method1,
1534
+ file_index3,
1535
+ file_index4,
1536
+ # file_big_npy2,
1537
+ index_rate2,
1538
+ filter_radius1,
1539
+ resample_sr1,
1540
+ rms_mix_rate1,
1541
+ protect1,
1542
+ format1,
1543
+ ],
1544
+ [vc_output3],
1545
+ )
1546
+ sid0.change(
1547
+ fn=get_vc,
1548
+ inputs=[sid0, protect0, protect1],
1549
+ outputs=[spk_item, protect0, protect1],
1550
+ )
1551
+ with gr.TabItem(i18n("伴奏人声分离&去混响&去回声")):
1552
+ with gr.Group():
1553
+ gr.Markdown(
1554
+ value=i18n(
1555
+ "人声伴奏分离批量处理, 使用UVR5模型。 <br>"
1556
+ "合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。 <br>"
1557
+ "模型分为三类: <br>"
1558
+ "1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点; <br>"
1559
+ "2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型; <br> "
1560
+ "3、去混响、去延迟模型(by FoxJoy):<br>"
1561
+ "  (1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;<br>"
1562
+ "&emsp;(234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。<br>"
1563
+ "去混响/去延迟,附:<br>"
1564
+ "1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;<br>"
1565
+ "2、MDX-Net-Dereverb模型挺慢的;<br>"
1566
+ "3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。"
1567
+ )
1568
+ )
1569
+ with gr.Row():
1570
+ with gr.Column():
1571
+ dir_wav_input = gr.Textbox(
1572
+ label=i18n("输入待处理音频文件夹路径"),
1573
+ value="E:\\codes\\py39\\test-20230416b\\todo-songs\\todo-songs",
1574
+ )
1575
+ wav_inputs = gr.File(
1576
+ file_count="multiple", label=i18n("也可批量输入音频文件, 二选一, 优先读文件夹")
1577
+ )
1578
+ with gr.Column():
1579
+ model_choose = gr.Dropdown(label=i18n("模型"), choices=uvr5_names)
1580
+ agg = gr.Slider(
1581
+ minimum=0,
1582
+ maximum=20,
1583
+ step=1,
1584
+ label="人声提取激进程度",
1585
+ value=10,
1586
+ interactive=True,
1587
+ visible=False, # 先不开放调整
1588
+ )
1589
+ opt_vocal_root = gr.Textbox(
1590
+ label=i18n("指定输出主人声文件夹"), value="opt"
1591
+ )
1592
+ opt_ins_root = gr.Textbox(
1593
+ label=i18n("指定输出非主人声文件夹"), value="opt"
1594
+ )
1595
+ format0 = gr.Radio(
1596
+ label=i18n("导出文件格式"),
1597
+ choices=["wav", "flac", "mp3", "m4a"],
1598
+ value="flac",
1599
+ interactive=True,
1600
+ )
1601
+ but2 = gr.Button(i18n("转换"), variant="primary")
1602
+ vc_output4 = gr.Textbox(label=i18n("输出信息"))
1603
+ but2.click(
1604
+ uvr,
1605
+ [
1606
+ model_choose,
1607
+ dir_wav_input,
1608
+ opt_vocal_root,
1609
+ wav_inputs,
1610
+ opt_ins_root,
1611
+ agg,
1612
+ format0,
1613
+ ],
1614
+ [vc_output4],
1615
+ )
1616
+ with gr.TabItem(i18n("训练")):
1617
+ gr.Markdown(
1618
+ value=i18n(
1619
+ "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. "
1620
+ )
1621
+ )
1622
+ with gr.Row():
1623
+ exp_dir1 = gr.Textbox(label=i18n("输入实验名"), value="mi-test")
1624
+ sr2 = gr.Radio(
1625
+ label=i18n("目标采样率"),
1626
+ choices=["40k", "48k"],
1627
+ value="40k",
1628
+ interactive=True,
1629
+ )
1630
+ if_f0_3 = gr.Radio(
1631
+ label=i18n("模型是否带音高指导(唱歌一定要, 语音可以不要)"),
1632
+ choices=[True, False],
1633
+ value=True,
1634
+ interactive=True,
1635
+ )
1636
+ version19 = gr.Radio(
1637
+ label=i18n("版本"),
1638
+ choices=["v1", "v2"],
1639
+ value="v1",
1640
+ interactive=True,
1641
+ visible=True,
1642
+ )
1643
+ np7 = gr.Slider(
1644
+ minimum=0,
1645
+ maximum=config.n_cpu,
1646
+ step=1,
1647
+ label=i18n("提取音高和处理数据使用的CPU进程数"),
1648
+ value=int(np.ceil(config.n_cpu / 1.5)),
1649
+ interactive=True,
1650
+ )
1651
+ with gr.Group(): # 暂时单人的, 后面支持最多4人的#数据处理
1652
+ gr.Markdown(
1653
+ value=i18n(
1654
+ "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. "
1655
+ )
1656
+ )
1657
+ with gr.Row():
1658
+ trainset_dir4 = gr.Textbox(
1659
+ label=i18n("输入训练文件夹路径"), value="E:\\语音音频+标注\\米津玄师\\src"
1660
+ )
1661
+ spk_id5 = gr.Slider(
1662
+ minimum=0,
1663
+ maximum=4,
1664
+ step=1,
1665
+ label=i18n("请指定说话人id"),
1666
+ value=0,
1667
+ interactive=True,
1668
+ )
1669
+ but1 = gr.Button(i18n("处理数据"), variant="primary")
1670
+ info1 = gr.Textbox(label=i18n("输出信息"), value="")
1671
+ but1.click(
1672
+ preprocess_dataset, [trainset_dir4, exp_dir1, sr2, np7], [info1]
1673
+ )
1674
+ with gr.Group():
1675
+ gr.Markdown(value=i18n("step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)"))
1676
+ with gr.Row():
1677
+ with gr.Column():
1678
+ gpus6 = gr.Textbox(
1679
+ label=i18n("以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2"),
1680
+ value=gpus,
1681
+ interactive=True,
1682
+ )
1683
+ gpu_info9 = gr.Textbox(label=i18n("显卡信息"), value=gpu_info)
1684
+ with gr.Column():
1685
+ f0method8 = gr.Radio(
1686
+ label=i18n(
1687
+ "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢"
1688
+ ),
1689
+ choices=["pm", "harvest", "dio"],
1690
+ value="harvest",
1691
+ interactive=True,
1692
+ )
1693
+ but2 = gr.Button(i18n("特征提取"), variant="primary")
1694
+ info2 = gr.Textbox(label=i18n("输出信息"), value="", max_lines=8)
1695
+ but2.click(
1696
+ extract_f0_feature,
1697
+ [gpus6, np7, f0method8, if_f0_3, exp_dir1, version19],
1698
+ [info2],
1699
+ )
1700
+ with gr.Group():
1701
+ gr.Markdown(value=i18n("step3: 填写训练设置, 开始训练模型和索引"))
1702
+ with gr.Row():
1703
+ save_epoch10 = gr.Slider(
1704
+ minimum=0,
1705
+ maximum=50,
1706
+ step=1,
1707
+ label=i18n("保存频率save_every_epoch"),
1708
+ value=5,
1709
+ interactive=True,
1710
+ )
1711
+ total_epoch11 = gr.Slider(
1712
+ minimum=0,
1713
+ maximum=1000,
1714
+ step=1,
1715
+ label=i18n("总训练轮数total_epoch"),
1716
+ value=20,
1717
+ interactive=True,
1718
+ )
1719
+ batch_size12 = gr.Slider(
1720
+ minimum=1,
1721
+ maximum=40,
1722
+ step=1,
1723
+ label=i18n("每张显卡的batch_size"),
1724
+ value=default_batch_size,
1725
+ interactive=True,
1726
+ )
1727
+ if_save_latest13 = gr.Radio(
1728
+ label=i18n("是否仅保存最新的ckpt文件以节省硬盘空间"),
1729
+ choices=[i18n("是"), i18n("否")],
1730
+ value=i18n("否"),
1731
+ interactive=True,
1732
+ )
1733
+ if_cache_gpu17 = gr.Radio(
1734
+ label=i18n(
1735
+ "是否缓存所有训练集至显存. 10min以下小数据可缓存以加速训练, 大数据缓存会炸显存也加不了多少速"
1736
+ ),
1737
+ choices=[i18n("是"), i18n("否")],
1738
+ value=i18n("否"),
1739
+ interactive=True,
1740
+ )
1741
+ if_save_every_weights18 = gr.Radio(
1742
+ label=i18n("是否在每次保存时间点将最终小模型保存至weights文件夹"),
1743
+ choices=[i18n("是"), i18n("否")],
1744
+ value=i18n("否"),
1745
+ interactive=True,
1746
+ )
1747
+ with gr.Row():
1748
+ pretrained_G14 = gr.Textbox(
1749
+ label=i18n("加载预训练底模G路径"),
1750
+ value="pretrained/f0G40k.pth",
1751
+ interactive=True,
1752
+ )
1753
+ pretrained_D15 = gr.Textbox(
1754
+ label=i18n("加载预训练底模D路径"),
1755
+ value="pretrained/f0D40k.pth",
1756
+ interactive=True,
1757
+ )
1758
+ sr2.change(
1759
+ change_sr2,
1760
+ [sr2, if_f0_3, version19],
1761
+ [pretrained_G14, pretrained_D15],
1762
+ )
1763
+ version19.change(
1764
+ change_version19,
1765
+ [sr2, if_f0_3, version19],
1766
+ [pretrained_G14, pretrained_D15, sr2],
1767
+ )
1768
+ if_f0_3.change(
1769
+ change_f0,
1770
+ [if_f0_3, sr2, version19],
1771
+ [f0method8, pretrained_G14, pretrained_D15],
1772
+ )
1773
+ gpus16 = gr.Textbox(
1774
+ label=i18n("以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2"),
1775
+ value=gpus,
1776
+ interactive=True,
1777
+ )
1778
+ but3 = gr.Button(i18n("训练模型"), variant="primary")
1779
+ but4 = gr.Button(i18n("训练特征索引"), variant="primary")
1780
+ but5 = gr.Button(i18n("一键训练"), variant="primary")
1781
+ info3 = gr.Textbox(label=i18n("输出信息"), value="", max_lines=10)
1782
+ but3.click(
1783
+ click_train,
1784
+ [
1785
+ exp_dir1,
1786
+ sr2,
1787
+ if_f0_3,
1788
+ spk_id5,
1789
+ save_epoch10,
1790
+ total_epoch11,
1791
+ batch_size12,
1792
+ if_save_latest13,
1793
+ pretrained_G14,
1794
+ pretrained_D15,
1795
+ gpus16,
1796
+ if_cache_gpu17,
1797
+ if_save_every_weights18,
1798
+ version19,
1799
+ ],
1800
+ info3,
1801
+ )
1802
+ but4.click(train_index, [exp_dir1, version19], info3)
1803
+ but5.click(
1804
+ train1key,
1805
+ [
1806
+ exp_dir1,
1807
+ sr2,
1808
+ if_f0_3,
1809
+ trainset_dir4,
1810
+ spk_id5,
1811
+ np7,
1812
+ f0method8,
1813
+ save_epoch10,
1814
+ total_epoch11,
1815
+ batch_size12,
1816
+ if_save_latest13,
1817
+ pretrained_G14,
1818
+ pretrained_D15,
1819
+ gpus16,
1820
+ if_cache_gpu17,
1821
+ if_save_every_weights18,
1822
+ version19,
1823
+ ],
1824
+ info3,
1825
+ )
1826
+
1827
+ with gr.TabItem(i18n("ckpt处理")):
1828
+ with gr.Group():
1829
+ gr.Markdown(value=i18n("模型融合, 可用于测试音色融合"))
1830
+ with gr.Row():
1831
+ ckpt_a = gr.Textbox(label=i18n("A模型路径"), value="", interactive=True)
1832
+ ckpt_b = gr.Textbox(label=i18n("B模型路径"), value="", interactive=True)
1833
+ alpha_a = gr.Slider(
1834
+ minimum=0,
1835
+ maximum=1,
1836
+ label=i18n("A模型权重"),
1837
+ value=0.5,
1838
+ interactive=True,
1839
+ )
1840
+ with gr.Row():
1841
+ sr_ = gr.Radio(
1842
+ label=i18n("目标采样率"),
1843
+ choices=["40k", "48k"],
1844
+ value="40k",
1845
+ interactive=True,
1846
+ )
1847
+ if_f0_ = gr.Radio(
1848
+ label=i18n("模型是否带音高指导"),
1849
+ choices=[i18n("是"), i18n("否")],
1850
+ value=i18n("是"),
1851
+ interactive=True,
1852
+ )
1853
+ info__ = gr.Textbox(
1854
+ label=i18n("要置入的模型信息"), value="", max_lines=8, interactive=True
1855
+ )
1856
+ name_to_save0 = gr.Textbox(
1857
+ label=i18n("保存的模型名不带后缀"),
1858
+ value="",
1859
+ max_lines=1,
1860
+ interactive=True,
1861
+ )
1862
+ version_2 = gr.Radio(
1863
+ label=i18n("模型版本型号"),
1864
+ choices=["v1", "v2"],
1865
+ value="v1",
1866
+ interactive=True,
1867
+ )
1868
+ with gr.Row():
1869
+ but6 = gr.Button(i18n("融合"), variant="primary")
1870
+ info4 = gr.Textbox(label=i18n("输出信息"), value="", max_lines=8)
1871
+ but6.click(
1872
+ merge,
1873
+ [
1874
+ ckpt_a,
1875
+ ckpt_b,
1876
+ alpha_a,
1877
+ sr_,
1878
+ if_f0_,
1879
+ info__,
1880
+ name_to_save0,
1881
+ version_2,
1882
+ ],
1883
+ info4,
1884
+ ) # def merge(path1,path2,alpha1,sr,f0,info):
1885
+ with gr.Group():
1886
+ gr.Markdown(value=i18n("修改模型信息(仅支持weights文件夹下提取的小模型文件)"))
1887
+ with gr.Row():
1888
+ ckpt_path0 = gr.Textbox(
1889
+ label=i18n("模型路径"), value="", interactive=True
1890
+ )
1891
+ info_ = gr.Textbox(
1892
+ label=i18n("要改的模型信息"), value="", max_lines=8, interactive=True
1893
+ )
1894
+ name_to_save1 = gr.Textbox(
1895
+ label=i18n("保存的文件名, 默认空为和源文件同名"),
1896
+ value="",
1897
+ max_lines=8,
1898
+ interactive=True,
1899
+ )
1900
+ with gr.Row():
1901
+ but7 = gr.Button(i18n("修改"), variant="primary")
1902
+ info5 = gr.Textbox(label=i18n("输出信息"), value="", max_lines=8)
1903
+ but7.click(change_info, [ckpt_path0, info_, name_to_save1], info5)
1904
+ with gr.Group():
1905
+ gr.Markdown(value=i18n("查看模型信息(仅支持weights文件夹下提取的小模型文件)"))
1906
+ with gr.Row():
1907
+ ckpt_path1 = gr.Textbox(
1908
+ label=i18n("模型路径"), value="", interactive=True
1909
+ )
1910
+ but8 = gr.Button(i18n("查看"), variant="primary")
1911
+ info6 = gr.Textbox(label=i18n("输出信息"), value="", max_lines=8)
1912
+ but8.click(show_info, [ckpt_path1], info6)
1913
+ with gr.Group():
1914
+ gr.Markdown(
1915
+ value=i18n(
1916
+ "模型提取(输入logs文件夹下大文件模型路径),适用于训一半不想训了模型没有自动提取保存小文件模型,或者想测试中间模型的情况"
1917
+ )
1918
+ )
1919
+ with gr.Row():
1920
+ ckpt_path2 = gr.Textbox(
1921
+ label=i18n("模型路径"),
1922
+ value="E:\\codes\\py39\\logs\\mi-test_f0_48k\\G_23333.pth",
1923
+ interactive=True,
1924
+ )
1925
+ save_name = gr.Textbox(
1926
+ label=i18n("保存名"), value="", interactive=True
1927
+ )
1928
+ sr__ = gr.Radio(
1929
+ label=i18n("目标采样率"),
1930
+ choices=["32k", "40k", "48k"],
1931
+ value="40k",
1932
+ interactive=True,
1933
+ )
1934
+ if_f0__ = gr.Radio(
1935
+ label=i18n("模型是否带音高指导,1是0否"),
1936
+ choices=["1", "0"],
1937
+ value="1",
1938
+ interactive=True,
1939
+ )
1940
+ version_1 = gr.Radio(
1941
+ label=i18n("模型版本型号"),
1942
+ choices=["v1", "v2"],
1943
+ value="v2",
1944
+ interactive=True,
1945
+ )
1946
+ info___ = gr.Textbox(
1947
+ label=i18n("要置入的模型信息"), value="", max_lines=8, interactive=True
1948
+ )
1949
+ but9 = gr.Button(i18n("提取"), variant="primary")
1950
+ info7 = gr.Textbox(label=i18n("输出信息"), value="", max_lines=8)
1951
+ ckpt_path2.change(
1952
+ change_info_, [ckpt_path2], [sr__, if_f0__, version_1]
1953
+ )
1954
+ but9.click(
1955
+ extract_small_model,
1956
+ [ckpt_path2, save_name, sr__, if_f0__, info___, version_1],
1957
+ info7,
1958
+ )
1959
+
1960
+ with gr.TabItem(i18n("Onnx导出")):
1961
+ with gr.Row():
1962
+ ckpt_dir = gr.Textbox(label=i18n("RVC模型路径"), value="", interactive=True)
1963
+ with gr.Row():
1964
+ onnx_dir = gr.Textbox(
1965
+ label=i18n("Onnx输出路径"), value="", interactive=True
1966
+ )
1967
+ with gr.Row():
1968
+ infoOnnx = gr.Label(label="info")
1969
+ with gr.Row():
1970
+ butOnnx = gr.Button(i18n("导出Onnx模型"), variant="primary")
1971
+ butOnnx.click(export_onnx, [ckpt_dir, onnx_dir], infoOnnx)
1972
+
1973
+ tab_faq = i18n("常见问题解答")
1974
+ with gr.TabItem(tab_faq):
1975
+ try:
1976
+ if tab_faq == "常见问题解答":
1977
+ with open("docs/faq.md", "r", encoding="utf8") as f:
1978
+ info = f.read()
1979
+ else:
1980
+ with open("docs/faq_en.md", "r", encoding="utf8") as f:
1981
+ info = f.read()
1982
+ gr.Markdown(value=info)
1983
+ except:
1984
+ gr.Markdown(traceback.format_exc())
1985
+
1986
+ # with gr.TabItem(i18n("招募音高曲线前端编辑器")):
1987
+ # gr.Markdown(value=i18n("加开发群联系我xxxxx"))
1988
+ # with gr.TabItem(i18n("点击查看交流、问题反馈群号")):
1989
+ # gr.Markdown(value=i18n("xxxxx"))
1990
+
1991
+ if config.iscolab:
1992
+ app.queue(concurrency_count=511, max_size=1022).launch(share=True)
1993
+ else:
1994
+ app.queue(concurrency_count=511, max_size=1022).launch(
1995
+ server_name="0.0.0.0",
1996
+ inbrowser=not config.noautoopen,
1997
+ server_port=config.listen_port,
1998
+ quiet=True,
1999
+ )
infer_batch_rvc.py ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ v1
3
+ runtime\python.exe myinfer-v2-0528.py 0 "E:\codes\py39\RVC-beta\todo-songs" "E:\codes\py39\logs\mi-test\added_IVF677_Flat_nprobe_7.index" harvest "E:\codes\py39\RVC-beta\output" "E:\codes\py39\test-20230416b\weights\mi-test.pth" 0.66 cuda:0 True 3 0 1 0.33
4
+ v2
5
+ runtime\python.exe myinfer-v2-0528.py 0 "E:\codes\py39\RVC-beta\todo-songs" "E:\codes\py39\test-20230416b\logs\mi-test-v2\aadded_IVF677_Flat_nprobe_1_v2.index" harvest "E:\codes\py39\RVC-beta\output_v2" "E:\codes\py39\test-20230416b\weights\mi-test-v2.pth" 0.66 cuda:0 True 3 0 1 0.33
6
+ """
7
+ import os, sys, pdb, torch
8
+ os.environ["CUDA_VISIBLE_DEVICES"] = ""
9
+ now_dir = os.getcwd()
10
+ sys.path.append(now_dir)
11
+ import argparse
12
+ import glob
13
+ import sys
14
+ import torch
15
+ import tqdm as tq
16
+ from multiprocessing import cpu_count
17
+
18
+
19
+ class Config:
20
+ def __init__(self, device, is_half):
21
+ self.device = device
22
+ self.is_half = is_half
23
+ self.n_cpu = 0
24
+ self.gpu_name = None
25
+ self.gpu_mem = None
26
+ self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()
27
+
28
+ def device_config(self) -> tuple:
29
+ if torch.cuda.is_available():
30
+ i_device = int(self.device.split(":")[-1])
31
+ self.gpu_name = torch.cuda.get_device_name(i_device)
32
+ if (
33
+ ("16" in self.gpu_name and "V100" not in self.gpu_name.upper())
34
+ or "P40" in self.gpu_name.upper()
35
+ or "1060" in self.gpu_name
36
+ or "1070" in self.gpu_name
37
+ or "1080" in self.gpu_name
38
+ ):
39
+ print("16系/10系显卡和P40强制单精度")
40
+ self.is_half = False
41
+ for config_file in ["32k.json", "40k.json", "48k.json"]:
42
+ with open(f"configs/{config_file}", "r") as f:
43
+ strr = f.read().replace("true", "false")
44
+ with open(f"configs/{config_file}", "w") as f:
45
+ f.write(strr)
46
+ with open("trainset_preprocess_pipeline_print.py", "r") as f:
47
+ strr = f.read().replace("3.7", "3.0")
48
+ with open("trainset_preprocess_pipeline_print.py", "w") as f:
49
+ f.write(strr)
50
+ else:
51
+ self.gpu_name = None
52
+ self.gpu_mem = int(
53
+ torch.cuda.get_device_properties(i_device).total_memory
54
+ / 1024
55
+ / 1024
56
+ / 1024
57
+ + 0.4
58
+ )
59
+ if self.gpu_mem <= 4:
60
+ with open("trainset_preprocess_pipeline_print.py", "r") as f:
61
+ strr = f.read().replace("3.7", "3.0")
62
+ with open("trainset_preprocess_pipeline_print.py", "w") as f:
63
+ f.write(strr)
64
+ elif torch.backends.mps.is_available():
65
+ print("没有发现支持的N卡, 使用MPS进行推理")
66
+ self.device = "mps"
67
+ else:
68
+ print("没有发现支持的N卡, 使用CPU进行推理")
69
+ self.device = "cpu"
70
+ self.is_half = True
71
+
72
+ if self.n_cpu == 0:
73
+ self.n_cpu = cpu_count()
74
+
75
+ if self.is_half:
76
+ # 6G显存配置
77
+ x_pad = 3
78
+ x_query = 10
79
+ x_center = 60
80
+ x_max = 65
81
+ else:
82
+ # 5G显存配置
83
+ x_pad = 1
84
+ x_query = 6
85
+ x_center = 38
86
+ x_max = 41
87
+
88
+ if self.gpu_mem != None and self.gpu_mem <= 4:
89
+ x_pad = 1
90
+ x_query = 5
91
+ x_center = 30
92
+ x_max = 32
93
+
94
+ return x_pad, x_query, x_center, x_max
95
+
96
+
97
+ f0up_key = sys.argv[1] #-12.0
98
+ input_path = sys.argv[2]
99
+ index_path = ''
100
+ abc = sys.argv[3]
101
+ f0method = sys.argv[4] # harvest or pm
102
+ opt_path = sys.argv[5]
103
+ model_path = sys.argv[6]
104
+ index_rate = float(sys.argv[7])
105
+ device = sys.argv[8]
106
+ device = None
107
+ is_half = bool(sys.argv[9])
108
+ is_half = None
109
+ filter_radius = int(sys.argv[10])
110
+ resample_sr = int(sys.argv[11])
111
+ rms_mix_rate = float(sys.argv[12])
112
+ protect = float(sys.argv[13])
113
+ print(sys.argv)
114
+ config = Config(device, is_half)
115
+ now_dir = os.getcwd()
116
+ sys.path.append(now_dir)
117
+ from vc_infer_pipeline import VC
118
+ from infer_pack.models import (
119
+ SynthesizerTrnMs256NSFsid,
120
+ SynthesizerTrnMs256NSFsid_nono,
121
+ SynthesizerTrnMs768NSFsid,
122
+ SynthesizerTrnMs768NSFsid_nono,
123
+ )
124
+ from my_utils import load_audio
125
+ from fairseq import checkpoint_utils
126
+ from scipy.io import wavfile
127
+
128
+ hubert_model = None
129
+
130
+
131
+ def load_hubert():
132
+ global hubert_model
133
+ models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
134
+ ["hubert_base.pt"],
135
+ suffix="",
136
+ )
137
+ hubert_model = models[0]
138
+ hubert_model = hubert_model.to(device)
139
+ if is_half:
140
+ hubert_model = hubert_model.half()
141
+ else:
142
+ hubert_model = hubert_model.float()
143
+ hubert_model.eval()
144
+
145
+
146
+ def vc_single(sid, input_audio, f0_up_key, f0_file, f0_method, file_index, index_rate):
147
+ global tgt_sr, net_g, vc, hubert_model, version
148
+ if input_audio is None:
149
+ return "You need to upload an audio", None
150
+ f0_up_key = int(f0_up_key)
151
+ audio = load_audio(input_audio, 16000)
152
+ times = [0, 0, 0]
153
+ if hubert_model == None:
154
+ load_hubert()
155
+ if_f0 = cpt.get("f0", 1)
156
+ # audio_opt=vc.pipeline(hubert_model,net_g,sid,audio,times,f0_up_key,f0_method,file_index,file_big_npy,index_rate,if_f0,f0_file=f0_file)
157
+ audio_opt = vc.pipeline(
158
+ hubert_model,
159
+ net_g,
160
+ sid,
161
+ audio,
162
+ input_audio,
163
+ times,
164
+ f0_up_key,
165
+ f0_method,
166
+ file_index,
167
+ index_rate,
168
+ if_f0,
169
+ filter_radius,
170
+ tgt_sr,
171
+ resample_sr,
172
+ rms_mix_rate,
173
+ version,
174
+ protect,
175
+ f0_file=f0_file,
176
+ )
177
+ print(times)
178
+ return audio_opt
179
+
180
+
181
+ def get_vc(model_path):
182
+ global n_spk, tgt_sr, net_g, vc, cpt, device, is_half, version
183
+ print("loading pth %s" % model_path)
184
+ cpt = torch.load(model_path, map_location="cpu")
185
+ tgt_sr = cpt["config"][-1]
186
+ cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
187
+ if_f0 = cpt.get("f0", 1)
188
+ version = cpt.get("version", "v1")
189
+ if version == "v1":
190
+ if if_f0 == 1:
191
+ net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=is_half)
192
+ else:
193
+ net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
194
+ elif version == "v2":
195
+ if if_f0 == 1: #
196
+ net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=is_half)
197
+ else:
198
+ net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
199
+ del net_g.enc_q
200
+ print(net_g.load_state_dict(cpt["weight"], strict=False)) # 不加这一行清不干净,真奇葩
201
+ net_g.eval().to(device)
202
+ if is_half:
203
+ net_g = net_g.half()
204
+ else:
205
+ net_g = net_g.float()
206
+ vc = VC(tgt_sr, config)
207
+ n_spk = cpt["config"][-3]
208
+ # return {"visible": True,"maximum": n_spk, "__type__": "update"}
209
+
210
+
211
+ get_vc(model_path)
212
+ audios = os.listdir(input_path)
213
+ for file in tq.tqdm(audios):
214
+ if file.endswith(".wav"):
215
+ file_path = input_path + "/" + file
216
+ wav_opt = vc_single(
217
+ 0, file_path, f0up_key, None, f0method, index_path, index_rate
218
+ )
219
+ out_path = opt_path + "/" + file
220
+ wavfile.write(out_path, tgt_sr, wav_opt)
infer_uvr5.py ADDED
@@ -0,0 +1,364 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, sys, torch, warnings, pdb
2
+
3
+ now_dir = os.getcwd()
4
+ sys.path.append(now_dir)
5
+ from json import load as ll
6
+
7
+ warnings.filterwarnings("ignore")
8
+ import librosa
9
+ import importlib
10
+ import numpy as np
11
+ import hashlib, math
12
+ from tqdm import tqdm
13
+ from uvr5_pack.lib_v5 import spec_utils
14
+ from uvr5_pack.utils import _get_name_params, inference
15
+ from uvr5_pack.lib_v5.model_param_init import ModelParameters
16
+ import soundfile as sf
17
+ from uvr5_pack.lib_v5.nets_new import CascadedNet
18
+ from uvr5_pack.lib_v5 import nets_61968KB as nets
19
+
20
+
21
+ class _audio_pre_:
22
+ def __init__(self, agg, model_path, device, is_half):
23
+ self.model_path = model_path
24
+ self.device = device
25
+ self.data = {
26
+ # Processing Options
27
+ "postprocess": False,
28
+ "tta": False,
29
+ # Constants
30
+ "window_size": 512,
31
+ "agg": agg,
32
+ "high_end_process": "mirroring",
33
+ }
34
+ mp = ModelParameters("uvr5_pack/lib_v5/modelparams/4band_v2.json")
35
+ model = nets.CascadedASPPNet(mp.param["bins"] * 2)
36
+ cpk = torch.load(model_path, map_location="cpu")
37
+ model.load_state_dict(cpk)
38
+ model.eval()
39
+ if is_half:
40
+ model = model.half().to(device)
41
+ else:
42
+ model = model.to(device)
43
+
44
+ self.mp = mp
45
+ self.model = model
46
+
47
+ def _path_audio_(self, music_file, ins_root=None, vocal_root=None, format="flac"):
48
+ if ins_root is None and vocal_root is None:
49
+ return "No save root."
50
+
51
+ if ins_root is not None:
52
+ os.makedirs(ins_root, exist_ok=True)
53
+ if vocal_root is not None:
54
+ os.makedirs(vocal_root, exist_ok=True)
55
+ X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {}
56
+ bands_n = len(self.mp.param["band"])
57
+ # print(bands_n)
58
+ for d in range(bands_n, 0, -1):
59
+ bp = self.mp.param["band"][d]
60
+ if d == bands_n: # high-end band
61
+ (
62
+ X_wave[d],
63
+ _,
64
+ ) = librosa.core.load(
65
+ music_file,
66
+ bp["sr"],
67
+ False,
68
+ dtype=np.float32,
69
+ res_type=bp["res_type"],
70
+ )
71
+ if X_wave[d].ndim == 1:
72
+ X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]])
73
+ else: # lower bands
74
+ X_wave[d] = librosa.core.resample(
75
+ X_wave[d + 1],
76
+ self.mp.param["band"][d + 1]["sr"],
77
+ bp["sr"],
78
+ res_type=bp["res_type"],
79
+ )
80
+ # Stft of wave source
81
+ X_spec_s[d] = spec_utils.wave_to_spectrogram_mt(
82
+ X_wave[d],
83
+ bp["hl"],
84
+ bp["n_fft"],
85
+ self.mp.param["mid_side"],
86
+ self.mp.param["mid_side_b2"],
87
+ self.mp.param["reverse"],
88
+ )
89
+ # pdb.set_trace()
90
+ if d == bands_n and self.data["high_end_process"] != "none":
91
+ input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + (
92
+ self.mp.param["pre_filter_stop"] - self.mp.param["pre_filter_start"]
93
+ )
94
+ input_high_end = X_spec_s[d][
95
+ :, bp["n_fft"] // 2 - input_high_end_h : bp["n_fft"] // 2, :
96
+ ]
97
+
98
+ X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp)
99
+ aggresive_set = float(self.data["agg"] / 100)
100
+ aggressiveness = {
101
+ "value": aggresive_set,
102
+ "split_bin": self.mp.param["band"][1]["crop_stop"],
103
+ }
104
+ with torch.no_grad():
105
+ pred, X_mag, X_phase = inference(
106
+ X_spec_m, self.device, self.model, aggressiveness, self.data
107
+ )
108
+ # Postprocess
109
+ if self.data["postprocess"]:
110
+ pred_inv = np.clip(X_mag - pred, 0, np.inf)
111
+ pred = spec_utils.mask_silence(pred, pred_inv)
112
+ y_spec_m = pred * X_phase
113
+ v_spec_m = X_spec_m - y_spec_m
114
+
115
+ if ins_root is not None:
116
+ if self.data["high_end_process"].startswith("mirroring"):
117
+ input_high_end_ = spec_utils.mirroring(
118
+ self.data["high_end_process"], y_spec_m, input_high_end, self.mp
119
+ )
120
+ wav_instrument = spec_utils.cmb_spectrogram_to_wave(
121
+ y_spec_m, self.mp, input_high_end_h, input_high_end_
122
+ )
123
+ else:
124
+ wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp)
125
+ print("%s instruments done")
126
+ if format in ["wav", "flac"]:
127
+ sf.write(
128
+ os.path.join(
129
+ ins_root,
130
+ "instrument.{}".format(format),
131
+ ),
132
+ (np.array(wav_instrument) * 32768).astype("int16"),
133
+ self.mp.param["sr"],
134
+ ) #
135
+ else:
136
+ path = os.path.join(
137
+ ins_root, "instrument.wav"
138
+ )
139
+ sf.write(
140
+ path,
141
+ (np.array(wav_instrument) * 32768).astype("int16"),
142
+ self.mp.param["sr"],
143
+ )
144
+ if os.path.exists(path):
145
+ os.system(
146
+ "ffmpeg -i %s -vn %s -q:a 2 -y"
147
+ % (path, path[:-4] + ".%s" % format)
148
+ )
149
+ if vocal_root is not None:
150
+ if self.data["high_end_process"].startswith("mirroring"):
151
+ input_high_end_ = spec_utils.mirroring(
152
+ self.data["high_end_process"], v_spec_m, input_high_end, self.mp
153
+ )
154
+ wav_vocals = spec_utils.cmb_spectrogram_to_wave(
155
+ v_spec_m, self.mp, input_high_end_h, input_high_end_
156
+ )
157
+ else:
158
+ wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp)
159
+ print("%s vocals done")
160
+ if format in ["wav", "flac"]:
161
+ sf.write(
162
+ os.path.join(
163
+ vocal_root,
164
+ "vocal.{}".format(format),
165
+ ),
166
+ (np.array(wav_vocals) * 32768).astype("int16"),
167
+ self.mp.param["sr"],
168
+ )
169
+ else:
170
+ path = os.path.join(
171
+ vocal_root, "vocal.wav"
172
+ )
173
+ sf.write(
174
+ path,
175
+ (np.array(wav_vocals) * 32768).astype("int16"),
176
+ self.mp.param["sr"],
177
+ )
178
+ if os.path.exists(path):
179
+ os.system(
180
+ "ffmpeg -i %s -vn %s -q:a 2 -y"
181
+ % (path, path[:-4] + ".%s" % format)
182
+ )
183
+
184
+
185
+ class _audio_pre_new:
186
+ def __init__(self, agg, model_path, device, is_half):
187
+ print('_audio_pre_new')
188
+ self.model_path = model_path
189
+ self.device = device
190
+ self.data = {
191
+ # Processing Options
192
+ "postprocess": False,
193
+ "tta": False,
194
+ # Constants
195
+ "window_size": 512,
196
+ "agg": agg,
197
+ "high_end_process": "mirroring",
198
+ }
199
+ mp = ModelParameters("uvr5_pack/lib_v5/modelparams/4band_v3.json")
200
+ nout = 64 if "DeReverb" in model_path else 48
201
+ model = CascadedNet(mp.param["bins"] * 2, nout)
202
+ cpk = torch.load(model_path, map_location="cpu")
203
+ model.load_state_dict(cpk)
204
+ model.eval()
205
+ if is_half:
206
+ model = model.half().to(device)
207
+ else:
208
+ model = model.to(device)
209
+
210
+ self.mp = mp
211
+ self.model = model
212
+
213
+ def _path_audio_(
214
+ self, music_file, vocal_root=None, ins_root=None, format="flac"
215
+ ): # 3个VR模型vocal和ins是反的
216
+ if ins_root is None and vocal_root is None:
217
+ return "No save root."
218
+ name = os.path.basename(music_file)
219
+ if ins_root is not None:
220
+ os.makedirs(ins_root, exist_ok=True)
221
+ if vocal_root is not None:
222
+ os.makedirs(vocal_root, exist_ok=True)
223
+ X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {}
224
+ bands_n = len(self.mp.param["band"])
225
+ # print(bands_n)
226
+ for d in range(bands_n, 0, -1):
227
+ bp = self.mp.param["band"][d]
228
+ if d == bands_n: # high-end band
229
+ (
230
+ X_wave[d],
231
+ _,
232
+ ) = librosa.core.load( # 理论上librosa读取可能对某些音频有bug,应该上ffmpeg读取,但是太麻烦了弃坑
233
+ music_file,
234
+ bp["sr"],
235
+ False,
236
+ dtype=np.float32,
237
+ res_type=bp["res_type"],
238
+ )
239
+ if X_wave[d].ndim == 1:
240
+ X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]])
241
+ else: # lower bands
242
+ X_wave[d] = librosa.core.resample(
243
+ X_wave[d + 1],
244
+ self.mp.param["band"][d + 1]["sr"],
245
+ bp["sr"],
246
+ res_type=bp["res_type"],
247
+ )
248
+ # Stft of wave source
249
+ X_spec_s[d] = spec_utils.wave_to_spectrogram_mt(
250
+ X_wave[d],
251
+ bp["hl"],
252
+ bp["n_fft"],
253
+ self.mp.param["mid_side"],
254
+ self.mp.param["mid_side_b2"],
255
+ self.mp.param["reverse"],
256
+ )
257
+ # pdb.set_trace()
258
+ if d == bands_n and self.data["high_end_process"] != "none":
259
+ input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + (
260
+ self.mp.param["pre_filter_stop"] - self.mp.param["pre_filter_start"]
261
+ )
262
+ input_high_end = X_spec_s[d][
263
+ :, bp["n_fft"] // 2 - input_high_end_h : bp["n_fft"] // 2, :
264
+ ]
265
+
266
+ X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp)
267
+ aggresive_set = float(self.data["agg"] / 100)
268
+ aggressiveness = {
269
+ "value": aggresive_set,
270
+ "split_bin": self.mp.param["band"][1]["crop_stop"],
271
+ }
272
+ with torch.no_grad():
273
+ pred, X_mag, X_phase = inference(
274
+ X_spec_m, self.device, self.model, aggressiveness, self.data
275
+ )
276
+ # Postprocess
277
+ if self.data["postprocess"]:
278
+ pred_inv = np.clip(X_mag - pred, 0, np.inf)
279
+ pred = spec_utils.mask_silence(pred, pred_inv)
280
+ y_spec_m = pred * X_phase
281
+ v_spec_m = X_spec_m - y_spec_m
282
+
283
+ if ins_root is not None:
284
+ if self.data["high_end_process"].startswith("mirroring"):
285
+ input_high_end_ = spec_utils.mirroring(
286
+ self.data["high_end_process"], y_spec_m, input_high_end, self.mp
287
+ )
288
+ wav_instrument = spec_utils.cmb_spectrogram_to_wave(
289
+ y_spec_m, self.mp, input_high_end_h, input_high_end_
290
+ )
291
+ else:
292
+ wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp)
293
+ print("%s instruments done" % name)
294
+ if format in ["wav", "flac"]:
295
+ sf.write(
296
+ os.path.join(
297
+ ins_root,
298
+ "instrument_{}_{}.{}".format(name, self.data["agg"], format),
299
+ ),
300
+ (np.array(wav_instrument) * 32768).astype("int16"),
301
+ self.mp.param["sr"],
302
+ ) #
303
+ else:
304
+ path = os.path.join(
305
+ ins_root, "instrument_{}_{}.wav".format(name, self.data["agg"])
306
+ )
307
+ sf.write(
308
+ path,
309
+ (np.array(wav_instrument) * 32768).astype("int16"),
310
+ self.mp.param["sr"],
311
+ )
312
+ if os.path.exists(path):
313
+ os.system(
314
+ "ffmpeg -i %s -vn %s -q:a 2 -y"
315
+ % (path, path[:-4] + ".%s" % format)
316
+ )
317
+ if vocal_root is not None:
318
+ if self.data["high_end_process"].startswith("mirroring"):
319
+ input_high_end_ = spec_utils.mirroring(
320
+ self.data["high_end_process"], v_spec_m, input_high_end, self.mp
321
+ )
322
+ wav_vocals = spec_utils.cmb_spectrogram_to_wave(
323
+ v_spec_m, self.mp, input_high_end_h, input_high_end_
324
+ )
325
+ else:
326
+ wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp)
327
+ print("%s vocals done" % name)
328
+ if format in ["wav", "flac"]:
329
+ sf.write(
330
+ os.path.join(
331
+ vocal_root,
332
+ "vocal_{}_{}.{}".format(name, self.data["agg"], format),
333
+ ),
334
+ (np.array(wav_vocals) * 32768).astype("int16"),
335
+ self.mp.param["sr"],
336
+ )
337
+ else:
338
+ path = os.path.join(
339
+ vocal_root, "vocal_{}_{}.wav".format(name, self.data["agg"])
340
+ )
341
+ sf.write(
342
+ path,
343
+ (np.array(wav_vocals) * 32768).astype("int16"),
344
+ self.mp.param["sr"],
345
+ )
346
+ if os.path.exists(path):
347
+ os.system(
348
+ "ffmpeg -i %s -vn %s -q:a 2 -y"
349
+ % (path, path[:-4] + ".%s" % format)
350
+ )
351
+
352
+
353
+ if __name__ == "__main__":
354
+ device = "cuda"
355
+ is_half = True
356
+ # model_path = "uvr5_weights/2_HP-UVR.pth"
357
+ # model_path = "uvr5_weights/VR-DeEchoDeReverb.pth"
358
+ # model_path = "uvr5_weights/VR-DeEchoNormal.pth"
359
+ model_path = "models/mymodelimran.pth"
360
+ # pre_fun = _audio_pre_(model_path=model_path, device=device, is_half=True,agg=10)
361
+ pre_fun = _audio_pre_new(model_path=model_path, device=device, is_half=True, agg=10)
362
+ audio_path = "audios/abc.mp3"
363
+ save_path = "results"
364
+ pre_fun._path_audio_(audio_path, save_path, save_path)
local.settings.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "IsEncrypted": false,
3
+ "Values": {
4
+ "AzureWebJobsStorage": "",
5
+ "FUNCTIONS_WORKER_RUNTIME": "python",
6
+ "AzureWebJobs.AltVoiceClone.Disabled": "true"
7
+ }
8
+ }
main.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import functions_framework
2
+ import os
3
+ import logging
4
+
5
+
6
+ @functions_framework.http
7
+ def hello_http(request):
8
+
9
+ request_args = request.args
10
+
11
+ # Extract parameters from request
12
+ audio_file = request_args.get('audio_file')
13
+ model_name = request_args.get('model_name')
14
+ transform = request_args.get('transform')
15
+ song = request_args.get('song')
16
+
17
+ # Check if any parameter is None
18
+ if any(param is None for param in [audio_file, model_name, transform, song]):
19
+ return "Please provide all the required arguments: audio_file, model_name, transform, song."
20
+ else:
21
+ is_song = song.lower() == "true"
22
+ transform = int(transform)
23
+ import RVC_class
24
+
25
+
26
+ is_song = song.lower() == "true"
27
+
28
+ # Create an instance of VoiceConverter
29
+ converter = RVC_class.VoiceConverter()
30
+
31
+ # Call single_run method and get the result
32
+ result = converter.single_run(audio_file, model_name, transform, is_song)
33
+
34
+ return result
35
+
36
+
37
+
38
+
39
+
40
+
41
+
my_utils.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ffmpeg
2
+ import numpy as np
3
+ import requests
4
+ import logging
5
+ import wave
6
+ import librosa
7
+
8
+ def load_audio(file_path, sr):
9
+ try:
10
+ with wave.open(file_path, 'rb') as audio_file:
11
+ channels = audio_file.getnchannels()
12
+ sample_width = audio_file.getsampwidth()
13
+ frame_rate = audio_file.getframerate()
14
+ frames = audio_file.readframes(audio_file.getnframes())
15
+
16
+ audio_data = np.frombuffer(frames, dtype=np.int16)
17
+ audio_data = audio_data.astype(np.float32) / np.iinfo(np.int16).max # Convert to float and normalize to the range [-1, 1]
18
+
19
+ # Resample the audio if the sample rate is different
20
+ if frame_rate != sr:
21
+ audio_data = librosa.resample(audio_data, orig_sr=frame_rate, target_sr=sr)
22
+
23
+ # Perform any required audio processing or conversion
24
+ # ...
25
+
26
+ except Exception as e:
27
+ raise RuntimeError(f"Failed to load audio: {e}")
28
+
29
+ return audio_data
30
+
31
+
32
+
33
+
34
+
35
+
onnx_inference_demo.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import soundfile
2
+ from infer_pack.onnx_inference import OnnxRVC
3
+
4
+ hop_size = 512
5
+ sampling_rate = 40000 # 采样率
6
+ f0_up_key = 0 # 升降调
7
+ sid = 0 # 角色ID
8
+ f0_method = "dio" # F0提取算法
9
+ model_path = "ShirohaRVC.onnx" # 模型的完整路径
10
+ vec_name = "vec-256-layer-9" # 内部自动补齐为 f"pretrained/{vec_name}.onnx" 需要onnx的vec模型
11
+ wav_path = "123.wav" # 输入路径或ByteIO实例
12
+ out_path = "out.wav" # 输出路径或ByteIO实例
13
+
14
+ model = OnnxRVC(
15
+ model_path, vec_path=vec_name, sr=sampling_rate, hop_size=hop_size, device="cuda"
16
+ )
17
+
18
+ audio = model.inference(wav_path, sid, f0_method=f0_method, f0_up_key=f0_up_key)
19
+
20
+ soundfile.write(out_path, audio, sampling_rate)
poetry.lock ADDED
The diff for this file is too large to render. See raw diff
 
pyproject.toml ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.poetry]
2
+ name = "rvc-beta"
3
+ version = "0.1.0"
4
+ description = ""
5
+ authors = ["lj1995"]
6
+ license = "MIT"
7
+
8
+ [tool.poetry.dependencies]
9
+ python = "^3.8"
10
+ torch = "^2.0.0"
11
+ torchaudio = "^2.0.1"
12
+ Cython = "^0.29.34"
13
+ gradio = "^3.24.1"
14
+ future = "^0.18.3"
15
+ pydub = "^0.25.1"
16
+ soundfile = "^0.12.1"
17
+ ffmpeg-python = "^0.2.0"
18
+ tensorboardX = "^2.6"
19
+ functorch = "^2.0.0"
20
+ fairseq = "^0.12.2"
21
+ faiss-cpu = "^1.7.2"
22
+ Jinja2 = "^3.1.2"
23
+ json5 = "^0.9.11"
24
+ librosa = "0.9.2"
25
+ llvmlite = "0.39.0"
26
+ Markdown = "^3.4.3"
27
+ matplotlib = "^3.7.1"
28
+ matplotlib-inline = "^0.1.6"
29
+ numba = "0.56.4"
30
+ numpy = "1.23.5"
31
+ scipy = "1.9.3"
32
+ praat-parselmouth = "^0.4.3"
33
+ Pillow = "9.3.0"
34
+ pyworld = "^0.3.2"
35
+ resampy = "^0.4.2"
36
+ scikit-learn = "^1.2.2"
37
+ starlette = "^0.27.0"
38
+ tensorboard = "^2.12.1"
39
+ tensorboard-data-server = "^0.7.0"
40
+ tensorboard-plugin-wit = "^1.8.1"
41
+ torchgen = "^0.0.1"
42
+ tqdm = "^4.65.0"
43
+ tornado = "^6.3"
44
+ Werkzeug = "^2.2.3"
45
+ uc-micro-py = "^1.0.1"
46
+ sympy = "^1.11.1"
47
+ tabulate = "^0.9.0"
48
+ PyYAML = "^6.0"
49
+ pyasn1 = "^0.4.8"
50
+ pyasn1-modules = "^0.2.8"
51
+ fsspec = "^2023.3.0"
52
+ absl-py = "^1.4.0"
53
+ audioread = "^3.0.0"
54
+ uvicorn = "^0.21.1"
55
+ colorama = "^0.4.6"
56
+
57
+ [tool.poetry.dev-dependencies]
58
+
59
+ [build-system]
60
+ requires = ["poetry-core>=1.0.0"]
61
+ build-backend = "poetry.core.masonry.api"
requirements-win-for-realtime_vc_gui.txt ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #1.Install torch from pytorch.org:
2
+ #torch 2.0 with cuda 11.8
3
+ #pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
4
+ #torch 1.11.0 with cuda 11.3
5
+ #pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 torchaudio==0.11.0 --extra-index-url https://download.pytorch.org/whl/cu113
6
+ einops
7
+ fairseq
8
+ flask
9
+ flask_cors
10
+ gin
11
+ gin_config
12
+ librosa
13
+ local_attention
14
+ matplotlib
15
+ praat-parselmouth
16
+ pyworld
17
+ PyYAML
18
+ resampy
19
+ scikit_learn
20
+ scipy
21
+ SoundFile
22
+ tensorboard
23
+ tqdm
24
+ wave
25
+ PySimpleGUI
26
+ sounddevice
27
+ gradio
28
+ noisereduce
requirements.txt ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # DO NOT include azure-functions-worker in this file
2
+ # The Python Worker is managed by Azure Functions platform
3
+ # Manually managing azure-functions-worker may cause unexpected issues
4
+
5
+ functions-framework
6
+ joblib>=1.1.0
7
+ numba==0.56.4
8
+ numpy==1.23.5
9
+ scipy==1.9.3
10
+ librosa==0.9.1
11
+ llvmlite==0.39.0
12
+ fairseq==0.12.2
13
+ faiss-cpu==1.7.3
14
+ gradio==3.14.0
15
+ Cython
16
+ pydub>=0.25.1
17
+ soundfile>=0.12.1
18
+ ffmpeg-python>=0.2.0
19
+ tensorboardX
20
+ Jinja2>=3.1.2
21
+ json5
22
+ Markdown
23
+ matplotlib>=3.7.0
24
+ matplotlib-inline>=0.1.3
25
+ praat-parselmouth>=0.4.2
26
+ Pillow>=9.1.1
27
+ resampy>=0.4.2
28
+ scikit-learn
29
+ starlette>=0.25.0
30
+ tensorboard
31
+ tensorboard-data-server
32
+ tensorboard-plugin-wit
33
+ torchgen>=0.0.1
34
+ torch==2.0.0
35
+ tqdm>=4.63.1
36
+ tornado>=6.1
37
+ Werkzeug>=2.2.3
38
+ uc-micro-py>=1.0.1
39
+ sympy>=1.11.1
40
+ tabulate>=0.8.10
41
+ PyYAML>=6.0
42
+ pyasn1>=0.4.8
43
+ pyasn1-modules>=0.2.8
44
+ fsspec>=2022.11.0
45
+ absl-py>=1.2.0
46
+ audioread
47
+ uvicorn>=0.21.1
48
+ colorama>=0.4.5
49
+ pyworld>=0.3.2
50
+ httpx==0.23.0
51
+ onnxruntime-gpu
52
+ torchcrepe==0.0.20
53
+ wave
54
+ pydub
55
+ requests
slicer2.py ADDED
@@ -0,0 +1,260 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+
4
+ # This function is obtained from librosa.
5
+ def get_rms(
6
+ y,
7
+ frame_length=2048,
8
+ hop_length=512,
9
+ pad_mode="constant",
10
+ ):
11
+ padding = (int(frame_length // 2), int(frame_length // 2))
12
+ y = np.pad(y, padding, mode=pad_mode)
13
+
14
+ axis = -1
15
+ # put our new within-frame axis at the end for now
16
+ out_strides = y.strides + tuple([y.strides[axis]])
17
+ # Reduce the shape on the framing axis
18
+ x_shape_trimmed = list(y.shape)
19
+ x_shape_trimmed[axis] -= frame_length - 1
20
+ out_shape = tuple(x_shape_trimmed) + tuple([frame_length])
21
+ xw = np.lib.stride_tricks.as_strided(y, shape=out_shape, strides=out_strides)
22
+ if axis < 0:
23
+ target_axis = axis - 1
24
+ else:
25
+ target_axis = axis + 1
26
+ xw = np.moveaxis(xw, -1, target_axis)
27
+ # Downsample along the target axis
28
+ slices = [slice(None)] * xw.ndim
29
+ slices[axis] = slice(0, None, hop_length)
30
+ x = xw[tuple(slices)]
31
+
32
+ # Calculate power
33
+ power = np.mean(np.abs(x) ** 2, axis=-2, keepdims=True)
34
+
35
+ return np.sqrt(power)
36
+
37
+
38
+ class Slicer:
39
+ def __init__(
40
+ self,
41
+ sr: int,
42
+ threshold: float = -40.0,
43
+ min_length: int = 5000,
44
+ min_interval: int = 300,
45
+ hop_size: int = 20,
46
+ max_sil_kept: int = 5000,
47
+ ):
48
+ if not min_length >= min_interval >= hop_size:
49
+ raise ValueError(
50
+ "The following condition must be satisfied: min_length >= min_interval >= hop_size"
51
+ )
52
+ if not max_sil_kept >= hop_size:
53
+ raise ValueError(
54
+ "The following condition must be satisfied: max_sil_kept >= hop_size"
55
+ )
56
+ min_interval = sr * min_interval / 1000
57
+ self.threshold = 10 ** (threshold / 20.0)
58
+ self.hop_size = round(sr * hop_size / 1000)
59
+ self.win_size = min(round(min_interval), 4 * self.hop_size)
60
+ self.min_length = round(sr * min_length / 1000 / self.hop_size)
61
+ self.min_interval = round(min_interval / self.hop_size)
62
+ self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size)
63
+
64
+ def _apply_slice(self, waveform, begin, end):
65
+ if len(waveform.shape) > 1:
66
+ return waveform[
67
+ :, begin * self.hop_size : min(waveform.shape[1], end * self.hop_size)
68
+ ]
69
+ else:
70
+ return waveform[
71
+ begin * self.hop_size : min(waveform.shape[0], end * self.hop_size)
72
+ ]
73
+
74
+ # @timeit
75
+ def slice(self, waveform):
76
+ if len(waveform.shape) > 1:
77
+ samples = waveform.mean(axis=0)
78
+ else:
79
+ samples = waveform
80
+ if samples.shape[0] <= self.min_length:
81
+ return [waveform]
82
+ rms_list = get_rms(
83
+ y=samples, frame_length=self.win_size, hop_length=self.hop_size
84
+ ).squeeze(0)
85
+ sil_tags = []
86
+ silence_start = None
87
+ clip_start = 0
88
+ for i, rms in enumerate(rms_list):
89
+ # Keep looping while frame is silent.
90
+ if rms < self.threshold:
91
+ # Record start of silent frames.
92
+ if silence_start is None:
93
+ silence_start = i
94
+ continue
95
+ # Keep looping while frame is not silent and silence start has not been recorded.
96
+ if silence_start is None:
97
+ continue
98
+ # Clear recorded silence start if interval is not enough or clip is too short
99
+ is_leading_silence = silence_start == 0 and i > self.max_sil_kept
100
+ need_slice_middle = (
101
+ i - silence_start >= self.min_interval
102
+ and i - clip_start >= self.min_length
103
+ )
104
+ if not is_leading_silence and not need_slice_middle:
105
+ silence_start = None
106
+ continue
107
+ # Need slicing. Record the range of silent frames to be removed.
108
+ if i - silence_start <= self.max_sil_kept:
109
+ pos = rms_list[silence_start : i + 1].argmin() + silence_start
110
+ if silence_start == 0:
111
+ sil_tags.append((0, pos))
112
+ else:
113
+ sil_tags.append((pos, pos))
114
+ clip_start = pos
115
+ elif i - silence_start <= self.max_sil_kept * 2:
116
+ pos = rms_list[
117
+ i - self.max_sil_kept : silence_start + self.max_sil_kept + 1
118
+ ].argmin()
119
+ pos += i - self.max_sil_kept
120
+ pos_l = (
121
+ rms_list[
122
+ silence_start : silence_start + self.max_sil_kept + 1
123
+ ].argmin()
124
+ + silence_start
125
+ )
126
+ pos_r = (
127
+ rms_list[i - self.max_sil_kept : i + 1].argmin()
128
+ + i
129
+ - self.max_sil_kept
130
+ )
131
+ if silence_start == 0:
132
+ sil_tags.append((0, pos_r))
133
+ clip_start = pos_r
134
+ else:
135
+ sil_tags.append((min(pos_l, pos), max(pos_r, pos)))
136
+ clip_start = max(pos_r, pos)
137
+ else:
138
+ pos_l = (
139
+ rms_list[
140
+ silence_start : silence_start + self.max_sil_kept + 1
141
+ ].argmin()
142
+ + silence_start
143
+ )
144
+ pos_r = (
145
+ rms_list[i - self.max_sil_kept : i + 1].argmin()
146
+ + i
147
+ - self.max_sil_kept
148
+ )
149
+ if silence_start == 0:
150
+ sil_tags.append((0, pos_r))
151
+ else:
152
+ sil_tags.append((pos_l, pos_r))
153
+ clip_start = pos_r
154
+ silence_start = None
155
+ # Deal with trailing silence.
156
+ total_frames = rms_list.shape[0]
157
+ if (
158
+ silence_start is not None
159
+ and total_frames - silence_start >= self.min_interval
160
+ ):
161
+ silence_end = min(total_frames, silence_start + self.max_sil_kept)
162
+ pos = rms_list[silence_start : silence_end + 1].argmin() + silence_start
163
+ sil_tags.append((pos, total_frames + 1))
164
+ # Apply and return slices.
165
+ if len(sil_tags) == 0:
166
+ return [waveform]
167
+ else:
168
+ chunks = []
169
+ if sil_tags[0][0] > 0:
170
+ chunks.append(self._apply_slice(waveform, 0, sil_tags[0][0]))
171
+ for i in range(len(sil_tags) - 1):
172
+ chunks.append(
173
+ self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0])
174
+ )
175
+ if sil_tags[-1][1] < total_frames:
176
+ chunks.append(
177
+ self._apply_slice(waveform, sil_tags[-1][1], total_frames)
178
+ )
179
+ return chunks
180
+
181
+
182
+ def main():
183
+ import os.path
184
+ from argparse import ArgumentParser
185
+
186
+ import librosa
187
+ import soundfile
188
+
189
+ parser = ArgumentParser()
190
+ parser.add_argument("audio", type=str, help="The audio to be sliced")
191
+ parser.add_argument(
192
+ "--out", type=str, help="Output directory of the sliced audio clips"
193
+ )
194
+ parser.add_argument(
195
+ "--db_thresh",
196
+ type=float,
197
+ required=False,
198
+ default=-40,
199
+ help="The dB threshold for silence detection",
200
+ )
201
+ parser.add_argument(
202
+ "--min_length",
203
+ type=int,
204
+ required=False,
205
+ default=5000,
206
+ help="The minimum milliseconds required for each sliced audio clip",
207
+ )
208
+ parser.add_argument(
209
+ "--min_interval",
210
+ type=int,
211
+ required=False,
212
+ default=300,
213
+ help="The minimum milliseconds for a silence part to be sliced",
214
+ )
215
+ parser.add_argument(
216
+ "--hop_size",
217
+ type=int,
218
+ required=False,
219
+ default=10,
220
+ help="Frame length in milliseconds",
221
+ )
222
+ parser.add_argument(
223
+ "--max_sil_kept",
224
+ type=int,
225
+ required=False,
226
+ default=500,
227
+ help="The maximum silence length kept around the sliced clip, presented in milliseconds",
228
+ )
229
+ args = parser.parse_args()
230
+ out = args.out
231
+ if out is None:
232
+ out = os.path.dirname(os.path.abspath(args.audio))
233
+ audio, sr = librosa.load(args.audio, sr=None, mono=False)
234
+ slicer = Slicer(
235
+ sr=sr,
236
+ threshold=args.db_thresh,
237
+ min_length=args.min_length,
238
+ min_interval=args.min_interval,
239
+ hop_size=args.hop_size,
240
+ max_sil_kept=args.max_sil_kept,
241
+ )
242
+ chunks = slicer.slice(audio)
243
+ if not os.path.exists(out):
244
+ os.makedirs(out)
245
+ for i, chunk in enumerate(chunks):
246
+ if len(chunk.shape) > 1:
247
+ chunk = chunk.T
248
+ soundfile.write(
249
+ os.path.join(
250
+ out,
251
+ f"%s_%d.wav"
252
+ % (os.path.basename(args.audio).rsplit(".", maxsplit=1)[0], i),
253
+ ),
254
+ chunk,
255
+ sr,
256
+ )
257
+
258
+
259
+ if __name__ == "__main__":
260
+ main()
temp.py ADDED
@@ -0,0 +1,1392 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ import sys
4
+ import tempfile
5
+ os.environ["CUDA_VISIBLE_DEVICES"] = ""
6
+ now_dir = os.getcwd()
7
+ sys.path.append(now_dir)
8
+ import traceback, pdb
9
+ import warnings
10
+
11
+ import numpy as np
12
+ import torch
13
+
14
+ os.environ['OPENBLAS_NUM_THREADS'] = '1'
15
+ os.environ["no_proxy"] = "localhost, 127.0.0.1, ::1"
16
+ import logging
17
+ import threading
18
+ from random import shuffle
19
+ from subprocess import Popen
20
+ from time import sleep
21
+
22
+ import faiss
23
+ import ffmpeg
24
+ import gradio as gr
25
+ import soundfile as sf
26
+ from config import Config
27
+ from fairseq import checkpoint_utils
28
+ from i18n import I18nAuto
29
+ from infer_pack.models import (
30
+ SynthesizerTrnMs256NSFsid,
31
+ SynthesizerTrnMs256NSFsid_nono,
32
+ SynthesizerTrnMs768NSFsid,
33
+ SynthesizerTrnMs768NSFsid_nono,
34
+ )
35
+ from infer_pack.models_onnx import SynthesizerTrnMsNSFsidM
36
+ from infer_uvr5 import _audio_pre_, _audio_pre_new
37
+ from MDXNet import MDXNetDereverb
38
+ from my_utils import load_audio
39
+ from train.process_ckpt import change_info, extract_small_model, merge, show_info
40
+ from vc_infer_pipeline import VC
41
+ from sklearn.cluster import MiniBatchKMeans
42
+
43
+ logging.getLogger("numba").setLevel(logging.WARNING)
44
+
45
+
46
+ tmp = os.path.join(now_dir, "TEMP")
47
+ shutil.rmtree(tmp, ignore_errors=True)
48
+ shutil.rmtree("%s/runtime/Lib/site-packages/infer_pack" % (now_dir), ignore_errors=True)
49
+ shutil.rmtree("%s/runtime/Lib/site-packages/uvr5_pack" % (now_dir), ignore_errors=True)
50
+ os.makedirs(tmp, exist_ok=True)
51
+ os.makedirs(os.path.join(now_dir, "logs"), exist_ok=True)
52
+ os.makedirs(os.path.join(now_dir, "weights"), exist_ok=True)
53
+ os.environ["TEMP"] = tmp
54
+ warnings.filterwarnings("ignore")
55
+ torch.manual_seed(114514)
56
+ from scipy.io import wavfile
57
+
58
+ config = Config()
59
+ i18n = I18nAuto()
60
+ i18n.print()
61
+ # 判断是否有能用来训练和加速推理的N卡
62
+ ngpu = torch.cuda.device_count()
63
+ gpu_infos = []
64
+ mem = []
65
+ if_gpu_ok = False
66
+
67
+ if torch.cuda.is_available() or ngpu != 0:
68
+ for i in range(ngpu):
69
+ gpu_name = torch.cuda.get_device_name(i)
70
+ if any(
71
+ value in gpu_name.upper()
72
+ for value in [
73
+ "10",
74
+ "16",
75
+ "20",
76
+ "30",
77
+ "40",
78
+ "A2",
79
+ "A3",
80
+ "A4",
81
+ "P4",
82
+ "A50",
83
+ "500",
84
+ "A60",
85
+ "70",
86
+ "80",
87
+ "90",
88
+ "M4",
89
+ "T4",
90
+ "TITAN",
91
+ ]
92
+ ):
93
+ # A10#A100#V100#A40#P40#M40#K80#A4500
94
+ if_gpu_ok = True # 至少有一张能用的N卡
95
+ gpu_infos.append("%s\t%s" % (i, gpu_name))
96
+ mem.append(
97
+ int(
98
+ torch.cuda.get_device_properties(i).total_memory
99
+ / 1024
100
+ / 1024
101
+ / 1024
102
+ + 0.4
103
+ )
104
+ )
105
+ if if_gpu_ok and len(gpu_infos) > 0:
106
+ gpu_info = "\n".join(gpu_infos)
107
+ default_batch_size = 1
108
+ else:
109
+ gpu_info = i18n("很遗憾您这没有能用的显卡来支持您训练")
110
+ default_batch_size = 1
111
+ gpus = "-".join([i[0] for i in gpu_infos])
112
+
113
+
114
+ class ToolButton(gr.Button, gr.components.FormComponent):
115
+ """Small button with single emoji as text, fits inside gradio forms"""
116
+
117
+ def __init__(self, **kwargs):
118
+ super().__init__(variant="tool", **kwargs)
119
+
120
+ def get_block_name(self):
121
+ return "button"
122
+
123
+
124
+ hubert_model = None
125
+
126
+
127
+ def load_hubert():
128
+ global hubert_model
129
+ models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
130
+ ["hubert_base.pt"],
131
+ suffix="",
132
+ )
133
+ hubert_model = models[0]
134
+ hubert_model = hubert_model.to(config.device)
135
+ if config.is_half:
136
+ hubert_model = hubert_model.half()
137
+ else:
138
+ hubert_model = hubert_model.float()
139
+ hubert_model.eval()
140
+
141
+
142
+ weight_root = "weights"
143
+ weight_uvr5_root = "uvr5_weights"
144
+ index_root = "logs"
145
+ names = []
146
+ for name in os.listdir(weight_root):
147
+ if name.endswith(".pth"):
148
+ names.append(name)
149
+ index_paths = []
150
+ for root, dirs, files in os.walk(index_root, topdown=False):
151
+ for name in files:
152
+ if name.endswith(".index") and "trained" not in name:
153
+ index_paths.append("%s/%s" % (root, name))
154
+ uvr5_names = []
155
+ for name in os.listdir(weight_uvr5_root):
156
+ if name.endswith(".pth") or "onnx" in name:
157
+ uvr5_names.append(name.replace(".pth", ""))
158
+
159
+
160
+ def vc_single(
161
+ sid,
162
+ input_audio_path,
163
+ f0_up_key,
164
+ f0_file,
165
+ f0_method,
166
+ file_index,
167
+ file_index2,
168
+ # file_big_npy,
169
+ index_rate,
170
+ filter_radius,
171
+ resample_sr,
172
+ rms_mix_rate,
173
+ protect,
174
+ ): # spk_item, input_audio0, vc_transform0,f0_file,f0method0
175
+ global tgt_sr, net_g, vc, hubert_model, version
176
+ print(f0_up_key)
177
+
178
+ if input_audio_path is None:
179
+ return "You need to upload an audio", None
180
+ print("input_audio_path: ", input_audio_path)
181
+ print("f0_up_key: ", f0_up_key)
182
+ f0_up_key = int(f0_up_key)
183
+ try:
184
+ audio = load_audio(input_audio_path, 16000)
185
+ audio_max = np.abs(audio).max() / 0.95
186
+ if audio_max > 1:
187
+ audio /= audio_max
188
+ times = [0, 0, 0]
189
+ if not hubert_model:
190
+ load_hubert()
191
+ if_f0 = cpt.get("f0", 1)
192
+ file_index = (
193
+ (
194
+ file_index.strip(" ")
195
+ .strip('"')
196
+ .strip("\n")
197
+ .strip('"')
198
+ .strip(" ")
199
+ .replace("trained", "added")
200
+ )
201
+ if file_index != ""
202
+ else file_index2
203
+ ) # 防止小白写错,自动帮他替换掉
204
+ # file_big_npy = (
205
+ # file_big_npy.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
206
+ # )
207
+ audio_opt = vc.pipeline(
208
+ hubert_model,
209
+ net_g,
210
+ sid,
211
+ audio,
212
+ input_audio_path,
213
+ times,
214
+ f0_up_key,
215
+ f0_method,
216
+ file_index,
217
+ # file_big_npy,
218
+ index_rate,
219
+ if_f0,
220
+ filter_radius,
221
+ tgt_sr,
222
+ resample_sr,
223
+ rms_mix_rate,
224
+ version,
225
+ protect,
226
+ f0_file=f0_file,
227
+ )
228
+ print(f0_up_key)
229
+
230
+ if tgt_sr != resample_sr >= 16000:
231
+ tgt_sr = resample_sr
232
+ index_info = (
233
+ "Using index:%s." % file_index
234
+ if os.path.exists(file_index)
235
+ else "Index not used."
236
+ )
237
+ return "Success.\n %s\nTime:\n npy:%ss, f0:%ss, infer:%ss" % (
238
+ index_info,
239
+ times[0],
240
+ times[1],
241
+ times[2],
242
+ ), (tgt_sr, audio_opt)
243
+ except:
244
+ info = traceback.format_exc()
245
+ print(info)
246
+ return info, (None, None)
247
+
248
+
249
+ def vc_multi(
250
+ sid,
251
+ dir_path,
252
+ opt_root,
253
+ paths,
254
+ f0_up_key,
255
+ f0_method,
256
+ file_index,
257
+ file_index2,
258
+ # file_big_npy,
259
+ index_rate,
260
+ filter_radius,
261
+ resample_sr,
262
+ rms_mix_rate,
263
+ protect,
264
+ format1,
265
+ ):
266
+ try:
267
+ print(f0_up_key)
268
+
269
+ dir_path = (
270
+ dir_path.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
271
+ ) # 防止小白拷路径头尾带了空格和"和回车
272
+ opt_root = opt_root.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
273
+ os.makedirs(opt_root, exist_ok=True)
274
+ try:
275
+ if dir_path != "":
276
+ paths = [os.path.join(dir_path, name) for name in os.listdir(dir_path)]
277
+ else:
278
+ paths = [path.name for path in paths]
279
+ except:
280
+ traceback.print_exc()
281
+ paths = [path.name for path in paths]
282
+ infos = []
283
+ for path in paths:
284
+ info, opt = vc_single(
285
+ sid,
286
+ path,
287
+ f0_up_key,
288
+ None,
289
+ f0_method,
290
+ file_index,
291
+ file_index2,
292
+ # file_big_npy,
293
+ index_rate,
294
+ filter_radius,
295
+ resample_sr,
296
+ rms_mix_rate,
297
+ protect,
298
+ )
299
+ if "Success" in info:
300
+ try:
301
+ tgt_sr, audio_opt = opt
302
+ if format1 in ["wav", "flac"]:
303
+ sf.write(
304
+ "%s/%s.%s" % (opt_root, os.path.basename(path), format1),
305
+ audio_opt,
306
+ tgt_sr,
307
+ )
308
+ else:
309
+ path = "%s/%s.wav" % (opt_root, os.path.basename(path))
310
+ sf.write(
311
+ path,
312
+ audio_opt,
313
+ tgt_sr,
314
+ )
315
+ if os.path.exists(path):
316
+ os.system(
317
+ "ffmpeg -i %s -vn %s -q:a 2 -y"
318
+ % (path, path[:-4] + ".%s" % format1)
319
+ )
320
+ except:
321
+ info += traceback.format_exc()
322
+ infos.append("%s->%s" % (os.path.basename(path), info))
323
+ yield "\n".join(infos)
324
+ yield "\n".join(infos)
325
+ except:
326
+ yield traceback.format_exc()
327
+
328
+
329
+ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format0):
330
+ infos = []
331
+ try:
332
+ inp_root = inp_root.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
333
+ save_root_vocal = (
334
+ save_root_vocal.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
335
+ )
336
+ save_root_ins = (
337
+ save_root_ins.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
338
+ )
339
+ if model_name == "onnx_dereverb_By_FoxJoy":
340
+ pre_fun = MDXNetDereverb(15)
341
+ else:
342
+ func = _audio_pre_ if "DeEcho" not in model_name else _audio_pre_new
343
+ pre_fun = func(
344
+ agg=int(agg),
345
+ model_path=os.path.join(weight_uvr5_root, model_name + ".pth"),
346
+ device=config.device,
347
+ is_half=config.is_half,
348
+ )
349
+ if inp_root != "":
350
+ paths = [os.path.join(inp_root, name) for name in os.listdir(inp_root)]
351
+ else:
352
+ paths = [path.name for path in paths]
353
+ for path in paths:
354
+ inp_path = os.path.join(inp_root, path)
355
+ need_reformat = 1
356
+ done = 0
357
+ try:
358
+ info = ffmpeg.probe(inp_path, cmd="ffprobe")
359
+ if (
360
+ info["streams"][0]["channels"] == 2
361
+ and info["streams"][0]["sample_rate"] == "44100"
362
+ ):
363
+ need_reformat = 0
364
+ pre_fun._path_audio_(
365
+ inp_path, save_root_ins, save_root_vocal, format0
366
+ )
367
+ done = 1
368
+ except:
369
+ need_reformat = 1
370
+ traceback.print_exc()
371
+ if need_reformat == 1:
372
+ tmp_path = "%s/%s.reformatted.wav" % (tmp, os.path.basename(inp_path))
373
+ os.system(
374
+ "ffmpeg -i %s -vn -acodec pcm_s16le -ac 2 -ar 44100 %s -y"
375
+ % (inp_path, tmp_path)
376
+ )
377
+ inp_path = tmp_path
378
+ try:
379
+ if done == 0:
380
+ pre_fun._path_audio_(
381
+ inp_path, save_root_ins, save_root_vocal, format0
382
+ )
383
+ infos.append("%s->Success" % (os.path.basename(inp_path)))
384
+ yield "\n".join(infos)
385
+ except:
386
+ infos.append(
387
+ "%s->%s" % (os.path.basename(inp_path), traceback.format_exc())
388
+ )
389
+ yield "\n".join(infos)
390
+ except:
391
+ infos.append(traceback.format_exc())
392
+ yield "\n".join(infos)
393
+ finally:
394
+ try:
395
+ if model_name == "onnx_dereverb_By_FoxJoy":
396
+ del pre_fun.pred.model
397
+ del pre_fun.pred.model_
398
+ else:
399
+ del pre_fun.model
400
+ del pre_fun
401
+ except:
402
+ traceback.print_exc()
403
+ print("clean_empty_cache")
404
+ if torch.cuda.is_available():
405
+ torch.cuda.empty_cache()
406
+ yield "\n".join(infos)
407
+
408
+
409
+ # 一个选项卡全局只能有一个音色
410
+ def get_vc(sid, to_return_protect0, to_return_protect1):
411
+ global n_spk, tgt_sr, net_g, vc, cpt, version
412
+ if sid == "" or sid == []:
413
+ global hubert_model
414
+ if hubert_model is not None: # 考虑到轮询, 需要加个判断看是否 sid 是由有模型切换到无模型的
415
+ print("clean_empty_cache")
416
+ del net_g, n_spk, vc, hubert_model, tgt_sr # ,cpt
417
+ hubert_model = net_g = n_spk = vc = hubert_model = tgt_sr = None
418
+ if torch.cuda.is_available():
419
+ torch.cuda.empty_cache()
420
+ ###楼下不这么折腾清理不干净
421
+ if_f0 = cpt.get("f0", 1)
422
+ version = cpt.get("version", "v1")
423
+ if version == "v1":
424
+ if if_f0 == 1:
425
+ net_g = SynthesizerTrnMs256NSFsid(
426
+ *cpt["config"], is_half=config.is_half
427
+ )
428
+ else:
429
+ net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
430
+ elif version == "v2":
431
+ if if_f0 == 1:
432
+ net_g = SynthesizerTrnMs768NSFsid(
433
+ *cpt["config"], is_half=config.is_half
434
+ )
435
+ else:
436
+ net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
437
+ del net_g, cpt
438
+ if torch.cuda.is_available():
439
+ torch.cuda.empty_cache()
440
+ cpt = None
441
+ return {"visible": False, "__type__": "update"}
442
+ person = "%s/%s" % (weight_root, sid)
443
+ print("loading %s" % person)
444
+ cpt = torch.load(person, map_location="cpu")
445
+ tgt_sr = cpt["config"][-1]
446
+ cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
447
+ if_f0 = cpt.get("f0", 1)
448
+ if if_f0 == 0:
449
+ to_return_protect0 = to_return_protect1 = {
450
+ "visible": False,
451
+ "value": 0.5,
452
+ "__type__": "update",
453
+ }
454
+ else:
455
+ to_return_protect0 = {
456
+ "visible": True,
457
+ "value": to_return_protect0,
458
+ "__type__": "update",
459
+ }
460
+ to_return_protect1 = {
461
+ "visible": True,
462
+ "value": to_return_protect1,
463
+ "__type__": "update",
464
+ }
465
+ version = cpt.get("version", "v1")
466
+ if version == "v1":
467
+ if if_f0 == 1:
468
+ net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=config.is_half)
469
+ else:
470
+ net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
471
+ elif version == "v2":
472
+ if if_f0 == 1:
473
+ net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=config.is_half)
474
+ else:
475
+ net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
476
+ del net_g.enc_q
477
+ print(net_g.load_state_dict(cpt["weight"], strict=False))
478
+ net_g.eval().to(config.device)
479
+ if config.is_half:
480
+ net_g = net_g.half()
481
+ else:
482
+ net_g = net_g.float()
483
+ vc = VC(tgt_sr, config)
484
+ n_spk = cpt["config"][-3]
485
+ return (
486
+ {"visible": True, "maximum": n_spk, "__type__": "update"},
487
+ to_return_protect0,
488
+ to_return_protect1,
489
+ )
490
+
491
+
492
+ def change_choices():
493
+ names = []
494
+ for name in os.listdir(weight_root):
495
+ if name.endswith(".pth"):
496
+ names.append(name)
497
+ index_paths = []
498
+ for root, dirs, files in os.walk(index_root, topdown=False):
499
+ for name in files:
500
+ if name.endswith(".index") and "trained" not in name:
501
+ index_paths.append("%s/%s" % (root, name))
502
+ return {"choices": sorted(names), "__type__": "update"}, {
503
+ "choices": sorted(index_paths),
504
+ "__type__": "update",
505
+ }
506
+
507
+
508
+ def clean():
509
+ return {"value": "", "__type__": "update"}
510
+
511
+
512
+ sr_dict = {
513
+ "32k": 32000,
514
+ "40k": 40000,
515
+ "48k": 48000,
516
+ }
517
+
518
+
519
+ def if_done(done, p):
520
+ while 1:
521
+ if p.poll() is None:
522
+ sleep(0.5)
523
+ else:
524
+ break
525
+ done[0] = True
526
+
527
+
528
+ def if_done_multi(done, ps):
529
+ while 1:
530
+ # poll==None代表进程未结束
531
+ # 只要有一个进程未结束都不停
532
+ flag = 1
533
+ for p in ps:
534
+ if p.poll() is None:
535
+ flag = 0
536
+ sleep(0.5)
537
+ break
538
+ if flag == 1:
539
+ break
540
+ done[0] = True
541
+
542
+
543
+ def preprocess_dataset(trainset_dir, exp_dir, sr, n_p):
544
+ sr = sr_dict[sr]
545
+ os.makedirs("%s/logs/%s" % (now_dir, exp_dir), exist_ok=True)
546
+ f = open("%s/logs/%s/preprocess.log" % (now_dir, exp_dir), "w")
547
+ f.close()
548
+ cmd = (
549
+ config.python_cmd
550
+ + " trainset_preprocess_pipeline_print.py %s %s %s %s/logs/%s "
551
+ % (trainset_dir, sr, n_p, now_dir, exp_dir)
552
+ + str(config.noparallel)
553
+ )
554
+ print(cmd)
555
+ p = Popen(cmd, shell=True) # , stdin=PIPE, stdout=PIPE,stderr=PIPE,cwd=now_dir
556
+ ###煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读
557
+ done = [False]
558
+ threading.Thread(
559
+ target=if_done,
560
+ args=(
561
+ done,
562
+ p,
563
+ ),
564
+ ).start()
565
+ while 1:
566
+ with open("%s/logs/%s/preprocess.log" % (now_dir, exp_dir), "r") as f:
567
+ yield (f.read())
568
+ sleep(1)
569
+ if done[0]:
570
+ break
571
+ with open("%s/logs/%s/preprocess.log" % (now_dir, exp_dir), "r") as f:
572
+ log = f.read()
573
+ print(log)
574
+ yield log
575
+
576
+
577
+ # but2.click(extract_f0,[gpus6,np7,f0method8,if_f0_3,trainset_dir4],[info2])
578
+ def extract_f0_feature(gpus, n_p, f0method, if_f0, exp_dir, version19):
579
+ gpus = gpus.split("-")
580
+ os.makedirs("%s/logs/%s" % (now_dir, exp_dir), exist_ok=True)
581
+ f = open("%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir), "w")
582
+ f.close()
583
+ if if_f0:
584
+ cmd = config.python_cmd + " extract_f0_print.py %s/logs/%s %s %s" % (
585
+ now_dir,
586
+ exp_dir,
587
+ n_p,
588
+ f0method,
589
+ )
590
+ print(cmd)
591
+ p = Popen(cmd, shell=True, cwd=now_dir) # , stdin=PIPE, stdout=PIPE,stderr=PIPE
592
+ ###煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读
593
+ done = [False]
594
+ threading.Thread(
595
+ target=if_done,
596
+ args=(
597
+ done,
598
+ p,
599
+ ),
600
+ ).start()
601
+ while 1:
602
+ with open(
603
+ "%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir), "r"
604
+ ) as f:
605
+ yield (f.read())
606
+ sleep(1)
607
+ if done[0]:
608
+ break
609
+ with open("%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir), "r") as f:
610
+ log = f.read()
611
+ print(log)
612
+ yield log
613
+ ####对不同part分别开多进程
614
+ """
615
+ n_part=int(sys.argv[1])
616
+ i_part=int(sys.argv[2])
617
+ i_gpu=sys.argv[3]
618
+ exp_dir=sys.argv[4]
619
+ os.environ["CUDA_VISIBLE_DEVICES"]=str(i_gpu)
620
+ """
621
+ leng = len(gpus)
622
+ ps = []
623
+ for idx, n_g in enumerate(gpus):
624
+ cmd = (
625
+ config.python_cmd
626
+ + " extract_feature_print.py %s %s %s %s %s/logs/%s %s"
627
+ % (
628
+ config.device,
629
+ leng,
630
+ idx,
631
+ n_g,
632
+ now_dir,
633
+ exp_dir,
634
+ version19,
635
+ )
636
+ )
637
+ print(cmd)
638
+ p = Popen(
639
+ cmd, shell=True, cwd=now_dir
640
+ ) # , shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=now_dir
641
+ ps.append(p)
642
+ ###煞笔gr, popen read都非得全跑完了再一次性读取, 不用gr就正常读一句输出一句;只能额外弄出一个文本流定时读
643
+ done = [False]
644
+ threading.Thread(
645
+ target=if_done_multi,
646
+ args=(
647
+ done,
648
+ ps,
649
+ ),
650
+ ).start()
651
+ while 1:
652
+ with open("%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir), "r") as f:
653
+ yield (f.read())
654
+ sleep(1)
655
+ if done[0]:
656
+ break
657
+ with open("%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir), "r") as f:
658
+ log = f.read()
659
+ print(log)
660
+ yield log
661
+
662
+
663
+ def change_sr2(sr2, if_f0_3, version19):
664
+ path_str = "" if version19 == "v1" else "_v2"
665
+ f0_str = "f0" if if_f0_3 else ""
666
+ if_pretrained_generator_exist = os.access(
667
+ "pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2), os.F_OK
668
+ )
669
+ if_pretrained_discriminator_exist = os.access(
670
+ "pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2), os.F_OK
671
+ )
672
+ if not if_pretrained_generator_exist:
673
+ print(
674
+ "pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2),
675
+ "not exist, will not use pretrained model",
676
+ )
677
+ if not if_pretrained_discriminator_exist:
678
+ print(
679
+ "pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2),
680
+ "not exist, will not use pretrained model",
681
+ )
682
+ return (
683
+ "pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2)
684
+ if if_pretrained_generator_exist
685
+ else "",
686
+ "pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2)
687
+ if if_pretrained_discriminator_exist
688
+ else "",
689
+ )
690
+
691
+
692
+ def change_version19(sr2, if_f0_3, version19):
693
+ path_str = "" if version19 == "v1" else "_v2"
694
+ if sr2 == "32k" and version19 == "v1":
695
+ sr2 = "40k"
696
+ to_return_sr2 = (
697
+ {"choices": ["40k", "48k"], "__type__": "update", "value": sr2}
698
+ if version19 == "v1"
699
+ else {"choices": ["40k", "48k", "32k"], "__type__": "update", "value": sr2}
700
+ )
701
+ f0_str = "f0" if if_f0_3 else ""
702
+ if_pretrained_generator_exist = os.access(
703
+ "pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2), os.F_OK
704
+ )
705
+ if_pretrained_discriminator_exist = os.access(
706
+ "pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2), os.F_OK
707
+ )
708
+ if not if_pretrained_generator_exist:
709
+ print(
710
+ "pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2),
711
+ "not exist, will not use pretrained model",
712
+ )
713
+ if not if_pretrained_discriminator_exist:
714
+ print(
715
+ "pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2),
716
+ "not exist, will not use pretrained model",
717
+ )
718
+ return (
719
+ "pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2)
720
+ if if_pretrained_generator_exist
721
+ else "",
722
+ "pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2)
723
+ if if_pretrained_discriminator_exist
724
+ else "",
725
+ to_return_sr2,
726
+ )
727
+
728
+
729
+ def change_f0(if_f0_3, sr2, version19): # f0method8,pretrained_G14,pretrained_D15
730
+ path_str = "" if version19 == "v1" else "_v2"
731
+ if_pretrained_generator_exist = os.access(
732
+ "pretrained%s/f0G%s.pth" % (path_str, sr2), os.F_OK
733
+ )
734
+ if_pretrained_discriminator_exist = os.access(
735
+ "pretrained%s/f0D%s.pth" % (path_str, sr2), os.F_OK
736
+ )
737
+ if not if_pretrained_generator_exist:
738
+ print(
739
+ "pretrained%s/f0G%s.pth" % (path_str, sr2),
740
+ "not exist, will not use pretrained model",
741
+ )
742
+ if not if_pretrained_discriminator_exist:
743
+ print(
744
+ "pretrained%s/f0D%s.pth" % (path_str, sr2),
745
+ "not exist, will not use pretrained model",
746
+ )
747
+ if if_f0_3:
748
+ return (
749
+ {"visible": True, "__type__": "update"},
750
+ "pretrained%s/f0G%s.pth" % (path_str, sr2)
751
+ if if_pretrained_generator_exist
752
+ else "",
753
+ "pretrained%s/f0D%s.pth" % (path_str, sr2)
754
+ if if_pretrained_discriminator_exist
755
+ else "",
756
+ )
757
+ return (
758
+ {"visible": False, "__type__": "update"},
759
+ ("pretrained%s/G%s.pth" % (path_str, sr2))
760
+ if if_pretrained_generator_exist
761
+ else "",
762
+ ("pretrained%s/D%s.pth" % (path_str, sr2))
763
+ if if_pretrained_discriminator_exist
764
+ else "",
765
+ )
766
+
767
+
768
+ # but3.click(click_train,[exp_dir1,sr2,if_f0_3,save_epoch10,total_epoch11,batch_size12,if_save_latest13,pretrained_G14,pretrained_D15,gpus16])
769
+ def click_train(
770
+ exp_dir1,
771
+ sr2,
772
+ if_f0_3,
773
+ spk_id5,
774
+ save_epoch10,
775
+ total_epoch11,
776
+ batch_size12,
777
+ if_save_latest13,
778
+ pretrained_G14,
779
+ pretrained_D15,
780
+ gpus16,
781
+ if_cache_gpu17,
782
+ if_save_every_weights18,
783
+ version19,
784
+ ):
785
+ # 生成filelist
786
+ exp_dir = "%s/logs/%s" % (now_dir, exp_dir1)
787
+ os.makedirs(exp_dir, exist_ok=True)
788
+ gt_wavs_dir = "%s/0_gt_wavs" % (exp_dir)
789
+ feature_dir = (
790
+ "%s/3_feature256" % (exp_dir)
791
+ if version19 == "v1"
792
+ else "%s/3_feature768" % (exp_dir)
793
+ )
794
+ if if_f0_3:
795
+ f0_dir = "%s/2a_f0" % (exp_dir)
796
+ f0nsf_dir = "%s/2b-f0nsf" % (exp_dir)
797
+ names = (
798
+ set([name.split(".")[0] for name in os.listdir(gt_wavs_dir)])
799
+ & set([name.split(".")[0] for name in os.listdir(feature_dir)])
800
+ & set([name.split(".")[0] for name in os.listdir(f0_dir)])
801
+ & set([name.split(".")[0] for name in os.listdir(f0nsf_dir)])
802
+ )
803
+ else:
804
+ names = set([name.split(".")[0] for name in os.listdir(gt_wavs_dir)]) & set(
805
+ [name.split(".")[0] for name in os.listdir(feature_dir)]
806
+ )
807
+ opt = []
808
+ for name in names:
809
+ if if_f0_3:
810
+ opt.append(
811
+ "%s/%s.wav|%s/%s.npy|%s/%s.wav.npy|%s/%s.wav.npy|%s"
812
+ % (
813
+ gt_wavs_dir.replace("\\", "\\\\"),
814
+ name,
815
+ feature_dir.replace("\\", "\\\\"),
816
+ name,
817
+ f0_dir.replace("\\", "\\\\"),
818
+ name,
819
+ f0nsf_dir.replace("\\", "\\\\"),
820
+ name,
821
+ spk_id5,
822
+ )
823
+ )
824
+ else:
825
+ opt.append(
826
+ "%s/%s.wav|%s/%s.npy|%s"
827
+ % (
828
+ gt_wavs_dir.replace("\\", "\\\\"),
829
+ name,
830
+ feature_dir.replace("\\", "\\\\"),
831
+ name,
832
+ spk_id5,
833
+ )
834
+ )
835
+ fea_dim = 256 if version19 == "v1" else 768
836
+ if if_f0_3:
837
+ for _ in range(2):
838
+ opt.append(
839
+ "%s/logs/mute/0_gt_wavs/mute%s.wav|%s/logs/mute/3_feature%s/mute.npy|%s/logs/mute/2a_f0/mute.wav.npy|%s/logs/mute/2b-f0nsf/mute.wav.npy|%s"
840
+ % (now_dir, sr2, now_dir, fea_dim, now_dir, now_dir, spk_id5)
841
+ )
842
+ else:
843
+ for _ in range(2):
844
+ opt.append(
845
+ "%s/logs/mute/0_gt_wavs/mute%s.wav|%s/logs/mute/3_feature%s/mute.npy|%s"
846
+ % (now_dir, sr2, now_dir, fea_dim, spk_id5)
847
+ )
848
+ shuffle(opt)
849
+ with open("%s/filelist.txt" % exp_dir, "w") as f:
850
+ f.write("\n".join(opt))
851
+ print("write filelist done")
852
+ # 生成config#无需生成config
853
+ # cmd = python_cmd + " train_nsf_sim_cache_sid_load_pretrain.py -e mi-test -sr 40k -f0 1 -bs 4 -g 0 -te 10 -se 5 -pg pretrained/f0G40k.pth -pd pretrained/f0D40k.pth -l 1 -c 0"
854
+ print("use gpus:", gpus16)
855
+ if pretrained_G14 == "":
856
+ print("no pretrained Generator")
857
+ if pretrained_D15 == "":
858
+ print("no pretrained Discriminator")
859
+ if gpus16:
860
+ cmd = (
861
+ config.python_cmd
862
+ + " train_nsf_sim_cache_sid_load_pretrain.py -e %s -sr %s -f0 %s -bs %s -g %s -te %s -se %s %s %s -l %s -c %s -sw %s -v %s"
863
+ % (
864
+ exp_dir1,
865
+ sr2,
866
+ 1 if if_f0_3 else 0,
867
+ batch_size12,
868
+ gpus16,
869
+ total_epoch11,
870
+ save_epoch10,
871
+ "-pg %s" % pretrained_G14 if pretrained_G14 != "" else "",
872
+ "-pd %s" % pretrained_D15 if pretrained_D15 != "" else "",
873
+ 1 if if_save_latest13 == i18n("是") else 0,
874
+ 1 if if_cache_gpu17 == i18n("是") else 0,
875
+ 1 if if_save_every_weights18 == i18n("是") else 0,
876
+ version19,
877
+ )
878
+ )
879
+ else:
880
+ cmd = (
881
+ config.python_cmd
882
+ + " train_nsf_sim_cache_sid_load_pretrain.py -e %s -sr %s -f0 %s -bs %s -te %s -se %s %s %s -l %s -c %s -sw %s -v %s"
883
+ % (
884
+ exp_dir1,
885
+ sr2,
886
+ 1 if if_f0_3 else 0,
887
+ batch_size12,
888
+ total_epoch11,
889
+ save_epoch10,
890
+ "-pg %s" % pretrained_G14 if pretrained_G14 != "" else "\b",
891
+ "-pd %s" % pretrained_D15 if pretrained_D15 != "" else "\b",
892
+ 1 if if_save_latest13 == i18n("是") else 0,
893
+ 1 if if_cache_gpu17 == i18n("是") else 0,
894
+ 1 if if_save_every_weights18 == i18n("是") else 0,
895
+ version19,
896
+ )
897
+ )
898
+ print(cmd)
899
+ p = Popen(cmd, shell=True, cwd=now_dir)
900
+ p.wait()
901
+ return "训练结束, 您可查看控制台训练日志或实验文件夹下的train.log"
902
+
903
+
904
+ # but4.click(train_index, [exp_dir1], info3)
905
+ def train_index(exp_dir1, version19):
906
+ exp_dir = "%s/logs/%s" % (now_dir, exp_dir1)
907
+ os.makedirs(exp_dir, exist_ok=True)
908
+ feature_dir = (
909
+ "%s/3_feature256" % (exp_dir)
910
+ if version19 == "v1"
911
+ else "%s/3_feature768" % (exp_dir)
912
+ )
913
+ if not os.path.exists(feature_dir):
914
+ return "请先进行特征提取!"
915
+ listdir_res = list(os.listdir(feature_dir))
916
+ if len(listdir_res) == 0:
917
+ return "请先进行特征提取!"
918
+ infos = []
919
+ npys = []
920
+ for name in sorted(listdir_res):
921
+ phone = np.load("%s/%s" % (feature_dir, name))
922
+ npys.append(phone)
923
+ big_npy = np.concatenate(npys, 0)
924
+ big_npy_idx = np.arange(big_npy.shape[0])
925
+ np.random.shuffle(big_npy_idx)
926
+ big_npy = big_npy[big_npy_idx]
927
+ if big_npy.shape[0] > 2e5:
928
+ # if(1):
929
+ infos.append("Trying doing kmeans %s shape to 10k centers." % big_npy.shape[0])
930
+ yield "\n".join(infos)
931
+ try:
932
+ big_npy = (
933
+ MiniBatchKMeans(
934
+ n_clusters=10000,
935
+ verbose=True,
936
+ batch_size=256 * config.n_cpu,
937
+ compute_labels=False,
938
+ init="random",
939
+ )
940
+ .fit(big_npy)
941
+ .cluster_centers_
942
+ )
943
+ except:
944
+ info = traceback.format_exc()
945
+ print(info)
946
+ infos.append(info)
947
+ yield "\n".join(infos)
948
+
949
+ np.save("%s/total_fea.npy" % exp_dir, big_npy)
950
+ n_ivf = min(int(16 * np.sqrt(big_npy.shape[0])), big_npy.shape[0] // 39)
951
+ infos.append("%s,%s" % (big_npy.shape, n_ivf))
952
+ yield "\n".join(infos)
953
+ index = faiss.index_factory(256 if version19 == "v1" else 768, "IVF%s,Flat" % n_ivf)
954
+ # index = faiss.index_factory(256if version19=="v1"else 768, "IVF%s,PQ128x4fs,RFlat"%n_ivf)
955
+ infos.append("training")
956
+ yield "\n".join(infos)
957
+ index_ivf = faiss.extract_index_ivf(index) #
958
+ index_ivf.nprobe = 1
959
+ index.train(big_npy)
960
+ faiss.write_index(
961
+ index,
962
+ "%s/trained_IVF%s_Flat_nprobe_%s_%s_%s.index"
963
+ % (exp_dir, n_ivf, index_ivf.nprobe, exp_dir1, version19),
964
+ )
965
+ # faiss.write_index(index, '%s/trained_IVF%s_Flat_FastScan_%s.index'%(exp_dir,n_ivf,version19))
966
+ infos.append("adding")
967
+ yield "\n".join(infos)
968
+ batch_size_add = 8192
969
+ for i in range(0, big_npy.shape[0], batch_size_add):
970
+ index.add(big_npy[i : i + batch_size_add])
971
+ faiss.write_index(
972
+ index,
973
+ "%s/added_IVF%s_Flat_nprobe_%s_%s_%s.index"
974
+ % (exp_dir, n_ivf, index_ivf.nprobe, exp_dir1, version19),
975
+ )
976
+ infos.append(
977
+ "成功构建索引,added_IVF%s_Flat_nprobe_%s_%s_%s.index"
978
+ % (n_ivf, index_ivf.nprobe, exp_dir1, version19)
979
+ )
980
+ # faiss.write_index(index, '%s/added_IVF%s_Flat_FastScan_%s.index'%(exp_dir,n_ivf,version19))
981
+ # infos.append("成功构建索引,added_IVF%s_Flat_FastScan_%s.index"%(n_ivf,version19))
982
+ yield "\n".join(infos)
983
+
984
+
985
+ # but5.click(train1key, [exp_dir1, sr2, if_f0_3, trainset_dir4, spk_id5, gpus6, np7, f0method8, save_epoch10, total_epoch11, batch_size12, if_save_latest13, pretrained_G14, pretrained_D15, gpus16, if_cache_gpu17], info3)
986
+ def train1key(
987
+ exp_dir1,
988
+ sr2,
989
+ if_f0_3,
990
+ trainset_dir4,
991
+ spk_id5,
992
+ np7,
993
+ f0method8,
994
+ save_epoch10,
995
+ total_epoch11,
996
+ batch_size12,
997
+ if_save_latest13,
998
+ pretrained_G14,
999
+ pretrained_D15,
1000
+ gpus16,
1001
+ if_cache_gpu17,
1002
+ if_save_every_weights18,
1003
+ version19,
1004
+ ):
1005
+ infos = []
1006
+
1007
+ def get_info_str(strr):
1008
+ infos.append(strr)
1009
+ return "\n".join(infos)
1010
+
1011
+ model_log_dir = "%s/logs/%s" % (now_dir, exp_dir1)
1012
+ preprocess_log_path = "%s/preprocess.log" % model_log_dir
1013
+ extract_f0_feature_log_path = "%s/extract_f0_feature.log" % model_log_dir
1014
+ gt_wavs_dir = "%s/0_gt_wavs" % model_log_dir
1015
+ feature_dir = (
1016
+ "%s/3_feature256" % model_log_dir
1017
+ if version19 == "v1"
1018
+ else "%s/3_feature768" % model_log_dir
1019
+ )
1020
+
1021
+ os.makedirs(model_log_dir, exist_ok=True)
1022
+ #########step1:处理数据
1023
+ open(preprocess_log_path, "w").close()
1024
+ cmd = (
1025
+ config.python_cmd
1026
+ + " trainset_preprocess_pipeline_print.py %s %s %s %s "
1027
+ % (trainset_dir4, sr_dict[sr2], np7, model_log_dir)
1028
+ + str(config.noparallel)
1029
+ )
1030
+ yield get_info_str(i18n("step1:正在处理数据"))
1031
+ yield get_info_str(cmd)
1032
+ p = Popen(cmd, shell=True)
1033
+ p.wait()
1034
+ with open(preprocess_log_path, "r") as f:
1035
+ print(f.read())
1036
+ #########step2a:提取音高
1037
+ open(extract_f0_feature_log_path, "w")
1038
+ if if_f0_3:
1039
+ yield get_info_str("step2a:正在提取音高")
1040
+ cmd = config.python_cmd + " extract_f0_print.py %s %s %s" % (
1041
+ model_log_dir,
1042
+ np7,
1043
+ f0method8,
1044
+ )
1045
+ yield get_info_str(cmd)
1046
+ p = Popen(cmd, shell=True, cwd=now_dir)
1047
+ p.wait()
1048
+ with open(extract_f0_feature_log_path, "r") as f:
1049
+ print(f.read())
1050
+ else:
1051
+ yield get_info_str(i18n("step2a:无需提取音高"))
1052
+ #######step2b:提取特征
1053
+ yield get_info_str(i18n("step2b:正在提取特征"))
1054
+ gpus = gpus16.split("-")
1055
+ leng = len(gpus)
1056
+ ps = []
1057
+ for idx, n_g in enumerate(gpus):
1058
+ cmd = config.python_cmd + " extract_feature_print.py %s %s %s %s %s %s" % (
1059
+ config.device,
1060
+ leng,
1061
+ idx,
1062
+ n_g,
1063
+ model_log_dir,
1064
+ version19,
1065
+ )
1066
+ yield get_info_str(cmd)
1067
+ p = Popen(
1068
+ cmd, shell=True, cwd=now_dir
1069
+ ) # , shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=now_dir
1070
+ ps.append(p)
1071
+ for p in ps:
1072
+ p.wait()
1073
+ with open(extract_f0_feature_log_path, "r") as f:
1074
+ print(f.read())
1075
+ #######step3a:训练模型
1076
+ yield get_info_str(i18n("step3a:正在训练模型"))
1077
+ # 生成filelist
1078
+ if if_f0_3:
1079
+ f0_dir = "%s/2a_f0" % model_log_dir
1080
+ f0nsf_dir = "%s/2b-f0nsf" % model_log_dir
1081
+ names = (
1082
+ set([name.split(".")[0] for name in os.listdir(gt_wavs_dir)])
1083
+ & set([name.split(".")[0] for name in os.listdir(feature_dir)])
1084
+ & set([name.split(".")[0] for name in os.listdir(f0_dir)])
1085
+ & set([name.split(".")[0] for name in os.listdir(f0nsf_dir)])
1086
+ )
1087
+ else:
1088
+ names = set([name.split(".")[0] for name in os.listdir(gt_wavs_dir)]) & set(
1089
+ [name.split(".")[0] for name in os.listdir(feature_dir)]
1090
+ )
1091
+ opt = []
1092
+ for name in names:
1093
+ if if_f0_3:
1094
+ opt.append(
1095
+ "%s/%s.wav|%s/%s.npy|%s/%s.wav.npy|%s/%s.wav.npy|%s"
1096
+ % (
1097
+ gt_wavs_dir.replace("\\", "\\\\"),
1098
+ name,
1099
+ feature_dir.replace("\\", "\\\\"),
1100
+ name,
1101
+ f0_dir.replace("\\", "\\\\"),
1102
+ name,
1103
+ f0nsf_dir.replace("\\", "\\\\"),
1104
+ name,
1105
+ spk_id5,
1106
+ )
1107
+ )
1108
+ else:
1109
+ opt.append(
1110
+ "%s/%s.wav|%s/%s.npy|%s"
1111
+ % (
1112
+ gt_wavs_dir.replace("\\", "\\\\"),
1113
+ name,
1114
+ feature_dir.replace("\\", "\\\\"),
1115
+ name,
1116
+ spk_id5,
1117
+ )
1118
+ )
1119
+ fea_dim = 256 if version19 == "v1" else 768
1120
+ if if_f0_3:
1121
+ for _ in range(2):
1122
+ opt.append(
1123
+ "%s/logs/mute/0_gt_wavs/mute%s.wav|%s/logs/mute/3_feature%s/mute.npy|%s/logs/mute/2a_f0/mute.wav.npy|%s/logs/mute/2b-f0nsf/mute.wav.npy|%s"
1124
+ % (now_dir, sr2, now_dir, fea_dim, now_dir, now_dir, spk_id5)
1125
+ )
1126
+ else:
1127
+ for _ in range(2):
1128
+ opt.append(
1129
+ "%s/logs/mute/0_gt_wavs/mute%s.wav|%s/logs/mute/3_feature%s/mute.npy|%s"
1130
+ % (now_dir, sr2, now_dir, fea_dim, spk_id5)
1131
+ )
1132
+ shuffle(opt)
1133
+ with open("%s/filelist.txt" % model_log_dir, "w") as f:
1134
+ f.write("\n".join(opt))
1135
+ yield get_info_str("write filelist done")
1136
+ if gpus16:
1137
+ cmd = (
1138
+ config.python_cmd
1139
+ + " train_nsf_sim_cache_sid_load_pretrain.py -e %s -sr %s -f0 %s -bs %s -g %s -te %s -se %s %s %s -l %s -c %s -sw %s -v %s"
1140
+ % (
1141
+ exp_dir1,
1142
+ sr2,
1143
+ 1 if if_f0_3 else 0,
1144
+ batch_size12,
1145
+ gpus16,
1146
+ total_epoch11,
1147
+ save_epoch10,
1148
+ "-pg %s" % pretrained_G14 if pretrained_G14 != "" else "",
1149
+ "-pd %s" % pretrained_D15 if pretrained_D15 != "" else "",
1150
+ 1 if if_save_latest13 == i18n("是") else 0,
1151
+ 1 if if_cache_gpu17 == i18n("是") else 0,
1152
+ 1 if if_save_every_weights18 == i18n("是") else 0,
1153
+ version19,
1154
+ )
1155
+ )
1156
+ else:
1157
+ cmd = (
1158
+ config.python_cmd
1159
+ + " train_nsf_sim_cache_sid_load_pretrain.py -e %s -sr %s -f0 %s -bs %s -te %s -se %s %s %s -l %s -c %s -sw %s -v %s"
1160
+ % (
1161
+ exp_dir1,
1162
+ sr2,
1163
+ 1 if if_f0_3 else 0,
1164
+ batch_size12,
1165
+ total_epoch11,
1166
+ save_epoch10,
1167
+ "-pg %s" % pretrained_G14 if pretrained_G14 != "" else "",
1168
+ "-pd %s" % pretrained_D15 if pretrained_D15 != "" else "",
1169
+ 1 if if_save_latest13 == i18n("是") else 0,
1170
+ 1 if if_cache_gpu17 == i18n("是") else 0,
1171
+ 1 if if_save_every_weights18 == i18n("是") else 0,
1172
+ version19,
1173
+ )
1174
+ )
1175
+ yield get_info_str(cmd)
1176
+ p = Popen(cmd, shell=True, cwd=now_dir)
1177
+ p.wait()
1178
+ yield get_info_str(i18n("训练结束, 您可查看控制台训练日志或实验文件夹下的train.log"))
1179
+ #######step3b:训练索引
1180
+ npys = []
1181
+ listdir_res = list(os.listdir(feature_dir))
1182
+ for name in sorted(listdir_res):
1183
+ phone = np.load("%s/%s" % (feature_dir, name))
1184
+ npys.append(phone)
1185
+ big_npy = np.concatenate(npys, 0)
1186
+
1187
+ big_npy_idx = np.arange(big_npy.shape[0])
1188
+ np.random.shuffle(big_npy_idx)
1189
+ big_npy = big_npy[big_npy_idx]
1190
+
1191
+ if big_npy.shape[0] > 2e5:
1192
+ # if(1):
1193
+ info = "Trying doing kmeans %s shape to 10k centers." % big_npy.shape[0]
1194
+ print(info)
1195
+ yield get_info_str(info)
1196
+ try:
1197
+ big_npy = (
1198
+ MiniBatchKMeans(
1199
+ n_clusters=10000,
1200
+ verbose=True,
1201
+ batch_size=256 * config.n_cpu,
1202
+ compute_labels=False,
1203
+ init="random",
1204
+ )
1205
+ .fit(big_npy)
1206
+ .cluster_centers_
1207
+ )
1208
+ except:
1209
+ info = traceback.format_exc()
1210
+ print(info)
1211
+ yield get_info_str(info)
1212
+
1213
+ np.save("%s/total_fea.npy" % model_log_dir, big_npy)
1214
+
1215
+ # n_ivf = big_npy.shape[0] // 39
1216
+ n_ivf = min(int(16 * np.sqrt(big_npy.shape[0])), big_npy.shape[0] // 39)
1217
+ yield get_info_str("%s,%s" % (big_npy.shape, n_ivf))
1218
+ index = faiss.index_factory(256 if version19 == "v1" else 768, "IVF%s,Flat" % n_ivf)
1219
+ yield get_info_str("training index")
1220
+ index_ivf = faiss.extract_index_ivf(index) #
1221
+ index_ivf.nprobe = 1
1222
+ index.train(big_npy)
1223
+ faiss.write_index(
1224
+ index,
1225
+ "%s/trained_IVF%s_Flat_nprobe_%s_%s_%s.index"
1226
+ % (model_log_dir, n_ivf, index_ivf.nprobe, exp_dir1, version19),
1227
+ )
1228
+ yield get_info_str("adding index")
1229
+ batch_size_add = 8192
1230
+ for i in range(0, big_npy.shape[0], batch_size_add):
1231
+ index.add(big_npy[i : i + batch_size_add])
1232
+ faiss.write_index(
1233
+ index,
1234
+ "%s/added_IVF%s_Flat_nprobe_%s_%s_%s.index"
1235
+ % (model_log_dir, n_ivf, index_ivf.nprobe, exp_dir1, version19),
1236
+ )
1237
+ yield get_info_str(
1238
+ "成功构建索引, added_IVF%s_Flat_nprobe_%s_%s_%s.index"
1239
+ % (n_ivf, index_ivf.nprobe, exp_dir1, version19)
1240
+ )
1241
+ yield get_info_str(i18n("全流程结束!"))
1242
+
1243
+
1244
+ # ckpt_path2.change(change_info_,[ckpt_path2],[sr__,if_f0__])
1245
+ def change_info_(ckpt_path):
1246
+ if not os.path.exists(ckpt_path.replace(os.path.basename(ckpt_path), "train.log")):
1247
+ return {"__type__": "update"}, {"__type__": "update"}, {"__type__": "update"}
1248
+ try:
1249
+ with open(
1250
+ ckpt_path.replace(os.path.basename(ckpt_path), "train.log"), "r"
1251
+ ) as f:
1252
+ info = eval(f.read().strip("\n").split("\n")[0].split("\t")[-1])
1253
+ sr, f0 = info["sample_rate"], info["if_f0"]
1254
+ version = "v2" if ("version" in info and info["version"] == "v2") else "v1"
1255
+ return sr, str(f0), version
1256
+ except:
1257
+ traceback.print_exc()
1258
+ return {"__type__": "update"}, {"__type__": "update"}, {"__type__": "update"}
1259
+
1260
+
1261
+ def export_onnx(ModelPath, ExportedPath):
1262
+ cpt = torch.load(ModelPath, map_location="cpu")
1263
+ cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]
1264
+ vec_channels = 256 if cpt.get("version", "v1") == "v1" else 768
1265
+
1266
+ test_phone = torch.rand(1, 200, vec_channels) # hidden unit
1267
+ test_phone_lengths = torch.tensor([200]).long() # hidden unit 长度(貌似没啥用)
1268
+ test_pitch = torch.randint(size=(1, 200), low=5, high=255) # 基频(单位赫兹)
1269
+ test_pitchf = torch.rand(1, 200) # nsf基频
1270
+ test_ds = torch.LongTensor([0]) # 说话人ID
1271
+ test_rnd = torch.rand(1, 192, 200) # 噪声(加入随机因子)
1272
+
1273
+ device = "cpu" # 导出时设备(不影响使用模型)
1274
+
1275
+ net_g = SynthesizerTrnMsNSFsidM(
1276
+ *cpt["config"], is_half=False, version=cpt.get("version", "v1")
1277
+ ) # fp32导出(C++要支持fp16必须手动将内存重新排列所以暂时不用fp16)
1278
+ net_g.load_state_dict(cpt["weight"], strict=False)
1279
+ input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds", "rnd"]
1280
+ output_names = [
1281
+ "audio",
1282
+ ]
1283
+ # net_g.construct_spkmixmap(n_speaker) 多角色混合轨道导出
1284
+ torch.onnx.export(
1285
+ net_g,
1286
+ (
1287
+ test_phone.to(device),
1288
+ test_phone_lengths.to(device),
1289
+ test_pitch.to(device),
1290
+ test_pitchf.to(device),
1291
+ test_ds.to(device),
1292
+ test_rnd.to(device),
1293
+ ),
1294
+ ExportedPath,
1295
+ dynamic_axes={
1296
+ "phone": [1],
1297
+ "pitch": [1],
1298
+ "pitchf": [1],
1299
+ "rnd": [2],
1300
+ },
1301
+ do_constant_folding=False,
1302
+ opset_version=13,
1303
+ verbose=False,
1304
+ input_names=input_names,
1305
+ output_names=output_names,
1306
+ )
1307
+ return "Finished"
1308
+
1309
+ # sid0: Inferencing voice/ model name
1310
+ # f0_up_key: Transpose (integer, number of semitones, raise by an octave: 12, lower by an octave: -12):
1311
+ # opt_root: output folder path
1312
+ # f0method: pitch extraction algorithm ('pm': faster extraction but lower-quality speech; 'harvest': better bass but extremely slow; 'crepe': better quality but GPU intensive)
1313
+ # filter_radius: If >=3: apply median filtering to the harvested pitch results. The value represents the filter radius and can reduce breathiness.
1314
+ # file_index: Path to the feature index file.:
1315
+ # Auto-detect index path
1316
+ # index_rate: Search feature ratio (controls accent strength, too high has artifacting): minimum=0, maximum=1
1317
+ # resample_sr: Resample the output audio in post-processing to the final sample rate. Set to 0 for no resampling (time consuming)
1318
+ # rms_mix_rate: Adjust the volume envelope scaling. Closer to 0, the more it mimicks the volume of the original vocals. Can help mask noise and make volume sound more natural when set relatively low. Closer to 1 will be more of a consistently loud volume:
1319
+ # protect: Protect voiceless consonants and breath sounds to prevent artifacts such as tearing in electronic music. Set to 0.5 to disable. Decrease the value to increase protection, but it may reduce indexing accuracy
1320
+ # dir_path: Enter the path of the audio folder to be processed (copy it from the address bar of the file manager)
1321
+ # format1: choices=["wav", "flac", "mp3", "m4a"]
1322
+
1323
+ def run(sid0, paths, dir_path=None, f0_up_key=0, opt_root="opt", f0_method="pm", filter_radius=3, file_index="", file_index2=None, index_rate=1, resample_sr=0, rms_mix_rate=1, protect=0.33, format1="wav"):
1324
+ if (dir_path=='' or dir_path==None) and (paths == '' or paths==None):
1325
+ return "must provide either dir_input or file path"
1326
+ if paths != None or paths != '':
1327
+ tempfile = [file_to_tempfile(paths)]
1328
+ print(paths)
1329
+ print(protect)
1330
+ get_vc(sid0, protect, protect)
1331
+
1332
+ vc_output3 = vc_multi(
1333
+ 0, # sid: 0
1334
+ dir_path, # dir_path:
1335
+ opt_root, # opt_root: opt
1336
+ tempfile, # paths: [<tempfile._TemporaryFileWrapper object at 0x7f42c7dbb970>]
1337
+ f0_up_key, # f0_up_key: -12.0
1338
+ f0_method, # f0_method: pm
1339
+ file_index, # file_index:
1340
+ file_index2, # file_index2:
1341
+ index_rate, # index_rate: 1
1342
+ filter_radius, # filter_radius: 3
1343
+ resample_sr, # resample_sr: 0
1344
+ rms_mix_rate, # rms_mix_rate: 1
1345
+ protect, # protect: 0.33
1346
+ format1 # format1: wav
1347
+ )
1348
+ out_path = paths
1349
+ wavfile.write(out_path, tgt_sr, vc_output3)
1350
+
1351
+ return vc_output3
1352
+
1353
+ def get_models():
1354
+ return names
1355
+
1356
+
1357
+ def file_to_tempfile(file_path):
1358
+ with open(file_path, 'rb') as file:
1359
+ temp_file = tempfile.TemporaryFile()
1360
+ temp_file.write(file.read())
1361
+ temp_file.seek(0)
1362
+ return temp_file
1363
+
1364
+ print(run('mymodelimran.pth', '/home/teewhy/Desktop/RVC/Retrieval-based-Voice-Conversion-WebUI/opt/abcxot47ylz.mp3.mp3'))
1365
+
1366
+
1367
+
1368
+
1369
+
1370
+
1371
+
1372
+
1373
+
1374
+
1375
+
1376
+
1377
+ # sid: 0
1378
+ # dir_path:
1379
+ # opt_root: opt
1380
+ # paths: [<tempfile._TemporaryFileWrapper object at 0x7f42c7dbb970>]
1381
+ # f0_up_key: -12.0
1382
+ # f0_method: pm
1383
+ # file_index:
1384
+ # file_index2:
1385
+ # index_rate: 1
1386
+ # filter_radius: 3
1387
+ # resample_sr: 0
1388
+ # rms_mix_rate: 1
1389
+ # protect: 0.33
1390
+ # format1: wav
1391
+ # -12.0
1392
+ # input_audio_path: /tmp/vocal_7x2a5ym2vczt3tom5.mp3_10.wav
test.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ import RVC_class
2
+ converter = RVC_class.VoiceConverter()
3
+ result = converter.single_run('https://tmpfiles.org/dl/1669357/recordonline-voice-recorder.com2.wav', 'imran_khan.pth', -12, False)
4
+
5
+ print(result)
train_nsf_sim_cache_sid_load_pretrain.py ADDED
@@ -0,0 +1,595 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys, os
2
+
3
+ now_dir = os.getcwd()
4
+ sys.path.append(os.path.join(now_dir))
5
+ sys.path.append(os.path.join(now_dir, "train"))
6
+ import utils
7
+ import datetime
8
+
9
+ hps = utils.get_hparams()
10
+ os.environ["CUDA_VISIBLE_DEVICES"] = hps.gpus.replace("-", ",")
11
+ n_gpus = len(hps.gpus.split("-"))
12
+ from random import shuffle, randint
13
+ import traceback, json, argparse, itertools, math, torch, pdb
14
+
15
+ torch.backends.cudnn.deterministic = False
16
+ torch.backends.cudnn.benchmark = False
17
+ from torch import nn, optim
18
+ from torch.nn import functional as F
19
+ from torch.utils.data import DataLoader
20
+ from torch.utils.tensorboard import SummaryWriter
21
+ import torch.multiprocessing as mp
22
+ import torch.distributed as dist
23
+ from torch.nn.parallel import DistributedDataParallel as DDP
24
+ from torch.cuda.amp import autocast, GradScaler
25
+ from infer_pack import commons
26
+ from time import sleep
27
+ from time import time as ttime
28
+ from data_utils import (
29
+ TextAudioLoaderMultiNSFsid,
30
+ TextAudioLoader,
31
+ TextAudioCollateMultiNSFsid,
32
+ TextAudioCollate,
33
+ DistributedBucketSampler,
34
+ )
35
+
36
+ if hps.version == "v1":
37
+ from infer_pack.models import (
38
+ SynthesizerTrnMs256NSFsid as RVC_Model_f0,
39
+ SynthesizerTrnMs256NSFsid_nono as RVC_Model_nof0,
40
+ MultiPeriodDiscriminator,
41
+ )
42
+ else:
43
+ from infer_pack.models import (
44
+ SynthesizerTrnMs768NSFsid as RVC_Model_f0,
45
+ SynthesizerTrnMs768NSFsid_nono as RVC_Model_nof0,
46
+ MultiPeriodDiscriminatorV2 as MultiPeriodDiscriminator,
47
+ )
48
+ from losses import generator_loss, discriminator_loss, feature_loss, kl_loss
49
+ from mel_processing import mel_spectrogram_torch, spec_to_mel_torch
50
+ from process_ckpt import savee
51
+
52
+ global_step = 0
53
+
54
+
55
+ class EpochRecorder:
56
+ def __init__(self):
57
+ self.last_time = ttime()
58
+
59
+ def record(self):
60
+ now_time = ttime()
61
+ elapsed_time = now_time - self.last_time
62
+ self.last_time = now_time
63
+ elapsed_time_str = str(datetime.timedelta(seconds=elapsed_time))
64
+ current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
65
+ return f"[{current_time}] | ({elapsed_time_str})"
66
+
67
+
68
+ def main():
69
+ n_gpus = torch.cuda.device_count()
70
+ if torch.cuda.is_available() == False and torch.backends.mps.is_available() == True:
71
+ n_gpus = 1
72
+ os.environ["MASTER_ADDR"] = "localhost"
73
+ os.environ["MASTER_PORT"] = str(randint(20000, 55555))
74
+ children = []
75
+ for i in range(n_gpus):
76
+ subproc = mp.Process(
77
+ target=run,
78
+ args=(
79
+ i,
80
+ n_gpus,
81
+ hps,
82
+ ),
83
+ )
84
+ children.append(subproc)
85
+ subproc.start()
86
+
87
+ for i in range(n_gpus):
88
+ children[i].join()
89
+
90
+
91
+ def run(rank, n_gpus, hps):
92
+ global global_step
93
+ if rank == 0:
94
+ logger = utils.get_logger(hps.model_dir)
95
+ logger.info(hps)
96
+ # utils.check_git_hash(hps.model_dir)
97
+ writer = SummaryWriter(log_dir=hps.model_dir)
98
+ writer_eval = SummaryWriter(log_dir=os.path.join(hps.model_dir, "eval"))
99
+
100
+ dist.init_process_group(
101
+ backend="gloo", init_method="env://", world_size=n_gpus, rank=rank
102
+ )
103
+ torch.manual_seed(hps.train.seed)
104
+ if torch.cuda.is_available():
105
+ torch.cuda.set_device(rank)
106
+
107
+ if hps.if_f0 == 1:
108
+ train_dataset = TextAudioLoaderMultiNSFsid(hps.data.training_files, hps.data)
109
+ else:
110
+ train_dataset = TextAudioLoader(hps.data.training_files, hps.data)
111
+ train_sampler = DistributedBucketSampler(
112
+ train_dataset,
113
+ hps.train.batch_size * n_gpus,
114
+ # [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1200,1400], # 16s
115
+ [100, 200, 300, 400, 500, 600, 700, 800, 900], # 16s
116
+ num_replicas=n_gpus,
117
+ rank=rank,
118
+ shuffle=True,
119
+ )
120
+ # It is possible that dataloader's workers are out of shared memory. Please try to raise your shared memory limit.
121
+ # num_workers=8 -> num_workers=4
122
+ if hps.if_f0 == 1:
123
+ collate_fn = TextAudioCollateMultiNSFsid()
124
+ else:
125
+ collate_fn = TextAudioCollate()
126
+ train_loader = DataLoader(
127
+ train_dataset,
128
+ num_workers=4,
129
+ shuffle=False,
130
+ pin_memory=True,
131
+ collate_fn=collate_fn,
132
+ batch_sampler=train_sampler,
133
+ persistent_workers=True,
134
+ prefetch_factor=8,
135
+ )
136
+ if hps.if_f0 == 1:
137
+ net_g = RVC_Model_f0(
138
+ hps.data.filter_length // 2 + 1,
139
+ hps.train.segment_size // hps.data.hop_length,
140
+ **hps.model,
141
+ is_half=hps.train.fp16_run,
142
+ sr=hps.sample_rate,
143
+ )
144
+ else:
145
+ net_g = RVC_Model_nof0(
146
+ hps.data.filter_length // 2 + 1,
147
+ hps.train.segment_size // hps.data.hop_length,
148
+ **hps.model,
149
+ is_half=hps.train.fp16_run,
150
+ )
151
+ if torch.cuda.is_available():
152
+ net_g = net_g.cuda(rank)
153
+ net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm)
154
+ if torch.cuda.is_available():
155
+ net_d = net_d.cuda(rank)
156
+ optim_g = torch.optim.AdamW(
157
+ net_g.parameters(),
158
+ hps.train.learning_rate,
159
+ betas=hps.train.betas,
160
+ eps=hps.train.eps,
161
+ )
162
+ optim_d = torch.optim.AdamW(
163
+ net_d.parameters(),
164
+ hps.train.learning_rate,
165
+ betas=hps.train.betas,
166
+ eps=hps.train.eps,
167
+ )
168
+ # net_g = DDP(net_g, device_ids=[rank], find_unused_parameters=True)
169
+ # net_d = DDP(net_d, device_ids=[rank], find_unused_parameters=True)
170
+ if torch.cuda.is_available():
171
+ net_g = DDP(net_g, device_ids=[rank])
172
+ net_d = DDP(net_d, device_ids=[rank])
173
+ else:
174
+ net_g = DDP(net_g)
175
+ net_d = DDP(net_d)
176
+
177
+ try: # 如果能加载自动resume
178
+ _, _, _, epoch_str = utils.load_checkpoint(
179
+ utils.latest_checkpoint_path(hps.model_dir, "D_*.pth"), net_d, optim_d
180
+ ) # D多半加载没事
181
+ if rank == 0:
182
+ logger.info("loaded D")
183
+ # _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g,load_opt=0)
184
+ _, _, _, epoch_str = utils.load_checkpoint(
185
+ utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g
186
+ )
187
+ global_step = (epoch_str - 1) * len(train_loader)
188
+ # epoch_str = 1
189
+ # global_step = 0
190
+ except: # 如果首次不能加载,加载pretrain
191
+ # traceback.print_exc()
192
+ epoch_str = 1
193
+ global_step = 0
194
+ if hps.pretrainG != "":
195
+ if rank == 0:
196
+ logger.info("loaded pretrained %s" % (hps.pretrainG))
197
+ print(
198
+ net_g.module.load_state_dict(
199
+ torch.load(hps.pretrainG, map_location="cpu")["model"]
200
+ )
201
+ ) ##测试不加载优化器
202
+ if hps.pretrainD != "":
203
+ if rank == 0:
204
+ logger.info("loaded pretrained %s" % (hps.pretrainD))
205
+ print(
206
+ net_d.module.load_state_dict(
207
+ torch.load(hps.pretrainD, map_location="cpu")["model"]
208
+ )
209
+ )
210
+
211
+ scheduler_g = torch.optim.lr_scheduler.ExponentialLR(
212
+ optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2
213
+ )
214
+ scheduler_d = torch.optim.lr_scheduler.ExponentialLR(
215
+ optim_d, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2
216
+ )
217
+
218
+ scaler = GradScaler(enabled=hps.train.fp16_run)
219
+
220
+ cache = []
221
+ for epoch in range(epoch_str, hps.train.epochs + 1):
222
+ if rank == 0:
223
+ train_and_evaluate(
224
+ rank,
225
+ epoch,
226
+ hps,
227
+ [net_g, net_d],
228
+ [optim_g, optim_d],
229
+ [scheduler_g, scheduler_d],
230
+ scaler,
231
+ [train_loader, None],
232
+ logger,
233
+ [writer, writer_eval],
234
+ cache,
235
+ )
236
+ else:
237
+ train_and_evaluate(
238
+ rank,
239
+ epoch,
240
+ hps,
241
+ [net_g, net_d],
242
+ [optim_g, optim_d],
243
+ [scheduler_g, scheduler_d],
244
+ scaler,
245
+ [train_loader, None],
246
+ None,
247
+ None,
248
+ cache,
249
+ )
250
+ scheduler_g.step()
251
+ scheduler_d.step()
252
+
253
+
254
+ def train_and_evaluate(
255
+ rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers, cache
256
+ ):
257
+ net_g, net_d = nets
258
+ optim_g, optim_d = optims
259
+ train_loader, eval_loader = loaders
260
+ if writers is not None:
261
+ writer, writer_eval = writers
262
+
263
+ train_loader.batch_sampler.set_epoch(epoch)
264
+ global global_step
265
+
266
+ net_g.train()
267
+ net_d.train()
268
+
269
+ # Prepare data iterator
270
+ if hps.if_cache_data_in_gpu == True:
271
+ # Use Cache
272
+ data_iterator = cache
273
+ if cache == []:
274
+ # Make new cache
275
+ for batch_idx, info in enumerate(train_loader):
276
+ # Unpack
277
+ if hps.if_f0 == 1:
278
+ (
279
+ phone,
280
+ phone_lengths,
281
+ pitch,
282
+ pitchf,
283
+ spec,
284
+ spec_lengths,
285
+ wave,
286
+ wave_lengths,
287
+ sid,
288
+ ) = info
289
+ else:
290
+ (
291
+ phone,
292
+ phone_lengths,
293
+ spec,
294
+ spec_lengths,
295
+ wave,
296
+ wave_lengths,
297
+ sid,
298
+ ) = info
299
+ # Load on CUDA
300
+ if torch.cuda.is_available():
301
+ phone = phone.cuda(rank, non_blocking=True)
302
+ phone_lengths = phone_lengths.cuda(rank, non_blocking=True)
303
+ if hps.if_f0 == 1:
304
+ pitch = pitch.cuda(rank, non_blocking=True)
305
+ pitchf = pitchf.cuda(rank, non_blocking=True)
306
+ sid = sid.cuda(rank, non_blocking=True)
307
+ spec = spec.cuda(rank, non_blocking=True)
308
+ spec_lengths = spec_lengths.cuda(rank, non_blocking=True)
309
+ wave = wave.cuda(rank, non_blocking=True)
310
+ wave_lengths = wave_lengths.cuda(rank, non_blocking=True)
311
+ # Cache on list
312
+ if hps.if_f0 == 1:
313
+ cache.append(
314
+ (
315
+ batch_idx,
316
+ (
317
+ phone,
318
+ phone_lengths,
319
+ pitch,
320
+ pitchf,
321
+ spec,
322
+ spec_lengths,
323
+ wave,
324
+ wave_lengths,
325
+ sid,
326
+ ),
327
+ )
328
+ )
329
+ else:
330
+ cache.append(
331
+ (
332
+ batch_idx,
333
+ (
334
+ phone,
335
+ phone_lengths,
336
+ spec,
337
+ spec_lengths,
338
+ wave,
339
+ wave_lengths,
340
+ sid,
341
+ ),
342
+ )
343
+ )
344
+ else:
345
+ # Load shuffled cache
346
+ shuffle(cache)
347
+ else:
348
+ # Loader
349
+ data_iterator = enumerate(train_loader)
350
+
351
+ # Run steps
352
+ epoch_recorder = EpochRecorder()
353
+ for batch_idx, info in data_iterator:
354
+ # Data
355
+ ## Unpack
356
+ if hps.if_f0 == 1:
357
+ (
358
+ phone,
359
+ phone_lengths,
360
+ pitch,
361
+ pitchf,
362
+ spec,
363
+ spec_lengths,
364
+ wave,
365
+ wave_lengths,
366
+ sid,
367
+ ) = info
368
+ else:
369
+ phone, phone_lengths, spec, spec_lengths, wave, wave_lengths, sid = info
370
+ ## Load on CUDA
371
+ if (hps.if_cache_data_in_gpu == False) and torch.cuda.is_available():
372
+ phone = phone.cuda(rank, non_blocking=True)
373
+ phone_lengths = phone_lengths.cuda(rank, non_blocking=True)
374
+ if hps.if_f0 == 1:
375
+ pitch = pitch.cuda(rank, non_blocking=True)
376
+ pitchf = pitchf.cuda(rank, non_blocking=True)
377
+ sid = sid.cuda(rank, non_blocking=True)
378
+ spec = spec.cuda(rank, non_blocking=True)
379
+ spec_lengths = spec_lengths.cuda(rank, non_blocking=True)
380
+ wave = wave.cuda(rank, non_blocking=True)
381
+ # wave_lengths = wave_lengths.cuda(rank, non_blocking=True)
382
+
383
+ # Calculate
384
+ with autocast(enabled=hps.train.fp16_run):
385
+ if hps.if_f0 == 1:
386
+ (
387
+ y_hat,
388
+ ids_slice,
389
+ x_mask,
390
+ z_mask,
391
+ (z, z_p, m_p, logs_p, m_q, logs_q),
392
+ ) = net_g(phone, phone_lengths, pitch, pitchf, spec, spec_lengths, sid)
393
+ else:
394
+ (
395
+ y_hat,
396
+ ids_slice,
397
+ x_mask,
398
+ z_mask,
399
+ (z, z_p, m_p, logs_p, m_q, logs_q),
400
+ ) = net_g(phone, phone_lengths, spec, spec_lengths, sid)
401
+ mel = spec_to_mel_torch(
402
+ spec,
403
+ hps.data.filter_length,
404
+ hps.data.n_mel_channels,
405
+ hps.data.sampling_rate,
406
+ hps.data.mel_fmin,
407
+ hps.data.mel_fmax,
408
+ )
409
+ y_mel = commons.slice_segments(
410
+ mel, ids_slice, hps.train.segment_size // hps.data.hop_length
411
+ )
412
+ with autocast(enabled=False):
413
+ y_hat_mel = mel_spectrogram_torch(
414
+ y_hat.float().squeeze(1),
415
+ hps.data.filter_length,
416
+ hps.data.n_mel_channels,
417
+ hps.data.sampling_rate,
418
+ hps.data.hop_length,
419
+ hps.data.win_length,
420
+ hps.data.mel_fmin,
421
+ hps.data.mel_fmax,
422
+ )
423
+ if hps.train.fp16_run == True:
424
+ y_hat_mel = y_hat_mel.half()
425
+ wave = commons.slice_segments(
426
+ wave, ids_slice * hps.data.hop_length, hps.train.segment_size
427
+ ) # slice
428
+
429
+ # Discriminator
430
+ y_d_hat_r, y_d_hat_g, _, _ = net_d(wave, y_hat.detach())
431
+ with autocast(enabled=False):
432
+ loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(
433
+ y_d_hat_r, y_d_hat_g
434
+ )
435
+ optim_d.zero_grad()
436
+ scaler.scale(loss_disc).backward()
437
+ scaler.unscale_(optim_d)
438
+ grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None)
439
+ scaler.step(optim_d)
440
+
441
+ with autocast(enabled=hps.train.fp16_run):
442
+ # Generator
443
+ y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(wave, y_hat)
444
+ with autocast(enabled=False):
445
+ loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel
446
+ loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl
447
+ loss_fm = feature_loss(fmap_r, fmap_g)
448
+ loss_gen, losses_gen = generator_loss(y_d_hat_g)
449
+ loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl
450
+ optim_g.zero_grad()
451
+ scaler.scale(loss_gen_all).backward()
452
+ scaler.unscale_(optim_g)
453
+ grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None)
454
+ scaler.step(optim_g)
455
+ scaler.update()
456
+
457
+ if rank == 0:
458
+ if global_step % hps.train.log_interval == 0:
459
+ lr = optim_g.param_groups[0]["lr"]
460
+ logger.info(
461
+ "Train Epoch: {} [{:.0f}%]".format(
462
+ epoch, 100.0 * batch_idx / len(train_loader)
463
+ )
464
+ )
465
+ # Amor For Tensorboard display
466
+ if loss_mel > 75:
467
+ loss_mel = 75
468
+ if loss_kl > 9:
469
+ loss_kl = 9
470
+
471
+ logger.info([global_step, lr])
472
+ logger.info(
473
+ f"loss_disc={loss_disc:.3f}, loss_gen={loss_gen:.3f}, loss_fm={loss_fm:.3f},loss_mel={loss_mel:.3f}, loss_kl={loss_kl:.3f}"
474
+ )
475
+ scalar_dict = {
476
+ "loss/g/total": loss_gen_all,
477
+ "loss/d/total": loss_disc,
478
+ "learning_rate": lr,
479
+ "grad_norm_d": grad_norm_d,
480
+ "grad_norm_g": grad_norm_g,
481
+ }
482
+ scalar_dict.update(
483
+ {
484
+ "loss/g/fm": loss_fm,
485
+ "loss/g/mel": loss_mel,
486
+ "loss/g/kl": loss_kl,
487
+ }
488
+ )
489
+
490
+ scalar_dict.update(
491
+ {"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)}
492
+ )
493
+ scalar_dict.update(
494
+ {"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)}
495
+ )
496
+ scalar_dict.update(
497
+ {"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)}
498
+ )
499
+ image_dict = {
500
+ "slice/mel_org": utils.plot_spectrogram_to_numpy(
501
+ y_mel[0].data.cpu().numpy()
502
+ ),
503
+ "slice/mel_gen": utils.plot_spectrogram_to_numpy(
504
+ y_hat_mel[0].data.cpu().numpy()
505
+ ),
506
+ "all/mel": utils.plot_spectrogram_to_numpy(
507
+ mel[0].data.cpu().numpy()
508
+ ),
509
+ }
510
+ utils.summarize(
511
+ writer=writer,
512
+ global_step=global_step,
513
+ images=image_dict,
514
+ scalars=scalar_dict,
515
+ )
516
+ global_step += 1
517
+ # /Run steps
518
+
519
+ if epoch % hps.save_every_epoch == 0 and rank == 0:
520
+ if hps.if_latest == 0:
521
+ utils.save_checkpoint(
522
+ net_g,
523
+ optim_g,
524
+ hps.train.learning_rate,
525
+ epoch,
526
+ os.path.join(hps.model_dir, "G_{}.pth".format(global_step)),
527
+ )
528
+ utils.save_checkpoint(
529
+ net_d,
530
+ optim_d,
531
+ hps.train.learning_rate,
532
+ epoch,
533
+ os.path.join(hps.model_dir, "D_{}.pth".format(global_step)),
534
+ )
535
+ else:
536
+ utils.save_checkpoint(
537
+ net_g,
538
+ optim_g,
539
+ hps.train.learning_rate,
540
+ epoch,
541
+ os.path.join(hps.model_dir, "G_{}.pth".format(2333333)),
542
+ )
543
+ utils.save_checkpoint(
544
+ net_d,
545
+ optim_d,
546
+ hps.train.learning_rate,
547
+ epoch,
548
+ os.path.join(hps.model_dir, "D_{}.pth".format(2333333)),
549
+ )
550
+ if rank == 0 and hps.save_every_weights == "1":
551
+ if hasattr(net_g, "module"):
552
+ ckpt = net_g.module.state_dict()
553
+ else:
554
+ ckpt = net_g.state_dict()
555
+ logger.info(
556
+ "saving ckpt %s_e%s:%s"
557
+ % (
558
+ hps.name,
559
+ epoch,
560
+ savee(
561
+ ckpt,
562
+ hps.sample_rate,
563
+ hps.if_f0,
564
+ hps.name + "_e%s_s%s" % (epoch, global_step),
565
+ epoch,
566
+ hps.version,
567
+ hps,
568
+ ),
569
+ )
570
+ )
571
+
572
+ if rank == 0:
573
+ logger.info("====> Epoch: {} {}".format(epoch, epoch_recorder.record()))
574
+ if epoch >= hps.total_epoch and rank == 0:
575
+ logger.info("Training is done. The program is closed.")
576
+
577
+ if hasattr(net_g, "module"):
578
+ ckpt = net_g.module.state_dict()
579
+ else:
580
+ ckpt = net_g.state_dict()
581
+ logger.info(
582
+ "saving final ckpt:%s"
583
+ % (
584
+ savee(
585
+ ckpt, hps.sample_rate, hps.if_f0, hps.name, epoch, hps.version, hps
586
+ )
587
+ )
588
+ )
589
+ sleep(1)
590
+ os._exit(2333333)
591
+
592
+
593
+ if __name__ == "__main__":
594
+ torch.multiprocessing.set_start_method("spawn")
595
+ main()
trainset_preprocess_pipeline_print.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys, os, multiprocessing
2
+ from scipy import signal
3
+
4
+ now_dir = os.getcwd()
5
+ sys.path.append(now_dir)
6
+
7
+ inp_root = sys.argv[1]
8
+ sr = int(sys.argv[2])
9
+ n_p = int(sys.argv[3])
10
+ exp_dir = sys.argv[4]
11
+ noparallel = sys.argv[5] == "True"
12
+ import numpy as np, os, traceback
13
+ from slicer2 import Slicer
14
+ import librosa, traceback
15
+ from scipy.io import wavfile
16
+ import multiprocessing
17
+ from my_utils import load_audio
18
+
19
+ mutex = multiprocessing.Lock()
20
+ f = open("%s/preprocess.log" % exp_dir, "a+")
21
+
22
+
23
+ def println(strr):
24
+ mutex.acquire()
25
+ print(strr)
26
+ f.write("%s\n" % strr)
27
+ f.flush()
28
+ mutex.release()
29
+
30
+
31
+ class PreProcess:
32
+ def __init__(self, sr, exp_dir):
33
+ self.slicer = Slicer(
34
+ sr=sr,
35
+ threshold=-42,
36
+ min_length=1500,
37
+ min_interval=400,
38
+ hop_size=15,
39
+ max_sil_kept=500,
40
+ )
41
+ self.sr = sr
42
+ self.bh, self.ah = signal.butter(N=5, Wn=48, btype="high", fs=self.sr)
43
+ self.per = 3.0
44
+ self.overlap = 0.3
45
+ self.tail = self.per + self.overlap
46
+ self.max = 0.9
47
+ self.alpha = 0.75
48
+ self.exp_dir = exp_dir
49
+ self.gt_wavs_dir = "%s/0_gt_wavs" % exp_dir
50
+ self.wavs16k_dir = "%s/1_16k_wavs" % exp_dir
51
+ os.makedirs(self.exp_dir, exist_ok=True)
52
+ os.makedirs(self.gt_wavs_dir, exist_ok=True)
53
+ os.makedirs(self.wavs16k_dir, exist_ok=True)
54
+
55
+ def norm_write(self, tmp_audio, idx0, idx1):
56
+ tmp_max = np.abs(tmp_audio).max()
57
+ if tmp_max > 2.5:
58
+ print("%s-%s-%s-filtered" % (idx0, idx1, tmp_max))
59
+ return
60
+ tmp_audio = (tmp_audio / tmp_max * (self.max * self.alpha)) + (
61
+ 1 - self.alpha
62
+ ) * tmp_audio
63
+ wavfile.write(
64
+ "%s/%s_%s.wav" % (self.gt_wavs_dir, idx0, idx1),
65
+ self.sr,
66
+ tmp_audio.astype(np.float32),
67
+ )
68
+ tmp_audio = librosa.resample(
69
+ tmp_audio, orig_sr=self.sr, target_sr=16000
70
+ ) # , res_type="soxr_vhq"
71
+ wavfile.write(
72
+ "%s/%s_%s.wav" % (self.wavs16k_dir, idx0, idx1),
73
+ 16000,
74
+ tmp_audio.astype(np.float32),
75
+ )
76
+
77
+ def pipeline(self, path, idx0):
78
+ try:
79
+ audio = load_audio(path, self.sr)
80
+ # zero phased digital filter cause pre-ringing noise...
81
+ # audio = signal.filtfilt(self.bh, self.ah, audio)
82
+ audio = signal.lfilter(self.bh, self.ah, audio)
83
+
84
+ idx1 = 0
85
+ for audio in self.slicer.slice(audio):
86
+ i = 0
87
+ while 1:
88
+ start = int(self.sr * (self.per - self.overlap) * i)
89
+ i += 1
90
+ if len(audio[start:]) > self.tail * self.sr:
91
+ tmp_audio = audio[start : start + int(self.per * self.sr)]
92
+ self.norm_write(tmp_audio, idx0, idx1)
93
+ idx1 += 1
94
+ else:
95
+ tmp_audio = audio[start:]
96
+ idx1 += 1
97
+ break
98
+ self.norm_write(tmp_audio, idx0, idx1)
99
+ println("%s->Suc." % path)
100
+ except:
101
+ println("%s->%s" % (path, traceback.format_exc()))
102
+
103
+ def pipeline_mp(self, infos):
104
+ for path, idx0 in infos:
105
+ self.pipeline(path, idx0)
106
+
107
+ def pipeline_mp_inp_dir(self, inp_root, n_p):
108
+ try:
109
+ infos = [
110
+ ("%s/%s" % (inp_root, name), idx)
111
+ for idx, name in enumerate(sorted(list(os.listdir(inp_root))))
112
+ ]
113
+ if noparallel:
114
+ for i in range(n_p):
115
+ self.pipeline_mp(infos[i::n_p])
116
+ else:
117
+ ps = []
118
+ for i in range(n_p):
119
+ p = multiprocessing.Process(
120
+ target=self.pipeline_mp, args=(infos[i::n_p],)
121
+ )
122
+ ps.append(p)
123
+ p.start()
124
+ for i in range(n_p):
125
+ ps[i].join()
126
+ except:
127
+ println("Fail. %s" % traceback.format_exc())
128
+
129
+
130
+ def preprocess_trainset(inp_root, sr, n_p, exp_dir):
131
+ pp = PreProcess(sr, exp_dir)
132
+ println("start preprocess")
133
+ println(sys.argv)
134
+ pp.pipeline_mp_inp_dir(inp_root, n_p)
135
+ println("end preprocess")
136
+
137
+
138
+ if __name__ == "__main__":
139
+ preprocess_trainset(inp_root, sr, n_p, exp_dir)
vc_infer_pipeline.py ADDED
@@ -0,0 +1,449 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np, parselmouth, torch, pdb
2
+ from time import time as ttime
3
+ import torch.nn.functional as F
4
+ import scipy.signal as signal
5
+ import pyworld, os, traceback, faiss, librosa, torchcrepe
6
+ from scipy import signal
7
+ from functools import lru_cache
8
+ import logging
9
+ bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
10
+
11
+ input_audio_path2wav = {}
12
+
13
+
14
+ @lru_cache
15
+ def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period):
16
+ audio = input_audio_path2wav[input_audio_path]
17
+ f0, t = pyworld.harvest(
18
+ audio,
19
+ fs=fs,
20
+ f0_ceil=f0max,
21
+ f0_floor=f0min,
22
+ frame_period=frame_period,
23
+ )
24
+ f0 = pyworld.stonemask(audio, f0, t, fs)
25
+ return f0
26
+
27
+
28
+ def change_rms(data1, sr1, data2, sr2, rate): # 1是输入音频,2是输出音频,rate是2的占比
29
+ # print(data1.max(),data2.max())
30
+ rms1 = librosa.feature.rms(
31
+ y=data1, frame_length=sr1 // 2 * 2, hop_length=sr1 // 2
32
+ ) # 每半秒一个点
33
+ rms2 = librosa.feature.rms(y=data2, frame_length=sr2 // 2 * 2, hop_length=sr2 // 2)
34
+ rms1 = torch.from_numpy(rms1)
35
+ rms1 = F.interpolate(
36
+ rms1.unsqueeze(0), size=data2.shape[0], mode="linear"
37
+ ).squeeze()
38
+ rms2 = torch.from_numpy(rms2)
39
+ rms2 = F.interpolate(
40
+ rms2.unsqueeze(0), size=data2.shape[0], mode="linear"
41
+ ).squeeze()
42
+ rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-6)
43
+ data2 *= (
44
+ torch.pow(rms1, torch.tensor(1 - rate))
45
+ * torch.pow(rms2, torch.tensor(rate - 1))
46
+ ).numpy()
47
+ return data2
48
+
49
+
50
+ class VC(object):
51
+ def __init__(self, tgt_sr, config):
52
+ self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = (
53
+ config.x_pad,
54
+ config.x_query,
55
+ config.x_center,
56
+ config.x_max,
57
+ config.is_half,
58
+ )
59
+ self.sr = 16000 # hubert输入采样率
60
+ self.window = 160 # 每帧点数
61
+ self.t_pad = self.sr * self.x_pad # 每条前后pad时间
62
+ self.t_pad_tgt = tgt_sr * self.x_pad
63
+ self.t_pad2 = self.t_pad * 2
64
+ self.t_query = self.sr * self.x_query # 查询切点前后查询时间
65
+ self.t_center = self.sr * self.x_center # 查询切点位置
66
+ self.t_max = self.sr * self.x_max # 免查询时长阈值
67
+ self.device = config.device
68
+
69
+ def get_f0(
70
+ self,
71
+ input_audio_path,
72
+ x,
73
+ p_len,
74
+ f0_up_key,
75
+ f0_method,
76
+ filter_radius,
77
+ inp_f0=None,
78
+ ):
79
+ logging.info("start get_f0")
80
+ global input_audio_path2wav
81
+ time_step = self.window / self.sr * 1000
82
+ f0_min = 50
83
+ f0_max = 1100
84
+ f0_mel_min = 1127 * np.log(1 + f0_min / 700)
85
+ f0_mel_max = 1127 * np.log(1 + f0_max / 700)
86
+ logging.info("before if_method == pm")
87
+ if f0_method == "pm":
88
+ f0 = (
89
+ parselmouth.Sound(x, self.sr)
90
+ .to_pitch_ac(
91
+ time_step=time_step / 1000,
92
+ voicing_threshold=0.6,
93
+ pitch_floor=f0_min,
94
+ pitch_ceiling=f0_max,
95
+ )
96
+ .selected_array["frequency"]
97
+ )
98
+ pad_size = (p_len - len(f0) + 1) // 2
99
+ if pad_size > 0 or p_len - len(f0) - pad_size > 0:
100
+ f0 = np.pad(
101
+ f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
102
+ )
103
+ elif f0_method == "harvest":
104
+ input_audio_path2wav[input_audio_path] = x.astype(np.double)
105
+ f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10)
106
+ if filter_radius > 2:
107
+ f0 = signal.medfilt(f0, 3)
108
+ elif f0_method == "crepe":
109
+ logging.info("inside elif if_method == crepe")
110
+ model = "full"
111
+ # Pick a batch size that doesn't cause memory errors on your gpu
112
+ batch_size = 512
113
+ # Compute pitch using first gpu
114
+ audio = torch.tensor(np.copy(x))[None].float()
115
+ logging.info("before torchcrepe.predict")
116
+ f0, pd = torchcrepe.predict(
117
+ audio,
118
+ self.sr,
119
+ self.window,
120
+ f0_min,
121
+ f0_max,
122
+ model,
123
+ batch_size=batch_size,
124
+ device=self.device,
125
+ return_periodicity=True,
126
+ )
127
+ logging.info("after torchcrepe.predict")
128
+ pd = torchcrepe.filter.median(pd, 3)
129
+ f0 = torchcrepe.filter.mean(f0, 3)
130
+ f0[pd < 0.1] = 0
131
+ f0 = f0[0].cpu().numpy()
132
+ f0 *= pow(2, f0_up_key / 12)
133
+ # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
134
+ tf0 = self.sr // self.window # 每秒f0点数
135
+ if inp_f0 is not None:
136
+ delta_t = np.round(
137
+ (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1
138
+ ).astype("int16")
139
+ replace_f0 = np.interp(
140
+ list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1]
141
+ )
142
+ shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0]
143
+ f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[
144
+ :shape
145
+ ]
146
+ # with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
147
+ f0bak = f0.copy()
148
+ f0_mel = 1127 * np.log(1 + f0 / 700)
149
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
150
+ f0_mel_max - f0_mel_min
151
+ ) + 1
152
+ f0_mel[f0_mel <= 1] = 1
153
+ f0_mel[f0_mel > 255] = 255
154
+ f0_coarse = np.rint(f0_mel).astype(np.int)
155
+ return f0_coarse, f0bak # 1-0
156
+
157
+ def vc(
158
+ self,
159
+ model,
160
+ net_g,
161
+ sid,
162
+ audio0,
163
+ pitch,
164
+ pitchf,
165
+ times,
166
+ index,
167
+ big_npy,
168
+ index_rate,
169
+ version,
170
+ protect,
171
+ ): # ,file_index,file_big_npy
172
+ feats = torch.from_numpy(audio0)
173
+ if self.is_half:
174
+ feats = feats.half()
175
+ else:
176
+ feats = feats.float()
177
+ if feats.dim() == 2: # double channels
178
+ feats = feats.mean(-1)
179
+ assert feats.dim() == 1, feats.dim()
180
+ feats = feats.view(1, -1)
181
+ padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
182
+
183
+ inputs = {
184
+ "source": feats.to(self.device),
185
+ "padding_mask": padding_mask,
186
+ "output_layer": 9 if version == "v1" else 12,
187
+ }
188
+ t0 = ttime()
189
+ with torch.no_grad():
190
+ logits = model.extract_features(**inputs)
191
+ feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
192
+ if protect < 0.5 and pitch != None and pitchf != None:
193
+ feats0 = feats.clone()
194
+ if (
195
+ isinstance(index, type(None)) == False
196
+ and isinstance(big_npy, type(None)) == False
197
+ and index_rate != 0
198
+ ):
199
+ npy = feats[0].cpu().numpy()
200
+ if self.is_half:
201
+ npy = npy.astype("float32")
202
+
203
+ # _, I = index.search(npy, 1)
204
+ # npy = big_npy[I.squeeze()]
205
+
206
+ score, ix = index.search(npy, k=8)
207
+ weight = np.square(1 / score)
208
+ weight /= weight.sum(axis=1, keepdims=True)
209
+ npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
210
+
211
+ if self.is_half:
212
+ npy = npy.astype("float16")
213
+ feats = (
214
+ torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
215
+ + (1 - index_rate) * feats
216
+ )
217
+
218
+ feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
219
+ if protect < 0.5 and pitch != None and pitchf != None:
220
+ feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
221
+ 0, 2, 1
222
+ )
223
+ t1 = ttime()
224
+ p_len = audio0.shape[0] // self.window
225
+ if feats.shape[1] < p_len:
226
+ p_len = feats.shape[1]
227
+ if pitch != None and pitchf != None:
228
+ pitch = pitch[:, :p_len]
229
+ pitchf = pitchf[:, :p_len]
230
+
231
+ if protect < 0.5 and pitch != None and pitchf != None:
232
+ pitchff = pitchf.clone()
233
+ pitchff[pitchf > 0] = 1
234
+ pitchff[pitchf < 1] = protect
235
+ pitchff = pitchff.unsqueeze(-1)
236
+ feats = feats * pitchff + feats0 * (1 - pitchff)
237
+ feats = feats.to(feats0.dtype)
238
+ p_len = torch.tensor([p_len], device=self.device).long()
239
+ with torch.no_grad():
240
+ if pitch != None and pitchf != None:
241
+ audio1 = (
242
+ (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0])
243
+ .data.cpu()
244
+ .float()
245
+ .numpy()
246
+ )
247
+ else:
248
+ audio1 = (
249
+ (net_g.infer(feats, p_len, sid)[0][0, 0]).data.cpu().float().numpy()
250
+ )
251
+ del feats, p_len, padding_mask
252
+ if torch.cuda.is_available():
253
+ torch.cuda.empty_cache()
254
+ t2 = ttime()
255
+ times[0] += t1 - t0
256
+ times[2] += t2 - t1
257
+ return audio1
258
+
259
+ def pipeline(
260
+ self,
261
+ model,
262
+ net_g,
263
+ sid,
264
+ audio,
265
+ input_audio_path,
266
+ times,
267
+ f0_up_key,
268
+ f0_method,
269
+ file_index,
270
+ # file_big_npy,
271
+ index_rate,
272
+ if_f0,
273
+ filter_radius,
274
+ tgt_sr,
275
+ resample_sr,
276
+ rms_mix_rate,
277
+ version,
278
+ protect,
279
+ f0_file=None,
280
+ ):
281
+ logging.info("pipeline starts")
282
+ if (
283
+ file_index != ""
284
+ # and file_big_npy != ""
285
+ # and os.path.exists(file_big_npy) == True
286
+ and os.path.exists(file_index) == True
287
+ and index_rate != 0
288
+ ):
289
+ try:
290
+ index = faiss.read_index(file_index)
291
+ # big_npy = np.load(file_big_npy)
292
+ big_npy = index.reconstruct_n(0, index.ntotal)
293
+ except:
294
+ traceback.print_exc()
295
+ index = big_npy = None
296
+ else:
297
+ index = big_npy = None
298
+ logging.info("index: ")
299
+ audio = signal.filtfilt(bh, ah, audio)
300
+ audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
301
+ opt_ts = []
302
+ if audio_pad.shape[0] > self.t_max:
303
+ audio_sum = np.zeros_like(audio)
304
+ for i in range(self.window):
305
+ audio_sum += audio_pad[i : i - self.window]
306
+ for t in range(self.t_center, audio.shape[0], self.t_center):
307
+ opt_ts.append(
308
+ t
309
+ - self.t_query
310
+ + np.where(
311
+ np.abs(audio_sum[t - self.t_query : t + self.t_query])
312
+ == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
313
+ )[0][0]
314
+ )
315
+ logging.info("opt_ts: ")
316
+ s = 0
317
+ audio_opt = []
318
+ t = None
319
+ t1 = ttime()
320
+ audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
321
+ p_len = audio_pad.shape[0] // self.window
322
+ inp_f0 = None
323
+ if hasattr(f0_file, "name") == True:
324
+ try:
325
+ with open(f0_file.name, "r") as f:
326
+ lines = f.read().strip("\n").split("\n")
327
+ inp_f0 = []
328
+ for line in lines:
329
+ inp_f0.append([float(i) for i in line.split(",")])
330
+ inp_f0 = np.array(inp_f0, dtype="float32")
331
+ except:
332
+ traceback.print_exc()
333
+ sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
334
+ logging.info("sid: ")
335
+ pitch, pitchf = None, None
336
+ logging.info("if_f0: ")
337
+ if if_f0 == 1:
338
+ logging.info("inside if_f0: ")
339
+ pitch, pitchf = self.get_f0(
340
+ input_audio_path,
341
+ audio_pad,
342
+ p_len,
343
+ f0_up_key,
344
+ f0_method,
345
+ filter_radius,
346
+ inp_f0,
347
+ )
348
+ logging.info("after get_f0")
349
+ pitch = pitch[:p_len]
350
+ pitchf = pitchf[:p_len]
351
+ if self.device == "mps":
352
+ pitchf = pitchf.astype(np.float32)
353
+ logging.info("12345")
354
+ pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
355
+ pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
356
+ logging.info("6789")
357
+ logging.info("after if_f0: ")
358
+ t2 = ttime()
359
+ times[1] += t2 - t1
360
+ for t in opt_ts:
361
+ t = t // self.window * self.window
362
+ if if_f0 == 1:
363
+ audio_opt.append(
364
+ self.vc(
365
+ model,
366
+ net_g,
367
+ sid,
368
+ audio_pad[s : t + self.t_pad2 + self.window],
369
+ pitch[:, s // self.window : (t + self.t_pad2) // self.window],
370
+ pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
371
+ times,
372
+ index,
373
+ big_npy,
374
+ index_rate,
375
+ version,
376
+ protect,
377
+ )[self.t_pad_tgt : -self.t_pad_tgt]
378
+ )
379
+ else:
380
+ audio_opt.append(
381
+ self.vc(
382
+ model,
383
+ net_g,
384
+ sid,
385
+ audio_pad[s : t + self.t_pad2 + self.window],
386
+ None,
387
+ None,
388
+ times,
389
+ index,
390
+ big_npy,
391
+ index_rate,
392
+ version,
393
+ protect,
394
+ )[self.t_pad_tgt : -self.t_pad_tgt]
395
+ )
396
+ s = t
397
+
398
+ if if_f0 == 1:
399
+ audio_opt.append(
400
+ self.vc(
401
+ model,
402
+ net_g,
403
+ sid,
404
+ audio_pad[t:],
405
+ pitch[:, t // self.window :] if t is not None else pitch,
406
+ pitchf[:, t // self.window :] if t is not None else pitchf,
407
+ times,
408
+ index,
409
+ big_npy,
410
+ index_rate,
411
+ version,
412
+ protect,
413
+ )[self.t_pad_tgt : -self.t_pad_tgt]
414
+ )
415
+ else:
416
+ audio_opt.append(
417
+ self.vc(
418
+ model,
419
+ net_g,
420
+ sid,
421
+ audio_pad[t:],
422
+ None,
423
+ None,
424
+ times,
425
+ index,
426
+ big_npy,
427
+ index_rate,
428
+ version,
429
+ protect,
430
+ )[self.t_pad_tgt : -self.t_pad_tgt]
431
+ )
432
+ audio_opt = np.concatenate(audio_opt)
433
+
434
+
435
+ if rms_mix_rate != 1:
436
+ audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate)
437
+ if resample_sr >= 16000 and tgt_sr != resample_sr:
438
+ audio_opt = librosa.resample(
439
+ audio_opt, orig_sr=tgt_sr, target_sr=resample_sr
440
+ )
441
+ audio_max = np.abs(audio_opt).max() / 0.99
442
+ max_int16 = 32768
443
+ if audio_max > 1:
444
+ max_int16 /= audio_max
445
+ audio_opt = (audio_opt * max_int16).astype(np.int16)
446
+ del pitch, pitchf, sid
447
+ if torch.cuda.is_available():
448
+ torch.cuda.empty_cache()
449
+ return audio_opt