Spaces:
Running
Running
Aitron Emper
commited on
Commit
•
1a7d583
1
Parent(s):
dc8f793
Upload 74 files
Browse files- app.py +53 -15
- core.py +784 -304
- rvc/configs/config.py +6 -7
- rvc/infer/infer.py +126 -63
- rvc/infer/pipeline.py +625 -0
- rvc/lib/FCPEF0Predictor.py +1036 -0
- rvc/lib/infer_pack/models.py +18 -18
- rvc/lib/tools/analyzer.py +76 -0
- rvc/lib/tools/gdown.py +15 -8
- rvc/lib/tools/launch_tensorboard.py +2 -1
- rvc/lib/tools/model_download.py +167 -72
- rvc/lib/tools/prerequisites_download.py +47 -36
- rvc/lib/tools/pretrained_selector.py +1 -1
- rvc/lib/tools/split_audio.py +7 -5
- rvc/lib/utils.py +7 -5
- rvc/train/extract/extract_feature_print.py +3 -0
- rvc/train/process/extract_index.py +5 -2
- rvc/train/process/extract_model.py +17 -7
- rvc/train/process/extract_small_model.py +175 -0
- rvc/train/process/model_blender.py +63 -0
- rvc/train/process/model_information.py +33 -0
- rvc/train/train.py +68 -38
- rvc/train/utils.py +18 -43
- tabs/download/download.py +37 -1
- tabs/extra/analyzer/analyzer.py +15 -68
- tabs/extra/extra.py +2 -2
- tabs/extra/model_information.py +4 -2
- tabs/extra/processing/processing.py +10 -114
- tabs/inference/inference.py +234 -62
- tabs/plugins/plugins_core.py +37 -17
- tabs/report/report.py +1 -1
- tabs/settings/fake_gpu.py +55 -0
- tabs/settings/flask_server.py +43 -0
- tabs/settings/lang.py +57 -0
- tabs/settings/presence.py +23 -10
- tabs/settings/restart.py +39 -0
- tabs/settings/themes.py +4 -1
- tabs/settings/version.py +24 -0
- tabs/train/train.py +392 -17
- tabs/tts/tts.py +147 -138
- tabs/voice_blender/voice_blender.py +99 -0
app.py
CHANGED
@@ -13,24 +13,35 @@ from tabs.extra.extra import extra_tab
|
|
13 |
from tabs.report.report import report_tab
|
14 |
from tabs.download.download import download_tab
|
15 |
from tabs.tts.tts import tts_tab
|
16 |
-
from tabs.
|
|
|
|
|
|
|
17 |
from tabs.settings.themes import theme_tab
|
18 |
from tabs.plugins.plugins import plugins_tab
|
|
|
|
|
|
|
19 |
|
20 |
# Assets
|
21 |
import assets.themes.loadThemes as loadThemes
|
22 |
from assets.i18n.i18n import I18nAuto
|
23 |
import assets.installation_checker as installation_checker
|
24 |
from assets.discord_presence import RPCManager
|
25 |
-
|
|
|
|
|
|
|
26 |
|
27 |
-
delete_models.start_infinite_loop()
|
28 |
i18n = I18nAuto()
|
29 |
-
|
|
|
30 |
installation_checker.check_installation()
|
31 |
logging.getLogger("uvicorn").disabled = True
|
32 |
logging.getLogger("fairseq").disabled = True
|
33 |
-
|
|
|
|
|
34 |
|
35 |
my_applio = loadThemes.load_json()
|
36 |
if my_applio:
|
@@ -53,17 +64,24 @@ with gr.Blocks(theme=my_applio, title="Applio") as Applio:
|
|
53 |
with gr.Tab(i18n("Inference")):
|
54 |
inference_tab()
|
55 |
|
56 |
-
|
57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
|
59 |
with gr.Tab(i18n("TTS")):
|
60 |
tts_tab()
|
61 |
|
62 |
-
with gr.Tab(i18n("
|
63 |
-
|
64 |
|
65 |
-
|
66 |
-
|
67 |
|
68 |
with gr.Tab(i18n("Download")):
|
69 |
download_tab()
|
@@ -71,10 +89,30 @@ with gr.Blocks(theme=my_applio, title="Applio") as Applio:
|
|
71 |
with gr.Tab(i18n("Report a Bug")):
|
72 |
report_tab()
|
73 |
|
74 |
-
|
75 |
-
|
76 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
|
78 |
|
79 |
if __name__ == "__main__":
|
80 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
from tabs.report.report import report_tab
|
14 |
from tabs.download.download import download_tab
|
15 |
from tabs.tts.tts import tts_tab
|
16 |
+
from tabs.voice_blender.voice_blender import voice_blender_tab
|
17 |
+
from tabs.settings.presence import presence_tab, load_config_presence
|
18 |
+
from tabs.settings.flask_server import flask_server_tab
|
19 |
+
from tabs.settings.fake_gpu import fake_gpu_tab, gpu_available, load_fake_gpu
|
20 |
from tabs.settings.themes import theme_tab
|
21 |
from tabs.plugins.plugins import plugins_tab
|
22 |
+
from tabs.settings.version import version_tab
|
23 |
+
from tabs.settings.lang import lang_tab
|
24 |
+
from tabs.settings.restart import restart_tab
|
25 |
|
26 |
# Assets
|
27 |
import assets.themes.loadThemes as loadThemes
|
28 |
from assets.i18n.i18n import I18nAuto
|
29 |
import assets.installation_checker as installation_checker
|
30 |
from assets.discord_presence import RPCManager
|
31 |
+
from assets.flask.server import start_flask, load_config_flask
|
32 |
+
from core import run_prerequisites_script
|
33 |
+
|
34 |
+
run_prerequisites_script("False", "True", "True", "True")
|
35 |
|
|
|
36 |
i18n = I18nAuto()
|
37 |
+
if load_config_presence() == True:
|
38 |
+
RPCManager.start_presence()
|
39 |
installation_checker.check_installation()
|
40 |
logging.getLogger("uvicorn").disabled = True
|
41 |
logging.getLogger("fairseq").disabled = True
|
42 |
+
if load_config_flask() == True:
|
43 |
+
print("Starting Flask server")
|
44 |
+
start_flask()
|
45 |
|
46 |
my_applio = loadThemes.load_json()
|
47 |
if my_applio:
|
|
|
64 |
with gr.Tab(i18n("Inference")):
|
65 |
inference_tab()
|
66 |
|
67 |
+
with gr.Tab(i18n("Train")):
|
68 |
+
if gpu_available() or load_fake_gpu():
|
69 |
+
train_tab()
|
70 |
+
else:
|
71 |
+
gr.Markdown(
|
72 |
+
i18n(
|
73 |
+
"Training is currently unsupported due to the absence of a GPU. To activate the training tab, navigate to the settings tab and enable the 'Fake GPU' option."
|
74 |
+
)
|
75 |
+
)
|
76 |
|
77 |
with gr.Tab(i18n("TTS")):
|
78 |
tts_tab()
|
79 |
|
80 |
+
with gr.Tab(i18n("Voice Blender")):
|
81 |
+
voice_blender_tab()
|
82 |
|
83 |
+
with gr.Tab(i18n("Plugins")):
|
84 |
+
plugins_tab()
|
85 |
|
86 |
with gr.Tab(i18n("Download")):
|
87 |
download_tab()
|
|
|
89 |
with gr.Tab(i18n("Report a Bug")):
|
90 |
report_tab()
|
91 |
|
92 |
+
with gr.Tab(i18n("Extra")):
|
93 |
+
extra_tab()
|
94 |
+
|
95 |
+
with gr.Tab(i18n("Settings")):
|
96 |
+
presence_tab()
|
97 |
+
flask_server_tab()
|
98 |
+
if not gpu_available():
|
99 |
+
fake_gpu_tab()
|
100 |
+
theme_tab()
|
101 |
+
version_tab()
|
102 |
+
lang_tab()
|
103 |
+
restart_tab()
|
104 |
|
105 |
|
106 |
if __name__ == "__main__":
|
107 |
+
port = 6969
|
108 |
+
if "--port" in sys.argv:
|
109 |
+
port_index = sys.argv.index("--port") + 1
|
110 |
+
if port_index < len(sys.argv):
|
111 |
+
port = int(sys.argv[port_index])
|
112 |
+
|
113 |
+
Applio.launch(
|
114 |
+
favicon_path="assets/ICON.ico",
|
115 |
+
share="--share" in sys.argv,
|
116 |
+
inbrowser="--open" in sys.argv,
|
117 |
+
server_port=port,
|
118 |
+
)
|
core.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
import os
|
2 |
import sys
|
|
|
3 |
import argparse
|
4 |
import subprocess
|
5 |
|
@@ -7,26 +8,32 @@ now_dir = os.getcwd()
|
|
7 |
sys.path.append(now_dir)
|
8 |
|
9 |
from rvc.configs.config import Config
|
10 |
-
from rvc.lib.tools.validators import (
|
11 |
-
validate_sampling_rate,
|
12 |
-
validate_f0up_key,
|
13 |
-
validate_f0method,
|
14 |
-
validate_true_false,
|
15 |
-
validate_tts_voices,
|
16 |
-
)
|
17 |
|
|
|
18 |
from rvc.train.extract.preparing_files import generate_config, generate_filelist
|
19 |
from rvc.lib.tools.pretrained_selector import pretrained_selector
|
20 |
|
21 |
-
from rvc.
|
22 |
-
from rvc.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
config = Config()
|
25 |
current_script_directory = os.path.dirname(os.path.realpath(__file__))
|
26 |
logs_path = os.path.join(current_script_directory, "logs")
|
27 |
-
|
28 |
-
|
29 |
-
)
|
|
|
|
|
|
|
30 |
|
31 |
|
32 |
# Infer
|
@@ -34,31 +41,41 @@ def run_infer_script(
|
|
34 |
f0up_key,
|
35 |
filter_radius,
|
36 |
index_rate,
|
|
|
|
|
37 |
hop_length,
|
38 |
f0method,
|
39 |
input_path,
|
40 |
output_path,
|
41 |
-
|
42 |
index_path,
|
43 |
split_audio,
|
|
|
|
|
|
|
|
|
44 |
):
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
str(hop_length),
|
53 |
f0method,
|
54 |
input_path,
|
55 |
output_path,
|
56 |
-
|
57 |
index_path,
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
|
|
|
|
|
|
|
|
|
|
62 |
|
63 |
|
64 |
# Batch infer
|
@@ -66,16 +83,20 @@ def run_batch_infer_script(
|
|
66 |
f0up_key,
|
67 |
filter_radius,
|
68 |
index_rate,
|
|
|
|
|
69 |
hop_length,
|
70 |
f0method,
|
71 |
input_folder,
|
72 |
output_folder,
|
73 |
-
|
74 |
index_path,
|
75 |
split_audio,
|
|
|
|
|
|
|
|
|
76 |
):
|
77 |
-
infer_script_path = os.path.join("rvc", "infer", "infer.py")
|
78 |
-
|
79 |
audio_files = [
|
80 |
f for f in os.listdir(input_folder) if f.endswith((".mp3", ".wav", ".flac"))
|
81 |
]
|
@@ -93,21 +114,24 @@ def run_batch_infer_script(
|
|
93 |
)
|
94 |
print(f"Inferring {input_path}...")
|
95 |
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
|
|
|
|
|
|
111 |
|
112 |
return f"Files from {input_folder} inferred successfully."
|
113 |
|
@@ -119,15 +143,21 @@ def run_tts_script(
|
|
119 |
f0up_key,
|
120 |
filter_radius,
|
121 |
index_rate,
|
|
|
|
|
122 |
hop_length,
|
123 |
f0method,
|
124 |
output_tts_path,
|
125 |
output_rvc_path,
|
126 |
-
|
127 |
index_path,
|
|
|
|
|
|
|
|
|
|
|
128 |
):
|
129 |
tts_script_path = os.path.join("rvc", "lib", "tools", "tts.py")
|
130 |
-
infer_script_path = os.path.join("rvc", "infer", "infer.py")
|
131 |
|
132 |
if os.path.exists(output_tts_path):
|
133 |
os.remove(output_tts_path)
|
@@ -139,23 +169,30 @@ def run_tts_script(
|
|
139 |
tts_voice,
|
140 |
output_tts_path,
|
141 |
]
|
|
|
142 |
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
f0method,
|
151 |
output_tts_path,
|
152 |
output_rvc_path,
|
153 |
-
|
154 |
index_path,
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
159 |
|
160 |
|
161 |
# Preprocess
|
@@ -165,20 +202,25 @@ def run_preprocess_script(model_name, dataset_path, sampling_rate):
|
|
165 |
command = [
|
166 |
"python",
|
167 |
preprocess_script_path,
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
|
|
|
|
|
|
|
|
|
|
172 |
]
|
173 |
|
174 |
-
os.makedirs(os.path.join(logs_path,
|
175 |
subprocess.run(command)
|
176 |
return f"Model {model_name} preprocessed successfully."
|
177 |
|
178 |
|
179 |
# Extract
|
180 |
def run_extract_script(model_name, rvc_version, f0method, hop_length, sampling_rate):
|
181 |
-
model_path = os.path.join(logs_path,
|
182 |
extract_f0_script_path = os.path.join(
|
183 |
"rvc", "train", "extract", "extract_f0_print.py"
|
184 |
)
|
@@ -189,20 +231,30 @@ def run_extract_script(model_name, rvc_version, f0method, hop_length, sampling_r
|
|
189 |
command_1 = [
|
190 |
"python",
|
191 |
extract_f0_script_path,
|
192 |
-
|
193 |
-
|
194 |
-
|
|
|
|
|
|
|
|
|
|
|
195 |
]
|
196 |
command_2 = [
|
197 |
"python",
|
198 |
extract_feature_script_path,
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
|
|
|
|
|
|
|
|
|
|
206 |
]
|
207 |
subprocess.run(command_1)
|
208 |
subprocess.run(command_2)
|
@@ -224,6 +276,8 @@ def run_train_script(
|
|
224 |
batch_size,
|
225 |
gpu,
|
226 |
pitch_guidance,
|
|
|
|
|
227 |
pretrained,
|
228 |
custom_pretrained,
|
229 |
g_pretrained_path=None,
|
@@ -232,6 +286,7 @@ def run_train_script(
|
|
232 |
f0 = 1 if str(pitch_guidance) == "True" else 0
|
233 |
latest = 1 if str(save_only_latest) == "True" else 0
|
234 |
save_every = 1 if str(save_every_weights) == "True" else 0
|
|
|
235 |
|
236 |
if str(pretrained) == "True":
|
237 |
if str(custom_pretrained) == "False":
|
@@ -248,33 +303,42 @@ def run_train_script(
|
|
248 |
train_script_path = os.path.join("rvc", "train", "train.py")
|
249 |
command = [
|
250 |
"python",
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
278 |
]
|
279 |
|
280 |
subprocess.run(command)
|
@@ -284,11 +348,11 @@ def run_train_script(
|
|
284 |
|
285 |
# Index
|
286 |
def run_index_script(model_name, rvc_version):
|
287 |
-
index_script_path = os.path.join("rvc", "train", "
|
288 |
command = [
|
289 |
"python",
|
290 |
index_script_path,
|
291 |
-
os.path.join(logs_path,
|
292 |
rvc_version,
|
293 |
]
|
294 |
|
@@ -296,38 +360,66 @@ def run_index_script(model_name, rvc_version):
|
|
296 |
return f"Index file for {model_name} generated successfully."
|
297 |
|
298 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
299 |
# Model information
|
300 |
def run_model_information_script(pth_path):
|
301 |
print(model_information(pth_path))
|
302 |
|
303 |
|
304 |
-
# Model
|
305 |
-
def
|
306 |
-
|
|
|
307 |
|
308 |
|
309 |
# Tensorboard
|
310 |
def run_tensorboard_script():
|
311 |
-
|
312 |
-
"rvc", "lib", "tools", "launch_tensorboard.py"
|
313 |
-
)
|
314 |
-
command = [
|
315 |
-
"python",
|
316 |
-
tensorboard_script_path,
|
317 |
-
]
|
318 |
-
subprocess.run(command)
|
319 |
|
320 |
|
321 |
# Download
|
322 |
def run_download_script(model_link):
|
323 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
324 |
command = [
|
325 |
-
"
|
326 |
-
|
327 |
-
|
|
|
|
|
|
|
328 |
]
|
329 |
subprocess.run(command)
|
330 |
-
return f"Model downloaded successfully."
|
331 |
|
332 |
|
333 |
# Parse arguments
|
@@ -342,48 +434,108 @@ def parse_arguments():
|
|
342 |
# Parser for 'infer' mode
|
343 |
infer_parser = subparsers.add_parser("infer", help="Run inference")
|
344 |
infer_parser.add_argument(
|
345 |
-
"f0up_key",
|
346 |
-
type=
|
347 |
-
help="Value for f0up_key
|
|
|
|
|
348 |
)
|
349 |
infer_parser.add_argument(
|
350 |
-
"filter_radius",
|
351 |
type=str,
|
352 |
-
help="Value for filter_radius
|
|
|
|
|
353 |
)
|
354 |
infer_parser.add_argument(
|
355 |
-
"index_rate",
|
356 |
type=str,
|
357 |
-
help="Value for index_rate
|
|
|
|
|
358 |
)
|
359 |
infer_parser.add_argument(
|
360 |
-
"
|
361 |
type=str,
|
362 |
-
help="Value for
|
|
|
|
|
363 |
)
|
364 |
infer_parser.add_argument(
|
365 |
-
"
|
366 |
-
type=
|
367 |
-
help="Value for
|
|
|
|
|
368 |
)
|
369 |
infer_parser.add_argument(
|
370 |
-
"
|
|
|
|
|
|
|
|
|
371 |
)
|
372 |
infer_parser.add_argument(
|
373 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
374 |
)
|
375 |
infer_parser.add_argument(
|
376 |
-
"
|
|
|
|
|
|
|
|
|
377 |
)
|
378 |
infer_parser.add_argument(
|
379 |
-
"
|
380 |
type=str,
|
381 |
-
help="
|
|
|
|
|
382 |
)
|
383 |
infer_parser.add_argument(
|
384 |
-
"
|
385 |
type=str,
|
386 |
-
help="Enable
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
387 |
)
|
388 |
|
389 |
# Parser for 'batch_infer' mode
|
@@ -391,229 +543,454 @@ def parse_arguments():
|
|
391 |
"batch_infer", help="Run batch inference"
|
392 |
)
|
393 |
batch_infer_parser.add_argument(
|
394 |
-
"f0up_key",
|
395 |
-
type=
|
396 |
-
help="Value for f0up_key
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
397 |
)
|
398 |
batch_infer_parser.add_argument(
|
399 |
-
"
|
400 |
type=str,
|
401 |
-
help="Value for
|
|
|
|
|
402 |
)
|
403 |
batch_infer_parser.add_argument(
|
404 |
-
"
|
405 |
type=str,
|
406 |
-
help="Value for
|
|
|
|
|
407 |
)
|
408 |
batch_infer_parser.add_argument(
|
409 |
-
"
|
410 |
type=str,
|
411 |
-
help="Value for
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
412 |
)
|
413 |
batch_infer_parser.add_argument(
|
414 |
-
"
|
415 |
-
type=
|
416 |
-
help="
|
417 |
)
|
418 |
batch_infer_parser.add_argument(
|
419 |
-
"
|
|
|
|
|
|
|
|
|
420 |
)
|
421 |
batch_infer_parser.add_argument(
|
422 |
-
"
|
|
|
|
|
|
|
|
|
423 |
)
|
424 |
batch_infer_parser.add_argument(
|
425 |
-
"
|
|
|
|
|
|
|
|
|
426 |
)
|
427 |
batch_infer_parser.add_argument(
|
428 |
-
"
|
429 |
type=str,
|
430 |
-
help="
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
431 |
)
|
432 |
|
433 |
# Parser for 'tts' mode
|
434 |
tts_parser = subparsers.add_parser("tts", help="Run TTS")
|
435 |
tts_parser.add_argument(
|
436 |
-
"tts_text",
|
437 |
type=str,
|
438 |
-
help="Text to be synthesized
|
439 |
)
|
440 |
tts_parser.add_argument(
|
441 |
-
"tts_voice",
|
442 |
-
type=
|
443 |
-
help="Voice to be used
|
|
|
444 |
)
|
445 |
tts_parser.add_argument(
|
446 |
-
"f0up_key",
|
447 |
-
type=
|
448 |
-
help="Value for f0up_key
|
|
|
|
|
449 |
)
|
450 |
tts_parser.add_argument(
|
451 |
-
"filter_radius",
|
452 |
type=str,
|
453 |
-
help="Value for filter_radius
|
|
|
|
|
454 |
)
|
455 |
tts_parser.add_argument(
|
456 |
-
"index_rate",
|
457 |
type=str,
|
458 |
-
help="Value for index_rate
|
|
|
|
|
459 |
)
|
460 |
tts_parser.add_argument(
|
461 |
-
"
|
462 |
type=str,
|
463 |
-
help="Value for
|
|
|
|
|
464 |
)
|
465 |
tts_parser.add_argument(
|
466 |
-
"
|
467 |
-
type=
|
468 |
-
help="Value for
|
|
|
|
|
469 |
)
|
470 |
tts_parser.add_argument(
|
471 |
-
"
|
|
|
|
|
|
|
|
|
472 |
)
|
473 |
tts_parser.add_argument(
|
474 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
475 |
)
|
476 |
tts_parser.add_argument(
|
477 |
-
"
|
|
|
|
|
|
|
|
|
478 |
)
|
479 |
tts_parser.add_argument(
|
480 |
-
"
|
481 |
type=str,
|
482 |
-
help="
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
483 |
)
|
484 |
|
485 |
# Parser for 'preprocess' mode
|
486 |
preprocess_parser = subparsers.add_parser("preprocess", help="Run preprocessing")
|
|
|
487 |
preprocess_parser.add_argument(
|
488 |
-
"
|
489 |
-
)
|
490 |
-
preprocess_parser.add_argument(
|
491 |
-
"dataset_path",
|
492 |
type=str,
|
493 |
-
help="Path to the dataset
|
494 |
)
|
495 |
preprocess_parser.add_argument(
|
496 |
-
"sampling_rate",
|
497 |
-
type=
|
498 |
-
help="Sampling rate
|
|
|
499 |
)
|
500 |
|
501 |
# Parser for 'extract' mode
|
502 |
extract_parser = subparsers.add_parser("extract", help="Run extract")
|
503 |
extract_parser.add_argument(
|
504 |
-
"model_name",
|
505 |
type=str,
|
506 |
-
help="Name of the model
|
507 |
)
|
508 |
extract_parser.add_argument(
|
509 |
-
"rvc_version",
|
510 |
type=str,
|
511 |
-
help="Version of the model
|
|
|
|
|
512 |
)
|
513 |
extract_parser.add_argument(
|
514 |
-
"f0method",
|
515 |
-
type=
|
516 |
-
help="Value for f0method
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
517 |
)
|
518 |
extract_parser.add_argument(
|
519 |
-
"hop_length",
|
520 |
type=str,
|
521 |
-
help="Value for hop_length
|
|
|
|
|
522 |
)
|
523 |
extract_parser.add_argument(
|
524 |
-
"sampling_rate",
|
525 |
-
type=
|
526 |
-
help="Sampling rate
|
|
|
527 |
)
|
528 |
|
529 |
# Parser for 'train' mode
|
530 |
train_parser = subparsers.add_parser("train", help="Run training")
|
531 |
train_parser.add_argument(
|
532 |
-
"model_name",
|
533 |
type=str,
|
534 |
-
help="Name of the model
|
535 |
)
|
536 |
train_parser.add_argument(
|
537 |
-
"rvc_version",
|
538 |
type=str,
|
539 |
-
help="Version of the model
|
|
|
|
|
540 |
)
|
541 |
train_parser.add_argument(
|
542 |
-
"save_every_epoch",
|
543 |
type=str,
|
544 |
help="Save every epoch",
|
|
|
545 |
)
|
546 |
train_parser.add_argument(
|
547 |
-
"save_only_latest",
|
548 |
type=str,
|
549 |
help="Save weight only at last epoch",
|
|
|
|
|
550 |
)
|
551 |
train_parser.add_argument(
|
552 |
-
"save_every_weights",
|
553 |
type=str,
|
554 |
help="Save weight every epoch",
|
|
|
|
|
555 |
)
|
556 |
train_parser.add_argument(
|
557 |
-
"total_epoch",
|
558 |
type=str,
|
559 |
help="Total epoch",
|
|
|
|
|
560 |
)
|
561 |
train_parser.add_argument(
|
562 |
-
"sampling_rate",
|
563 |
-
type=
|
564 |
-
help="Sampling rate
|
|
|
565 |
)
|
566 |
train_parser.add_argument(
|
567 |
-
"batch_size",
|
568 |
type=str,
|
569 |
help="Batch size",
|
|
|
|
|
570 |
)
|
571 |
train_parser.add_argument(
|
572 |
-
"gpu",
|
573 |
type=str,
|
574 |
-
help="GPU number
|
|
|
|
|
575 |
)
|
576 |
train_parser.add_argument(
|
577 |
-
"pitch_guidance",
|
578 |
-
type=
|
579 |
-
help="Pitch guidance
|
|
|
|
|
580 |
)
|
581 |
train_parser.add_argument(
|
582 |
-
"pretrained",
|
583 |
-
type=
|
584 |
-
help="Pretrained
|
|
|
|
|
585 |
)
|
586 |
train_parser.add_argument(
|
587 |
-
"custom_pretrained",
|
588 |
-
type=
|
589 |
-
help="Custom pretrained
|
|
|
|
|
590 |
)
|
591 |
train_parser.add_argument(
|
592 |
-
"g_pretrained_path",
|
593 |
type=str,
|
594 |
nargs="?",
|
595 |
default=None,
|
596 |
-
help="Path to the pretrained G file
|
597 |
)
|
598 |
train_parser.add_argument(
|
599 |
-
"d_pretrained_path",
|
600 |
type=str,
|
601 |
nargs="?",
|
602 |
default=None,
|
603 |
-
help="Path to the pretrained D file
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
604 |
)
|
605 |
|
606 |
# Parser for 'index' mode
|
607 |
index_parser = subparsers.add_parser("index", help="Generate index file")
|
608 |
index_parser.add_argument(
|
609 |
-
"model_name",
|
610 |
type=str,
|
611 |
-
help="Name of the model
|
612 |
)
|
613 |
index_parser.add_argument(
|
614 |
-
"rvc_version",
|
615 |
type=str,
|
616 |
-
help="Version of the model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
617 |
)
|
618 |
|
619 |
# Parser for 'model_information' mode
|
@@ -621,27 +998,36 @@ def parse_arguments():
|
|
621 |
"model_information", help="Print model information"
|
622 |
)
|
623 |
model_information_parser.add_argument(
|
624 |
-
"pth_path",
|
625 |
type=str,
|
626 |
-
help="Path to the .pth file
|
627 |
)
|
628 |
|
629 |
-
# Parser for '
|
630 |
-
|
631 |
-
|
632 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
633 |
type=str,
|
634 |
-
help="
|
635 |
)
|
636 |
-
|
637 |
-
"
|
638 |
type=str,
|
639 |
-
help="Path to the
|
640 |
)
|
641 |
-
|
642 |
-
"
|
643 |
type=str,
|
644 |
-
help="
|
|
|
|
|
645 |
)
|
646 |
|
647 |
# Parser for 'tensorboard' mode
|
@@ -650,11 +1036,57 @@ def parse_arguments():
|
|
650 |
# Parser for 'download' mode
|
651 |
download_parser = subparsers.add_parser("download", help="Download models")
|
652 |
download_parser.add_argument(
|
653 |
-
"model_link",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
654 |
type=str,
|
655 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
656 |
)
|
657 |
|
|
|
|
|
|
|
|
|
|
|
658 |
return parser.parse_args()
|
659 |
|
660 |
|
@@ -668,95 +1100,143 @@ def main():
|
|
668 |
try:
|
669 |
if args.mode == "infer":
|
670 |
run_infer_script(
|
671 |
-
args.f0up_key,
|
672 |
-
args.filter_radius,
|
673 |
-
args.index_rate,
|
674 |
-
args.
|
675 |
-
args.
|
676 |
-
args.
|
677 |
-
args.
|
678 |
-
args.
|
679 |
-
args.
|
680 |
-
args.
|
|
|
|
|
|
|
|
|
|
|
|
|
681 |
)
|
682 |
elif args.mode == "batch_infer":
|
683 |
run_batch_infer_script(
|
684 |
-
args.f0up_key,
|
685 |
-
args.filter_radius,
|
686 |
-
args.index_rate,
|
687 |
-
args.
|
688 |
-
args.
|
689 |
-
args.
|
690 |
-
args.
|
691 |
-
args.
|
692 |
-
args.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
693 |
)
|
694 |
elif args.mode == "tts":
|
695 |
run_tts_script(
|
696 |
-
args.tts_text,
|
697 |
-
args.tts_voice,
|
698 |
-
args.f0up_key,
|
699 |
-
args.filter_radius,
|
700 |
-
args.index_rate,
|
701 |
-
args.
|
702 |
-
args.
|
703 |
-
args.
|
704 |
-
args.
|
705 |
-
args.
|
706 |
-
args.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
707 |
)
|
708 |
elif args.mode == "preprocess":
|
709 |
run_preprocess_script(
|
710 |
-
args.model_name,
|
711 |
-
args.dataset_path,
|
712 |
str(args.sampling_rate),
|
713 |
)
|
714 |
-
|
715 |
elif args.mode == "extract":
|
716 |
run_extract_script(
|
717 |
-
args.model_name,
|
718 |
-
args.rvc_version,
|
719 |
-
args.f0method,
|
720 |
-
args.hop_length,
|
721 |
-
args.sampling_rate,
|
722 |
)
|
723 |
elif args.mode == "train":
|
724 |
run_train_script(
|
725 |
-
args.model_name,
|
726 |
-
args.rvc_version,
|
727 |
-
args.save_every_epoch,
|
728 |
-
args.save_only_latest,
|
729 |
-
args.save_every_weights,
|
730 |
-
args.total_epoch,
|
731 |
-
args.sampling_rate,
|
732 |
-
args.batch_size,
|
733 |
-
args.gpu,
|
734 |
-
args.pitch_guidance,
|
735 |
-
args.pretrained,
|
736 |
-
args.custom_pretrained,
|
737 |
-
args.g_pretrained_path,
|
738 |
-
args.d_pretrained_path,
|
|
|
|
|
739 |
)
|
740 |
elif args.mode == "index":
|
741 |
run_index_script(
|
742 |
-
args.model_name,
|
743 |
-
args.rvc_version,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
744 |
)
|
745 |
elif args.mode == "model_information":
|
746 |
run_model_information_script(
|
747 |
-
args.pth_path,
|
748 |
)
|
749 |
-
elif args.mode == "
|
750 |
-
|
751 |
-
args.model_name,
|
752 |
-
args.pth_path_1,
|
753 |
-
args.pth_path_2,
|
|
|
754 |
)
|
755 |
elif args.mode == "tensorboard":
|
756 |
run_tensorboard_script()
|
757 |
elif args.mode == "download":
|
758 |
run_download_script(
|
759 |
-
args.model_link,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
760 |
)
|
761 |
except Exception as error:
|
762 |
print(f"Error: {error}")
|
|
|
1 |
import os
|
2 |
import sys
|
3 |
+
import json
|
4 |
import argparse
|
5 |
import subprocess
|
6 |
|
|
|
8 |
sys.path.append(now_dir)
|
9 |
|
10 |
from rvc.configs.config import Config
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
+
from rvc.lib.tools.prerequisites_download import prequisites_download_pipeline
|
13 |
from rvc.train.extract.preparing_files import generate_config, generate_filelist
|
14 |
from rvc.lib.tools.pretrained_selector import pretrained_selector
|
15 |
|
16 |
+
from rvc.train.process.model_blender import model_blender
|
17 |
+
from rvc.train.process.model_information import model_information
|
18 |
+
from rvc.train.process.extract_small_model import extract_small_model
|
19 |
+
|
20 |
+
from rvc.infer.infer import infer_pipeline
|
21 |
+
|
22 |
+
from rvc.lib.tools.analyzer import analyze_audio
|
23 |
+
|
24 |
+
from rvc.lib.tools.launch_tensorboard import launch_tensorboard_pipeline
|
25 |
+
|
26 |
+
from rvc.lib.tools.model_download import model_download_pipeline
|
27 |
|
28 |
config = Config()
|
29 |
current_script_directory = os.path.dirname(os.path.realpath(__file__))
|
30 |
logs_path = os.path.join(current_script_directory, "logs")
|
31 |
+
|
32 |
+
# Get TTS Voices
|
33 |
+
with open(os.path.join("rvc", "lib", "tools", "tts_voices.json"), "r") as f:
|
34 |
+
voices_data = json.load(f)
|
35 |
+
|
36 |
+
locales = list({voice["Locale"] for voice in voices_data})
|
37 |
|
38 |
|
39 |
# Infer
|
|
|
41 |
f0up_key,
|
42 |
filter_radius,
|
43 |
index_rate,
|
44 |
+
rms_mix_rate,
|
45 |
+
protect,
|
46 |
hop_length,
|
47 |
f0method,
|
48 |
input_path,
|
49 |
output_path,
|
50 |
+
pth_path,
|
51 |
index_path,
|
52 |
split_audio,
|
53 |
+
f0autotune,
|
54 |
+
clean_audio,
|
55 |
+
clean_strength,
|
56 |
+
export_format,
|
57 |
):
|
58 |
+
infer_pipeline(
|
59 |
+
f0up_key,
|
60 |
+
filter_radius,
|
61 |
+
index_rate,
|
62 |
+
rms_mix_rate,
|
63 |
+
protect,
|
64 |
+
hop_length,
|
|
|
65 |
f0method,
|
66 |
input_path,
|
67 |
output_path,
|
68 |
+
pth_path,
|
69 |
index_path,
|
70 |
+
split_audio,
|
71 |
+
f0autotune,
|
72 |
+
clean_audio,
|
73 |
+
clean_strength,
|
74 |
+
export_format,
|
75 |
+
)
|
76 |
+
return f"File {input_path} inferred successfully.", output_path.replace(
|
77 |
+
".wav", f".{export_format.lower()}"
|
78 |
+
)
|
79 |
|
80 |
|
81 |
# Batch infer
|
|
|
83 |
f0up_key,
|
84 |
filter_radius,
|
85 |
index_rate,
|
86 |
+
rms_mix_rate,
|
87 |
+
protect,
|
88 |
hop_length,
|
89 |
f0method,
|
90 |
input_folder,
|
91 |
output_folder,
|
92 |
+
pth_path,
|
93 |
index_path,
|
94 |
split_audio,
|
95 |
+
f0autotune,
|
96 |
+
clean_audio,
|
97 |
+
clean_strength,
|
98 |
+
export_format,
|
99 |
):
|
|
|
|
|
100 |
audio_files = [
|
101 |
f for f in os.listdir(input_folder) if f.endswith((".mp3", ".wav", ".flac"))
|
102 |
]
|
|
|
114 |
)
|
115 |
print(f"Inferring {input_path}...")
|
116 |
|
117 |
+
infer_pipeline(
|
118 |
+
f0up_key,
|
119 |
+
filter_radius,
|
120 |
+
index_rate,
|
121 |
+
rms_mix_rate,
|
122 |
+
protect,
|
123 |
+
hop_length,
|
124 |
+
f0method,
|
125 |
+
input_path,
|
126 |
+
output_path,
|
127 |
+
pth_path,
|
128 |
+
index_path,
|
129 |
+
split_audio,
|
130 |
+
f0autotune,
|
131 |
+
clean_audio,
|
132 |
+
clean_strength,
|
133 |
+
export_format,
|
134 |
+
)
|
135 |
|
136 |
return f"Files from {input_folder} inferred successfully."
|
137 |
|
|
|
143 |
f0up_key,
|
144 |
filter_radius,
|
145 |
index_rate,
|
146 |
+
rms_mix_rate,
|
147 |
+
protect,
|
148 |
hop_length,
|
149 |
f0method,
|
150 |
output_tts_path,
|
151 |
output_rvc_path,
|
152 |
+
pth_path,
|
153 |
index_path,
|
154 |
+
split_audio,
|
155 |
+
f0autotune,
|
156 |
+
clean_audio,
|
157 |
+
clean_strength,
|
158 |
+
export_format,
|
159 |
):
|
160 |
tts_script_path = os.path.join("rvc", "lib", "tools", "tts.py")
|
|
|
161 |
|
162 |
if os.path.exists(output_tts_path):
|
163 |
os.remove(output_tts_path)
|
|
|
169 |
tts_voice,
|
170 |
output_tts_path,
|
171 |
]
|
172 |
+
subprocess.run(command_tts)
|
173 |
|
174 |
+
infer_pipeline(
|
175 |
+
f0up_key,
|
176 |
+
filter_radius,
|
177 |
+
index_rate,
|
178 |
+
rms_mix_rate,
|
179 |
+
protect,
|
180 |
+
hop_length,
|
181 |
f0method,
|
182 |
output_tts_path,
|
183 |
output_rvc_path,
|
184 |
+
pth_path,
|
185 |
index_path,
|
186 |
+
split_audio,
|
187 |
+
f0autotune,
|
188 |
+
clean_audio,
|
189 |
+
clean_strength,
|
190 |
+
export_format,
|
191 |
+
)
|
192 |
+
|
193 |
+
return f"Text {tts_text} synthesized successfully.", output_rvc_path.replace(
|
194 |
+
".wav", f".{export_format.lower()}"
|
195 |
+
)
|
196 |
|
197 |
|
198 |
# Preprocess
|
|
|
202 |
command = [
|
203 |
"python",
|
204 |
preprocess_script_path,
|
205 |
+
*map(
|
206 |
+
str,
|
207 |
+
[
|
208 |
+
os.path.join(logs_path, model_name),
|
209 |
+
dataset_path,
|
210 |
+
sampling_rate,
|
211 |
+
per,
|
212 |
+
],
|
213 |
+
),
|
214 |
]
|
215 |
|
216 |
+
os.makedirs(os.path.join(logs_path, model_name), exist_ok=True)
|
217 |
subprocess.run(command)
|
218 |
return f"Model {model_name} preprocessed successfully."
|
219 |
|
220 |
|
221 |
# Extract
|
222 |
def run_extract_script(model_name, rvc_version, f0method, hop_length, sampling_rate):
|
223 |
+
model_path = os.path.join(logs_path, model_name)
|
224 |
extract_f0_script_path = os.path.join(
|
225 |
"rvc", "train", "extract", "extract_f0_print.py"
|
226 |
)
|
|
|
231 |
command_1 = [
|
232 |
"python",
|
233 |
extract_f0_script_path,
|
234 |
+
*map(
|
235 |
+
str,
|
236 |
+
[
|
237 |
+
model_path,
|
238 |
+
f0method,
|
239 |
+
hop_length,
|
240 |
+
],
|
241 |
+
),
|
242 |
]
|
243 |
command_2 = [
|
244 |
"python",
|
245 |
extract_feature_script_path,
|
246 |
+
*map(
|
247 |
+
str,
|
248 |
+
[
|
249 |
+
config.device,
|
250 |
+
"1",
|
251 |
+
"0",
|
252 |
+
"0",
|
253 |
+
model_path,
|
254 |
+
rvc_version,
|
255 |
+
"True",
|
256 |
+
],
|
257 |
+
),
|
258 |
]
|
259 |
subprocess.run(command_1)
|
260 |
subprocess.run(command_2)
|
|
|
276 |
batch_size,
|
277 |
gpu,
|
278 |
pitch_guidance,
|
279 |
+
overtraining_detector,
|
280 |
+
overtraining_threshold,
|
281 |
pretrained,
|
282 |
custom_pretrained,
|
283 |
g_pretrained_path=None,
|
|
|
286 |
f0 = 1 if str(pitch_guidance) == "True" else 0
|
287 |
latest = 1 if str(save_only_latest) == "True" else 0
|
288 |
save_every = 1 if str(save_every_weights) == "True" else 0
|
289 |
+
detector = 1 if str(overtraining_detector) == "True" else 0
|
290 |
|
291 |
if str(pretrained) == "True":
|
292 |
if str(custom_pretrained) == "False":
|
|
|
303 |
train_script_path = os.path.join("rvc", "train", "train.py")
|
304 |
command = [
|
305 |
"python",
|
306 |
+
train_script_path,
|
307 |
+
*map(
|
308 |
+
str,
|
309 |
+
[
|
310 |
+
"-se",
|
311 |
+
save_every_epoch,
|
312 |
+
"-te",
|
313 |
+
total_epoch,
|
314 |
+
"-pg",
|
315 |
+
pg,
|
316 |
+
"-pd",
|
317 |
+
pd,
|
318 |
+
"-sr",
|
319 |
+
sampling_rate,
|
320 |
+
"-bs",
|
321 |
+
batch_size,
|
322 |
+
"-g",
|
323 |
+
gpu,
|
324 |
+
"-e",
|
325 |
+
os.path.join(logs_path, model_name),
|
326 |
+
"-v",
|
327 |
+
rvc_version,
|
328 |
+
"-l",
|
329 |
+
latest,
|
330 |
+
"-c",
|
331 |
+
"0",
|
332 |
+
"-sw",
|
333 |
+
save_every,
|
334 |
+
"-f0",
|
335 |
+
f0,
|
336 |
+
"-od",
|
337 |
+
detector,
|
338 |
+
"-ot",
|
339 |
+
overtraining_threshold,
|
340 |
+
],
|
341 |
+
),
|
342 |
]
|
343 |
|
344 |
subprocess.run(command)
|
|
|
348 |
|
349 |
# Index
|
350 |
def run_index_script(model_name, rvc_version):
|
351 |
+
index_script_path = os.path.join("rvc", "train", "process", "extract_index.py")
|
352 |
command = [
|
353 |
"python",
|
354 |
index_script_path,
|
355 |
+
os.path.join(logs_path, model_name),
|
356 |
rvc_version,
|
357 |
]
|
358 |
|
|
|
360 |
return f"Index file for {model_name} generated successfully."
|
361 |
|
362 |
|
363 |
+
# Model extract
|
364 |
+
def run_model_extract_script(
|
365 |
+
pth_path, model_name, sampling_rate, pitch_guidance, rvc_version, epoch, step
|
366 |
+
):
|
367 |
+
f0 = 1 if str(pitch_guidance) == "True" else 0
|
368 |
+
extract_small_model(
|
369 |
+
pth_path, model_name, sampling_rate, f0, rvc_version, epoch, step
|
370 |
+
)
|
371 |
+
return f"Model {model_name} extracted successfully."
|
372 |
+
|
373 |
+
|
374 |
# Model information
|
375 |
def run_model_information_script(pth_path):
|
376 |
print(model_information(pth_path))
|
377 |
|
378 |
|
379 |
+
# Model blender
|
380 |
+
def run_model_blender_script(model_name, pth_path_1, pth_path_2, ratio):
|
381 |
+
message, model_blended = model_blender(model_name, pth_path_1, pth_path_2, ratio)
|
382 |
+
return message, model_blended
|
383 |
|
384 |
|
385 |
# Tensorboard
|
386 |
def run_tensorboard_script():
|
387 |
+
launch_tensorboard_pipeline()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
388 |
|
389 |
|
390 |
# Download
|
391 |
def run_download_script(model_link):
|
392 |
+
model_download_pipeline(model_link)
|
393 |
+
return f"Model downloaded successfully."
|
394 |
+
|
395 |
+
|
396 |
+
# Prerequisites
|
397 |
+
def run_prerequisites_script(pretraineds_v1, pretraineds_v2, models, exe):
|
398 |
+
prequisites_download_pipeline(pretraineds_v1, pretraineds_v2, models, exe)
|
399 |
+
return "Prerequisites installed successfully."
|
400 |
+
|
401 |
+
|
402 |
+
# Audio analyzer
|
403 |
+
def run_audio_analyzer_script(input_path, save_plot_path="logs/audio_analysis.png"):
|
404 |
+
audio_info, plot_path = analyze_audio(input_path, save_plot_path)
|
405 |
+
print(
|
406 |
+
f"Audio info of {input_path}: {audio_info}",
|
407 |
+
f"Audio file {input_path} analyzed successfully. Plot saved at: {plot_path}",
|
408 |
+
)
|
409 |
+
return audio_info, plot_path
|
410 |
+
|
411 |
+
|
412 |
+
# API
|
413 |
+
def run_api_script(ip, port):
|
414 |
command = [
|
415 |
+
"env/Scripts/uvicorn.exe" if os.name == "nt" else "uvicorn",
|
416 |
+
"api:app",
|
417 |
+
"--host",
|
418 |
+
ip,
|
419 |
+
"--port",
|
420 |
+
port,
|
421 |
]
|
422 |
subprocess.run(command)
|
|
|
423 |
|
424 |
|
425 |
# Parse arguments
|
|
|
434 |
# Parser for 'infer' mode
|
435 |
infer_parser = subparsers.add_parser("infer", help="Run inference")
|
436 |
infer_parser.add_argument(
|
437 |
+
"--f0up_key",
|
438 |
+
type=str,
|
439 |
+
help="Value for f0up_key",
|
440 |
+
choices=[str(i) for i in range(-24, 25)],
|
441 |
+
default="0",
|
442 |
)
|
443 |
infer_parser.add_argument(
|
444 |
+
"--filter_radius",
|
445 |
type=str,
|
446 |
+
help="Value for filter_radius",
|
447 |
+
choices=[str(i) for i in range(11)],
|
448 |
+
default="3",
|
449 |
)
|
450 |
infer_parser.add_argument(
|
451 |
+
"--index_rate",
|
452 |
type=str,
|
453 |
+
help="Value for index_rate",
|
454 |
+
choices=[str(i / 10) for i in range(11)],
|
455 |
+
default="0.3",
|
456 |
)
|
457 |
infer_parser.add_argument(
|
458 |
+
"--rms_mix_rate",
|
459 |
type=str,
|
460 |
+
help="Value for rms_mix_rate",
|
461 |
+
choices=[str(i / 10) for i in range(11)],
|
462 |
+
default="1",
|
463 |
)
|
464 |
infer_parser.add_argument(
|
465 |
+
"--protect",
|
466 |
+
type=str,
|
467 |
+
help="Value for protect",
|
468 |
+
choices=[str(i / 10) for i in range(6)],
|
469 |
+
default="0.33",
|
470 |
)
|
471 |
infer_parser.add_argument(
|
472 |
+
"--hop_length",
|
473 |
+
type=str,
|
474 |
+
help="Value for hop_length",
|
475 |
+
choices=[str(i) for i in range(1, 513)],
|
476 |
+
default="128",
|
477 |
)
|
478 |
infer_parser.add_argument(
|
479 |
+
"--f0method",
|
480 |
+
type=str,
|
481 |
+
help="Value for f0method",
|
482 |
+
choices=[
|
483 |
+
"pm",
|
484 |
+
"harvest",
|
485 |
+
"dio",
|
486 |
+
"crepe",
|
487 |
+
"crepe-tiny",
|
488 |
+
"rmvpe",
|
489 |
+
"fcpe",
|
490 |
+
"hybrid[crepe+rmvpe]",
|
491 |
+
"hybrid[crepe+fcpe]",
|
492 |
+
"hybrid[rmvpe+fcpe]",
|
493 |
+
"hybrid[crepe+rmvpe+fcpe]",
|
494 |
+
],
|
495 |
+
default="rmvpe",
|
496 |
+
)
|
497 |
+
infer_parser.add_argument("--input_path", type=str, help="Input path")
|
498 |
+
infer_parser.add_argument("--output_path", type=str, help="Output path")
|
499 |
+
infer_parser.add_argument("--pth_path", type=str, help="Path to the .pth file")
|
500 |
+
infer_parser.add_argument(
|
501 |
+
"--index_path",
|
502 |
+
type=str,
|
503 |
+
help="Path to the .index file",
|
504 |
)
|
505 |
infer_parser.add_argument(
|
506 |
+
"--split_audio",
|
507 |
+
type=str,
|
508 |
+
help="Enable split audio",
|
509 |
+
choices=["True", "False"],
|
510 |
+
default="False",
|
511 |
)
|
512 |
infer_parser.add_argument(
|
513 |
+
"--f0autotune",
|
514 |
type=str,
|
515 |
+
help="Enable autotune",
|
516 |
+
choices=["True", "False"],
|
517 |
+
default="False",
|
518 |
)
|
519 |
infer_parser.add_argument(
|
520 |
+
"--clean_audio",
|
521 |
type=str,
|
522 |
+
help="Enable clean audio",
|
523 |
+
choices=["True", "False"],
|
524 |
+
default="False",
|
525 |
+
)
|
526 |
+
infer_parser.add_argument(
|
527 |
+
"--clean_strength",
|
528 |
+
type=str,
|
529 |
+
help="Value for clean_strength",
|
530 |
+
choices=[str(i / 10) for i in range(11)],
|
531 |
+
default="0.7",
|
532 |
+
)
|
533 |
+
infer_parser.add_argument(
|
534 |
+
"--export_format",
|
535 |
+
type=str,
|
536 |
+
help="Export format",
|
537 |
+
choices=["WAV", "MP3", "FLAC", "OGG", "M4A"],
|
538 |
+
default="WAV",
|
539 |
)
|
540 |
|
541 |
# Parser for 'batch_infer' mode
|
|
|
543 |
"batch_infer", help="Run batch inference"
|
544 |
)
|
545 |
batch_infer_parser.add_argument(
|
546 |
+
"--f0up_key",
|
547 |
+
type=str,
|
548 |
+
help="Value for f0up_key",
|
549 |
+
choices=[str(i) for i in range(-24, 25)],
|
550 |
+
default="0",
|
551 |
+
)
|
552 |
+
batch_infer_parser.add_argument(
|
553 |
+
"--filter_radius",
|
554 |
+
type=str,
|
555 |
+
help="Value for filter_radius",
|
556 |
+
choices=[str(i) for i in range(11)],
|
557 |
+
default="3",
|
558 |
+
)
|
559 |
+
batch_infer_parser.add_argument(
|
560 |
+
"--index_rate",
|
561 |
+
type=str,
|
562 |
+
help="Value for index_rate",
|
563 |
+
choices=[str(i / 10) for i in range(11)],
|
564 |
+
default="0.3",
|
565 |
+
)
|
566 |
+
batch_infer_parser.add_argument(
|
567 |
+
"--rms_mix_rate",
|
568 |
+
type=str,
|
569 |
+
help="Value for rms_mix_rate",
|
570 |
+
choices=[str(i / 10) for i in range(11)],
|
571 |
+
default="1",
|
572 |
)
|
573 |
batch_infer_parser.add_argument(
|
574 |
+
"--protect",
|
575 |
type=str,
|
576 |
+
help="Value for protect",
|
577 |
+
choices=[str(i / 10) for i in range(6)],
|
578 |
+
default="0.33",
|
579 |
)
|
580 |
batch_infer_parser.add_argument(
|
581 |
+
"--hop_length",
|
582 |
type=str,
|
583 |
+
help="Value for hop_length",
|
584 |
+
choices=[str(i) for i in range(1, 513)],
|
585 |
+
default="128",
|
586 |
)
|
587 |
batch_infer_parser.add_argument(
|
588 |
+
"--f0method",
|
589 |
type=str,
|
590 |
+
help="Value for f0method",
|
591 |
+
choices=[
|
592 |
+
"pm",
|
593 |
+
"harvest",
|
594 |
+
"dio",
|
595 |
+
"crepe",
|
596 |
+
"crepe-tiny",
|
597 |
+
"rmvpe",
|
598 |
+
"fcpe",
|
599 |
+
"hybrid[crepe+rmvpe]",
|
600 |
+
"hybrid[crepe+fcpe]",
|
601 |
+
"hybrid[rmvpe+fcpe]",
|
602 |
+
"hybrid[crepe+rmvpe+fcpe]",
|
603 |
+
],
|
604 |
+
default="rmvpe",
|
605 |
+
)
|
606 |
+
batch_infer_parser.add_argument("--input_folder", type=str, help="Input folder")
|
607 |
+
batch_infer_parser.add_argument("--output_folder", type=str, help="Output folder")
|
608 |
+
batch_infer_parser.add_argument(
|
609 |
+
"--pth_path", type=str, help="Path to the .pth file"
|
610 |
)
|
611 |
batch_infer_parser.add_argument(
|
612 |
+
"--index_path",
|
613 |
+
type=str,
|
614 |
+
help="Path to the .index file",
|
615 |
)
|
616 |
batch_infer_parser.add_argument(
|
617 |
+
"--split_audio",
|
618 |
+
type=str,
|
619 |
+
help="Enable split audio",
|
620 |
+
choices=["True", "False"],
|
621 |
+
default="False",
|
622 |
)
|
623 |
batch_infer_parser.add_argument(
|
624 |
+
"--f0autotune",
|
625 |
+
type=str,
|
626 |
+
help="Enable autotune",
|
627 |
+
choices=["True", "False"],
|
628 |
+
default="False",
|
629 |
)
|
630 |
batch_infer_parser.add_argument(
|
631 |
+
"--clean_audio",
|
632 |
+
type=str,
|
633 |
+
help="Enable clean audio",
|
634 |
+
choices=["True", "False"],
|
635 |
+
default="False",
|
636 |
)
|
637 |
batch_infer_parser.add_argument(
|
638 |
+
"--clean_strength",
|
639 |
type=str,
|
640 |
+
help="Value for clean_strength",
|
641 |
+
choices=[str(i / 10) for i in range(11)],
|
642 |
+
default="0.7",
|
643 |
+
)
|
644 |
+
batch_infer_parser.add_argument(
|
645 |
+
"--export_format",
|
646 |
+
type=str,
|
647 |
+
help="Export format",
|
648 |
+
choices=["WAV", "MP3", "FLAC", "OGG", "M4A"],
|
649 |
+
default="WAV",
|
650 |
)
|
651 |
|
652 |
# Parser for 'tts' mode
|
653 |
tts_parser = subparsers.add_parser("tts", help="Run TTS")
|
654 |
tts_parser.add_argument(
|
655 |
+
"--tts_text",
|
656 |
type=str,
|
657 |
+
help="Text to be synthesized",
|
658 |
)
|
659 |
tts_parser.add_argument(
|
660 |
+
"--tts_voice",
|
661 |
+
type=str,
|
662 |
+
help="Voice to be used",
|
663 |
+
choices=locales,
|
664 |
)
|
665 |
tts_parser.add_argument(
|
666 |
+
"--f0up_key",
|
667 |
+
type=str,
|
668 |
+
help="Value for f0up_key",
|
669 |
+
choices=[str(i) for i in range(-24, 25)],
|
670 |
+
default="0",
|
671 |
)
|
672 |
tts_parser.add_argument(
|
673 |
+
"--filter_radius",
|
674 |
type=str,
|
675 |
+
help="Value for filter_radius",
|
676 |
+
choices=[str(i) for i in range(11)],
|
677 |
+
default="3",
|
678 |
)
|
679 |
tts_parser.add_argument(
|
680 |
+
"--index_rate",
|
681 |
type=str,
|
682 |
+
help="Value for index_rate",
|
683 |
+
choices=[str(i / 10) for i in range(11)],
|
684 |
+
default="0.3",
|
685 |
)
|
686 |
tts_parser.add_argument(
|
687 |
+
"--rms_mix_rate",
|
688 |
type=str,
|
689 |
+
help="Value for rms_mix_rate",
|
690 |
+
choices=[str(i / 10) for i in range(11)],
|
691 |
+
default="1",
|
692 |
)
|
693 |
tts_parser.add_argument(
|
694 |
+
"--protect",
|
695 |
+
type=str,
|
696 |
+
help="Value for protect",
|
697 |
+
choices=[str(i / 10) for i in range(6)],
|
698 |
+
default="0.33",
|
699 |
)
|
700 |
tts_parser.add_argument(
|
701 |
+
"--hop_length",
|
702 |
+
type=str,
|
703 |
+
help="Value for hop_length",
|
704 |
+
choices=[str(i) for i in range(1, 513)],
|
705 |
+
default="128",
|
706 |
)
|
707 |
tts_parser.add_argument(
|
708 |
+
"--f0method",
|
709 |
+
type=str,
|
710 |
+
help="Value for f0method",
|
711 |
+
choices=[
|
712 |
+
"pm",
|
713 |
+
"harvest",
|
714 |
+
"dio",
|
715 |
+
"crepe",
|
716 |
+
"crepe-tiny",
|
717 |
+
"rmvpe",
|
718 |
+
"fcpe",
|
719 |
+
"hybrid[crepe+rmvpe]",
|
720 |
+
"hybrid[crepe+fcpe]",
|
721 |
+
"hybrid[rmvpe+fcpe]",
|
722 |
+
"hybrid[crepe+rmvpe+fcpe]",
|
723 |
+
],
|
724 |
+
default="rmvpe",
|
725 |
+
)
|
726 |
+
tts_parser.add_argument("--output_tts_path", type=str, help="Output tts path")
|
727 |
+
tts_parser.add_argument("--output_rvc_path", type=str, help="Output rvc path")
|
728 |
+
tts_parser.add_argument("--pth_path", type=str, help="Path to the .pth file")
|
729 |
+
tts_parser.add_argument(
|
730 |
+
"--index_path",
|
731 |
+
type=str,
|
732 |
+
help="Path to the .index file",
|
733 |
)
|
734 |
tts_parser.add_argument(
|
735 |
+
"--split_audio",
|
736 |
+
type=str,
|
737 |
+
help="Enable split audio",
|
738 |
+
choices=["True", "False"],
|
739 |
+
default="False",
|
740 |
)
|
741 |
tts_parser.add_argument(
|
742 |
+
"--f0autotune",
|
743 |
type=str,
|
744 |
+
help="Enable autotune",
|
745 |
+
choices=["True", "False"],
|
746 |
+
default="False",
|
747 |
+
)
|
748 |
+
tts_parser.add_argument(
|
749 |
+
"--clean_audio",
|
750 |
+
type=str,
|
751 |
+
help="Enable clean audio",
|
752 |
+
choices=["True", "False"],
|
753 |
+
default="False",
|
754 |
+
)
|
755 |
+
tts_parser.add_argument(
|
756 |
+
"--clean_strength",
|
757 |
+
type=str,
|
758 |
+
help="Value for clean_strength",
|
759 |
+
choices=[str(i / 10) for i in range(11)],
|
760 |
+
default="0.7",
|
761 |
+
)
|
762 |
+
tts_parser.add_argument(
|
763 |
+
"--export_format",
|
764 |
+
type=str,
|
765 |
+
help="Export format",
|
766 |
+
choices=["WAV", "MP3", "FLAC", "OGG", "M4A"],
|
767 |
+
default="WAV",
|
768 |
)
|
769 |
|
770 |
# Parser for 'preprocess' mode
|
771 |
preprocess_parser = subparsers.add_parser("preprocess", help="Run preprocessing")
|
772 |
+
preprocess_parser.add_argument("--model_name", type=str, help="Name of the model")
|
773 |
preprocess_parser.add_argument(
|
774 |
+
"--dataset_path",
|
|
|
|
|
|
|
775 |
type=str,
|
776 |
+
help="Path to the dataset",
|
777 |
)
|
778 |
preprocess_parser.add_argument(
|
779 |
+
"--sampling_rate",
|
780 |
+
type=str,
|
781 |
+
help="Sampling rate",
|
782 |
+
choices=["32000", "40000", "48000"],
|
783 |
)
|
784 |
|
785 |
# Parser for 'extract' mode
|
786 |
extract_parser = subparsers.add_parser("extract", help="Run extract")
|
787 |
extract_parser.add_argument(
|
788 |
+
"--model_name",
|
789 |
type=str,
|
790 |
+
help="Name of the model",
|
791 |
)
|
792 |
extract_parser.add_argument(
|
793 |
+
"--rvc_version",
|
794 |
type=str,
|
795 |
+
help="Version of the model",
|
796 |
+
choices=["v1", "v2"],
|
797 |
+
default="v2",
|
798 |
)
|
799 |
extract_parser.add_argument(
|
800 |
+
"--f0method",
|
801 |
+
type=str,
|
802 |
+
help="Value for f0method",
|
803 |
+
choices=[
|
804 |
+
"pm",
|
805 |
+
"harvest",
|
806 |
+
"dio",
|
807 |
+
"crepe",
|
808 |
+
"crepe-tiny",
|
809 |
+
"rmvpe",
|
810 |
+
],
|
811 |
+
default="rmvpe",
|
812 |
)
|
813 |
extract_parser.add_argument(
|
814 |
+
"--hop_length",
|
815 |
type=str,
|
816 |
+
help="Value for hop_length",
|
817 |
+
choices=[str(i) for i in range(1, 513)],
|
818 |
+
default="128",
|
819 |
)
|
820 |
extract_parser.add_argument(
|
821 |
+
"--sampling_rate",
|
822 |
+
type=str,
|
823 |
+
help="Sampling rate",
|
824 |
+
choices=["32000", "40000", "48000"],
|
825 |
)
|
826 |
|
827 |
# Parser for 'train' mode
|
828 |
train_parser = subparsers.add_parser("train", help="Run training")
|
829 |
train_parser.add_argument(
|
830 |
+
"--model_name",
|
831 |
type=str,
|
832 |
+
help="Name of the model",
|
833 |
)
|
834 |
train_parser.add_argument(
|
835 |
+
"--rvc_version",
|
836 |
type=str,
|
837 |
+
help="Version of the model",
|
838 |
+
choices=["v1", "v2"],
|
839 |
+
default="v2",
|
840 |
)
|
841 |
train_parser.add_argument(
|
842 |
+
"--save_every_epoch",
|
843 |
type=str,
|
844 |
help="Save every epoch",
|
845 |
+
choices=[str(i) for i in range(1, 101)],
|
846 |
)
|
847 |
train_parser.add_argument(
|
848 |
+
"--save_only_latest",
|
849 |
type=str,
|
850 |
help="Save weight only at last epoch",
|
851 |
+
choices=["True", "False"],
|
852 |
+
default="False",
|
853 |
)
|
854 |
train_parser.add_argument(
|
855 |
+
"--save_every_weights",
|
856 |
type=str,
|
857 |
help="Save weight every epoch",
|
858 |
+
choices=["True", "False"],
|
859 |
+
default="True",
|
860 |
)
|
861 |
train_parser.add_argument(
|
862 |
+
"--total_epoch",
|
863 |
type=str,
|
864 |
help="Total epoch",
|
865 |
+
choices=[str(i) for i in range(1, 10001)],
|
866 |
+
default="1000",
|
867 |
)
|
868 |
train_parser.add_argument(
|
869 |
+
"--sampling_rate",
|
870 |
+
type=str,
|
871 |
+
help="Sampling rate",
|
872 |
+
choices=["32000", "40000", "48000"],
|
873 |
)
|
874 |
train_parser.add_argument(
|
875 |
+
"--batch_size",
|
876 |
type=str,
|
877 |
help="Batch size",
|
878 |
+
choices=[str(i) for i in range(1, 51)],
|
879 |
+
default="8",
|
880 |
)
|
881 |
train_parser.add_argument(
|
882 |
+
"--gpu",
|
883 |
type=str,
|
884 |
+
help="GPU number",
|
885 |
+
choices=[str(i) for i in range(0, 11)],
|
886 |
+
default="0",
|
887 |
)
|
888 |
train_parser.add_argument(
|
889 |
+
"--pitch_guidance",
|
890 |
+
type=str,
|
891 |
+
help="Pitch guidance",
|
892 |
+
choices=["True", "False"],
|
893 |
+
default="True",
|
894 |
)
|
895 |
train_parser.add_argument(
|
896 |
+
"--pretrained",
|
897 |
+
type=str,
|
898 |
+
help="Pretrained",
|
899 |
+
choices=["True", "False"],
|
900 |
+
default="True",
|
901 |
)
|
902 |
train_parser.add_argument(
|
903 |
+
"--custom_pretrained",
|
904 |
+
type=str,
|
905 |
+
help="Custom pretrained",
|
906 |
+
choices=["True", "False"],
|
907 |
+
default="False",
|
908 |
)
|
909 |
train_parser.add_argument(
|
910 |
+
"--g_pretrained_path",
|
911 |
type=str,
|
912 |
nargs="?",
|
913 |
default=None,
|
914 |
+
help="Path to the pretrained G file",
|
915 |
)
|
916 |
train_parser.add_argument(
|
917 |
+
"--d_pretrained_path",
|
918 |
type=str,
|
919 |
nargs="?",
|
920 |
default=None,
|
921 |
+
help="Path to the pretrained D file",
|
922 |
+
)
|
923 |
+
train_parser.add_argument(
|
924 |
+
"--overtraining_detector",
|
925 |
+
type=str,
|
926 |
+
help="Overtraining detector",
|
927 |
+
choices=["True", "False"],
|
928 |
+
default="False",
|
929 |
+
)
|
930 |
+
train_parser.add_argument(
|
931 |
+
"--overtraining_threshold",
|
932 |
+
type=str,
|
933 |
+
help="Overtraining threshold",
|
934 |
+
choices=[str(i) for i in range(1, 101)],
|
935 |
+
default="50",
|
936 |
)
|
937 |
|
938 |
# Parser for 'index' mode
|
939 |
index_parser = subparsers.add_parser("index", help="Generate index file")
|
940 |
index_parser.add_argument(
|
941 |
+
"--model_name",
|
942 |
type=str,
|
943 |
+
help="Name of the model",
|
944 |
)
|
945 |
index_parser.add_argument(
|
946 |
+
"--rvc_version",
|
947 |
type=str,
|
948 |
+
help="Version of the model",
|
949 |
+
choices=["v1", "v2"],
|
950 |
+
default="v2",
|
951 |
+
)
|
952 |
+
|
953 |
+
# Parser for 'model_extract' mode
|
954 |
+
model_extract_parser = subparsers.add_parser("model_extract", help="Extract model")
|
955 |
+
model_extract_parser.add_argument(
|
956 |
+
"--pth_path",
|
957 |
+
type=str,
|
958 |
+
help="Path to the .pth file",
|
959 |
+
)
|
960 |
+
model_extract_parser.add_argument(
|
961 |
+
"--model_name",
|
962 |
+
type=str,
|
963 |
+
help="Name of the model",
|
964 |
+
)
|
965 |
+
model_extract_parser.add_argument(
|
966 |
+
"--sampling_rate",
|
967 |
+
type=str,
|
968 |
+
help="Sampling rate",
|
969 |
+
choices=["40000", "48000"],
|
970 |
+
)
|
971 |
+
model_extract_parser.add_argument(
|
972 |
+
"--pitch_guidance",
|
973 |
+
type=str,
|
974 |
+
help="Pitch guidance",
|
975 |
+
choices=["True", "False"],
|
976 |
+
)
|
977 |
+
model_extract_parser.add_argument(
|
978 |
+
"--rvc_version",
|
979 |
+
type=str,
|
980 |
+
help="Version of the model",
|
981 |
+
choices=["v1", "v2"],
|
982 |
+
default="v2",
|
983 |
+
)
|
984 |
+
model_extract_parser.add_argument(
|
985 |
+
"--epoch",
|
986 |
+
type=str,
|
987 |
+
help="Epochs of the model",
|
988 |
+
choices=[str(i) for i in range(1, 10001)],
|
989 |
+
)
|
990 |
+
model_extract_parser.add_argument(
|
991 |
+
"--step",
|
992 |
+
type=str,
|
993 |
+
help="Steps of the model",
|
994 |
)
|
995 |
|
996 |
# Parser for 'model_information' mode
|
|
|
998 |
"model_information", help="Print model information"
|
999 |
)
|
1000 |
model_information_parser.add_argument(
|
1001 |
+
"--pth_path",
|
1002 |
type=str,
|
1003 |
+
help="Path to the .pth file",
|
1004 |
)
|
1005 |
|
1006 |
+
# Parser for 'model_blender' mode
|
1007 |
+
model_blender_parser = subparsers.add_parser(
|
1008 |
+
"model_blender", help="Fuse two models"
|
1009 |
+
)
|
1010 |
+
model_blender_parser.add_argument(
|
1011 |
+
"--model_name",
|
1012 |
+
type=str,
|
1013 |
+
help="Name of the model",
|
1014 |
+
)
|
1015 |
+
model_blender_parser.add_argument(
|
1016 |
+
"--pth_path_1",
|
1017 |
type=str,
|
1018 |
+
help="Path to the first .pth file",
|
1019 |
)
|
1020 |
+
model_blender_parser.add_argument(
|
1021 |
+
"--pth_path_2",
|
1022 |
type=str,
|
1023 |
+
help="Path to the second .pth file",
|
1024 |
)
|
1025 |
+
model_blender_parser.add_argument(
|
1026 |
+
"--ratio",
|
1027 |
type=str,
|
1028 |
+
help="Value for blender ratio",
|
1029 |
+
choices=[str(i / 10) for i in range(11)],
|
1030 |
+
default="0.5",
|
1031 |
)
|
1032 |
|
1033 |
# Parser for 'tensorboard' mode
|
|
|
1036 |
# Parser for 'download' mode
|
1037 |
download_parser = subparsers.add_parser("download", help="Download models")
|
1038 |
download_parser.add_argument(
|
1039 |
+
"--model_link",
|
1040 |
+
type=str,
|
1041 |
+
help="Link of the model",
|
1042 |
+
)
|
1043 |
+
|
1044 |
+
# Parser for 'prerequisites' mode
|
1045 |
+
prerequisites_parser = subparsers.add_parser(
|
1046 |
+
"prerequisites", help="Install prerequisites"
|
1047 |
+
)
|
1048 |
+
prerequisites_parser.add_argument(
|
1049 |
+
"--pretraineds_v1",
|
1050 |
type=str,
|
1051 |
+
choices=["True", "False"],
|
1052 |
+
default="True",
|
1053 |
+
help="Download pretrained models for v1",
|
1054 |
+
)
|
1055 |
+
prerequisites_parser.add_argument(
|
1056 |
+
"--pretraineds_v2",
|
1057 |
+
type=str,
|
1058 |
+
choices=["True", "False"],
|
1059 |
+
default="True",
|
1060 |
+
help="Download pretrained models for v2",
|
1061 |
+
)
|
1062 |
+
prerequisites_parser.add_argument(
|
1063 |
+
"--models",
|
1064 |
+
type=str,
|
1065 |
+
choices=["True", "False"],
|
1066 |
+
default="True",
|
1067 |
+
help="Donwload models",
|
1068 |
+
)
|
1069 |
+
prerequisites_parser.add_argument(
|
1070 |
+
"--exe",
|
1071 |
+
type=str,
|
1072 |
+
choices=["True", "False"],
|
1073 |
+
default="True",
|
1074 |
+
help="Download executables",
|
1075 |
+
)
|
1076 |
+
|
1077 |
+
# Parser for 'audio_analyzer' mode
|
1078 |
+
audio_analyzer = subparsers.add_parser("audio_analyzer", help="Run audio analyzer")
|
1079 |
+
audio_analyzer.add_argument(
|
1080 |
+
"--input_path",
|
1081 |
+
type=str,
|
1082 |
+
help="Path to the input audio file",
|
1083 |
)
|
1084 |
|
1085 |
+
# Parser for 'api' mode
|
1086 |
+
api_parser = subparsers.add_parser("api", help="Run the API")
|
1087 |
+
api_parser.add_argument("--ip", type=str, help="IP address", default="127.0.0.1")
|
1088 |
+
api_parser.add_argument("--port", type=str, help="Port", default="8000")
|
1089 |
+
|
1090 |
return parser.parse_args()
|
1091 |
|
1092 |
|
|
|
1100 |
try:
|
1101 |
if args.mode == "infer":
|
1102 |
run_infer_script(
|
1103 |
+
str(args.f0up_key),
|
1104 |
+
str(args.filter_radius),
|
1105 |
+
str(args.index_rate),
|
1106 |
+
str(args.rms_mix_rate),
|
1107 |
+
str(args.protect),
|
1108 |
+
str(args.hop_length),
|
1109 |
+
str(args.f0method),
|
1110 |
+
str(args.input_path),
|
1111 |
+
str(args.output_path),
|
1112 |
+
str(args.pth_path),
|
1113 |
+
str(args.index_path),
|
1114 |
+
str(args.split_audio),
|
1115 |
+
str(args.f0autotune),
|
1116 |
+
str(args.clean_audio),
|
1117 |
+
str(args.clean_strength),
|
1118 |
+
str(args.export_format),
|
1119 |
)
|
1120 |
elif args.mode == "batch_infer":
|
1121 |
run_batch_infer_script(
|
1122 |
+
str(args.f0up_key),
|
1123 |
+
str(args.filter_radius),
|
1124 |
+
str(args.index_rate),
|
1125 |
+
str(args.rms_mix_rate),
|
1126 |
+
str(args.protect),
|
1127 |
+
str(args.hop_length),
|
1128 |
+
str(args.f0method),
|
1129 |
+
str(args.input_folder),
|
1130 |
+
str(args.output_folder),
|
1131 |
+
str(args.pth_path),
|
1132 |
+
str(args.index_path),
|
1133 |
+
str(args.split_audio),
|
1134 |
+
str(args.f0autotune),
|
1135 |
+
str(args.clean_audio),
|
1136 |
+
str(args.clean_strength),
|
1137 |
+
str(args.export_format),
|
1138 |
)
|
1139 |
elif args.mode == "tts":
|
1140 |
run_tts_script(
|
1141 |
+
str(args.tts_text),
|
1142 |
+
str(args.tts_voice),
|
1143 |
+
str(args.f0up_key),
|
1144 |
+
str(args.filter_radius),
|
1145 |
+
str(args.index_rate),
|
1146 |
+
str(args.rms_mix_rate),
|
1147 |
+
str(args.protect),
|
1148 |
+
str(args.hop_length),
|
1149 |
+
str(args.f0method),
|
1150 |
+
str(args.output_tts_path),
|
1151 |
+
str(args.output_rvc_path),
|
1152 |
+
str(args.pth_path),
|
1153 |
+
str(args.index_path),
|
1154 |
+
str(args.split_audio),
|
1155 |
+
str(args.f0autotune),
|
1156 |
+
str(args.clean_audio),
|
1157 |
+
str(args.clean_strength),
|
1158 |
+
str(args.export_format),
|
1159 |
)
|
1160 |
elif args.mode == "preprocess":
|
1161 |
run_preprocess_script(
|
1162 |
+
str(args.model_name),
|
1163 |
+
str(args.dataset_path),
|
1164 |
str(args.sampling_rate),
|
1165 |
)
|
|
|
1166 |
elif args.mode == "extract":
|
1167 |
run_extract_script(
|
1168 |
+
str(args.model_name),
|
1169 |
+
str(args.rvc_version),
|
1170 |
+
str(args.f0method),
|
1171 |
+
str(args.hop_length),
|
1172 |
+
str(args.sampling_rate),
|
1173 |
)
|
1174 |
elif args.mode == "train":
|
1175 |
run_train_script(
|
1176 |
+
str(args.model_name),
|
1177 |
+
str(args.rvc_version),
|
1178 |
+
str(args.save_every_epoch),
|
1179 |
+
str(args.save_only_latest),
|
1180 |
+
str(args.save_every_weights),
|
1181 |
+
str(args.total_epoch),
|
1182 |
+
str(args.sampling_rate),
|
1183 |
+
str(args.batch_size),
|
1184 |
+
str(args.gpu),
|
1185 |
+
str(args.pitch_guidance),
|
1186 |
+
str(args.pretrained),
|
1187 |
+
str(args.custom_pretrained),
|
1188 |
+
str(args.g_pretrained_path),
|
1189 |
+
str(args.d_pretrained_path),
|
1190 |
+
str(args.overtraining_detector),
|
1191 |
+
str(args.overtraining_threshold),
|
1192 |
)
|
1193 |
elif args.mode == "index":
|
1194 |
run_index_script(
|
1195 |
+
str(args.model_name),
|
1196 |
+
str(args.rvc_version),
|
1197 |
+
)
|
1198 |
+
elif args.mode == "model_extract":
|
1199 |
+
run_model_extract_script(
|
1200 |
+
str(args.pth_path),
|
1201 |
+
str(args.model_name),
|
1202 |
+
str(args.sampling_rate),
|
1203 |
+
str(args.pitch_guidance),
|
1204 |
+
str(args.rvc_version),
|
1205 |
+
str(args.epoch),
|
1206 |
+
str(args.step),
|
1207 |
)
|
1208 |
elif args.mode == "model_information":
|
1209 |
run_model_information_script(
|
1210 |
+
str(args.pth_path),
|
1211 |
)
|
1212 |
+
elif args.mode == "model_blender":
|
1213 |
+
run_model_blender_script(
|
1214 |
+
str(args.model_name),
|
1215 |
+
str(args.pth_path_1),
|
1216 |
+
str(args.pth_path_2),
|
1217 |
+
str(args.ratio),
|
1218 |
)
|
1219 |
elif args.mode == "tensorboard":
|
1220 |
run_tensorboard_script()
|
1221 |
elif args.mode == "download":
|
1222 |
run_download_script(
|
1223 |
+
str(args.model_link),
|
1224 |
+
)
|
1225 |
+
elif args.mode == "prerequisites":
|
1226 |
+
run_prerequisites_script(
|
1227 |
+
str(args.pretraineds_v1),
|
1228 |
+
str(args.pretraineds_v2),
|
1229 |
+
str(args.models),
|
1230 |
+
str(args.exe),
|
1231 |
+
)
|
1232 |
+
elif args.mode == "audio_analyzer":
|
1233 |
+
run_audio_analyzer_script(
|
1234 |
+
str(args.input_path),
|
1235 |
+
)
|
1236 |
+
elif args.mode == "api":
|
1237 |
+
run_api_script(
|
1238 |
+
str(args.ip),
|
1239 |
+
str(args.port),
|
1240 |
)
|
1241 |
except Exception as error:
|
1242 |
print(f"Error: {error}")
|
rvc/configs/config.py
CHANGED
@@ -1,10 +1,6 @@
|
|
1 |
-
import argparse
|
2 |
-
import os
|
3 |
-
import sys
|
4 |
-
import json
|
5 |
-
from multiprocessing import cpu_count
|
6 |
-
|
7 |
import torch
|
|
|
|
|
8 |
|
9 |
version_config_list = [
|
10 |
"v1/32000.json",
|
@@ -64,6 +60,9 @@ class Config:
|
|
64 |
return False
|
65 |
|
66 |
def use_fp32_config(self):
|
|
|
|
|
|
|
67 |
for config_file in version_config_list:
|
68 |
self.json_config[config_file]["train"]["fp16_run"] = False
|
69 |
with open(f"rvc/configs/{config_file}", "r") as f:
|
@@ -116,7 +115,7 @@ class Config:
|
|
116 |
self.use_fp32_config()
|
117 |
|
118 |
if self.n_cpu == 0:
|
119 |
-
self.n_cpu = cpu_count()
|
120 |
|
121 |
if self.is_half:
|
122 |
x_pad = 3
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import torch
|
2 |
+
import json
|
3 |
+
import os
|
4 |
|
5 |
version_config_list = [
|
6 |
"v1/32000.json",
|
|
|
60 |
return False
|
61 |
|
62 |
def use_fp32_config(self):
|
63 |
+
print(
|
64 |
+
f"Using FP32 config instead of FP16 due to GPU compatibility ({self.gpu_name})"
|
65 |
+
)
|
66 |
for config_file in version_config_list:
|
67 |
self.json_config[config_file]["train"]["fp16_run"] = False
|
68 |
with open(f"rvc/configs/{config_file}", "r") as f:
|
|
|
115 |
self.use_fp32_config()
|
116 |
|
117 |
if self.n_cpu == 0:
|
118 |
+
self.n_cpu = os.cpu_count()
|
119 |
|
120 |
if self.is_half:
|
121 |
x_pad = 3
|
rvc/infer/infer.py
CHANGED
@@ -1,9 +1,19 @@
|
|
1 |
import os
|
2 |
import sys
|
|
|
3 |
import torch
|
|
|
|
|
4 |
import numpy as np
|
5 |
import soundfile as sf
|
6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
from rvc.lib.utils import load_audio
|
8 |
from rvc.lib.tools.split_audio import process_audio, merge_audio
|
9 |
from fairseq import checkpoint_utils
|
@@ -13,13 +23,19 @@ from rvc.lib.infer_pack.models import (
|
|
13 |
SynthesizerTrnMs768NSFsid,
|
14 |
SynthesizerTrnMs768NSFsid_nono,
|
15 |
)
|
16 |
-
|
17 |
from rvc.configs.config import Config
|
18 |
|
19 |
-
|
|
|
20 |
|
21 |
-
|
22 |
hubert_model = None
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
|
25 |
def load_hubert():
|
@@ -37,6 +53,44 @@ def load_hubert():
|
|
37 |
hubert_model.eval()
|
38 |
|
39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
def vc_single(
|
41 |
sid=0,
|
42 |
input_audio_path=None,
|
@@ -46,17 +100,16 @@ def vc_single(
|
|
46 |
file_index=None,
|
47 |
index_rate=None,
|
48 |
resample_sr=0,
|
49 |
-
rms_mix_rate=
|
50 |
-
protect=
|
51 |
hop_length=None,
|
52 |
output_path=None,
|
53 |
split_audio=False,
|
|
|
|
|
54 |
):
|
55 |
global tgt_sr, net_g, vc, hubert_model, version
|
56 |
|
57 |
-
if input_audio_path is None:
|
58 |
-
return "Please, load an audio!", None
|
59 |
-
|
60 |
f0_up_key = int(f0_up_key)
|
61 |
try:
|
62 |
audio = load_audio(input_audio_path, 16000)
|
@@ -95,7 +148,7 @@ def vc_single(
|
|
95 |
]
|
96 |
try:
|
97 |
for path in paths:
|
98 |
-
|
99 |
sid,
|
100 |
path,
|
101 |
f0_up_key,
|
@@ -109,17 +162,18 @@ def vc_single(
|
|
109 |
hop_length,
|
110 |
path,
|
111 |
False,
|
|
|
112 |
)
|
113 |
-
# new_dir_path
|
114 |
except Exception as error:
|
115 |
print(error)
|
116 |
-
return "Error"
|
117 |
print("Finished processing segmented audio, now merging audio...")
|
118 |
merge_timestamps_file = os.path.join(
|
119 |
os.path.dirname(new_dir_path),
|
120 |
f"{os.path.basename(input_audio_path).split('.')[0]}_timestamps.txt",
|
121 |
)
|
122 |
tgt_sr, audio_opt = merge_audio(merge_timestamps_file)
|
|
|
123 |
|
124 |
else:
|
125 |
audio_opt = vc.pipeline(
|
@@ -140,9 +194,9 @@ def vc_single(
|
|
140 |
version,
|
141 |
protect,
|
142 |
hop_length,
|
|
|
143 |
f0_file=f0_file,
|
144 |
)
|
145 |
-
|
146 |
if output_path is not None:
|
147 |
sf.write(output_path, audio_opt, tgt_sr, format="WAV")
|
148 |
|
@@ -158,7 +212,7 @@ def get_vc(weight_root, sid):
|
|
158 |
global hubert_model
|
159 |
if hubert_model is not None:
|
160 |
print("clean_empty_cache")
|
161 |
-
del net_g, n_spk, vc, hubert_model, tgt_sr
|
162 |
hubert_model = net_g = n_spk = vc = hubert_model = tgt_sr = None
|
163 |
if torch.cuda.is_available():
|
164 |
torch.cuda.empty_cache()
|
@@ -211,55 +265,64 @@ def get_vc(weight_root, sid):
|
|
211 |
n_spk = cpt["config"][-3]
|
212 |
|
213 |
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
f0_pitch = f0up_key
|
234 |
-
f0_file = None
|
235 |
-
f0_method = f0method
|
236 |
-
file_index = index_path
|
237 |
-
index_rate = index_rate
|
238 |
-
output_file = audio_output_path
|
239 |
-
split_audio = split_audio
|
240 |
-
|
241 |
-
get_vc(model_path, 0)
|
242 |
-
|
243 |
-
try:
|
244 |
-
result, audio_opt = vc_single(
|
245 |
-
sid=0,
|
246 |
-
input_audio_path=input_audio,
|
247 |
-
f0_up_key=f0_pitch,
|
248 |
-
f0_file=None,
|
249 |
-
f0_method=f0_method,
|
250 |
-
file_index=file_index,
|
251 |
-
index_rate=index_rate,
|
252 |
-
hop_length=hop_length,
|
253 |
-
output_path=output_file,
|
254 |
-
split_audio=split_audio,
|
255 |
-
)
|
256 |
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
261 |
|
262 |
-
|
|
|
|
|
|
|
|
|
|
|
263 |
|
264 |
-
|
265 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
import sys
|
3 |
+
import time
|
4 |
import torch
|
5 |
+
import logging
|
6 |
+
|
7 |
import numpy as np
|
8 |
import soundfile as sf
|
9 |
+
import librosa
|
10 |
+
|
11 |
+
now_dir = os.getcwd()
|
12 |
+
sys.path.append(now_dir)
|
13 |
+
|
14 |
+
from rvc.infer.pipeline import VC
|
15 |
+
from scipy.io import wavfile
|
16 |
+
import noisereduce as nr
|
17 |
from rvc.lib.utils import load_audio
|
18 |
from rvc.lib.tools.split_audio import process_audio, merge_audio
|
19 |
from fairseq import checkpoint_utils
|
|
|
23 |
SynthesizerTrnMs768NSFsid,
|
24 |
SynthesizerTrnMs768NSFsid_nono,
|
25 |
)
|
|
|
26 |
from rvc.configs.config import Config
|
27 |
|
28 |
+
logging.getLogger("fairseq").setLevel(logging.WARNING)
|
29 |
+
logging.getLogger("httpx").setLevel(logging.WARNING)
|
30 |
|
31 |
+
config = Config()
|
32 |
hubert_model = None
|
33 |
+
tgt_sr = None
|
34 |
+
net_g = None
|
35 |
+
vc = None
|
36 |
+
cpt = None
|
37 |
+
version = None
|
38 |
+
n_spk = None
|
39 |
|
40 |
|
41 |
def load_hubert():
|
|
|
53 |
hubert_model.eval()
|
54 |
|
55 |
|
56 |
+
def remove_audio_noise(input_audio_path, reduction_strength=0.7):
|
57 |
+
try:
|
58 |
+
rate, data = wavfile.read(input_audio_path)
|
59 |
+
reduced_noise = nr.reduce_noise(
|
60 |
+
y=data,
|
61 |
+
sr=rate,
|
62 |
+
prop_decrease=reduction_strength,
|
63 |
+
)
|
64 |
+
return reduced_noise
|
65 |
+
except Exception as error:
|
66 |
+
print(f"Error cleaning audio: {error}")
|
67 |
+
return None
|
68 |
+
|
69 |
+
|
70 |
+
def convert_audio_format(input_path, output_path, output_format):
|
71 |
+
try:
|
72 |
+
if output_format != "WAV":
|
73 |
+
print(f"Converting audio to {output_format} format...")
|
74 |
+
audio, sample_rate = librosa.load(input_path, sr=None)
|
75 |
+
common_sample_rates = [
|
76 |
+
8000,
|
77 |
+
11025,
|
78 |
+
12000,
|
79 |
+
16000,
|
80 |
+
22050,
|
81 |
+
24000,
|
82 |
+
32000,
|
83 |
+
44100,
|
84 |
+
48000,
|
85 |
+
]
|
86 |
+
target_sr = min(common_sample_rates, key=lambda x: abs(x - sample_rate))
|
87 |
+
audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=target_sr)
|
88 |
+
sf.write(output_path, audio, target_sr, format=output_format.lower())
|
89 |
+
return output_path
|
90 |
+
except Exception as error:
|
91 |
+
print(f"Failed to convert audio to {output_format} format: {error}")
|
92 |
+
|
93 |
+
|
94 |
def vc_single(
|
95 |
sid=0,
|
96 |
input_audio_path=None,
|
|
|
100 |
file_index=None,
|
101 |
index_rate=None,
|
102 |
resample_sr=0,
|
103 |
+
rms_mix_rate=None,
|
104 |
+
protect=None,
|
105 |
hop_length=None,
|
106 |
output_path=None,
|
107 |
split_audio=False,
|
108 |
+
f0autotune=False,
|
109 |
+
filter_radius=None,
|
110 |
):
|
111 |
global tgt_sr, net_g, vc, hubert_model, version
|
112 |
|
|
|
|
|
|
|
113 |
f0_up_key = int(f0_up_key)
|
114 |
try:
|
115 |
audio = load_audio(input_audio_path, 16000)
|
|
|
148 |
]
|
149 |
try:
|
150 |
for path in paths:
|
151 |
+
vc_single(
|
152 |
sid,
|
153 |
path,
|
154 |
f0_up_key,
|
|
|
162 |
hop_length,
|
163 |
path,
|
164 |
False,
|
165 |
+
f0autotune,
|
166 |
)
|
|
|
167 |
except Exception as error:
|
168 |
print(error)
|
169 |
+
return f"Error {error}"
|
170 |
print("Finished processing segmented audio, now merging audio...")
|
171 |
merge_timestamps_file = os.path.join(
|
172 |
os.path.dirname(new_dir_path),
|
173 |
f"{os.path.basename(input_audio_path).split('.')[0]}_timestamps.txt",
|
174 |
)
|
175 |
tgt_sr, audio_opt = merge_audio(merge_timestamps_file)
|
176 |
+
os.remove(merge_timestamps_file)
|
177 |
|
178 |
else:
|
179 |
audio_opt = vc.pipeline(
|
|
|
194 |
version,
|
195 |
protect,
|
196 |
hop_length,
|
197 |
+
f0autotune,
|
198 |
f0_file=f0_file,
|
199 |
)
|
|
|
200 |
if output_path is not None:
|
201 |
sf.write(output_path, audio_opt, tgt_sr, format="WAV")
|
202 |
|
|
|
212 |
global hubert_model
|
213 |
if hubert_model is not None:
|
214 |
print("clean_empty_cache")
|
215 |
+
del net_g, n_spk, vc, hubert_model, tgt_sr
|
216 |
hubert_model = net_g = n_spk = vc = hubert_model = tgt_sr = None
|
217 |
if torch.cuda.is_available():
|
218 |
torch.cuda.empty_cache()
|
|
|
265 |
n_spk = cpt["config"][-3]
|
266 |
|
267 |
|
268 |
+
def infer_pipeline(
|
269 |
+
f0up_key,
|
270 |
+
filter_radius,
|
271 |
+
index_rate,
|
272 |
+
rms_mix_rate,
|
273 |
+
protect,
|
274 |
+
hop_length,
|
275 |
+
f0method,
|
276 |
+
audio_input_path,
|
277 |
+
audio_output_path,
|
278 |
+
model_path,
|
279 |
+
index_path,
|
280 |
+
split_audio,
|
281 |
+
f0autotune,
|
282 |
+
clean_audio,
|
283 |
+
clean_strength,
|
284 |
+
export_format,
|
285 |
+
):
|
286 |
+
global tgt_sr, net_g, vc, cpt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
287 |
|
288 |
+
get_vc(model_path, 0)
|
289 |
+
|
290 |
+
try:
|
291 |
+
start_time = time.time()
|
292 |
+
vc_single(
|
293 |
+
sid=0,
|
294 |
+
input_audio_path=audio_input_path,
|
295 |
+
f0_up_key=f0up_key,
|
296 |
+
f0_file=None,
|
297 |
+
f0_method=f0method,
|
298 |
+
file_index=index_path,
|
299 |
+
index_rate=index_rate,
|
300 |
+
rms_mix_rate=rms_mix_rate,
|
301 |
+
protect=protect,
|
302 |
+
hop_length=hop_length,
|
303 |
+
output_path=audio_output_path,
|
304 |
+
split_audio=split_audio,
|
305 |
+
f0autotune=f0autotune,
|
306 |
+
filter_radius=filter_radius,
|
307 |
+
)
|
308 |
+
|
309 |
+
if clean_audio == "True":
|
310 |
+
cleaned_audio = remove_audio_noise(audio_output_path, clean_strength)
|
311 |
+
if cleaned_audio is not None:
|
312 |
+
sf.write(audio_output_path, cleaned_audio, tgt_sr, format="WAV")
|
313 |
|
314 |
+
output_path_format = audio_output_path.replace(
|
315 |
+
".wav", f".{export_format.lower()}"
|
316 |
+
)
|
317 |
+
audio_output_path = convert_audio_format(
|
318 |
+
audio_output_path, output_path_format, export_format
|
319 |
+
)
|
320 |
|
321 |
+
end_time = time.time()
|
322 |
+
elapsed_time = end_time - start_time
|
323 |
+
print(
|
324 |
+
f"Conversion completed. Output file: '{audio_output_path}' in {elapsed_time:.2f} seconds."
|
325 |
+
)
|
326 |
+
|
327 |
+
except Exception as error:
|
328 |
+
print(f"Voice conversion failed: {error}")
|
rvc/infer/pipeline.py
ADDED
@@ -0,0 +1,625 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np, parselmouth, torch, pdb, sys, os
|
2 |
+
from time import time as ttime
|
3 |
+
import torch.nn.functional as F
|
4 |
+
import torchcrepe
|
5 |
+
from torch import Tensor
|
6 |
+
import scipy.signal as signal
|
7 |
+
import pyworld, os, faiss, librosa, torchcrepe
|
8 |
+
from scipy import signal
|
9 |
+
from functools import lru_cache
|
10 |
+
import random
|
11 |
+
import gc
|
12 |
+
import re
|
13 |
+
|
14 |
+
now_dir = os.getcwd()
|
15 |
+
sys.path.append(now_dir)
|
16 |
+
|
17 |
+
from rvc.lib.FCPEF0Predictor import FCPEF0Predictor
|
18 |
+
|
19 |
+
bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
|
20 |
+
|
21 |
+
input_audio_path2wav = {}
|
22 |
+
|
23 |
+
|
24 |
+
@lru_cache
|
25 |
+
def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period):
|
26 |
+
audio = input_audio_path2wav[input_audio_path]
|
27 |
+
f0, t = pyworld.harvest(
|
28 |
+
audio,
|
29 |
+
fs=fs,
|
30 |
+
f0_ceil=f0max,
|
31 |
+
f0_floor=f0min,
|
32 |
+
frame_period=frame_period,
|
33 |
+
)
|
34 |
+
f0 = pyworld.stonemask(audio, f0, t, fs)
|
35 |
+
return f0
|
36 |
+
|
37 |
+
|
38 |
+
def change_rms(data1, sr1, data2, sr2, rate):
|
39 |
+
# print(data1.max(),data2.max())
|
40 |
+
rms1 = librosa.feature.rms(y=data1, frame_length=sr1 // 2 * 2, hop_length=sr1 // 2)
|
41 |
+
rms2 = librosa.feature.rms(y=data2, frame_length=sr2 // 2 * 2, hop_length=sr2 // 2)
|
42 |
+
|
43 |
+
rms1 = torch.from_numpy(rms1)
|
44 |
+
rms1 = F.interpolate(
|
45 |
+
rms1.unsqueeze(0), size=data2.shape[0], mode="linear"
|
46 |
+
).squeeze()
|
47 |
+
|
48 |
+
rms2 = torch.from_numpy(rms2)
|
49 |
+
rms2 = F.interpolate(
|
50 |
+
rms2.unsqueeze(0), size=data2.shape[0], mode="linear"
|
51 |
+
).squeeze()
|
52 |
+
rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-6)
|
53 |
+
|
54 |
+
data2 *= (
|
55 |
+
torch.pow(rms1, torch.tensor(1 - rate))
|
56 |
+
* torch.pow(rms2, torch.tensor(rate - 1))
|
57 |
+
).numpy()
|
58 |
+
return data2
|
59 |
+
|
60 |
+
|
61 |
+
class VC(object):
|
62 |
+
def __init__(self, tgt_sr, config):
|
63 |
+
self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = (
|
64 |
+
config.x_pad,
|
65 |
+
config.x_query,
|
66 |
+
config.x_center,
|
67 |
+
config.x_max,
|
68 |
+
config.is_half,
|
69 |
+
)
|
70 |
+
self.sr = 16000
|
71 |
+
self.window = 160
|
72 |
+
self.t_pad = self.sr * self.x_pad
|
73 |
+
self.t_pad_tgt = tgt_sr * self.x_pad
|
74 |
+
self.t_pad2 = self.t_pad * 2
|
75 |
+
self.t_query = self.sr * self.x_query
|
76 |
+
self.t_center = self.sr * self.x_center
|
77 |
+
self.t_max = self.sr * self.x_max
|
78 |
+
self.device = config.device
|
79 |
+
self.ref_freqs = [
|
80 |
+
65.41,
|
81 |
+
82.41,
|
82 |
+
110.00,
|
83 |
+
146.83,
|
84 |
+
196.00,
|
85 |
+
246.94,
|
86 |
+
329.63,
|
87 |
+
440.00,
|
88 |
+
587.33,
|
89 |
+
783.99,
|
90 |
+
1046.50,
|
91 |
+
]
|
92 |
+
# Generate interpolated frequencies
|
93 |
+
self.note_dict = self.generate_interpolated_frequencies()
|
94 |
+
|
95 |
+
def generate_interpolated_frequencies(self):
|
96 |
+
# Generate interpolated frequencies based on the reference frequencies.
|
97 |
+
note_dict = []
|
98 |
+
for i in range(len(self.ref_freqs) - 1):
|
99 |
+
freq_low = self.ref_freqs[i]
|
100 |
+
freq_high = self.ref_freqs[i + 1]
|
101 |
+
# Interpolate between adjacent reference frequencies
|
102 |
+
interpolated_freqs = np.linspace(
|
103 |
+
freq_low, freq_high, num=10, endpoint=False
|
104 |
+
)
|
105 |
+
note_dict.extend(interpolated_freqs)
|
106 |
+
# Add the last reference frequency
|
107 |
+
note_dict.append(self.ref_freqs[-1])
|
108 |
+
return note_dict
|
109 |
+
|
110 |
+
def autotune_f0(self, f0):
|
111 |
+
# Autotunes the given fundamental frequency (f0) to the nearest musical note.
|
112 |
+
autotuned_f0 = np.zeros_like(f0)
|
113 |
+
for i, freq in enumerate(f0):
|
114 |
+
# Find the closest note
|
115 |
+
closest_note = min(self.note_dict, key=lambda x: abs(x - freq))
|
116 |
+
autotuned_f0[i] = closest_note
|
117 |
+
return autotuned_f0
|
118 |
+
|
119 |
+
def get_optimal_torch_device(self, index: int = 0) -> torch.device:
|
120 |
+
if torch.cuda.is_available():
|
121 |
+
return torch.device(f"cuda:{index % torch.cuda.device_count()}")
|
122 |
+
elif torch.backends.mps.is_available():
|
123 |
+
return torch.device("mps")
|
124 |
+
return torch.device("cpu")
|
125 |
+
|
126 |
+
def get_f0_crepe_computation(
|
127 |
+
self,
|
128 |
+
x,
|
129 |
+
f0_min,
|
130 |
+
f0_max,
|
131 |
+
p_len,
|
132 |
+
hop_length,
|
133 |
+
model="full",
|
134 |
+
):
|
135 |
+
x = x.astype(np.float32)
|
136 |
+
x /= np.quantile(np.abs(x), 0.999)
|
137 |
+
torch_device = self.get_optimal_torch_device()
|
138 |
+
audio = torch.from_numpy(x).to(torch_device, copy=True)
|
139 |
+
audio = torch.unsqueeze(audio, dim=0)
|
140 |
+
if audio.ndim == 2 and audio.shape[0] > 1:
|
141 |
+
audio = torch.mean(audio, dim=0, keepdim=True).detach()
|
142 |
+
audio = audio.detach()
|
143 |
+
pitch: Tensor = torchcrepe.predict(
|
144 |
+
audio,
|
145 |
+
self.sr,
|
146 |
+
hop_length,
|
147 |
+
f0_min,
|
148 |
+
f0_max,
|
149 |
+
model,
|
150 |
+
batch_size=hop_length * 2,
|
151 |
+
device=torch_device,
|
152 |
+
pad=True,
|
153 |
+
)
|
154 |
+
p_len = p_len or x.shape[0] // hop_length
|
155 |
+
source = np.array(pitch.squeeze(0).cpu().float().numpy())
|
156 |
+
source[source < 0.001] = np.nan
|
157 |
+
target = np.interp(
|
158 |
+
np.arange(0, len(source) * p_len, len(source)) / p_len,
|
159 |
+
np.arange(0, len(source)),
|
160 |
+
source,
|
161 |
+
)
|
162 |
+
f0 = np.nan_to_num(target)
|
163 |
+
return f0
|
164 |
+
|
165 |
+
def get_f0_official_crepe_computation(
|
166 |
+
self,
|
167 |
+
x,
|
168 |
+
f0_min,
|
169 |
+
f0_max,
|
170 |
+
model="full",
|
171 |
+
):
|
172 |
+
batch_size = 512
|
173 |
+
audio = torch.tensor(np.copy(x))[None].float()
|
174 |
+
f0, pd = torchcrepe.predict(
|
175 |
+
audio,
|
176 |
+
self.sr,
|
177 |
+
self.window,
|
178 |
+
f0_min,
|
179 |
+
f0_max,
|
180 |
+
model,
|
181 |
+
batch_size=batch_size,
|
182 |
+
device=self.device,
|
183 |
+
return_periodicity=True,
|
184 |
+
)
|
185 |
+
pd = torchcrepe.filter.median(pd, 3)
|
186 |
+
f0 = torchcrepe.filter.mean(f0, 3)
|
187 |
+
f0[pd < 0.1] = 0
|
188 |
+
f0 = f0[0].cpu().numpy()
|
189 |
+
return f0
|
190 |
+
|
191 |
+
def get_f0_hybrid_computation(
|
192 |
+
self,
|
193 |
+
methods_str,
|
194 |
+
x,
|
195 |
+
f0_min,
|
196 |
+
f0_max,
|
197 |
+
p_len,
|
198 |
+
hop_length,
|
199 |
+
):
|
200 |
+
methods_str = re.search("hybrid\[(.+)\]", methods_str)
|
201 |
+
if methods_str:
|
202 |
+
methods = [method.strip() for method in methods_str.group(1).split("+")]
|
203 |
+
f0_computation_stack = []
|
204 |
+
print(f"Calculating f0 pitch estimations for methods {str(methods)}")
|
205 |
+
x = x.astype(np.float32)
|
206 |
+
x /= np.quantile(np.abs(x), 0.999)
|
207 |
+
for method in methods:
|
208 |
+
f0 = None
|
209 |
+
if method == "crepe":
|
210 |
+
f0 = self.get_f0_crepe_computation(
|
211 |
+
x, f0_min, f0_max, p_len, int(hop_length)
|
212 |
+
)
|
213 |
+
elif method == "rmvpe":
|
214 |
+
if hasattr(self, "model_rmvpe") == False:
|
215 |
+
from rvc.lib.rmvpe import RMVPE
|
216 |
+
|
217 |
+
self.model_rmvpe = RMVPE(
|
218 |
+
"rmvpe.pt", is_half=self.is_half, device=self.device
|
219 |
+
)
|
220 |
+
f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
|
221 |
+
f0 = f0[1:]
|
222 |
+
elif method == "fcpe":
|
223 |
+
self.model_fcpe = FCPEF0Predictor(
|
224 |
+
"fcpe.pt",
|
225 |
+
f0_min=int(f0_min),
|
226 |
+
f0_max=int(f0_max),
|
227 |
+
dtype=torch.float32,
|
228 |
+
device=self.device,
|
229 |
+
sampling_rate=self.sr,
|
230 |
+
threshold=0.03,
|
231 |
+
)
|
232 |
+
f0 = self.model_fcpe.compute_f0(x, p_len=p_len)
|
233 |
+
del self.model_fcpe
|
234 |
+
gc.collect()
|
235 |
+
f0_computation_stack.append(f0)
|
236 |
+
|
237 |
+
print(f"Calculating hybrid median f0 from the stack of {str(methods)}")
|
238 |
+
f0_computation_stack = [fc for fc in f0_computation_stack if fc is not None]
|
239 |
+
f0_median_hybrid = None
|
240 |
+
if len(f0_computation_stack) == 1:
|
241 |
+
f0_median_hybrid = f0_computation_stack[0]
|
242 |
+
else:
|
243 |
+
f0_median_hybrid = np.nanmedian(f0_computation_stack, axis=0)
|
244 |
+
return f0_median_hybrid
|
245 |
+
|
246 |
+
def get_f0(
|
247 |
+
self,
|
248 |
+
input_audio_path,
|
249 |
+
x,
|
250 |
+
p_len,
|
251 |
+
f0_up_key,
|
252 |
+
f0_method,
|
253 |
+
filter_radius,
|
254 |
+
hop_length,
|
255 |
+
f0autotune,
|
256 |
+
inp_f0=None,
|
257 |
+
):
|
258 |
+
global input_audio_path2wav
|
259 |
+
time_step = self.window / self.sr * 1000
|
260 |
+
f0_min = 50
|
261 |
+
f0_max = 1100
|
262 |
+
f0_mel_min = 1127 * np.log(1 + f0_min / 700)
|
263 |
+
f0_mel_max = 1127 * np.log(1 + f0_max / 700)
|
264 |
+
if f0_method == "pm":
|
265 |
+
f0 = (
|
266 |
+
parselmouth.Sound(x, self.sr)
|
267 |
+
.to_pitch_ac(
|
268 |
+
time_step=time_step / 1000,
|
269 |
+
voicing_threshold=0.6,
|
270 |
+
pitch_floor=f0_min,
|
271 |
+
pitch_ceiling=f0_max,
|
272 |
+
)
|
273 |
+
.selected_array["frequency"]
|
274 |
+
)
|
275 |
+
pad_size = (p_len - len(f0) + 1) // 2
|
276 |
+
if pad_size > 0 or p_len - len(f0) - pad_size > 0:
|
277 |
+
f0 = np.pad(
|
278 |
+
f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
|
279 |
+
)
|
280 |
+
elif f0_method == "harvest":
|
281 |
+
input_audio_path2wav[input_audio_path] = x.astype(np.double)
|
282 |
+
f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10)
|
283 |
+
if int(filter_radius) > 2:
|
284 |
+
f0 = signal.medfilt(f0, 3)
|
285 |
+
elif f0_method == "dio":
|
286 |
+
f0, t = pyworld.dio(
|
287 |
+
x.astype(np.double),
|
288 |
+
fs=self.sr,
|
289 |
+
f0_ceil=f0_max,
|
290 |
+
f0_floor=f0_min,
|
291 |
+
frame_period=10,
|
292 |
+
)
|
293 |
+
f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)
|
294 |
+
f0 = signal.medfilt(f0, 3)
|
295 |
+
elif f0_method == "crepe":
|
296 |
+
f0 = self.get_f0_crepe_computation(
|
297 |
+
x, f0_min, f0_max, p_len, int(hop_length)
|
298 |
+
)
|
299 |
+
elif f0_method == "crepe-tiny":
|
300 |
+
f0 = self.get_f0_crepe_computation(
|
301 |
+
x, f0_min, f0_max, p_len, int(hop_length), "tiny"
|
302 |
+
)
|
303 |
+
elif f0_method == "rmvpe":
|
304 |
+
if hasattr(self, "model_rmvpe") == False:
|
305 |
+
from rvc.lib.rmvpe import RMVPE
|
306 |
+
|
307 |
+
self.model_rmvpe = RMVPE(
|
308 |
+
"rmvpe.pt", is_half=self.is_half, device=self.device
|
309 |
+
)
|
310 |
+
f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
|
311 |
+
elif f0_method == "fcpe":
|
312 |
+
self.model_fcpe = FCPEF0Predictor(
|
313 |
+
"fcpe.pt",
|
314 |
+
f0_min=int(f0_min),
|
315 |
+
f0_max=int(f0_max),
|
316 |
+
dtype=torch.float32,
|
317 |
+
device=self.device,
|
318 |
+
sampling_rate=self.sr,
|
319 |
+
threshold=0.03,
|
320 |
+
)
|
321 |
+
f0 = self.model_fcpe.compute_f0(x, p_len=p_len)
|
322 |
+
del self.model_fcpe
|
323 |
+
gc.collect()
|
324 |
+
elif "hybrid" in f0_method:
|
325 |
+
input_audio_path2wav[input_audio_path] = x.astype(np.double)
|
326 |
+
f0 = self.get_f0_hybrid_computation(
|
327 |
+
f0_method,
|
328 |
+
x,
|
329 |
+
f0_min,
|
330 |
+
f0_max,
|
331 |
+
p_len,
|
332 |
+
hop_length,
|
333 |
+
)
|
334 |
+
|
335 |
+
if f0autotune == "True":
|
336 |
+
f0 = self.autotune_f0(f0)
|
337 |
+
|
338 |
+
f0 *= pow(2, f0_up_key / 12)
|
339 |
+
tf0 = self.sr // self.window
|
340 |
+
if inp_f0 is not None:
|
341 |
+
delta_t = np.round(
|
342 |
+
(inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1
|
343 |
+
).astype("int16")
|
344 |
+
replace_f0 = np.interp(
|
345 |
+
list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1]
|
346 |
+
)
|
347 |
+
shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0]
|
348 |
+
f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[
|
349 |
+
:shape
|
350 |
+
]
|
351 |
+
f0bak = f0.copy()
|
352 |
+
f0_mel = 1127 * np.log(1 + f0 / 700)
|
353 |
+
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
|
354 |
+
f0_mel_max - f0_mel_min
|
355 |
+
) + 1
|
356 |
+
f0_mel[f0_mel <= 1] = 1
|
357 |
+
f0_mel[f0_mel > 255] = 255
|
358 |
+
f0_coarse = np.rint(f0_mel).astype(np.int)
|
359 |
+
|
360 |
+
return f0_coarse, f0bak
|
361 |
+
|
362 |
+
def vc(
|
363 |
+
self,
|
364 |
+
model,
|
365 |
+
net_g,
|
366 |
+
sid,
|
367 |
+
audio0,
|
368 |
+
pitch,
|
369 |
+
pitchf,
|
370 |
+
index,
|
371 |
+
big_npy,
|
372 |
+
index_rate,
|
373 |
+
version,
|
374 |
+
protect,
|
375 |
+
):
|
376 |
+
feats = torch.from_numpy(audio0)
|
377 |
+
if self.is_half:
|
378 |
+
feats = feats.half()
|
379 |
+
else:
|
380 |
+
feats = feats.float()
|
381 |
+
if feats.dim() == 2:
|
382 |
+
feats = feats.mean(-1)
|
383 |
+
assert feats.dim() == 1, feats.dim()
|
384 |
+
feats = feats.view(1, -1)
|
385 |
+
padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
|
386 |
+
|
387 |
+
inputs = {
|
388 |
+
"source": feats.to(self.device),
|
389 |
+
"padding_mask": padding_mask,
|
390 |
+
"output_layer": 9 if version == "v1" else 12,
|
391 |
+
}
|
392 |
+
t0 = ttime()
|
393 |
+
with torch.no_grad():
|
394 |
+
logits = model.extract_features(**inputs)
|
395 |
+
feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
|
396 |
+
if protect < 0.5 and pitch != None and pitchf != None:
|
397 |
+
feats0 = feats.clone()
|
398 |
+
if (
|
399 |
+
isinstance(index, type(None)) == False
|
400 |
+
and isinstance(big_npy, type(None)) == False
|
401 |
+
and index_rate != 0
|
402 |
+
):
|
403 |
+
npy = feats[0].cpu().numpy()
|
404 |
+
if self.is_half:
|
405 |
+
npy = npy.astype("float32")
|
406 |
+
|
407 |
+
score, ix = index.search(npy, k=8)
|
408 |
+
weight = np.square(1 / score)
|
409 |
+
weight /= weight.sum(axis=1, keepdims=True)
|
410 |
+
npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
|
411 |
+
|
412 |
+
if self.is_half:
|
413 |
+
npy = npy.astype("float16")
|
414 |
+
feats = (
|
415 |
+
torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
|
416 |
+
+ (1 - index_rate) * feats
|
417 |
+
)
|
418 |
+
|
419 |
+
feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
|
420 |
+
if protect < 0.5 and pitch != None and pitchf != None:
|
421 |
+
feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
|
422 |
+
0, 2, 1
|
423 |
+
)
|
424 |
+
t1 = ttime()
|
425 |
+
p_len = audio0.shape[0] // self.window
|
426 |
+
if feats.shape[1] < p_len:
|
427 |
+
p_len = feats.shape[1]
|
428 |
+
if pitch != None and pitchf != None:
|
429 |
+
pitch = pitch[:, :p_len]
|
430 |
+
pitchf = pitchf[:, :p_len]
|
431 |
+
|
432 |
+
if protect < 0.5 and pitch != None and pitchf != None:
|
433 |
+
pitchff = pitchf.clone()
|
434 |
+
pitchff[pitchf > 0] = 1
|
435 |
+
pitchff[pitchf < 1] = protect
|
436 |
+
pitchff = pitchff.unsqueeze(-1)
|
437 |
+
feats = feats * pitchff + feats0 * (1 - pitchff)
|
438 |
+
feats = feats.to(feats0.dtype)
|
439 |
+
p_len = torch.tensor([p_len], device=self.device).long()
|
440 |
+
with torch.no_grad():
|
441 |
+
if pitch != None and pitchf != None:
|
442 |
+
audio1 = (
|
443 |
+
(net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0])
|
444 |
+
.data.cpu()
|
445 |
+
.float()
|
446 |
+
.numpy()
|
447 |
+
)
|
448 |
+
else:
|
449 |
+
audio1 = (
|
450 |
+
(net_g.infer(feats, p_len, sid)[0][0, 0]).data.cpu().float().numpy()
|
451 |
+
)
|
452 |
+
del feats, p_len, padding_mask
|
453 |
+
if torch.cuda.is_available():
|
454 |
+
torch.cuda.empty_cache()
|
455 |
+
t2 = ttime()
|
456 |
+
return audio1
|
457 |
+
|
458 |
+
def pipeline(
|
459 |
+
self,
|
460 |
+
model,
|
461 |
+
net_g,
|
462 |
+
sid,
|
463 |
+
audio,
|
464 |
+
input_audio_path,
|
465 |
+
f0_up_key,
|
466 |
+
f0_method,
|
467 |
+
file_index,
|
468 |
+
index_rate,
|
469 |
+
if_f0,
|
470 |
+
filter_radius,
|
471 |
+
tgt_sr,
|
472 |
+
resample_sr,
|
473 |
+
rms_mix_rate,
|
474 |
+
version,
|
475 |
+
protect,
|
476 |
+
hop_length,
|
477 |
+
f0autotune,
|
478 |
+
f0_file=None,
|
479 |
+
):
|
480 |
+
if file_index != "" and os.path.exists(file_index) == True and index_rate != 0:
|
481 |
+
try:
|
482 |
+
index = faiss.read_index(file_index)
|
483 |
+
big_npy = index.reconstruct_n(0, index.ntotal)
|
484 |
+
except Exception as error:
|
485 |
+
print(error)
|
486 |
+
index = big_npy = None
|
487 |
+
else:
|
488 |
+
index = big_npy = None
|
489 |
+
audio = signal.filtfilt(bh, ah, audio)
|
490 |
+
audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
|
491 |
+
opt_ts = []
|
492 |
+
if audio_pad.shape[0] > self.t_max:
|
493 |
+
audio_sum = np.zeros_like(audio)
|
494 |
+
for i in range(self.window):
|
495 |
+
audio_sum += audio_pad[i : i - self.window]
|
496 |
+
for t in range(self.t_center, audio.shape[0], self.t_center):
|
497 |
+
opt_ts.append(
|
498 |
+
t
|
499 |
+
- self.t_query
|
500 |
+
+ np.where(
|
501 |
+
np.abs(audio_sum[t - self.t_query : t + self.t_query])
|
502 |
+
== np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
|
503 |
+
)[0][0]
|
504 |
+
)
|
505 |
+
s = 0
|
506 |
+
audio_opt = []
|
507 |
+
t = None
|
508 |
+
t1 = ttime()
|
509 |
+
audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
|
510 |
+
p_len = audio_pad.shape[0] // self.window
|
511 |
+
inp_f0 = None
|
512 |
+
if hasattr(f0_file, "name") == True:
|
513 |
+
try:
|
514 |
+
with open(f0_file.name, "r") as f:
|
515 |
+
lines = f.read().strip("\n").split("\n")
|
516 |
+
inp_f0 = []
|
517 |
+
for line in lines:
|
518 |
+
inp_f0.append([float(i) for i in line.split(",")])
|
519 |
+
inp_f0 = np.array(inp_f0, dtype="float32")
|
520 |
+
except Exception as error:
|
521 |
+
print(error)
|
522 |
+
sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
|
523 |
+
pitch, pitchf = None, None
|
524 |
+
if if_f0 == 1:
|
525 |
+
pitch, pitchf = self.get_f0(
|
526 |
+
input_audio_path,
|
527 |
+
audio_pad,
|
528 |
+
p_len,
|
529 |
+
f0_up_key,
|
530 |
+
f0_method,
|
531 |
+
filter_radius,
|
532 |
+
hop_length,
|
533 |
+
f0autotune,
|
534 |
+
inp_f0,
|
535 |
+
)
|
536 |
+
pitch = pitch[:p_len]
|
537 |
+
pitchf = pitchf[:p_len]
|
538 |
+
if self.device == "mps":
|
539 |
+
pitchf = pitchf.astype(np.float32)
|
540 |
+
pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
|
541 |
+
pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
|
542 |
+
t2 = ttime()
|
543 |
+
for t in opt_ts:
|
544 |
+
t = t // self.window * self.window
|
545 |
+
if if_f0 == 1:
|
546 |
+
audio_opt.append(
|
547 |
+
self.vc(
|
548 |
+
model,
|
549 |
+
net_g,
|
550 |
+
sid,
|
551 |
+
audio_pad[s : t + self.t_pad2 + self.window],
|
552 |
+
pitch[:, s // self.window : (t + self.t_pad2) // self.window],
|
553 |
+
pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
|
554 |
+
index,
|
555 |
+
big_npy,
|
556 |
+
index_rate,
|
557 |
+
version,
|
558 |
+
protect,
|
559 |
+
)[self.t_pad_tgt : -self.t_pad_tgt]
|
560 |
+
)
|
561 |
+
else:
|
562 |
+
audio_opt.append(
|
563 |
+
self.vc(
|
564 |
+
model,
|
565 |
+
net_g,
|
566 |
+
sid,
|
567 |
+
audio_pad[s : t + self.t_pad2 + self.window],
|
568 |
+
None,
|
569 |
+
None,
|
570 |
+
index,
|
571 |
+
big_npy,
|
572 |
+
index_rate,
|
573 |
+
version,
|
574 |
+
protect,
|
575 |
+
)[self.t_pad_tgt : -self.t_pad_tgt]
|
576 |
+
)
|
577 |
+
s = t
|
578 |
+
if if_f0 == 1:
|
579 |
+
audio_opt.append(
|
580 |
+
self.vc(
|
581 |
+
model,
|
582 |
+
net_g,
|
583 |
+
sid,
|
584 |
+
audio_pad[t:],
|
585 |
+
pitch[:, t // self.window :] if t is not None else pitch,
|
586 |
+
pitchf[:, t // self.window :] if t is not None else pitchf,
|
587 |
+
index,
|
588 |
+
big_npy,
|
589 |
+
index_rate,
|
590 |
+
version,
|
591 |
+
protect,
|
592 |
+
)[self.t_pad_tgt : -self.t_pad_tgt]
|
593 |
+
)
|
594 |
+
else:
|
595 |
+
audio_opt.append(
|
596 |
+
self.vc(
|
597 |
+
model,
|
598 |
+
net_g,
|
599 |
+
sid,
|
600 |
+
audio_pad[t:],
|
601 |
+
None,
|
602 |
+
None,
|
603 |
+
index,
|
604 |
+
big_npy,
|
605 |
+
index_rate,
|
606 |
+
version,
|
607 |
+
protect,
|
608 |
+
)[self.t_pad_tgt : -self.t_pad_tgt]
|
609 |
+
)
|
610 |
+
audio_opt = np.concatenate(audio_opt)
|
611 |
+
if rms_mix_rate != 1:
|
612 |
+
audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate)
|
613 |
+
if resample_sr >= 16000 and tgt_sr != resample_sr:
|
614 |
+
audio_opt = librosa.resample(
|
615 |
+
audio_opt, orig_sr=tgt_sr, target_sr=resample_sr
|
616 |
+
)
|
617 |
+
audio_max = np.abs(audio_opt).max() / 0.99
|
618 |
+
max_int16 = 32768
|
619 |
+
if audio_max > 1:
|
620 |
+
max_int16 /= audio_max
|
621 |
+
audio_opt = (audio_opt * max_int16).astype(np.int16)
|
622 |
+
del pitch, pitchf, sid
|
623 |
+
if torch.cuda.is_available():
|
624 |
+
torch.cuda.empty_cache()
|
625 |
+
return audio_opt
|
rvc/lib/FCPEF0Predictor.py
ADDED
@@ -0,0 +1,1036 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Union
|
2 |
+
|
3 |
+
import torch.nn.functional as F
|
4 |
+
import numpy as np
|
5 |
+
import torch
|
6 |
+
import torch.nn as nn
|
7 |
+
from torch.nn.utils.parametrizations import weight_norm
|
8 |
+
from torchaudio.transforms import Resample
|
9 |
+
import os
|
10 |
+
import librosa
|
11 |
+
import soundfile as sf
|
12 |
+
import torch.utils.data
|
13 |
+
from librosa.filters import mel as librosa_mel_fn
|
14 |
+
import math
|
15 |
+
from functools import partial
|
16 |
+
|
17 |
+
from einops import rearrange, repeat
|
18 |
+
from local_attention import LocalAttention
|
19 |
+
from torch import nn
|
20 |
+
|
21 |
+
os.environ["LRU_CACHE_CAPACITY"] = "3"
|
22 |
+
|
23 |
+
|
24 |
+
def load_wav_to_torch(full_path, target_sr=None, return_empty_on_exception=False):
|
25 |
+
sampling_rate = None
|
26 |
+
try:
|
27 |
+
data, sampling_rate = sf.read(full_path, always_2d=True) # than soundfile.
|
28 |
+
except Exception as error:
|
29 |
+
print(f"'{full_path}' failed to load with {error}")
|
30 |
+
if return_empty_on_exception:
|
31 |
+
return [], sampling_rate or target_sr or 48000
|
32 |
+
else:
|
33 |
+
raise Exception(error)
|
34 |
+
|
35 |
+
if len(data.shape) > 1:
|
36 |
+
data = data[:, 0]
|
37 |
+
assert (
|
38 |
+
len(data) > 2
|
39 |
+
) # check duration of audio file is > 2 samples (because otherwise the slice operation was on the wrong dimension)
|
40 |
+
|
41 |
+
if np.issubdtype(data.dtype, np.integer): # if audio data is type int
|
42 |
+
max_mag = -np.iinfo(
|
43 |
+
data.dtype
|
44 |
+
).min # maximum magnitude = min possible value of intXX
|
45 |
+
else: # if audio data is type fp32
|
46 |
+
max_mag = max(np.amax(data), -np.amin(data))
|
47 |
+
max_mag = (
|
48 |
+
(2**31) + 1
|
49 |
+
if max_mag > (2**15)
|
50 |
+
else ((2**15) + 1 if max_mag > 1.01 else 1.0)
|
51 |
+
) # data should be either 16-bit INT, 32-bit INT or [-1 to 1] float32
|
52 |
+
|
53 |
+
data = torch.FloatTensor(data.astype(np.float32)) / max_mag
|
54 |
+
|
55 |
+
if (
|
56 |
+
torch.isinf(data) | torch.isnan(data)
|
57 |
+
).any() and return_empty_on_exception: # resample will crash with inf/NaN inputs. return_empty_on_exception will return empty arr instead of except
|
58 |
+
return [], sampling_rate or target_sr or 48000
|
59 |
+
if target_sr is not None and sampling_rate != target_sr:
|
60 |
+
data = torch.from_numpy(
|
61 |
+
librosa.core.resample(
|
62 |
+
data.numpy(), orig_sr=sampling_rate, target_sr=target_sr
|
63 |
+
)
|
64 |
+
)
|
65 |
+
sampling_rate = target_sr
|
66 |
+
|
67 |
+
return data, sampling_rate
|
68 |
+
|
69 |
+
|
70 |
+
def dynamic_range_compression(x, C=1, clip_val=1e-5):
|
71 |
+
return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
|
72 |
+
|
73 |
+
|
74 |
+
def dynamic_range_decompression(x, C=1):
|
75 |
+
return np.exp(x) / C
|
76 |
+
|
77 |
+
|
78 |
+
def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
|
79 |
+
return torch.log(torch.clamp(x, min=clip_val) * C)
|
80 |
+
|
81 |
+
|
82 |
+
def dynamic_range_decompression_torch(x, C=1):
|
83 |
+
return torch.exp(x) / C
|
84 |
+
|
85 |
+
|
86 |
+
class STFT:
|
87 |
+
def __init__(
|
88 |
+
self,
|
89 |
+
sr=22050,
|
90 |
+
n_mels=80,
|
91 |
+
n_fft=1024,
|
92 |
+
win_size=1024,
|
93 |
+
hop_length=256,
|
94 |
+
fmin=20,
|
95 |
+
fmax=11025,
|
96 |
+
clip_val=1e-5,
|
97 |
+
):
|
98 |
+
self.target_sr = sr
|
99 |
+
|
100 |
+
self.n_mels = n_mels
|
101 |
+
self.n_fft = n_fft
|
102 |
+
self.win_size = win_size
|
103 |
+
self.hop_length = hop_length
|
104 |
+
self.fmin = fmin
|
105 |
+
self.fmax = fmax
|
106 |
+
self.clip_val = clip_val
|
107 |
+
self.mel_basis = {}
|
108 |
+
self.hann_window = {}
|
109 |
+
|
110 |
+
def get_mel(self, y, keyshift=0, speed=1, center=False, train=False):
|
111 |
+
sampling_rate = self.target_sr
|
112 |
+
n_mels = self.n_mels
|
113 |
+
n_fft = self.n_fft
|
114 |
+
win_size = self.win_size
|
115 |
+
hop_length = self.hop_length
|
116 |
+
fmin = self.fmin
|
117 |
+
fmax = self.fmax
|
118 |
+
clip_val = self.clip_val
|
119 |
+
|
120 |
+
factor = 2 ** (keyshift / 12)
|
121 |
+
n_fft_new = int(np.round(n_fft * factor))
|
122 |
+
win_size_new = int(np.round(win_size * factor))
|
123 |
+
hop_length_new = int(np.round(hop_length * speed))
|
124 |
+
if not train:
|
125 |
+
mel_basis = self.mel_basis
|
126 |
+
hann_window = self.hann_window
|
127 |
+
else:
|
128 |
+
mel_basis = {}
|
129 |
+
hann_window = {}
|
130 |
+
|
131 |
+
mel_basis_key = str(fmax) + "_" + str(y.device)
|
132 |
+
if mel_basis_key not in mel_basis:
|
133 |
+
mel = librosa_mel_fn(
|
134 |
+
sr=sampling_rate, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax
|
135 |
+
)
|
136 |
+
mel_basis[mel_basis_key] = torch.from_numpy(mel).float().to(y.device)
|
137 |
+
|
138 |
+
keyshift_key = str(keyshift) + "_" + str(y.device)
|
139 |
+
if keyshift_key not in hann_window:
|
140 |
+
hann_window[keyshift_key] = torch.hann_window(win_size_new).to(y.device)
|
141 |
+
|
142 |
+
pad_left = (win_size_new - hop_length_new) // 2
|
143 |
+
pad_right = max(
|
144 |
+
(win_size_new - hop_length_new + 1) // 2,
|
145 |
+
win_size_new - y.size(-1) - pad_left,
|
146 |
+
)
|
147 |
+
if pad_right < y.size(-1):
|
148 |
+
mode = "reflect"
|
149 |
+
else:
|
150 |
+
mode = "constant"
|
151 |
+
y = torch.nn.functional.pad(y.unsqueeze(1), (pad_left, pad_right), mode=mode)
|
152 |
+
y = y.squeeze(1)
|
153 |
+
|
154 |
+
spec = torch.stft(
|
155 |
+
y,
|
156 |
+
n_fft_new,
|
157 |
+
hop_length=hop_length_new,
|
158 |
+
win_length=win_size_new,
|
159 |
+
window=hann_window[keyshift_key],
|
160 |
+
center=center,
|
161 |
+
pad_mode="reflect",
|
162 |
+
normalized=False,
|
163 |
+
onesided=True,
|
164 |
+
return_complex=True,
|
165 |
+
)
|
166 |
+
spec = torch.sqrt(spec.real.pow(2) + spec.imag.pow(2) + (1e-9))
|
167 |
+
if keyshift != 0:
|
168 |
+
size = n_fft // 2 + 1
|
169 |
+
resize = spec.size(1)
|
170 |
+
if resize < size:
|
171 |
+
spec = F.pad(spec, (0, 0, 0, size - resize))
|
172 |
+
spec = spec[:, :size, :] * win_size / win_size_new
|
173 |
+
spec = torch.matmul(mel_basis[mel_basis_key], spec)
|
174 |
+
spec = dynamic_range_compression_torch(spec, clip_val=clip_val)
|
175 |
+
return spec
|
176 |
+
|
177 |
+
def __call__(self, audiopath):
|
178 |
+
audio, sr = load_wav_to_torch(audiopath, target_sr=self.target_sr)
|
179 |
+
spect = self.get_mel(audio.unsqueeze(0)).squeeze(0)
|
180 |
+
return spect
|
181 |
+
|
182 |
+
|
183 |
+
stft = STFT()
|
184 |
+
|
185 |
+
# import fast_transformers.causal_product.causal_product_cuda
|
186 |
+
|
187 |
+
|
188 |
+
def softmax_kernel(
|
189 |
+
data, *, projection_matrix, is_query, normalize_data=True, eps=1e-4, device=None
|
190 |
+
):
|
191 |
+
b, h, *_ = data.shape
|
192 |
+
# (batch size, head, length, model_dim)
|
193 |
+
|
194 |
+
# normalize model dim
|
195 |
+
data_normalizer = (data.shape[-1] ** -0.25) if normalize_data else 1.0
|
196 |
+
|
197 |
+
# what is ration?, projection_matrix.shape[0] --> 266
|
198 |
+
|
199 |
+
ratio = projection_matrix.shape[0] ** -0.5
|
200 |
+
|
201 |
+
projection = repeat(projection_matrix, "j d -> b h j d", b=b, h=h)
|
202 |
+
projection = projection.type_as(data)
|
203 |
+
|
204 |
+
# data_dash = w^T x
|
205 |
+
data_dash = torch.einsum("...id,...jd->...ij", (data_normalizer * data), projection)
|
206 |
+
|
207 |
+
# diag_data = D**2
|
208 |
+
diag_data = data**2
|
209 |
+
diag_data = torch.sum(diag_data, dim=-1)
|
210 |
+
diag_data = (diag_data / 2.0) * (data_normalizer**2)
|
211 |
+
diag_data = diag_data.unsqueeze(dim=-1)
|
212 |
+
|
213 |
+
if is_query:
|
214 |
+
data_dash = ratio * (
|
215 |
+
torch.exp(
|
216 |
+
data_dash
|
217 |
+
- diag_data
|
218 |
+
- torch.max(data_dash, dim=-1, keepdim=True).values
|
219 |
+
)
|
220 |
+
+ eps
|
221 |
+
)
|
222 |
+
else:
|
223 |
+
data_dash = ratio * (
|
224 |
+
torch.exp(data_dash - diag_data + eps)
|
225 |
+
) # - torch.max(data_dash)) + eps)
|
226 |
+
|
227 |
+
return data_dash.type_as(data)
|
228 |
+
|
229 |
+
|
230 |
+
def orthogonal_matrix_chunk(cols, qr_uniform_q=False, device=None):
|
231 |
+
unstructured_block = torch.randn((cols, cols), device=device)
|
232 |
+
q, r = torch.linalg.qr(unstructured_block.cpu(), mode="reduced")
|
233 |
+
q, r = map(lambda t: t.to(device), (q, r))
|
234 |
+
|
235 |
+
# proposed by @Parskatt
|
236 |
+
# to make sure Q is uniform https://arxiv.org/pdf/math-ph/0609050.pdf
|
237 |
+
if qr_uniform_q:
|
238 |
+
d = torch.diag(r, 0)
|
239 |
+
q *= d.sign()
|
240 |
+
return q.t()
|
241 |
+
|
242 |
+
|
243 |
+
def exists(val):
|
244 |
+
return val is not None
|
245 |
+
|
246 |
+
|
247 |
+
def empty(tensor):
|
248 |
+
return tensor.numel() == 0
|
249 |
+
|
250 |
+
|
251 |
+
def default(val, d):
|
252 |
+
return val if exists(val) else d
|
253 |
+
|
254 |
+
|
255 |
+
def cast_tuple(val):
|
256 |
+
return (val,) if not isinstance(val, tuple) else val
|
257 |
+
|
258 |
+
|
259 |
+
class PCmer(nn.Module):
|
260 |
+
"""The encoder that is used in the Transformer model."""
|
261 |
+
|
262 |
+
def __init__(
|
263 |
+
self,
|
264 |
+
num_layers,
|
265 |
+
num_heads,
|
266 |
+
dim_model,
|
267 |
+
dim_keys,
|
268 |
+
dim_values,
|
269 |
+
residual_dropout,
|
270 |
+
attention_dropout,
|
271 |
+
):
|
272 |
+
super().__init__()
|
273 |
+
self.num_layers = num_layers
|
274 |
+
self.num_heads = num_heads
|
275 |
+
self.dim_model = dim_model
|
276 |
+
self.dim_values = dim_values
|
277 |
+
self.dim_keys = dim_keys
|
278 |
+
self.residual_dropout = residual_dropout
|
279 |
+
self.attention_dropout = attention_dropout
|
280 |
+
|
281 |
+
self._layers = nn.ModuleList([_EncoderLayer(self) for _ in range(num_layers)])
|
282 |
+
|
283 |
+
# METHODS ########################################################################################################
|
284 |
+
|
285 |
+
def forward(self, phone, mask=None):
|
286 |
+
|
287 |
+
# apply all layers to the input
|
288 |
+
for i, layer in enumerate(self._layers):
|
289 |
+
phone = layer(phone, mask)
|
290 |
+
# provide the final sequence
|
291 |
+
return phone
|
292 |
+
|
293 |
+
|
294 |
+
# ==================================================================================================================== #
|
295 |
+
# CLASS _ E N C O D E R L A Y E R #
|
296 |
+
# ==================================================================================================================== #
|
297 |
+
|
298 |
+
|
299 |
+
class _EncoderLayer(nn.Module):
|
300 |
+
"""One layer of the encoder.
|
301 |
+
|
302 |
+
Attributes:
|
303 |
+
attn: (:class:`mha.MultiHeadAttention`): The attention mechanism that is used to read the input sequence.
|
304 |
+
feed_forward (:class:`ffl.FeedForwardLayer`): The feed-forward layer on top of the attention mechanism.
|
305 |
+
"""
|
306 |
+
|
307 |
+
def __init__(self, parent: PCmer):
|
308 |
+
"""Creates a new instance of ``_EncoderLayer``.
|
309 |
+
|
310 |
+
Args:
|
311 |
+
parent (Encoder): The encoder that the layers is created for.
|
312 |
+
"""
|
313 |
+
super().__init__()
|
314 |
+
|
315 |
+
self.conformer = ConformerConvModule(parent.dim_model)
|
316 |
+
self.norm = nn.LayerNorm(parent.dim_model)
|
317 |
+
self.dropout = nn.Dropout(parent.residual_dropout)
|
318 |
+
|
319 |
+
# selfatt -> fastatt: performer!
|
320 |
+
self.attn = SelfAttention(
|
321 |
+
dim=parent.dim_model, heads=parent.num_heads, causal=False
|
322 |
+
)
|
323 |
+
|
324 |
+
# METHODS ########################################################################################################
|
325 |
+
|
326 |
+
def forward(self, phone, mask=None):
|
327 |
+
|
328 |
+
# compute attention sub-layer
|
329 |
+
phone = phone + (self.attn(self.norm(phone), mask=mask))
|
330 |
+
|
331 |
+
phone = phone + (self.conformer(phone))
|
332 |
+
|
333 |
+
return phone
|
334 |
+
|
335 |
+
|
336 |
+
def calc_same_padding(kernel_size):
|
337 |
+
pad = kernel_size // 2
|
338 |
+
return (pad, pad - (kernel_size + 1) % 2)
|
339 |
+
|
340 |
+
|
341 |
+
# helper classes
|
342 |
+
|
343 |
+
|
344 |
+
class Swish(nn.Module):
|
345 |
+
def forward(self, x):
|
346 |
+
return x * x.sigmoid()
|
347 |
+
|
348 |
+
|
349 |
+
class Transpose(nn.Module):
|
350 |
+
def __init__(self, dims):
|
351 |
+
super().__init__()
|
352 |
+
assert len(dims) == 2, "dims must be a tuple of two dimensions"
|
353 |
+
self.dims = dims
|
354 |
+
|
355 |
+
def forward(self, x):
|
356 |
+
return x.transpose(*self.dims)
|
357 |
+
|
358 |
+
|
359 |
+
class GLU(nn.Module):
|
360 |
+
def __init__(self, dim):
|
361 |
+
super().__init__()
|
362 |
+
self.dim = dim
|
363 |
+
|
364 |
+
def forward(self, x):
|
365 |
+
out, gate = x.chunk(2, dim=self.dim)
|
366 |
+
return out * gate.sigmoid()
|
367 |
+
|
368 |
+
|
369 |
+
class DepthWiseConv1d(nn.Module):
|
370 |
+
def __init__(self, chan_in, chan_out, kernel_size, padding):
|
371 |
+
super().__init__()
|
372 |
+
self.padding = padding
|
373 |
+
self.conv = nn.Conv1d(chan_in, chan_out, kernel_size, groups=chan_in)
|
374 |
+
|
375 |
+
def forward(self, x):
|
376 |
+
x = F.pad(x, self.padding)
|
377 |
+
return self.conv(x)
|
378 |
+
|
379 |
+
|
380 |
+
class ConformerConvModule(nn.Module):
|
381 |
+
def __init__(
|
382 |
+
self, dim, causal=False, expansion_factor=2, kernel_size=31, dropout=0.0
|
383 |
+
):
|
384 |
+
super().__init__()
|
385 |
+
|
386 |
+
inner_dim = dim * expansion_factor
|
387 |
+
padding = calc_same_padding(kernel_size) if not causal else (kernel_size - 1, 0)
|
388 |
+
|
389 |
+
self.net = nn.Sequential(
|
390 |
+
nn.LayerNorm(dim),
|
391 |
+
Transpose((1, 2)),
|
392 |
+
nn.Conv1d(dim, inner_dim * 2, 1),
|
393 |
+
GLU(dim=1),
|
394 |
+
DepthWiseConv1d(
|
395 |
+
inner_dim, inner_dim, kernel_size=kernel_size, padding=padding
|
396 |
+
),
|
397 |
+
# nn.BatchNorm1d(inner_dim) if not causal else nn.Identity(),
|
398 |
+
Swish(),
|
399 |
+
nn.Conv1d(inner_dim, dim, 1),
|
400 |
+
Transpose((1, 2)),
|
401 |
+
nn.Dropout(dropout),
|
402 |
+
)
|
403 |
+
|
404 |
+
def forward(self, x):
|
405 |
+
return self.net(x)
|
406 |
+
|
407 |
+
|
408 |
+
def linear_attention(q, k, v):
|
409 |
+
if v is None:
|
410 |
+
out = torch.einsum("...ed,...nd->...ne", k, q)
|
411 |
+
return out
|
412 |
+
|
413 |
+
else:
|
414 |
+
k_cumsum = k.sum(dim=-2)
|
415 |
+
# k_cumsum = k.sum(dim = -2)
|
416 |
+
D_inv = 1.0 / (torch.einsum("...nd,...d->...n", q, k_cumsum.type_as(q)) + 1e-8)
|
417 |
+
|
418 |
+
context = torch.einsum("...nd,...ne->...de", k, v)
|
419 |
+
out = torch.einsum("...de,...nd,...n->...ne", context, q, D_inv)
|
420 |
+
return out
|
421 |
+
|
422 |
+
|
423 |
+
def gaussian_orthogonal_random_matrix(
|
424 |
+
nb_rows, nb_columns, scaling=0, qr_uniform_q=False, device=None
|
425 |
+
):
|
426 |
+
nb_full_blocks = int(nb_rows / nb_columns)
|
427 |
+
block_list = []
|
428 |
+
|
429 |
+
for _ in range(nb_full_blocks):
|
430 |
+
q = orthogonal_matrix_chunk(
|
431 |
+
nb_columns, qr_uniform_q=qr_uniform_q, device=device
|
432 |
+
)
|
433 |
+
block_list.append(q)
|
434 |
+
|
435 |
+
remaining_rows = nb_rows - nb_full_blocks * nb_columns
|
436 |
+
if remaining_rows > 0:
|
437 |
+
q = orthogonal_matrix_chunk(
|
438 |
+
nb_columns, qr_uniform_q=qr_uniform_q, device=device
|
439 |
+
)
|
440 |
+
|
441 |
+
block_list.append(q[:remaining_rows])
|
442 |
+
|
443 |
+
final_matrix = torch.cat(block_list)
|
444 |
+
|
445 |
+
if scaling == 0:
|
446 |
+
multiplier = torch.randn((nb_rows, nb_columns), device=device).norm(dim=1)
|
447 |
+
elif scaling == 1:
|
448 |
+
multiplier = math.sqrt((float(nb_columns))) * torch.ones(
|
449 |
+
(nb_rows,), device=device
|
450 |
+
)
|
451 |
+
else:
|
452 |
+
raise ValueError(f"Invalid scaling {scaling}")
|
453 |
+
|
454 |
+
return torch.diag(multiplier) @ final_matrix
|
455 |
+
|
456 |
+
|
457 |
+
class FastAttention(nn.Module):
|
458 |
+
def __init__(
|
459 |
+
self,
|
460 |
+
dim_heads,
|
461 |
+
nb_features=None,
|
462 |
+
ortho_scaling=0,
|
463 |
+
causal=False,
|
464 |
+
generalized_attention=False,
|
465 |
+
kernel_fn=nn.ReLU(),
|
466 |
+
qr_uniform_q=False,
|
467 |
+
no_projection=False,
|
468 |
+
):
|
469 |
+
super().__init__()
|
470 |
+
nb_features = default(nb_features, int(dim_heads * math.log(dim_heads)))
|
471 |
+
|
472 |
+
self.dim_heads = dim_heads
|
473 |
+
self.nb_features = nb_features
|
474 |
+
self.ortho_scaling = ortho_scaling
|
475 |
+
|
476 |
+
self.create_projection = partial(
|
477 |
+
gaussian_orthogonal_random_matrix,
|
478 |
+
nb_rows=self.nb_features,
|
479 |
+
nb_columns=dim_heads,
|
480 |
+
scaling=ortho_scaling,
|
481 |
+
qr_uniform_q=qr_uniform_q,
|
482 |
+
)
|
483 |
+
projection_matrix = self.create_projection()
|
484 |
+
self.register_buffer("projection_matrix", projection_matrix)
|
485 |
+
|
486 |
+
self.generalized_attention = generalized_attention
|
487 |
+
self.kernel_fn = kernel_fn
|
488 |
+
|
489 |
+
# if this is turned on, no projection will be used
|
490 |
+
# queries and keys will be softmax-ed as in the original efficient attention paper
|
491 |
+
self.no_projection = no_projection
|
492 |
+
|
493 |
+
self.causal = causal
|
494 |
+
|
495 |
+
@torch.no_grad()
|
496 |
+
def redraw_projection_matrix(self):
|
497 |
+
projections = self.create_projection()
|
498 |
+
self.projection_matrix.copy_(projections)
|
499 |
+
del projections
|
500 |
+
|
501 |
+
def forward(self, q, k, v):
|
502 |
+
device = q.device
|
503 |
+
|
504 |
+
if self.no_projection:
|
505 |
+
q = q.softmax(dim=-1)
|
506 |
+
k = torch.exp(k) if self.causal else k.softmax(dim=-2)
|
507 |
+
else:
|
508 |
+
create_kernel = partial(
|
509 |
+
softmax_kernel, projection_matrix=self.projection_matrix, device=device
|
510 |
+
)
|
511 |
+
|
512 |
+
q = create_kernel(q, is_query=True)
|
513 |
+
k = create_kernel(k, is_query=False)
|
514 |
+
|
515 |
+
attn_fn = linear_attention if not self.causal else self.causal_linear_fn
|
516 |
+
if v is None:
|
517 |
+
out = attn_fn(q, k, None)
|
518 |
+
return out
|
519 |
+
else:
|
520 |
+
out = attn_fn(q, k, v)
|
521 |
+
return out
|
522 |
+
|
523 |
+
|
524 |
+
class SelfAttention(nn.Module):
|
525 |
+
def __init__(
|
526 |
+
self,
|
527 |
+
dim,
|
528 |
+
causal=False,
|
529 |
+
heads=8,
|
530 |
+
dim_head=64,
|
531 |
+
local_heads=0,
|
532 |
+
local_window_size=256,
|
533 |
+
nb_features=None,
|
534 |
+
feature_redraw_interval=1000,
|
535 |
+
generalized_attention=False,
|
536 |
+
kernel_fn=nn.ReLU(),
|
537 |
+
qr_uniform_q=False,
|
538 |
+
dropout=0.0,
|
539 |
+
no_projection=False,
|
540 |
+
):
|
541 |
+
super().__init__()
|
542 |
+
assert dim % heads == 0, "dimension must be divisible by number of heads"
|
543 |
+
dim_head = default(dim_head, dim // heads)
|
544 |
+
inner_dim = dim_head * heads
|
545 |
+
self.fast_attention = FastAttention(
|
546 |
+
dim_head,
|
547 |
+
nb_features,
|
548 |
+
causal=causal,
|
549 |
+
generalized_attention=generalized_attention,
|
550 |
+
kernel_fn=kernel_fn,
|
551 |
+
qr_uniform_q=qr_uniform_q,
|
552 |
+
no_projection=no_projection,
|
553 |
+
)
|
554 |
+
|
555 |
+
self.heads = heads
|
556 |
+
self.global_heads = heads - local_heads
|
557 |
+
self.local_attn = (
|
558 |
+
LocalAttention(
|
559 |
+
window_size=local_window_size,
|
560 |
+
causal=causal,
|
561 |
+
autopad=True,
|
562 |
+
dropout=dropout,
|
563 |
+
look_forward=int(not causal),
|
564 |
+
rel_pos_emb_config=(dim_head, local_heads),
|
565 |
+
)
|
566 |
+
if local_heads > 0
|
567 |
+
else None
|
568 |
+
)
|
569 |
+
|
570 |
+
self.to_q = nn.Linear(dim, inner_dim)
|
571 |
+
self.to_k = nn.Linear(dim, inner_dim)
|
572 |
+
self.to_v = nn.Linear(dim, inner_dim)
|
573 |
+
self.to_out = nn.Linear(inner_dim, dim)
|
574 |
+
self.dropout = nn.Dropout(dropout)
|
575 |
+
|
576 |
+
@torch.no_grad()
|
577 |
+
def redraw_projection_matrix(self):
|
578 |
+
self.fast_attention.redraw_projection_matrix()
|
579 |
+
|
580 |
+
def forward(
|
581 |
+
self,
|
582 |
+
x,
|
583 |
+
context=None,
|
584 |
+
mask=None,
|
585 |
+
context_mask=None,
|
586 |
+
name=None,
|
587 |
+
inference=False,
|
588 |
+
**kwargs,
|
589 |
+
):
|
590 |
+
_, _, _, h, gh = *x.shape, self.heads, self.global_heads
|
591 |
+
|
592 |
+
cross_attend = exists(context)
|
593 |
+
|
594 |
+
context = default(context, x)
|
595 |
+
context_mask = default(context_mask, mask) if not cross_attend else context_mask
|
596 |
+
q, k, v = self.to_q(x), self.to_k(context), self.to_v(context)
|
597 |
+
|
598 |
+
q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=h), (q, k, v))
|
599 |
+
(q, lq), (k, lk), (v, lv) = map(lambda t: (t[:, :gh], t[:, gh:]), (q, k, v))
|
600 |
+
|
601 |
+
attn_outs = []
|
602 |
+
if not empty(q):
|
603 |
+
if exists(context_mask):
|
604 |
+
global_mask = context_mask[:, None, :, None]
|
605 |
+
v.masked_fill_(~global_mask, 0.0)
|
606 |
+
if cross_attend:
|
607 |
+
pass
|
608 |
+
else:
|
609 |
+
out = self.fast_attention(q, k, v)
|
610 |
+
attn_outs.append(out)
|
611 |
+
|
612 |
+
if not empty(lq):
|
613 |
+
assert (
|
614 |
+
not cross_attend
|
615 |
+
), "local attention is not compatible with cross attention"
|
616 |
+
out = self.local_attn(lq, lk, lv, input_mask=mask)
|
617 |
+
attn_outs.append(out)
|
618 |
+
|
619 |
+
out = torch.cat(attn_outs, dim=1)
|
620 |
+
out = rearrange(out, "b h n d -> b n (h d)")
|
621 |
+
out = self.to_out(out)
|
622 |
+
return self.dropout(out)
|
623 |
+
|
624 |
+
|
625 |
+
def l2_regularization(model, l2_alpha):
|
626 |
+
l2_loss = []
|
627 |
+
for module in model.modules():
|
628 |
+
if type(module) is nn.Conv2d:
|
629 |
+
l2_loss.append((module.weight**2).sum() / 2.0)
|
630 |
+
return l2_alpha * sum(l2_loss)
|
631 |
+
|
632 |
+
|
633 |
+
class FCPE(nn.Module):
|
634 |
+
def __init__(
|
635 |
+
self,
|
636 |
+
input_channel=128,
|
637 |
+
out_dims=360,
|
638 |
+
n_layers=12,
|
639 |
+
n_chans=512,
|
640 |
+
use_siren=False,
|
641 |
+
use_full=False,
|
642 |
+
loss_mse_scale=10,
|
643 |
+
loss_l2_regularization=False,
|
644 |
+
loss_l2_regularization_scale=1,
|
645 |
+
loss_grad1_mse=False,
|
646 |
+
loss_grad1_mse_scale=1,
|
647 |
+
f0_max=1975.5,
|
648 |
+
f0_min=32.70,
|
649 |
+
confidence=False,
|
650 |
+
threshold=0.05,
|
651 |
+
use_input_conv=True,
|
652 |
+
):
|
653 |
+
super().__init__()
|
654 |
+
if use_siren is True:
|
655 |
+
raise ValueError("Siren is not supported yet.")
|
656 |
+
if use_full is True:
|
657 |
+
raise ValueError("Full model is not supported yet.")
|
658 |
+
|
659 |
+
self.loss_mse_scale = loss_mse_scale if (loss_mse_scale is not None) else 10
|
660 |
+
self.loss_l2_regularization = (
|
661 |
+
loss_l2_regularization if (loss_l2_regularization is not None) else False
|
662 |
+
)
|
663 |
+
self.loss_l2_regularization_scale = (
|
664 |
+
loss_l2_regularization_scale
|
665 |
+
if (loss_l2_regularization_scale is not None)
|
666 |
+
else 1
|
667 |
+
)
|
668 |
+
self.loss_grad1_mse = loss_grad1_mse if (loss_grad1_mse is not None) else False
|
669 |
+
self.loss_grad1_mse_scale = (
|
670 |
+
loss_grad1_mse_scale if (loss_grad1_mse_scale is not None) else 1
|
671 |
+
)
|
672 |
+
self.f0_max = f0_max if (f0_max is not None) else 1975.5
|
673 |
+
self.f0_min = f0_min if (f0_min is not None) else 32.70
|
674 |
+
self.confidence = confidence if (confidence is not None) else False
|
675 |
+
self.threshold = threshold if (threshold is not None) else 0.05
|
676 |
+
self.use_input_conv = use_input_conv if (use_input_conv is not None) else True
|
677 |
+
|
678 |
+
self.cent_table_b = torch.Tensor(
|
679 |
+
np.linspace(
|
680 |
+
self.f0_to_cent(torch.Tensor([f0_min]))[0],
|
681 |
+
self.f0_to_cent(torch.Tensor([f0_max]))[0],
|
682 |
+
out_dims,
|
683 |
+
)
|
684 |
+
)
|
685 |
+
self.register_buffer("cent_table", self.cent_table_b)
|
686 |
+
|
687 |
+
# conv in stack
|
688 |
+
_leaky = nn.LeakyReLU()
|
689 |
+
self.stack = nn.Sequential(
|
690 |
+
nn.Conv1d(input_channel, n_chans, 3, 1, 1),
|
691 |
+
nn.GroupNorm(4, n_chans),
|
692 |
+
_leaky,
|
693 |
+
nn.Conv1d(n_chans, n_chans, 3, 1, 1),
|
694 |
+
)
|
695 |
+
|
696 |
+
# transformer
|
697 |
+
self.decoder = PCmer(
|
698 |
+
num_layers=n_layers,
|
699 |
+
num_heads=8,
|
700 |
+
dim_model=n_chans,
|
701 |
+
dim_keys=n_chans,
|
702 |
+
dim_values=n_chans,
|
703 |
+
residual_dropout=0.1,
|
704 |
+
attention_dropout=0.1,
|
705 |
+
)
|
706 |
+
self.norm = nn.LayerNorm(n_chans)
|
707 |
+
|
708 |
+
# out
|
709 |
+
self.n_out = out_dims
|
710 |
+
self.dense_out = weight_norm(nn.Linear(n_chans, self.n_out))
|
711 |
+
|
712 |
+
def forward(
|
713 |
+
self, mel, infer=True, gt_f0=None, return_hz_f0=False, cdecoder="local_argmax"
|
714 |
+
):
|
715 |
+
"""
|
716 |
+
input:
|
717 |
+
B x n_frames x n_unit
|
718 |
+
return:
|
719 |
+
dict of B x n_frames x feat
|
720 |
+
"""
|
721 |
+
if cdecoder == "argmax":
|
722 |
+
self.cdecoder = self.cents_decoder
|
723 |
+
elif cdecoder == "local_argmax":
|
724 |
+
self.cdecoder = self.cents_local_decoder
|
725 |
+
if self.use_input_conv:
|
726 |
+
x = self.stack(mel.transpose(1, 2)).transpose(1, 2)
|
727 |
+
else:
|
728 |
+
x = mel
|
729 |
+
x = self.decoder(x)
|
730 |
+
x = self.norm(x)
|
731 |
+
x = self.dense_out(x) # [B,N,D]
|
732 |
+
x = torch.sigmoid(x)
|
733 |
+
if not infer:
|
734 |
+
gt_cent_f0 = self.f0_to_cent(gt_f0) # mel f0 #[B,N,1]
|
735 |
+
gt_cent_f0 = self.gaussian_blurred_cent(gt_cent_f0) # #[B,N,out_dim]
|
736 |
+
loss_all = self.loss_mse_scale * F.binary_cross_entropy(
|
737 |
+
x, gt_cent_f0
|
738 |
+
) # bce loss
|
739 |
+
# l2 regularization
|
740 |
+
if self.loss_l2_regularization:
|
741 |
+
loss_all = loss_all + l2_regularization(
|
742 |
+
model=self, l2_alpha=self.loss_l2_regularization_scale
|
743 |
+
)
|
744 |
+
x = loss_all
|
745 |
+
if infer:
|
746 |
+
x = self.cdecoder(x)
|
747 |
+
x = self.cent_to_f0(x)
|
748 |
+
if not return_hz_f0:
|
749 |
+
x = (1 + x / 700).log()
|
750 |
+
return x
|
751 |
+
|
752 |
+
def cents_decoder(self, y, mask=True):
|
753 |
+
B, N, _ = y.size()
|
754 |
+
ci = self.cent_table[None, None, :].expand(B, N, -1)
|
755 |
+
rtn = torch.sum(ci * y, dim=-1, keepdim=True) / torch.sum(
|
756 |
+
y, dim=-1, keepdim=True
|
757 |
+
) # cents: [B,N,1]
|
758 |
+
if mask:
|
759 |
+
confident = torch.max(y, dim=-1, keepdim=True)[0]
|
760 |
+
confident_mask = torch.ones_like(confident)
|
761 |
+
confident_mask[confident <= self.threshold] = float("-INF")
|
762 |
+
rtn = rtn * confident_mask
|
763 |
+
if self.confidence:
|
764 |
+
return rtn, confident
|
765 |
+
else:
|
766 |
+
return rtn
|
767 |
+
|
768 |
+
def cents_local_decoder(self, y, mask=True):
|
769 |
+
B, N, _ = y.size()
|
770 |
+
ci = self.cent_table[None, None, :].expand(B, N, -1)
|
771 |
+
confident, max_index = torch.max(y, dim=-1, keepdim=True)
|
772 |
+
local_argmax_index = torch.arange(0, 9).to(max_index.device) + (max_index - 4)
|
773 |
+
local_argmax_index[local_argmax_index < 0] = 0
|
774 |
+
local_argmax_index[local_argmax_index >= self.n_out] = self.n_out - 1
|
775 |
+
ci_l = torch.gather(ci, -1, local_argmax_index)
|
776 |
+
y_l = torch.gather(y, -1, local_argmax_index)
|
777 |
+
rtn = torch.sum(ci_l * y_l, dim=-1, keepdim=True) / torch.sum(
|
778 |
+
y_l, dim=-1, keepdim=True
|
779 |
+
) # cents: [B,N,1]
|
780 |
+
if mask:
|
781 |
+
confident_mask = torch.ones_like(confident)
|
782 |
+
confident_mask[confident <= self.threshold] = float("-INF")
|
783 |
+
rtn = rtn * confident_mask
|
784 |
+
if self.confidence:
|
785 |
+
return rtn, confident
|
786 |
+
else:
|
787 |
+
return rtn
|
788 |
+
|
789 |
+
def cent_to_f0(self, cent):
|
790 |
+
return 10.0 * 2 ** (cent / 1200.0)
|
791 |
+
|
792 |
+
def f0_to_cent(self, f0):
|
793 |
+
return 1200.0 * torch.log2(f0 / 10.0)
|
794 |
+
|
795 |
+
def gaussian_blurred_cent(self, cents): # cents: [B,N,1]
|
796 |
+
mask = (cents > 0.1) & (cents < (1200.0 * np.log2(self.f0_max / 10.0)))
|
797 |
+
B, N, _ = cents.size()
|
798 |
+
ci = self.cent_table[None, None, :].expand(B, N, -1)
|
799 |
+
return torch.exp(-torch.square(ci - cents) / 1250) * mask.float()
|
800 |
+
|
801 |
+
|
802 |
+
class FCPEInfer:
|
803 |
+
def __init__(self, model_path, device=None, dtype=torch.float32):
|
804 |
+
if device is None:
|
805 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
806 |
+
self.device = device
|
807 |
+
ckpt = torch.load(model_path, map_location=torch.device(self.device))
|
808 |
+
self.args = DotDict(ckpt["config"])
|
809 |
+
self.dtype = dtype
|
810 |
+
model = FCPE(
|
811 |
+
input_channel=self.args.model.input_channel,
|
812 |
+
out_dims=self.args.model.out_dims,
|
813 |
+
n_layers=self.args.model.n_layers,
|
814 |
+
n_chans=self.args.model.n_chans,
|
815 |
+
use_siren=self.args.model.use_siren,
|
816 |
+
use_full=self.args.model.use_full,
|
817 |
+
loss_mse_scale=self.args.loss.loss_mse_scale,
|
818 |
+
loss_l2_regularization=self.args.loss.loss_l2_regularization,
|
819 |
+
loss_l2_regularization_scale=self.args.loss.loss_l2_regularization_scale,
|
820 |
+
loss_grad1_mse=self.args.loss.loss_grad1_mse,
|
821 |
+
loss_grad1_mse_scale=self.args.loss.loss_grad1_mse_scale,
|
822 |
+
f0_max=self.args.model.f0_max,
|
823 |
+
f0_min=self.args.model.f0_min,
|
824 |
+
confidence=self.args.model.confidence,
|
825 |
+
)
|
826 |
+
model.to(self.device).to(self.dtype)
|
827 |
+
model.load_state_dict(ckpt["model"])
|
828 |
+
model.eval()
|
829 |
+
self.model = model
|
830 |
+
self.wav2mel = Wav2Mel(self.args, dtype=self.dtype, device=self.device)
|
831 |
+
|
832 |
+
@torch.no_grad()
|
833 |
+
def __call__(self, audio, sr, threshold=0.05):
|
834 |
+
self.model.threshold = threshold
|
835 |
+
audio = audio[None, :]
|
836 |
+
mel = self.wav2mel(audio=audio, sample_rate=sr).to(self.dtype)
|
837 |
+
f0 = self.model(mel=mel, infer=True, return_hz_f0=True)
|
838 |
+
return f0
|
839 |
+
|
840 |
+
|
841 |
+
class Wav2Mel:
|
842 |
+
|
843 |
+
def __init__(self, args, device=None, dtype=torch.float32):
|
844 |
+
# self.args = args
|
845 |
+
self.sampling_rate = args.mel.sampling_rate
|
846 |
+
self.hop_size = args.mel.hop_size
|
847 |
+
if device is None:
|
848 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
849 |
+
self.device = device
|
850 |
+
self.dtype = dtype
|
851 |
+
self.stft = STFT(
|
852 |
+
args.mel.sampling_rate,
|
853 |
+
args.mel.num_mels,
|
854 |
+
args.mel.n_fft,
|
855 |
+
args.mel.win_size,
|
856 |
+
args.mel.hop_size,
|
857 |
+
args.mel.fmin,
|
858 |
+
args.mel.fmax,
|
859 |
+
)
|
860 |
+
self.resample_kernel = {}
|
861 |
+
|
862 |
+
def extract_nvstft(self, audio, keyshift=0, train=False):
|
863 |
+
mel = self.stft.get_mel(audio, keyshift=keyshift, train=train).transpose(
|
864 |
+
1, 2
|
865 |
+
) # B, n_frames, bins
|
866 |
+
return mel
|
867 |
+
|
868 |
+
def extract_mel(self, audio, sample_rate, keyshift=0, train=False):
|
869 |
+
audio = audio.to(self.dtype).to(self.device)
|
870 |
+
# resample
|
871 |
+
if sample_rate == self.sampling_rate:
|
872 |
+
audio_res = audio
|
873 |
+
else:
|
874 |
+
key_str = str(sample_rate)
|
875 |
+
if key_str not in self.resample_kernel:
|
876 |
+
self.resample_kernel[key_str] = Resample(
|
877 |
+
sample_rate, self.sampling_rate, lowpass_filter_width=128
|
878 |
+
)
|
879 |
+
self.resample_kernel[key_str] = (
|
880 |
+
self.resample_kernel[key_str].to(self.dtype).to(self.device)
|
881 |
+
)
|
882 |
+
audio_res = self.resample_kernel[key_str](audio)
|
883 |
+
|
884 |
+
# extract
|
885 |
+
mel = self.extract_nvstft(
|
886 |
+
audio_res, keyshift=keyshift, train=train
|
887 |
+
) # B, n_frames, bins
|
888 |
+
n_frames = int(audio.shape[1] // self.hop_size) + 1
|
889 |
+
if n_frames > int(mel.shape[1]):
|
890 |
+
mel = torch.cat((mel, mel[:, -1:, :]), 1)
|
891 |
+
if n_frames < int(mel.shape[1]):
|
892 |
+
mel = mel[:, :n_frames, :]
|
893 |
+
return mel
|
894 |
+
|
895 |
+
def __call__(self, audio, sample_rate, keyshift=0, train=False):
|
896 |
+
return self.extract_mel(audio, sample_rate, keyshift=keyshift, train=train)
|
897 |
+
|
898 |
+
|
899 |
+
class DotDict(dict):
|
900 |
+
def __getattr__(*args):
|
901 |
+
val = dict.get(*args)
|
902 |
+
return DotDict(val) if type(val) is dict else val
|
903 |
+
|
904 |
+
__setattr__ = dict.__setitem__
|
905 |
+
__delattr__ = dict.__delitem__
|
906 |
+
|
907 |
+
|
908 |
+
class F0Predictor(object):
|
909 |
+
def compute_f0(self, wav, p_len):
|
910 |
+
"""
|
911 |
+
input: wav:[signal_length]
|
912 |
+
p_len:int
|
913 |
+
output: f0:[signal_length//hop_length]
|
914 |
+
"""
|
915 |
+
pass
|
916 |
+
|
917 |
+
def compute_f0_uv(self, wav, p_len):
|
918 |
+
"""
|
919 |
+
input: wav:[signal_length]
|
920 |
+
p_len:int
|
921 |
+
output: f0:[signal_length//hop_length],uv:[signal_length//hop_length]
|
922 |
+
"""
|
923 |
+
pass
|
924 |
+
|
925 |
+
|
926 |
+
class FCPEF0Predictor(F0Predictor):
|
927 |
+
def __init__(
|
928 |
+
self,
|
929 |
+
model_path,
|
930 |
+
hop_length=512,
|
931 |
+
f0_min=50,
|
932 |
+
f0_max=1100,
|
933 |
+
dtype=torch.float32,
|
934 |
+
device=None,
|
935 |
+
sampling_rate=44100,
|
936 |
+
threshold=0.05,
|
937 |
+
):
|
938 |
+
self.fcpe = FCPEInfer(model_path, device=device, dtype=dtype)
|
939 |
+
self.hop_length = hop_length
|
940 |
+
self.f0_min = f0_min
|
941 |
+
self.f0_max = f0_max
|
942 |
+
if device is None:
|
943 |
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
944 |
+
else:
|
945 |
+
self.device = device
|
946 |
+
self.threshold = threshold
|
947 |
+
self.sampling_rate = sampling_rate
|
948 |
+
self.dtype = dtype
|
949 |
+
self.name = "fcpe"
|
950 |
+
|
951 |
+
def repeat_expand(
|
952 |
+
self,
|
953 |
+
content: Union[torch.Tensor, np.ndarray],
|
954 |
+
target_len: int,
|
955 |
+
mode: str = "nearest",
|
956 |
+
):
|
957 |
+
ndim = content.ndim
|
958 |
+
|
959 |
+
if content.ndim == 1:
|
960 |
+
content = content[None, None]
|
961 |
+
elif content.ndim == 2:
|
962 |
+
content = content[None]
|
963 |
+
|
964 |
+
assert content.ndim == 3
|
965 |
+
|
966 |
+
is_np = isinstance(content, np.ndarray)
|
967 |
+
if is_np:
|
968 |
+
content = torch.from_numpy(content)
|
969 |
+
|
970 |
+
results = torch.nn.functional.interpolate(content, size=target_len, mode=mode)
|
971 |
+
|
972 |
+
if is_np:
|
973 |
+
results = results.numpy()
|
974 |
+
|
975 |
+
if ndim == 1:
|
976 |
+
return results[0, 0]
|
977 |
+
elif ndim == 2:
|
978 |
+
return results[0]
|
979 |
+
|
980 |
+
def post_process(self, x, sampling_rate, f0, pad_to):
|
981 |
+
if isinstance(f0, np.ndarray):
|
982 |
+
f0 = torch.from_numpy(f0).float().to(x.device)
|
983 |
+
|
984 |
+
if pad_to is None:
|
985 |
+
return f0
|
986 |
+
|
987 |
+
f0 = self.repeat_expand(f0, pad_to)
|
988 |
+
|
989 |
+
vuv_vector = torch.zeros_like(f0)
|
990 |
+
vuv_vector[f0 > 0.0] = 1.0
|
991 |
+
vuv_vector[f0 <= 0.0] = 0.0
|
992 |
+
|
993 |
+
# 去掉0频率, 并线性插值
|
994 |
+
nzindex = torch.nonzero(f0).squeeze()
|
995 |
+
f0 = torch.index_select(f0, dim=0, index=nzindex).cpu().numpy()
|
996 |
+
time_org = self.hop_length / sampling_rate * nzindex.cpu().numpy()
|
997 |
+
time_frame = np.arange(pad_to) * self.hop_length / sampling_rate
|
998 |
+
|
999 |
+
vuv_vector = F.interpolate(vuv_vector[None, None, :], size=pad_to)[0][0]
|
1000 |
+
|
1001 |
+
if f0.shape[0] <= 0:
|
1002 |
+
return (
|
1003 |
+
torch.zeros(pad_to, dtype=torch.float, device=x.device).cpu().numpy(),
|
1004 |
+
vuv_vector.cpu().numpy(),
|
1005 |
+
)
|
1006 |
+
if f0.shape[0] == 1:
|
1007 |
+
return (
|
1008 |
+
torch.ones(pad_to, dtype=torch.float, device=x.device) * f0[0]
|
1009 |
+
).cpu().numpy(), vuv_vector.cpu().numpy()
|
1010 |
+
|
1011 |
+
# 大概可以用 torch 重写?
|
1012 |
+
f0 = np.interp(time_frame, time_org, f0, left=f0[0], right=f0[-1])
|
1013 |
+
# vuv_vector = np.ceil(scipy.ndimage.zoom(vuv_vector,pad_to/len(vuv_vector),order = 0))
|
1014 |
+
|
1015 |
+
return f0, vuv_vector.cpu().numpy()
|
1016 |
+
|
1017 |
+
def compute_f0(self, wav, p_len=None):
|
1018 |
+
x = torch.FloatTensor(wav).to(self.dtype).to(self.device)
|
1019 |
+
if p_len is None:
|
1020 |
+
print("fcpe p_len is None")
|
1021 |
+
p_len = x.shape[0] // self.hop_length
|
1022 |
+
f0 = self.fcpe(x, sr=self.sampling_rate, threshold=self.threshold)[0, :, 0]
|
1023 |
+
if torch.all(f0 == 0):
|
1024 |
+
rtn = f0.cpu().numpy() if p_len is None else np.zeros(p_len)
|
1025 |
+
return rtn, rtn
|
1026 |
+
return self.post_process(x, self.sampling_rate, f0, p_len)[0]
|
1027 |
+
|
1028 |
+
def compute_f0_uv(self, wav, p_len=None):
|
1029 |
+
x = torch.FloatTensor(wav).to(self.dtype).to(self.device)
|
1030 |
+
if p_len is None:
|
1031 |
+
p_len = x.shape[0] // self.hop_length
|
1032 |
+
f0 = self.fcpe(x, sr=self.sampling_rate, threshold=self.threshold)[0, :, 0]
|
1033 |
+
if torch.all(f0 == 0):
|
1034 |
+
rtn = f0.cpu().numpy() if p_len is None else np.zeros(p_len)
|
1035 |
+
return rtn, rtn
|
1036 |
+
return self.post_process(x, self.sampling_rate, f0, p_len)
|
rvc/lib/infer_pack/models.py
CHANGED
@@ -178,7 +178,7 @@ class ResidualCouplingBlock(nn.Module):
|
|
178 |
for i in range(self.n_flows):
|
179 |
for hook in self.flows[i * 2]._forward_pre_hooks.values():
|
180 |
if (
|
181 |
-
hook.__module__ == "torch.nn.utils.weight_norm"
|
182 |
and hook.__class__.__name__ == "WeightNorm"
|
183 |
):
|
184 |
torch.nn.utils.remove_weight_norm(self.flows[i * 2])
|
@@ -235,7 +235,7 @@ class PosteriorEncoder(nn.Module):
|
|
235 |
def __prepare_scriptable__(self):
|
236 |
for hook in self.enc._forward_pre_hooks.values():
|
237 |
if (
|
238 |
-
hook.__module__ == "torch.nn.utils.weight_norm"
|
239 |
and hook.__class__.__name__ == "WeightNorm"
|
240 |
):
|
241 |
torch.nn.utils.remove_weight_norm(self.enc)
|
@@ -319,7 +319,7 @@ class Generator(torch.nn.Module):
|
|
319 |
# because of shadowing, so we check the module name directly.
|
320 |
# https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3
|
321 |
if (
|
322 |
-
hook.__module__ == "torch.nn.utils.weight_norm"
|
323 |
and hook.__class__.__name__ == "WeightNorm"
|
324 |
):
|
325 |
torch.nn.utils.remove_weight_norm(l)
|
@@ -327,7 +327,7 @@ class Generator(torch.nn.Module):
|
|
327 |
for l in self.resblocks:
|
328 |
for hook in l._forward_pre_hooks.values():
|
329 |
if (
|
330 |
-
hook.__module__ == "torch.nn.utils.weight_norm"
|
331 |
and hook.__class__.__name__ == "WeightNorm"
|
332 |
):
|
333 |
torch.nn.utils.remove_weight_norm(l)
|
@@ -610,14 +610,14 @@ class GeneratorNSF(torch.nn.Module):
|
|
610 |
# because of shadowing, so we check the module name directly.
|
611 |
# https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3
|
612 |
if (
|
613 |
-
hook.__module__ == "torch.nn.utils.weight_norm"
|
614 |
and hook.__class__.__name__ == "WeightNorm"
|
615 |
):
|
616 |
torch.nn.utils.remove_weight_norm(l)
|
617 |
for l in self.resblocks:
|
618 |
for hook in self.resblocks._forward_pre_hooks.values():
|
619 |
if (
|
620 |
-
hook.__module__ == "torch.nn.utils.weight_norm"
|
621 |
and hook.__class__.__name__ == "WeightNorm"
|
622 |
):
|
623 |
torch.nn.utils.remove_weight_norm(l)
|
@@ -722,20 +722,20 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
|
|
722 |
# because of shadowing, so we check the module name directly.
|
723 |
# https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3
|
724 |
if (
|
725 |
-
hook.__module__ == "torch.nn.utils.weight_norm"
|
726 |
and hook.__class__.__name__ == "WeightNorm"
|
727 |
):
|
728 |
torch.nn.utils.remove_weight_norm(self.dec)
|
729 |
for hook in self.flow._forward_pre_hooks.values():
|
730 |
if (
|
731 |
-
hook.__module__ == "torch.nn.utils.weight_norm"
|
732 |
and hook.__class__.__name__ == "WeightNorm"
|
733 |
):
|
734 |
torch.nn.utils.remove_weight_norm(self.flow)
|
735 |
if hasattr(self, "enc_q"):
|
736 |
for hook in self.enc_q._forward_pre_hooks.values():
|
737 |
if (
|
738 |
-
hook.__module__ == "torch.nn.utils.weight_norm"
|
739 |
and hook.__class__.__name__ == "WeightNorm"
|
740 |
):
|
741 |
torch.nn.utils.remove_weight_norm(self.enc_q)
|
@@ -881,20 +881,20 @@ class SynthesizerTrnMs768NSFsid(nn.Module):
|
|
881 |
# because of shadowing, so we check the module name directly.
|
882 |
# https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3
|
883 |
if (
|
884 |
-
hook.__module__ == "torch.nn.utils.weight_norm"
|
885 |
and hook.__class__.__name__ == "WeightNorm"
|
886 |
):
|
887 |
torch.nn.utils.remove_weight_norm(self.dec)
|
888 |
for hook in self.flow._forward_pre_hooks.values():
|
889 |
if (
|
890 |
-
hook.__module__ == "torch.nn.utils.weight_norm"
|
891 |
and hook.__class__.__name__ == "WeightNorm"
|
892 |
):
|
893 |
torch.nn.utils.remove_weight_norm(self.flow)
|
894 |
if hasattr(self, "enc_q"):
|
895 |
for hook in self.enc_q._forward_pre_hooks.values():
|
896 |
if (
|
897 |
-
hook.__module__ == "torch.nn.utils.weight_norm"
|
898 |
and hook.__class__.__name__ == "WeightNorm"
|
899 |
):
|
900 |
torch.nn.utils.remove_weight_norm(self.enc_q)
|
@@ -1029,20 +1029,20 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
|
|
1029 |
# because of shadowing, so we check the module name directly.
|
1030 |
# https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3
|
1031 |
if (
|
1032 |
-
hook.__module__ == "torch.nn.utils.weight_norm"
|
1033 |
and hook.__class__.__name__ == "WeightNorm"
|
1034 |
):
|
1035 |
torch.nn.utils.remove_weight_norm(self.dec)
|
1036 |
for hook in self.flow._forward_pre_hooks.values():
|
1037 |
if (
|
1038 |
-
hook.__module__ == "torch.nn.utils.weight_norm"
|
1039 |
and hook.__class__.__name__ == "WeightNorm"
|
1040 |
):
|
1041 |
torch.nn.utils.remove_weight_norm(self.flow)
|
1042 |
if hasattr(self, "enc_q"):
|
1043 |
for hook in self.enc_q._forward_pre_hooks.values():
|
1044 |
if (
|
1045 |
-
hook.__module__ == "torch.nn.utils.weight_norm"
|
1046 |
and hook.__class__.__name__ == "WeightNorm"
|
1047 |
):
|
1048 |
torch.nn.utils.remove_weight_norm(self.enc_q)
|
@@ -1168,20 +1168,20 @@ class SynthesizerTrnMs768NSFsid_nono(nn.Module):
|
|
1168 |
# because of shadowing, so we check the module name directly.
|
1169 |
# https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3
|
1170 |
if (
|
1171 |
-
hook.__module__ == "torch.nn.utils.weight_norm"
|
1172 |
and hook.__class__.__name__ == "WeightNorm"
|
1173 |
):
|
1174 |
torch.nn.utils.remove_weight_norm(self.dec)
|
1175 |
for hook in self.flow._forward_pre_hooks.values():
|
1176 |
if (
|
1177 |
-
hook.__module__ == "torch.nn.utils.weight_norm"
|
1178 |
and hook.__class__.__name__ == "WeightNorm"
|
1179 |
):
|
1180 |
torch.nn.utils.remove_weight_norm(self.flow)
|
1181 |
if hasattr(self, "enc_q"):
|
1182 |
for hook in self.enc_q._forward_pre_hooks.values():
|
1183 |
if (
|
1184 |
-
hook.__module__ == "torch.nn.utils.weight_norm"
|
1185 |
and hook.__class__.__name__ == "WeightNorm"
|
1186 |
):
|
1187 |
torch.nn.utils.remove_weight_norm(self.enc_q)
|
|
|
178 |
for i in range(self.n_flows):
|
179 |
for hook in self.flows[i * 2]._forward_pre_hooks.values():
|
180 |
if (
|
181 |
+
hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
|
182 |
and hook.__class__.__name__ == "WeightNorm"
|
183 |
):
|
184 |
torch.nn.utils.remove_weight_norm(self.flows[i * 2])
|
|
|
235 |
def __prepare_scriptable__(self):
|
236 |
for hook in self.enc._forward_pre_hooks.values():
|
237 |
if (
|
238 |
+
hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
|
239 |
and hook.__class__.__name__ == "WeightNorm"
|
240 |
):
|
241 |
torch.nn.utils.remove_weight_norm(self.enc)
|
|
|
319 |
# because of shadowing, so we check the module name directly.
|
320 |
# https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3
|
321 |
if (
|
322 |
+
hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
|
323 |
and hook.__class__.__name__ == "WeightNorm"
|
324 |
):
|
325 |
torch.nn.utils.remove_weight_norm(l)
|
|
|
327 |
for l in self.resblocks:
|
328 |
for hook in l._forward_pre_hooks.values():
|
329 |
if (
|
330 |
+
hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
|
331 |
and hook.__class__.__name__ == "WeightNorm"
|
332 |
):
|
333 |
torch.nn.utils.remove_weight_norm(l)
|
|
|
610 |
# because of shadowing, so we check the module name directly.
|
611 |
# https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3
|
612 |
if (
|
613 |
+
hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
|
614 |
and hook.__class__.__name__ == "WeightNorm"
|
615 |
):
|
616 |
torch.nn.utils.remove_weight_norm(l)
|
617 |
for l in self.resblocks:
|
618 |
for hook in self.resblocks._forward_pre_hooks.values():
|
619 |
if (
|
620 |
+
hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
|
621 |
and hook.__class__.__name__ == "WeightNorm"
|
622 |
):
|
623 |
torch.nn.utils.remove_weight_norm(l)
|
|
|
722 |
# because of shadowing, so we check the module name directly.
|
723 |
# https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3
|
724 |
if (
|
725 |
+
hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
|
726 |
and hook.__class__.__name__ == "WeightNorm"
|
727 |
):
|
728 |
torch.nn.utils.remove_weight_norm(self.dec)
|
729 |
for hook in self.flow._forward_pre_hooks.values():
|
730 |
if (
|
731 |
+
hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
|
732 |
and hook.__class__.__name__ == "WeightNorm"
|
733 |
):
|
734 |
torch.nn.utils.remove_weight_norm(self.flow)
|
735 |
if hasattr(self, "enc_q"):
|
736 |
for hook in self.enc_q._forward_pre_hooks.values():
|
737 |
if (
|
738 |
+
hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
|
739 |
and hook.__class__.__name__ == "WeightNorm"
|
740 |
):
|
741 |
torch.nn.utils.remove_weight_norm(self.enc_q)
|
|
|
881 |
# because of shadowing, so we check the module name directly.
|
882 |
# https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3
|
883 |
if (
|
884 |
+
hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
|
885 |
and hook.__class__.__name__ == "WeightNorm"
|
886 |
):
|
887 |
torch.nn.utils.remove_weight_norm(self.dec)
|
888 |
for hook in self.flow._forward_pre_hooks.values():
|
889 |
if (
|
890 |
+
hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
|
891 |
and hook.__class__.__name__ == "WeightNorm"
|
892 |
):
|
893 |
torch.nn.utils.remove_weight_norm(self.flow)
|
894 |
if hasattr(self, "enc_q"):
|
895 |
for hook in self.enc_q._forward_pre_hooks.values():
|
896 |
if (
|
897 |
+
hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
|
898 |
and hook.__class__.__name__ == "WeightNorm"
|
899 |
):
|
900 |
torch.nn.utils.remove_weight_norm(self.enc_q)
|
|
|
1029 |
# because of shadowing, so we check the module name directly.
|
1030 |
# https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3
|
1031 |
if (
|
1032 |
+
hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
|
1033 |
and hook.__class__.__name__ == "WeightNorm"
|
1034 |
):
|
1035 |
torch.nn.utils.remove_weight_norm(self.dec)
|
1036 |
for hook in self.flow._forward_pre_hooks.values():
|
1037 |
if (
|
1038 |
+
hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
|
1039 |
and hook.__class__.__name__ == "WeightNorm"
|
1040 |
):
|
1041 |
torch.nn.utils.remove_weight_norm(self.flow)
|
1042 |
if hasattr(self, "enc_q"):
|
1043 |
for hook in self.enc_q._forward_pre_hooks.values():
|
1044 |
if (
|
1045 |
+
hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
|
1046 |
and hook.__class__.__name__ == "WeightNorm"
|
1047 |
):
|
1048 |
torch.nn.utils.remove_weight_norm(self.enc_q)
|
|
|
1168 |
# because of shadowing, so we check the module name directly.
|
1169 |
# https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3
|
1170 |
if (
|
1171 |
+
hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
|
1172 |
and hook.__class__.__name__ == "WeightNorm"
|
1173 |
):
|
1174 |
torch.nn.utils.remove_weight_norm(self.dec)
|
1175 |
for hook in self.flow._forward_pre_hooks.values():
|
1176 |
if (
|
1177 |
+
hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
|
1178 |
and hook.__class__.__name__ == "WeightNorm"
|
1179 |
):
|
1180 |
torch.nn.utils.remove_weight_norm(self.flow)
|
1181 |
if hasattr(self, "enc_q"):
|
1182 |
for hook in self.enc_q._forward_pre_hooks.values():
|
1183 |
if (
|
1184 |
+
hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
|
1185 |
and hook.__class__.__name__ == "WeightNorm"
|
1186 |
):
|
1187 |
torch.nn.utils.remove_weight_norm(self.enc_q)
|
rvc/lib/tools/analyzer.py
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import matplotlib.pyplot as plt
|
3 |
+
import librosa.display
|
4 |
+
import librosa
|
5 |
+
|
6 |
+
|
7 |
+
def calculate_features(y, sr):
|
8 |
+
stft = np.abs(librosa.stft(y))
|
9 |
+
duration = librosa.get_duration(y=y, sr=sr)
|
10 |
+
cent = librosa.feature.spectral_centroid(S=stft, sr=sr)[0]
|
11 |
+
bw = librosa.feature.spectral_bandwidth(S=stft, sr=sr)[0]
|
12 |
+
rolloff = librosa.feature.spectral_rolloff(S=stft, sr=sr)[0]
|
13 |
+
return stft, duration, cent, bw, rolloff
|
14 |
+
|
15 |
+
|
16 |
+
def plot_title(title):
|
17 |
+
plt.suptitle(title, fontsize=16, fontweight="bold")
|
18 |
+
|
19 |
+
|
20 |
+
def plot_spectrogram(y, sr, stft, duration, cmap="inferno"):
|
21 |
+
plt.subplot(3, 1, 1)
|
22 |
+
plt.imshow(
|
23 |
+
librosa.amplitude_to_db(stft, ref=np.max),
|
24 |
+
origin="lower",
|
25 |
+
extent=[0, duration, 0, sr / 1000],
|
26 |
+
aspect="auto",
|
27 |
+
cmap=cmap, # Change the colormap here
|
28 |
+
)
|
29 |
+
plt.colorbar(format="%+2.0f dB")
|
30 |
+
plt.xlabel("Time (s)")
|
31 |
+
plt.ylabel("Frequency (kHz)")
|
32 |
+
plt.title("Spectrogram")
|
33 |
+
|
34 |
+
|
35 |
+
def plot_waveform(y, sr, duration):
|
36 |
+
plt.subplot(3, 1, 2)
|
37 |
+
librosa.display.waveshow(y, sr=sr)
|
38 |
+
plt.xlabel("Time (s)")
|
39 |
+
plt.ylabel("Amplitude")
|
40 |
+
plt.title("Waveform")
|
41 |
+
|
42 |
+
|
43 |
+
def plot_features(times, cent, bw, rolloff, duration):
|
44 |
+
plt.subplot(3, 1, 3)
|
45 |
+
plt.plot(times, cent, label="Spectral Centroid (kHz)", color="b")
|
46 |
+
plt.plot(times, bw, label="Spectral Bandwidth (kHz)", color="g")
|
47 |
+
plt.plot(times, rolloff, label="Spectral Rolloff (kHz)", color="r")
|
48 |
+
plt.xlabel("Time (s)")
|
49 |
+
plt.title("Spectral Features")
|
50 |
+
plt.legend()
|
51 |
+
|
52 |
+
|
53 |
+
def analyze_audio(audio_file, save_plot_path="logs/audio_analysis.png"):
|
54 |
+
y, sr = librosa.load(audio_file)
|
55 |
+
stft, duration, cent, bw, rolloff = calculate_features(y, sr)
|
56 |
+
|
57 |
+
plt.figure(figsize=(12, 10))
|
58 |
+
|
59 |
+
plot_title("Audio Analysis" + " - " + audio_file.split("/")[-1])
|
60 |
+
plot_spectrogram(y, sr, stft, duration)
|
61 |
+
plot_waveform(y, sr, duration)
|
62 |
+
plot_features(librosa.times_like(cent), cent, bw, rolloff, duration)
|
63 |
+
|
64 |
+
plt.tight_layout()
|
65 |
+
|
66 |
+
if save_plot_path:
|
67 |
+
plt.savefig(save_plot_path, bbox_inches="tight", dpi=300)
|
68 |
+
plt.close()
|
69 |
+
|
70 |
+
audio_info = f"""Sample Rate: {sr}\nDuration: {(
|
71 |
+
str(round(duration, 2)) + " seconds"
|
72 |
+
if duration < 60
|
73 |
+
else str(round(duration / 60, 2)) + " minutes"
|
74 |
+
)}\nNumber of Samples: {len(y)}\nBits per Sample: {librosa.get_samplerate(audio_file)}\nChannels: {"Mono (1)" if y.ndim == 1 else "Stereo (2)"}"""
|
75 |
+
|
76 |
+
return audio_info, save_plot_path
|
rvc/lib/tools/gdown.py
CHANGED
@@ -16,6 +16,7 @@ import requests
|
|
16 |
import six
|
17 |
import tqdm
|
18 |
|
|
|
19 |
def indent(text, prefix):
|
20 |
def prefixed_lines():
|
21 |
for line in text.splitlines(True):
|
@@ -23,6 +24,7 @@ def indent(text, prefix):
|
|
23 |
|
24 |
return "".join(prefixed_lines())
|
25 |
|
|
|
26 |
class FileURLRetrievalError(Exception):
|
27 |
pass
|
28 |
|
@@ -30,6 +32,7 @@ class FileURLRetrievalError(Exception):
|
|
30 |
class FolderContentsMaximumLimitError(Exception):
|
31 |
pass
|
32 |
|
|
|
33 |
def parse_url(url, warning=True):
|
34 |
"""Parse URLs especially for Google Drive links.
|
35 |
|
@@ -93,11 +96,17 @@ def get_url_from_gdrive_confirmation(contents):
|
|
93 |
m = re.search(r'href="/open\?id=([^"]+)"', contents)
|
94 |
if m:
|
95 |
url = m.groups()[0]
|
96 |
-
uuid = re.search(
|
|
|
|
|
97 |
uuid = uuid.groups()[0]
|
98 |
-
url =
|
|
|
|
|
|
|
|
|
|
|
99 |
return url
|
100 |
-
|
101 |
|
102 |
m = re.search(r'"downloadUrl":"([^"]+)', contents)
|
103 |
if m:
|
@@ -116,6 +125,8 @@ def get_url_from_gdrive_confirmation(contents):
|
|
116 |
"You may need to change the permission to "
|
117 |
"'Anyone with the link', or have had many accesses."
|
118 |
)
|
|
|
|
|
119 |
def _get_session(proxy, use_cookies, return_cookies_file=False):
|
120 |
sess = requests.session()
|
121 |
|
@@ -211,16 +222,12 @@ def download(
|
|
211 |
url_origin = url
|
212 |
is_gdrive_download_link = True
|
213 |
|
214 |
-
|
215 |
-
|
216 |
while True:
|
217 |
res = sess.get(url, stream=True, verify=verify)
|
218 |
|
219 |
if url == url_origin and res.status_code == 500:
|
220 |
# The file could be Google Docs or Spreadsheets.
|
221 |
-
url = "https://drive.google.com/open?id={id}".format(
|
222 |
-
id=gdrive_file_id
|
223 |
-
)
|
224 |
continue
|
225 |
|
226 |
if res.headers["Content-Type"].startswith("text/html"):
|
|
|
16 |
import six
|
17 |
import tqdm
|
18 |
|
19 |
+
|
20 |
def indent(text, prefix):
|
21 |
def prefixed_lines():
|
22 |
for line in text.splitlines(True):
|
|
|
24 |
|
25 |
return "".join(prefixed_lines())
|
26 |
|
27 |
+
|
28 |
class FileURLRetrievalError(Exception):
|
29 |
pass
|
30 |
|
|
|
32 |
class FolderContentsMaximumLimitError(Exception):
|
33 |
pass
|
34 |
|
35 |
+
|
36 |
def parse_url(url, warning=True):
|
37 |
"""Parse URLs especially for Google Drive links.
|
38 |
|
|
|
96 |
m = re.search(r'href="/open\?id=([^"]+)"', contents)
|
97 |
if m:
|
98 |
url = m.groups()[0]
|
99 |
+
uuid = re.search(
|
100 |
+
r'<input\s+type="hidden"\s+name="uuid"\s+value="([^"]+)"', contents
|
101 |
+
)
|
102 |
uuid = uuid.groups()[0]
|
103 |
+
url = (
|
104 |
+
"https://drive.usercontent.google.com/download?id="
|
105 |
+
+ url
|
106 |
+
+ "&confirm=t&uuid="
|
107 |
+
+ uuid
|
108 |
+
)
|
109 |
return url
|
|
|
110 |
|
111 |
m = re.search(r'"downloadUrl":"([^"]+)', contents)
|
112 |
if m:
|
|
|
125 |
"You may need to change the permission to "
|
126 |
"'Anyone with the link', or have had many accesses."
|
127 |
)
|
128 |
+
|
129 |
+
|
130 |
def _get_session(proxy, use_cookies, return_cookies_file=False):
|
131 |
sess = requests.session()
|
132 |
|
|
|
222 |
url_origin = url
|
223 |
is_gdrive_download_link = True
|
224 |
|
|
|
|
|
225 |
while True:
|
226 |
res = sess.get(url, stream=True, verify=verify)
|
227 |
|
228 |
if url == url_origin and res.status_code == 500:
|
229 |
# The file could be Google Docs or Spreadsheets.
|
230 |
+
url = "https://drive.google.com/open?id={id}".format(id=gdrive_file_id)
|
|
|
|
|
231 |
continue
|
232 |
|
233 |
if res.headers["Content-Type"].startswith("text/html"):
|
rvc/lib/tools/launch_tensorboard.py
CHANGED
@@ -3,7 +3,8 @@ from tensorboard import program
|
|
3 |
|
4 |
log_path = "logs"
|
5 |
|
6 |
-
|
|
|
7 |
tb = program.TensorBoard()
|
8 |
tb.configure(argv=[None, "--logdir", log_path])
|
9 |
url = tb.launch()
|
|
|
3 |
|
4 |
log_path = "logs"
|
5 |
|
6 |
+
|
7 |
+
def launch_tensorboard_pipeline():
|
8 |
tb = program.TensorBoard()
|
9 |
tb.configure(argv=[None, "--logdir", log_path])
|
10 |
url = tb.launch()
|
rvc/lib/tools/model_download.py
CHANGED
@@ -4,9 +4,11 @@ import wget
|
|
4 |
import zipfile
|
5 |
from bs4 import BeautifulSoup
|
6 |
import requests
|
7 |
-
from urllib.parse import unquote
|
8 |
import re
|
9 |
import shutil
|
|
|
|
|
10 |
|
11 |
def find_folder_parent(search_dir, folder_name):
|
12 |
for dirpath, dirnames, _ in os.walk(search_dir):
|
@@ -14,12 +16,13 @@ def find_folder_parent(search_dir, folder_name):
|
|
14 |
return os.path.abspath(dirpath)
|
15 |
return None
|
16 |
|
|
|
17 |
now_dir = os.getcwd()
|
18 |
sys.path.append(now_dir)
|
19 |
|
20 |
from rvc.lib.utils import format_title
|
21 |
|
22 |
-
|
23 |
|
24 |
file_path = find_folder_parent(now_dir, "logs")
|
25 |
|
@@ -71,7 +74,7 @@ def download_from_url(url):
|
|
71 |
try:
|
72 |
gdown.download(
|
73 |
f"https://drive.google.com/uc?id={file_id}",
|
74 |
-
quiet=
|
75 |
fuzzy=True,
|
76 |
)
|
77 |
except Exception as error:
|
@@ -91,7 +94,60 @@ def download_from_url(url):
|
|
91 |
print(error_message)
|
92 |
os.chdir(now_dir)
|
93 |
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
elif "/blob/" in url or "/resolve/" in url:
|
96 |
os.chdir(zips_path)
|
97 |
if "/blob/" in url:
|
@@ -99,11 +155,12 @@ def download_from_url(url):
|
|
99 |
|
100 |
response = requests.get(url, stream=True)
|
101 |
if response.status_code == 200:
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
|
|
107 |
total_size_in_bytes = int(response.headers.get("content-length", 0))
|
108 |
block_size = 1024
|
109 |
progress_bar_length = 50
|
@@ -152,6 +209,31 @@ def download_from_url(url):
|
|
152 |
else:
|
153 |
os.chdir(now_dir)
|
154 |
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
155 |
else:
|
156 |
try:
|
157 |
os.chdir(zips_path)
|
@@ -197,73 +279,86 @@ def unzip_file(zip_path, zip_file_name):
|
|
197 |
os.remove(zip_file_path)
|
198 |
|
199 |
|
200 |
-
url
|
201 |
-
|
202 |
-
if "
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
file_name = item.split(".pth")[0]
|
235 |
-
if file_name != model_name:
|
236 |
-
os.rename(
|
237 |
-
os.path.join(extract_folder_path, item),
|
238 |
-
os.path.join(extract_folder_path, model_name + ".pth"),
|
239 |
-
)
|
240 |
-
else:
|
241 |
-
if "v2" not in item:
|
242 |
-
file_name = item.split("_nprobe_1_")[1].split("_v1")[0]
|
243 |
if file_name != model_name:
|
244 |
-
new_file_name = item.split("_nprobe_1_")[0] + "_nprobe_1_" + model_name + "_v1"
|
245 |
os.rename(
|
246 |
os.path.join(extract_folder_path, item),
|
247 |
-
os.path.join(extract_folder_path,
|
248 |
)
|
249 |
else:
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
265 |
sys.exit()
|
266 |
-
result = search_pth_index(extract_folder_path)
|
267 |
-
else:
|
268 |
-
message = "Error"
|
269 |
-
sys.exit()
|
|
|
4 |
import zipfile
|
5 |
from bs4 import BeautifulSoup
|
6 |
import requests
|
7 |
+
from urllib.parse import unquote, urlencode, parse_qs, urlparse
|
8 |
import re
|
9 |
import shutil
|
10 |
+
import six
|
11 |
+
|
12 |
|
13 |
def find_folder_parent(search_dir, folder_name):
|
14 |
for dirpath, dirnames, _ in os.walk(search_dir):
|
|
|
16 |
return os.path.abspath(dirpath)
|
17 |
return None
|
18 |
|
19 |
+
|
20 |
now_dir = os.getcwd()
|
21 |
sys.path.append(now_dir)
|
22 |
|
23 |
from rvc.lib.utils import format_title
|
24 |
|
25 |
+
from rvc.lib.tools import gdown
|
26 |
|
27 |
file_path = find_folder_parent(now_dir, "logs")
|
28 |
|
|
|
74 |
try:
|
75 |
gdown.download(
|
76 |
f"https://drive.google.com/uc?id={file_id}",
|
77 |
+
quiet=True,
|
78 |
fuzzy=True,
|
79 |
)
|
80 |
except Exception as error:
|
|
|
94 |
print(error_message)
|
95 |
os.chdir(now_dir)
|
96 |
return None
|
97 |
+
elif "disk.yandex.ru" in url:
|
98 |
+
base_url = "https://cloud-api.yandex.net/v1/disk/public/resources/download?"
|
99 |
+
public_key = url
|
100 |
+
final_url = base_url + urlencode(dict(public_key=public_key))
|
101 |
+
response = requests.get(final_url)
|
102 |
+
download_url = response.json()["href"]
|
103 |
+
download_response = requests.get(download_url)
|
104 |
+
|
105 |
+
if download_response.status_code == 200:
|
106 |
+
filename = parse_qs(urlparse(unquote(download_url)).query).get(
|
107 |
+
"filename", [""]
|
108 |
+
)[0]
|
109 |
+
if filename:
|
110 |
+
os.chdir(zips_path)
|
111 |
+
with open(filename, "wb") as f:
|
112 |
+
f.write(download_response.content)
|
113 |
+
else:
|
114 |
+
print("Failed to get filename from URL.")
|
115 |
+
return None
|
116 |
|
117 |
+
elif "pixeldrain.com" in url:
|
118 |
+
try:
|
119 |
+
file_id = url.split("pixeldrain.com/u/")[1]
|
120 |
+
os.chdir(zips_path)
|
121 |
+
print(file_id)
|
122 |
+
response = requests.get(f"https://pixeldrain.com/api/file/{file_id}")
|
123 |
+
if response.status_code == 200:
|
124 |
+
file_name = (
|
125 |
+
response.headers.get("Content-Disposition")
|
126 |
+
.split("filename=")[-1]
|
127 |
+
.strip('";')
|
128 |
+
)
|
129 |
+
os.makedirs(zips_path, exist_ok=True)
|
130 |
+
with open(os.path.join(zips_path, file_name), "wb") as newfile:
|
131 |
+
newfile.write(response.content)
|
132 |
+
os.chdir(file_path)
|
133 |
+
return "downloaded"
|
134 |
+
else:
|
135 |
+
os.chdir(file_path)
|
136 |
+
return None
|
137 |
+
except Exception as e:
|
138 |
+
print(e)
|
139 |
+
os.chdir(file_path)
|
140 |
+
return None
|
141 |
+
|
142 |
+
elif "cdn.discordapp.com" in url:
|
143 |
+
file = requests.get(url)
|
144 |
+
os.chdir(zips_path)
|
145 |
+
if file.status_code == 200:
|
146 |
+
name = url.split("/")
|
147 |
+
with open(os.path.join(name[-1]), "wb") as newfile:
|
148 |
+
newfile.write(file.content)
|
149 |
+
else:
|
150 |
+
return None
|
151 |
elif "/blob/" in url or "/resolve/" in url:
|
152 |
os.chdir(zips_path)
|
153 |
if "/blob/" in url:
|
|
|
155 |
|
156 |
response = requests.get(url, stream=True)
|
157 |
if response.status_code == 200:
|
158 |
+
content_disposition = six.moves.urllib_parse.unquote(
|
159 |
+
response.headers["Content-Disposition"]
|
160 |
+
)
|
161 |
+
m = re.search(r'filename="([^"]+)"', content_disposition)
|
162 |
+
file_name = m.groups()[0]
|
163 |
+
file_name = file_name.replace(os.path.sep, "_")
|
164 |
total_size_in_bytes = int(response.headers.get("content-length", 0))
|
165 |
block_size = 1024
|
166 |
progress_bar_length = 50
|
|
|
209 |
else:
|
210 |
os.chdir(now_dir)
|
211 |
return None
|
212 |
+
elif "applio.org" in url:
|
213 |
+
parts = url.split("/")
|
214 |
+
id_with_query = parts[-1]
|
215 |
+
id_parts = id_with_query.split("?")
|
216 |
+
id_number = id_parts[0]
|
217 |
+
|
218 |
+
url = "https://cjtfqzjfdimgpvpwhzlv.supabase.co/rest/v1/models"
|
219 |
+
headers = {
|
220 |
+
"apikey": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6ImNqdGZxempmZGltZ3B2cHdoemx2Iiwicm9sZSI6ImFub24iLCJpYXQiOjE2OTUxNjczODgsImV4cCI6MjAxMDc0MzM4OH0.7z5WMIbjR99c2Ooc0ma7B_FyGq10G8X-alkCYTkKR10"
|
221 |
+
}
|
222 |
+
|
223 |
+
params = {"id": f"eq.{id_number}"}
|
224 |
+
response = requests.get(url, headers=headers, params=params)
|
225 |
+
if response.status_code == 200:
|
226 |
+
json_response = response.json()
|
227 |
+
print(json_response)
|
228 |
+
if json_response:
|
229 |
+
link = json_response[0]["link"]
|
230 |
+
verify = download_from_url(link)
|
231 |
+
if verify == "downloaded":
|
232 |
+
return "downloaded"
|
233 |
+
else:
|
234 |
+
return None
|
235 |
+
else:
|
236 |
+
return None
|
237 |
else:
|
238 |
try:
|
239 |
os.chdir(zips_path)
|
|
|
279 |
os.remove(zip_file_path)
|
280 |
|
281 |
|
282 |
+
def model_download_pipeline(url):
|
283 |
+
verify = download_from_url(url)
|
284 |
+
if verify == "downloaded":
|
285 |
+
extract_folder_path = ""
|
286 |
+
for filename in os.listdir(zips_path):
|
287 |
+
if filename.endswith(".zip"):
|
288 |
+
zipfile_path = os.path.join(zips_path, filename)
|
289 |
+
print("Proceeding with the extraction...")
|
290 |
+
|
291 |
+
model_zip = os.path.basename(zipfile_path)
|
292 |
+
model_name = format_title(model_zip.split(".zip")[0])
|
293 |
+
extract_folder_path = os.path.join(
|
294 |
+
"logs",
|
295 |
+
os.path.normpath(model_name),
|
296 |
+
)
|
297 |
+
|
298 |
+
success = extract_and_show_progress(zipfile_path, extract_folder_path)
|
299 |
+
|
300 |
+
subfolders = [
|
301 |
+
f
|
302 |
+
for f in os.listdir(extract_folder_path)
|
303 |
+
if os.path.isdir(os.path.join(extract_folder_path, f))
|
304 |
+
]
|
305 |
+
if len(subfolders) == 1:
|
306 |
+
subfolder_path = os.path.join(extract_folder_path, subfolders[0])
|
307 |
+
for item in os.listdir(subfolder_path):
|
308 |
+
s = os.path.join(subfolder_path, item)
|
309 |
+
d = os.path.join(extract_folder_path, item)
|
310 |
+
shutil.move(s, d)
|
311 |
+
os.rmdir(subfolder_path)
|
312 |
+
|
313 |
+
for item in os.listdir(extract_folder_path):
|
314 |
+
if ".pth" in item:
|
315 |
+
file_name = item.split(".pth")[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
316 |
if file_name != model_name:
|
|
|
317 |
os.rename(
|
318 |
os.path.join(extract_folder_path, item),
|
319 |
+
os.path.join(extract_folder_path, model_name + ".pth"),
|
320 |
)
|
321 |
else:
|
322 |
+
if "v2" not in item:
|
323 |
+
file_name = item.split("_nprobe_1_")[1].split("_v1")[0]
|
324 |
+
if file_name != model_name:
|
325 |
+
new_file_name = (
|
326 |
+
item.split("_nprobe_1_")[0]
|
327 |
+
+ "_nprobe_1_"
|
328 |
+
+ model_name
|
329 |
+
+ "_v1"
|
330 |
+
)
|
331 |
+
os.rename(
|
332 |
+
os.path.join(extract_folder_path, item),
|
333 |
+
os.path.join(
|
334 |
+
extract_folder_path, new_file_name + ".index"
|
335 |
+
),
|
336 |
+
)
|
337 |
+
else:
|
338 |
+
file_name = item.split("_nprobe_1_")[1].split("_v2")[0]
|
339 |
+
if file_name != model_name:
|
340 |
+
new_file_name = (
|
341 |
+
item.split("_nprobe_1_")[0]
|
342 |
+
+ "_nprobe_1_"
|
343 |
+
+ model_name
|
344 |
+
+ "_v2"
|
345 |
+
)
|
346 |
+
os.rename(
|
347 |
+
os.path.join(extract_folder_path, item),
|
348 |
+
os.path.join(
|
349 |
+
extract_folder_path, new_file_name + ".index"
|
350 |
+
),
|
351 |
+
)
|
352 |
+
|
353 |
+
if success:
|
354 |
+
print(f"Model {model_name} downloaded!")
|
355 |
+
else:
|
356 |
+
print(f"Error downloading {model_name}")
|
357 |
+
sys.exit()
|
358 |
+
if extract_folder_path == "":
|
359 |
+
print("Zip file was not found.")
|
360 |
+
sys.exit()
|
361 |
+
result = search_pth_index(extract_folder_path)
|
362 |
+
else:
|
363 |
+
message = "Error"
|
364 |
sys.exit()
|
|
|
|
|
|
|
|
rvc/lib/tools/prerequisites_download.py
CHANGED
@@ -1,11 +1,10 @@
|
|
1 |
import os
|
2 |
import wget
|
3 |
-
import sys
|
4 |
|
5 |
-
url_base = "https://huggingface.co/
|
6 |
-
|
7 |
(
|
8 |
-
"
|
9 |
[
|
10 |
"D32k.pth",
|
11 |
"D40k.pth",
|
@@ -21,6 +20,8 @@ models_download = [
|
|
21 |
"f0G48k.pth",
|
22 |
],
|
23 |
),
|
|
|
|
|
24 |
(
|
25 |
"pretrained_v2/",
|
26 |
[
|
@@ -40,45 +41,55 @@ models_download = [
|
|
40 |
),
|
41 |
]
|
42 |
|
43 |
-
|
44 |
"hubert_base.pt",
|
45 |
"rmvpe.pt",
|
46 |
-
|
|
|
47 |
]
|
48 |
|
49 |
-
|
50 |
-
"ffmpeg.exe",
|
51 |
-
"ffprobe.exe",
|
52 |
-
]
|
53 |
|
54 |
-
|
55 |
-
"
|
56 |
"pretrained_v2/": "rvc/pretraineds/pretrained_v2/",
|
57 |
}
|
58 |
|
59 |
-
for file_name in models_file:
|
60 |
-
destination_path = os.path.join(file_name)
|
61 |
-
url = f"{url_base}/{file_name}"
|
62 |
-
if not os.path.exists(destination_path):
|
63 |
-
os.makedirs(os.path.dirname(destination_path) or ".", exist_ok=True)
|
64 |
-
print(f"\nDownloading {url} to {destination_path}...")
|
65 |
-
wget.download(url, out=destination_path)
|
66 |
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
os.
|
73 |
-
|
74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
|
|
|
1 |
import os
|
2 |
import wget
|
|
|
3 |
|
4 |
+
url_base = "https://huggingface.co/IAHispano/Applio/resolve/main/Resources"
|
5 |
+
pretraineds_v1_list = [
|
6 |
(
|
7 |
+
"pretrained_v1/",
|
8 |
[
|
9 |
"D32k.pth",
|
10 |
"D40k.pth",
|
|
|
20 |
"f0G48k.pth",
|
21 |
],
|
22 |
),
|
23 |
+
]
|
24 |
+
pretraineds_v2_list = [
|
25 |
(
|
26 |
"pretrained_v2/",
|
27 |
[
|
|
|
41 |
),
|
42 |
]
|
43 |
|
44 |
+
models_list = [
|
45 |
"hubert_base.pt",
|
46 |
"rmvpe.pt",
|
47 |
+
"fcpe.pt",
|
48 |
+
# "rmvpe.onnx"
|
49 |
]
|
50 |
|
51 |
+
executables_list = ["ffmpeg.exe", "ffprobe.exe"]
|
|
|
|
|
|
|
52 |
|
53 |
+
folder_mapping_list = {
|
54 |
+
"pretrained_v1/": "rvc/pretraineds/pretrained_v1/",
|
55 |
"pretrained_v2/": "rvc/pretraineds/pretrained_v2/",
|
56 |
}
|
57 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
|
59 |
+
def prequisites_download_pipeline(pretraineds_v1, pretraineds_v2, models, exe):
|
60 |
+
def download_files(file_list):
|
61 |
+
for file_name in file_list:
|
62 |
+
destination_path = os.path.join(file_name)
|
63 |
+
url = f"{url_base}/{file_name}"
|
64 |
+
if not os.path.exists(destination_path):
|
65 |
+
os.makedirs(os.path.dirname(destination_path) or ".", exist_ok=True)
|
66 |
+
print(f"\nDownloading {url} to {destination_path}...")
|
67 |
+
wget.download(url, out=destination_path)
|
68 |
+
|
69 |
+
if models == "True":
|
70 |
+
download_files(models_list)
|
71 |
+
|
72 |
+
if exe == "True" and os.name == "nt":
|
73 |
+
download_files(executables_list)
|
74 |
+
|
75 |
+
if pretraineds_v1 == "True":
|
76 |
+
for remote_folder, file_list in pretraineds_v1_list:
|
77 |
+
local_folder = folder_mapping_list.get(remote_folder, "")
|
78 |
+
for file in file_list:
|
79 |
+
destination_path = os.path.join(local_folder, file)
|
80 |
+
url = f"{url_base}/{remote_folder}{file}"
|
81 |
+
if not os.path.exists(destination_path):
|
82 |
+
os.makedirs(os.path.dirname(destination_path) or ".", exist_ok=True)
|
83 |
+
print(f"\nDownloading {url} to {destination_path}...")
|
84 |
+
wget.download(url, out=destination_path)
|
85 |
|
86 |
+
if pretraineds_v2 == "True":
|
87 |
+
for remote_folder, file_list in pretraineds_v2_list:
|
88 |
+
local_folder = folder_mapping_list.get(remote_folder, "")
|
89 |
+
for file in file_list:
|
90 |
+
destination_path = os.path.join(local_folder, file)
|
91 |
+
url = f"{url_base}/{remote_folder}{file}"
|
92 |
+
if not os.path.exists(destination_path):
|
93 |
+
os.makedirs(os.path.dirname(destination_path) or ".", exist_ok=True)
|
94 |
+
print(f"\nDownloading {url} to {destination_path}...")
|
95 |
+
wget.download(url, out=destination_path)
|
rvc/lib/tools/pretrained_selector.py
CHANGED
@@ -60,4 +60,4 @@ def pretrained_selector(pitch_guidance):
|
|
60 |
"rvc/pretraineds/pretrained_v2/D48k.pth",
|
61 |
),
|
62 |
},
|
63 |
-
}
|
|
|
60 |
"rvc/pretraineds/pretrained_v2/D48k.pth",
|
61 |
),
|
62 |
},
|
63 |
+
}
|
rvc/lib/tools/split_audio.py
CHANGED
@@ -17,11 +17,13 @@ def process_audio(file_path):
|
|
17 |
min_silence_len = 750 # ms, adjust as needed
|
18 |
|
19 |
# detect nonsilent parts
|
20 |
-
nonsilent_parts = detect_nonsilent(
|
|
|
|
|
21 |
|
22 |
# Create a new directory to store chunks
|
23 |
file_dir = os.path.dirname(file_path)
|
24 |
-
file_name = os.path.basename(file_path).split(
|
25 |
file_name = format_title(file_name)
|
26 |
new_dir_path = os.path.join(file_dir, file_name)
|
27 |
os.makedirs(new_dir_path, exist_ok=True)
|
@@ -58,7 +60,7 @@ def process_audio(file_path):
|
|
58 |
def merge_audio(timestamps_file):
|
59 |
try:
|
60 |
# Extract prefix from the timestamps filename
|
61 |
-
prefix = os.path.basename(timestamps_file).replace(
|
62 |
timestamps_dir = os.path.dirname(timestamps_file)
|
63 |
|
64 |
# Open the timestamps file
|
@@ -98,8 +100,8 @@ def merge_audio(timestamps_file):
|
|
98 |
# Concatenate all audio_segments and export
|
99 |
merged_audio = sum(audio_segments)
|
100 |
merged_audio_np = np.array(merged_audio.get_array_of_samples())
|
101 |
-
#print(f"Exported merged file: {merged_filename}\n")
|
102 |
return merged_audio.frame_rate, merged_audio_np
|
103 |
|
104 |
except Exception as e:
|
105 |
-
print(f"An error occurred: {e}")
|
|
|
17 |
min_silence_len = 750 # ms, adjust as needed
|
18 |
|
19 |
# detect nonsilent parts
|
20 |
+
nonsilent_parts = detect_nonsilent(
|
21 |
+
song, min_silence_len=min_silence_len, silence_thresh=silence_thresh
|
22 |
+
)
|
23 |
|
24 |
# Create a new directory to store chunks
|
25 |
file_dir = os.path.dirname(file_path)
|
26 |
+
file_name = os.path.basename(file_path).split(".")[0]
|
27 |
file_name = format_title(file_name)
|
28 |
new_dir_path = os.path.join(file_dir, file_name)
|
29 |
os.makedirs(new_dir_path, exist_ok=True)
|
|
|
60 |
def merge_audio(timestamps_file):
|
61 |
try:
|
62 |
# Extract prefix from the timestamps filename
|
63 |
+
prefix = os.path.basename(timestamps_file).replace("_timestamps.txt", "")
|
64 |
timestamps_dir = os.path.dirname(timestamps_file)
|
65 |
|
66 |
# Open the timestamps file
|
|
|
100 |
# Concatenate all audio_segments and export
|
101 |
merged_audio = sum(audio_segments)
|
102 |
merged_audio_np = np.array(merged_audio.get_array_of_samples())
|
103 |
+
# print(f"Exported merged file: {merged_filename}\n")
|
104 |
return merged_audio.frame_rate, merged_audio_np
|
105 |
|
106 |
except Exception as e:
|
107 |
+
print(f"An error occurred: {e}")
|
rvc/lib/utils.py
CHANGED
@@ -19,8 +19,10 @@ def load_audio(file, sampling_rate):
|
|
19 |
|
20 |
|
21 |
def format_title(title):
|
22 |
-
formatted_title =
|
23 |
-
|
24 |
-
|
25 |
-
formatted_title = re.sub(r
|
26 |
-
|
|
|
|
|
|
19 |
|
20 |
|
21 |
def format_title(title):
|
22 |
+
formatted_title = (
|
23 |
+
unicodedata.normalize("NFKD", title).encode("ascii", "ignore").decode("utf-8")
|
24 |
+
)
|
25 |
+
formatted_title = re.sub(r"[\u2500-\u257F]+", "", formatted_title)
|
26 |
+
formatted_title = re.sub(r"[^\w\s.-]", "", formatted_title)
|
27 |
+
formatted_title = re.sub(r"\s+", "_", formatted_title)
|
28 |
+
return formatted_title
|
rvc/train/extract/extract_feature_print.py
CHANGED
@@ -7,6 +7,9 @@ import fairseq
|
|
7 |
import soundfile as sf
|
8 |
import numpy as np
|
9 |
|
|
|
|
|
|
|
10 |
|
11 |
device = sys.argv[1]
|
12 |
n_parts = int(sys.argv[2])
|
|
|
7 |
import soundfile as sf
|
8 |
import numpy as np
|
9 |
|
10 |
+
import logging
|
11 |
+
|
12 |
+
logging.getLogger("fairseq").setLevel(logging.WARNING)
|
13 |
|
14 |
device = sys.argv[1]
|
15 |
n_parts = int(sys.argv[2])
|
rvc/train/process/extract_index.py
CHANGED
@@ -78,8 +78,11 @@ try:
|
|
78 |
index_added.add(big_npy[i : i + batch_size_add])
|
79 |
|
80 |
faiss.write_index(index_added, index_filepath_added)
|
|
|
81 |
|
82 |
except Exception as error:
|
83 |
print(f"Failed to train index: {error}")
|
84 |
-
|
85 |
-
print(
|
|
|
|
|
|
78 |
index_added.add(big_npy[i : i + batch_size_add])
|
79 |
|
80 |
faiss.write_index(index_added, index_filepath_added)
|
81 |
+
print(f"Saved index file '{index_filepath_added}'")
|
82 |
|
83 |
except Exception as error:
|
84 |
print(f"Failed to train index: {error}")
|
85 |
+
if "one array to concatenate" in str(error):
|
86 |
+
print(
|
87 |
+
"If you are running this code in a virtual environment, make sure you have enough GPU available to generate the Index file."
|
88 |
+
)
|
rvc/train/process/extract_model.py
CHANGED
@@ -1,28 +1,27 @@
|
|
1 |
import os
|
2 |
import torch
|
|
|
|
|
3 |
from collections import OrderedDict
|
4 |
|
5 |
|
6 |
def replace_keys_in_dict(d, old_key_part, new_key_part):
|
7 |
-
# Use OrderedDict if the original is an OrderedDict
|
8 |
if isinstance(d, OrderedDict):
|
9 |
updated_dict = OrderedDict()
|
10 |
else:
|
11 |
updated_dict = {}
|
12 |
for key, value in d.items():
|
13 |
-
# Replace the key part if found
|
14 |
new_key = key.replace(old_key_part, new_key_part)
|
15 |
-
# If the value is a dictionary, apply the function recursively
|
16 |
if isinstance(value, dict):
|
17 |
value = replace_keys_in_dict(value, old_key_part, new_key_part)
|
18 |
updated_dict[new_key] = value
|
19 |
return updated_dict
|
20 |
|
21 |
|
22 |
-
def extract_model(ckpt, sr, if_f0, name, model_dir, epoch, version, hps):
|
23 |
try:
|
24 |
-
print(f"Saved model '{model_dir}' (epoch {epoch})")
|
25 |
-
pth_file = f"{name}_{epoch}
|
26 |
pth_file_old_version_path = os.path.join(
|
27 |
model_dir, f"{pth_file}_old_version.pth"
|
28 |
)
|
@@ -51,7 +50,18 @@ def extract_model(ckpt, sr, if_f0, name, model_dir, epoch, version, hps):
|
|
51 |
hps.model.gin_channels,
|
52 |
hps.data.sampling_rate,
|
53 |
]
|
54 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
torch.save(opt, model_dir)
|
56 |
|
57 |
model = torch.load(model_dir, map_location=torch.device("cpu"))
|
|
|
1 |
import os
|
2 |
import torch
|
3 |
+
import hashlib
|
4 |
+
import datetime
|
5 |
from collections import OrderedDict
|
6 |
|
7 |
|
8 |
def replace_keys_in_dict(d, old_key_part, new_key_part):
|
|
|
9 |
if isinstance(d, OrderedDict):
|
10 |
updated_dict = OrderedDict()
|
11 |
else:
|
12 |
updated_dict = {}
|
13 |
for key, value in d.items():
|
|
|
14 |
new_key = key.replace(old_key_part, new_key_part)
|
|
|
15 |
if isinstance(value, dict):
|
16 |
value = replace_keys_in_dict(value, old_key_part, new_key_part)
|
17 |
updated_dict[new_key] = value
|
18 |
return updated_dict
|
19 |
|
20 |
|
21 |
+
def extract_model(ckpt, sr, if_f0, name, model_dir, epoch, step, version, hps):
|
22 |
try:
|
23 |
+
print(f"Saved model '{model_dir}' (epoch {epoch} and step {step})")
|
24 |
+
pth_file = f"{name}_{epoch}e_{step}s.pth"
|
25 |
pth_file_old_version_path = os.path.join(
|
26 |
model_dir, f"{pth_file}_old_version.pth"
|
27 |
)
|
|
|
50 |
hps.model.gin_channels,
|
51 |
hps.data.sampling_rate,
|
52 |
]
|
53 |
+
|
54 |
+
opt["epoch"] = epoch
|
55 |
+
opt["step"] = step
|
56 |
+
opt["sr"] = sr
|
57 |
+
opt["f0"] = if_f0
|
58 |
+
opt["version"] = version
|
59 |
+
opt["creation_date"] = datetime.datetime.now().isoformat()
|
60 |
+
|
61 |
+
hash_input = f"{str(ckpt)} {epoch} {step} {datetime.datetime.now().isoformat()}"
|
62 |
+
model_hash = hashlib.sha256(hash_input.encode()).hexdigest()
|
63 |
+
opt["model_hash"] = model_hash
|
64 |
+
|
65 |
torch.save(opt, model_dir)
|
66 |
|
67 |
model = torch.load(model_dir, map_location=torch.device("cpu"))
|
rvc/train/process/extract_small_model.py
ADDED
@@ -0,0 +1,175 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import torch
|
3 |
+
import hashlib
|
4 |
+
import datetime
|
5 |
+
from collections import OrderedDict
|
6 |
+
|
7 |
+
|
8 |
+
def replace_keys_in_dict(d, old_key_part, new_key_part):
|
9 |
+
# Use OrderedDict if the original is an OrderedDict
|
10 |
+
if isinstance(d, OrderedDict):
|
11 |
+
updated_dict = OrderedDict()
|
12 |
+
else:
|
13 |
+
updated_dict = {}
|
14 |
+
for key, value in d.items():
|
15 |
+
# Replace the key part if found
|
16 |
+
new_key = key.replace(old_key_part, new_key_part)
|
17 |
+
# If the value is a dictionary, apply the function recursively
|
18 |
+
if isinstance(value, dict):
|
19 |
+
value = replace_keys_in_dict(value, old_key_part, new_key_part)
|
20 |
+
updated_dict[new_key] = value
|
21 |
+
return updated_dict
|
22 |
+
|
23 |
+
|
24 |
+
def extract_small_model(path, name, sr, if_f0, version, epoch, step):
|
25 |
+
try:
|
26 |
+
ckpt = torch.load(path, map_location="cpu")
|
27 |
+
pth_file = f"{name}.pth"
|
28 |
+
pth_file_old_version_path = os.path.join("logs", f"{pth_file}_old_version.pth")
|
29 |
+
opt = OrderedDict(
|
30 |
+
weight={
|
31 |
+
key: value.half() for key, value in ckpt.items() if "enc_q" not in key
|
32 |
+
}
|
33 |
+
)
|
34 |
+
if "model" in ckpt:
|
35 |
+
ckpt = ckpt["model"]
|
36 |
+
opt = OrderedDict()
|
37 |
+
opt["weight"] = {}
|
38 |
+
for key in ckpt.keys():
|
39 |
+
if "enc_q" in key:
|
40 |
+
continue
|
41 |
+
opt["weight"][key] = ckpt[key].half()
|
42 |
+
if sr == "40k":
|
43 |
+
opt["config"] = [
|
44 |
+
1025,
|
45 |
+
32,
|
46 |
+
192,
|
47 |
+
192,
|
48 |
+
768,
|
49 |
+
2,
|
50 |
+
6,
|
51 |
+
3,
|
52 |
+
0,
|
53 |
+
"1",
|
54 |
+
[3, 7, 11],
|
55 |
+
[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
|
56 |
+
[10, 10, 2, 2],
|
57 |
+
512,
|
58 |
+
[16, 16, 4, 4],
|
59 |
+
109,
|
60 |
+
256,
|
61 |
+
40000,
|
62 |
+
]
|
63 |
+
elif sr == "48k":
|
64 |
+
if version == "v1":
|
65 |
+
opt["config"] = [
|
66 |
+
1025,
|
67 |
+
32,
|
68 |
+
192,
|
69 |
+
192,
|
70 |
+
768,
|
71 |
+
2,
|
72 |
+
6,
|
73 |
+
3,
|
74 |
+
0,
|
75 |
+
"1",
|
76 |
+
[3, 7, 11],
|
77 |
+
[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
|
78 |
+
[10, 6, 2, 2, 2],
|
79 |
+
512,
|
80 |
+
[16, 16, 4, 4, 4],
|
81 |
+
109,
|
82 |
+
256,
|
83 |
+
48000,
|
84 |
+
]
|
85 |
+
else:
|
86 |
+
opt["config"] = [
|
87 |
+
1025,
|
88 |
+
32,
|
89 |
+
192,
|
90 |
+
192,
|
91 |
+
768,
|
92 |
+
2,
|
93 |
+
6,
|
94 |
+
3,
|
95 |
+
0,
|
96 |
+
"1",
|
97 |
+
[3, 7, 11],
|
98 |
+
[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
|
99 |
+
[12, 10, 2, 2],
|
100 |
+
512,
|
101 |
+
[24, 20, 4, 4],
|
102 |
+
109,
|
103 |
+
256,
|
104 |
+
48000,
|
105 |
+
]
|
106 |
+
elif sr == "32k":
|
107 |
+
if version == "v1":
|
108 |
+
opt["config"] = [
|
109 |
+
513,
|
110 |
+
32,
|
111 |
+
192,
|
112 |
+
192,
|
113 |
+
768,
|
114 |
+
2,
|
115 |
+
6,
|
116 |
+
3,
|
117 |
+
0,
|
118 |
+
"1",
|
119 |
+
[3, 7, 11],
|
120 |
+
[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
|
121 |
+
[10, 4, 2, 2, 2],
|
122 |
+
512,
|
123 |
+
[16, 16, 4, 4, 4],
|
124 |
+
109,
|
125 |
+
256,
|
126 |
+
32000,
|
127 |
+
]
|
128 |
+
else:
|
129 |
+
opt["config"] = [
|
130 |
+
513,
|
131 |
+
32,
|
132 |
+
192,
|
133 |
+
192,
|
134 |
+
768,
|
135 |
+
2,
|
136 |
+
6,
|
137 |
+
3,
|
138 |
+
0,
|
139 |
+
"1",
|
140 |
+
[3, 7, 11],
|
141 |
+
[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
|
142 |
+
[10, 8, 2, 2],
|
143 |
+
512,
|
144 |
+
[20, 16, 4, 4],
|
145 |
+
109,
|
146 |
+
256,
|
147 |
+
32000,
|
148 |
+
]
|
149 |
+
|
150 |
+
opt["epoch"] = epoch
|
151 |
+
opt["step"] = step
|
152 |
+
opt["sr"] = sr
|
153 |
+
opt["f0"] = int(if_f0)
|
154 |
+
opt["version"] = version
|
155 |
+
opt["creation_date"] = datetime.datetime.now().isoformat()
|
156 |
+
|
157 |
+
hash_input = f"{str(ckpt)} {epoch} {step} {datetime.datetime.now().isoformat()}"
|
158 |
+
model_hash = hashlib.sha256(hash_input.encode()).hexdigest()
|
159 |
+
opt["model_hash"] = model_hash
|
160 |
+
|
161 |
+
model = torch.load(pth_file_old_version_path, map_location=torch.device("cpu"))
|
162 |
+
torch.save(
|
163 |
+
replace_keys_in_dict(
|
164 |
+
replace_keys_in_dict(
|
165 |
+
model, ".parametrizations.weight.original1", ".weight_v"
|
166 |
+
),
|
167 |
+
".parametrizations.weight.original0",
|
168 |
+
".weight_g",
|
169 |
+
),
|
170 |
+
pth_file_old_version_path,
|
171 |
+
)
|
172 |
+
os.remove(pth_file_old_version_path)
|
173 |
+
os.rename(pth_file_old_version_path, pth_file)
|
174 |
+
except Exception as error:
|
175 |
+
print(error)
|
rvc/train/process/model_blender.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import torch
|
3 |
+
from collections import OrderedDict
|
4 |
+
|
5 |
+
|
6 |
+
def extract(ckpt):
|
7 |
+
a = ckpt["model"]
|
8 |
+
opt = OrderedDict()
|
9 |
+
opt["weight"] = {}
|
10 |
+
for key in a.keys():
|
11 |
+
if "enc_q" in key:
|
12 |
+
continue
|
13 |
+
opt["weight"][key] = a[key]
|
14 |
+
return opt
|
15 |
+
|
16 |
+
|
17 |
+
def model_blender(name, path1, path2, ratio):
|
18 |
+
try:
|
19 |
+
message = f"Model {path1} and {path2} are merged with alpha {ratio}."
|
20 |
+
ckpt1 = torch.load(path1, map_location="cpu")
|
21 |
+
ckpt2 = torch.load(path2, map_location="cpu")
|
22 |
+
cfg = ckpt1["config"]
|
23 |
+
cfg_f0 = ckpt1["f0"]
|
24 |
+
cfg_version = ckpt1["version"]
|
25 |
+
|
26 |
+
if "model" in ckpt1:
|
27 |
+
ckpt1 = extract(ckpt1)
|
28 |
+
else:
|
29 |
+
ckpt1 = ckpt1["weight"]
|
30 |
+
if "model" in ckpt2:
|
31 |
+
ckpt2 = extract(ckpt2)
|
32 |
+
else:
|
33 |
+
ckpt2 = ckpt2["weight"]
|
34 |
+
|
35 |
+
if sorted(list(ckpt1.keys())) != sorted(list(ckpt2.keys())):
|
36 |
+
return "Fail to merge the models. The model architectures are not the same."
|
37 |
+
|
38 |
+
opt = OrderedDict()
|
39 |
+
opt["weight"] = {}
|
40 |
+
for key in ckpt1.keys():
|
41 |
+
if key == "emb_g.weight" and ckpt1[key].shape != ckpt2[key].shape:
|
42 |
+
min_shape0 = min(ckpt1[key].shape[0], ckpt2[key].shape[0])
|
43 |
+
opt["weight"][key] = (
|
44 |
+
ratio * (ckpt1[key][:min_shape0].float())
|
45 |
+
+ (1 - ratio) * (ckpt2[key][:min_shape0].float())
|
46 |
+
).half()
|
47 |
+
else:
|
48 |
+
opt["weight"][key] = (
|
49 |
+
ratio * (ckpt1[key].float()) + (1 - ratio) * (ckpt2[key].float())
|
50 |
+
).half()
|
51 |
+
|
52 |
+
opt["config"] = cfg
|
53 |
+
opt["sr"] = message
|
54 |
+
opt["f0"] = cfg_f0
|
55 |
+
opt["version"] = cfg_version
|
56 |
+
opt["info"] = message
|
57 |
+
|
58 |
+
torch.save(opt, os.path.join("logs", "%s.pth" % name))
|
59 |
+
print(message)
|
60 |
+
return message, os.path.join("logs", "%s.pth" % name)
|
61 |
+
except Exception as error:
|
62 |
+
print(error)
|
63 |
+
return error
|
rvc/train/process/model_information.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from datetime import datetime
|
3 |
+
|
4 |
+
|
5 |
+
def prettify_date(date_str):
|
6 |
+
date_time_obj = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S.%f")
|
7 |
+
return date_time_obj.strftime("%Y-%m-%d %H:%M:%S")
|
8 |
+
|
9 |
+
|
10 |
+
def model_information(path):
|
11 |
+
model_data = torch.load(path, map_location="cpu")
|
12 |
+
|
13 |
+
print(f"Loaded model from {path}")
|
14 |
+
|
15 |
+
epochs = model_data.get("epoch", "None")
|
16 |
+
steps = model_data.get("step", "None")
|
17 |
+
sr = model_data.get("sr", "None")
|
18 |
+
f0 = model_data.get("f0", "None")
|
19 |
+
version = model_data.get("version", "None")
|
20 |
+
creation_date = model_data.get("creation_date", "None")
|
21 |
+
model_hash = model_data.get("model_hash", "None")
|
22 |
+
|
23 |
+
pitch_guidance = "True" if f0 == 1 else "False"
|
24 |
+
|
25 |
+
return (
|
26 |
+
f"Epochs: {epochs}\n"
|
27 |
+
f"Steps: {steps}\n"
|
28 |
+
f"RVC Version: {version}\n"
|
29 |
+
f"Sampling Rate: {sr}\n"
|
30 |
+
f"Pitch Guidance: {pitch_guidance}\n"
|
31 |
+
f"Creation Date: {prettify_date(creation_date)}\n"
|
32 |
+
f"Hash (ID): {model_hash}"
|
33 |
+
)
|
rvc/train/train.py
CHANGED
@@ -70,15 +70,9 @@ torch.backends.cudnn.deterministic = False
|
|
70 |
torch.backends.cudnn.benchmark = False
|
71 |
|
72 |
global_step = 0
|
73 |
-
|
74 |
last_loss_gen_all = 0
|
75 |
-
|
76 |
-
lowestValue = {"step": 0, "value": float("inf"), "epoch": 0}
|
77 |
-
dirtyTb = []
|
78 |
-
dirtyValues = []
|
79 |
-
dirtySteps = []
|
80 |
-
dirtyEpochs = []
|
81 |
-
continued = False
|
82 |
|
83 |
|
84 |
class EpochRecorder:
|
@@ -104,13 +98,16 @@ def main():
|
|
104 |
print("GPU not detected, reverting to CPU (not recommended)")
|
105 |
n_gpus = 1
|
106 |
children = []
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
|
|
|
|
|
|
114 |
|
115 |
for i in range(n_gpus):
|
116 |
children[i].join()
|
@@ -287,9 +284,13 @@ def run(
|
|
287 |
|
288 |
|
289 |
def train_and_evaluate(rank, epoch, hps, nets, optims, scaler, loaders, writers, cache):
|
290 |
-
global global_step, last_loss_gen_all,
|
|
|
291 |
if epoch == 1:
|
292 |
-
|
|
|
|
|
|
|
293 |
net_g, net_d = nets
|
294 |
optim_g, optim_d = optims
|
295 |
train_loader = loaders[0] if loaders is not None else None
|
@@ -467,10 +468,15 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, scaler, loaders, writers,
|
|
467 |
loss_gen, losses_gen = generator_loss(y_d_hat_g)
|
468 |
loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl
|
469 |
|
470 |
-
if loss_gen_all <
|
471 |
-
|
472 |
-
|
473 |
-
|
|
|
|
|
|
|
|
|
|
|
474 |
|
475 |
optim_g.zero_grad()
|
476 |
scaler.scale(loss_gen_all).backward()
|
@@ -558,25 +564,43 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, scaler, loaders, writers,
|
|
558 |
ckpt = net_g.module.state_dict()
|
559 |
else:
|
560 |
ckpt = net_g.state_dict()
|
561 |
-
|
562 |
-
|
563 |
-
|
564 |
-
|
565 |
-
|
566 |
-
|
567 |
-
|
568 |
-
|
569 |
-
|
570 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
571 |
|
572 |
if rank == 0:
|
573 |
if epoch > 1:
|
574 |
-
|
575 |
-
change_str = ""
|
576 |
-
if change != 0:
|
577 |
-
change_str = f"({'decreased' if change > 0 else 'increased'} {abs(change)})" # decreased = good
|
578 |
print(
|
579 |
-
f"{hps.name} | epoch={epoch} | step={global_step} | {epoch_recorder.record()} |
|
|
|
|
|
|
|
|
|
580 |
)
|
581 |
last_loss_gen_all = loss_gen_all
|
582 |
|
@@ -585,9 +609,12 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, scaler, loaders, writers,
|
|
585 |
f"Training has been successfully completed with {epoch} epoch, {global_step} steps and {round(loss_gen_all.item(), 3)} loss gen."
|
586 |
)
|
587 |
print(
|
588 |
-
f"Lowest generator loss: {
|
589 |
)
|
590 |
|
|
|
|
|
|
|
591 |
if hasattr(net_g, "module"):
|
592 |
ckpt = net_g.module.state_dict()
|
593 |
else:
|
@@ -598,8 +625,11 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, scaler, loaders, writers,
|
|
598 |
hps.sample_rate,
|
599 |
hps.if_f0,
|
600 |
hps.name,
|
601 |
-
os.path.join(
|
|
|
|
|
602 |
epoch,
|
|
|
603 |
hps.version,
|
604 |
hps,
|
605 |
)
|
|
|
70 |
torch.backends.cudnn.benchmark = False
|
71 |
|
72 |
global_step = 0
|
73 |
+
lowest_value = {"step": 0, "value": float("inf"), "epoch": 0}
|
74 |
last_loss_gen_all = 0
|
75 |
+
epochs_since_last_lowest = 0
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
|
77 |
|
78 |
class EpochRecorder:
|
|
|
98 |
print("GPU not detected, reverting to CPU (not recommended)")
|
99 |
n_gpus = 1
|
100 |
children = []
|
101 |
+
pid_file_path = os.path.join(now_dir, "rvc", "train", "train_pid.txt")
|
102 |
+
with open(pid_file_path, "w") as pid_file:
|
103 |
+
for i in range(n_gpus):
|
104 |
+
subproc = mp.Process(
|
105 |
+
target=run,
|
106 |
+
args=(i, n_gpus, hps),
|
107 |
+
)
|
108 |
+
children.append(subproc)
|
109 |
+
subproc.start()
|
110 |
+
pid_file.write(str(subproc.pid) + "\n")
|
111 |
|
112 |
for i in range(n_gpus):
|
113 |
children[i].join()
|
|
|
284 |
|
285 |
|
286 |
def train_and_evaluate(rank, epoch, hps, nets, optims, scaler, loaders, writers, cache):
|
287 |
+
global global_step, last_loss_gen_all, lowest_value, epochs_since_last_lowest
|
288 |
+
|
289 |
if epoch == 1:
|
290 |
+
lowest_value = {"step": 0, "value": float("inf"), "epoch": 0}
|
291 |
+
last_loss_gen_all = 0.0
|
292 |
+
epochs_since_last_lowest = 0
|
293 |
+
|
294 |
net_g, net_d = nets
|
295 |
optim_g, optim_d = optims
|
296 |
train_loader = loaders[0] if loaders is not None else None
|
|
|
468 |
loss_gen, losses_gen = generator_loss(y_d_hat_g)
|
469 |
loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl
|
470 |
|
471 |
+
if loss_gen_all < lowest_value["value"]:
|
472 |
+
lowest_value["value"] = loss_gen_all
|
473 |
+
lowest_value["step"] = global_step
|
474 |
+
lowest_value["epoch"] = epoch
|
475 |
+
# print(f'Lowest generator loss updated: {lowest_value["value"]} at epoch {epoch}, step {global_step}')
|
476 |
+
if epoch > lowest_value["epoch"]:
|
477 |
+
print(
|
478 |
+
"Alert: The lower generating loss has been exceeded by a lower loss in a subsequent epoch."
|
479 |
+
)
|
480 |
|
481 |
optim_g.zero_grad()
|
482 |
scaler.scale(loss_gen_all).backward()
|
|
|
564 |
ckpt = net_g.module.state_dict()
|
565 |
else:
|
566 |
ckpt = net_g.state_dict()
|
567 |
+
extract_model(
|
568 |
+
ckpt,
|
569 |
+
hps.sample_rate,
|
570 |
+
hps.if_f0,
|
571 |
+
hps.name,
|
572 |
+
os.path.join(
|
573 |
+
hps.model_dir, "{}_{}e_{}s.pth".format(hps.name, epoch, global_step)
|
574 |
+
),
|
575 |
+
epoch,
|
576 |
+
global_step,
|
577 |
+
hps.version,
|
578 |
+
hps,
|
579 |
+
)
|
580 |
+
|
581 |
+
if hps.overtraining_detector == 1:
|
582 |
+
if lowest_value["value"] < last_loss_gen_all:
|
583 |
+
epochs_since_last_lowest += 1
|
584 |
+
else:
|
585 |
+
epochs_since_last_lowest = 0
|
586 |
+
|
587 |
+
if epochs_since_last_lowest >= hps.overtraining_threshold:
|
588 |
+
print(
|
589 |
+
"Stopping training due to possible overtraining. Lowest generator loss: {} at epoch {}, step {}".format(
|
590 |
+
lowest_value["value"], lowest_value["epoch"], lowest_value["step"]
|
591 |
+
)
|
592 |
+
)
|
593 |
+
os._exit(2333333)
|
594 |
|
595 |
if rank == 0:
|
596 |
if epoch > 1:
|
597 |
+
print(hps.overtraining_threshold)
|
|
|
|
|
|
|
598 |
print(
|
599 |
+
f"{hps.name} | epoch={epoch} | step={global_step} | {epoch_recorder.record()} | lowest_value={lowest_value['value']} (epoch {lowest_value['epoch']} and step {lowest_value['step']})"
|
600 |
+
)
|
601 |
+
else:
|
602 |
+
print(
|
603 |
+
f"{hps.name} | epoch={epoch} | step={global_step} | {epoch_recorder.record()}"
|
604 |
)
|
605 |
last_loss_gen_all = loss_gen_all
|
606 |
|
|
|
609 |
f"Training has been successfully completed with {epoch} epoch, {global_step} steps and {round(loss_gen_all.item(), 3)} loss gen."
|
610 |
)
|
611 |
print(
|
612 |
+
f"Lowest generator loss: {lowest_value['value']} at epoch {lowest_value['epoch']}, step {lowest_value['step']}"
|
613 |
)
|
614 |
|
615 |
+
pid_file_path = os.path.join(now_dir, "rvc", "train", "train_pid.txt")
|
616 |
+
os.remove(pid_file_path)
|
617 |
+
|
618 |
if hasattr(net_g, "module"):
|
619 |
ckpt = net_g.module.state_dict()
|
620 |
else:
|
|
|
625 |
hps.sample_rate,
|
626 |
hps.if_f0,
|
627 |
hps.name,
|
628 |
+
os.path.join(
|
629 |
+
hps.model_dir, "{}_{}e_{}s.pth".format(hps.name, epoch, global_step)
|
630 |
+
),
|
631 |
epoch,
|
632 |
+
global_step,
|
633 |
hps.version,
|
634 |
hps,
|
635 |
)
|
rvc/train/utils.py
CHANGED
@@ -7,49 +7,6 @@ import numpy as np
|
|
7 |
from scipy.io.wavfile import read
|
8 |
|
9 |
|
10 |
-
def load_checkpoint_d(checkpoint_path, combd, sbd, optimizer=None, load_opt=1):
|
11 |
-
assert os.path.isfile(checkpoint_path)
|
12 |
-
checkpoint_dict = torch.load(checkpoint_path, map_location="cpu")
|
13 |
-
|
14 |
-
def go(model, bkey):
|
15 |
-
saved_state_dict = checkpoint_dict[bkey]
|
16 |
-
if hasattr(model, "module"):
|
17 |
-
state_dict = model.module.state_dict()
|
18 |
-
else:
|
19 |
-
state_dict = model.state_dict()
|
20 |
-
new_state_dict = {}
|
21 |
-
for k, v in state_dict.items():
|
22 |
-
try:
|
23 |
-
new_state_dict[k] = saved_state_dict[k]
|
24 |
-
if saved_state_dict[k].shape != state_dict[k].shape:
|
25 |
-
print(
|
26 |
-
"shape-%s-mismatch. need: %s, get: %s",
|
27 |
-
k,
|
28 |
-
state_dict[k].shape,
|
29 |
-
saved_state_dict[k].shape,
|
30 |
-
)
|
31 |
-
raise KeyError
|
32 |
-
except:
|
33 |
-
print("%s is not in the checkpoint", k)
|
34 |
-
new_state_dict[k] = v
|
35 |
-
if hasattr(model, "module"):
|
36 |
-
model.module.load_state_dict(new_state_dict, strict=False)
|
37 |
-
else:
|
38 |
-
model.load_state_dict(new_state_dict, strict=False)
|
39 |
-
return model
|
40 |
-
|
41 |
-
go(combd, "combd")
|
42 |
-
model = go(sbd, "sbd")
|
43 |
-
|
44 |
-
iteration = checkpoint_dict["iteration"]
|
45 |
-
learning_rate = checkpoint_dict["learning_rate"]
|
46 |
-
if optimizer is not None and load_opt == 1:
|
47 |
-
optimizer.load_state_dict(checkpoint_dict["optimizer"])
|
48 |
-
|
49 |
-
print("Loaded checkpoint '{}' (epoch {})".format(checkpoint_path, iteration))
|
50 |
-
return model, optimizer, learning_rate, iteration
|
51 |
-
|
52 |
-
|
53 |
def load_checkpoint(checkpoint_path, model, optimizer=None, load_opt=1):
|
54 |
assert os.path.isfile(checkpoint_path)
|
55 |
checkpoint_dict = torch.load(checkpoint_path, map_location="cpu")
|
@@ -218,6 +175,22 @@ def get_hparams():
|
|
218 |
required=True,
|
219 |
help="if caching the dataset in GPU memory, 1 or 0",
|
220 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
221 |
args = parser.parse_args()
|
222 |
name = args.experiment_dir
|
223 |
experiment_dir = os.path.join("./logs", args.experiment_dir)
|
@@ -240,6 +213,8 @@ def get_hparams():
|
|
240 |
hparams.save_every_weights = args.save_every_weights
|
241 |
hparams.if_cache_data_in_gpu = args.if_cache_data_in_gpu
|
242 |
hparams.data.training_files = f"{experiment_dir}/filelist.txt"
|
|
|
|
|
243 |
return hparams
|
244 |
|
245 |
|
|
|
7 |
from scipy.io.wavfile import read
|
8 |
|
9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
def load_checkpoint(checkpoint_path, model, optimizer=None, load_opt=1):
|
11 |
assert os.path.isfile(checkpoint_path)
|
12 |
checkpoint_dict = torch.load(checkpoint_path, map_location="cpu")
|
|
|
175 |
required=True,
|
176 |
help="if caching the dataset in GPU memory, 1 or 0",
|
177 |
)
|
178 |
+
|
179 |
+
parser.add_argument(
|
180 |
+
"-od",
|
181 |
+
"--overtraining_detector",
|
182 |
+
type=int,
|
183 |
+
required=True,
|
184 |
+
help="Detect overtraining or not, 1 or 0",
|
185 |
+
)
|
186 |
+
parser.add_argument(
|
187 |
+
"-ot",
|
188 |
+
"--overtraining_threshold",
|
189 |
+
type=int,
|
190 |
+
default=50,
|
191 |
+
help="overtraining_threshold",
|
192 |
+
)
|
193 |
+
|
194 |
args = parser.parse_args()
|
195 |
name = args.experiment_dir
|
196 |
experiment_dir = os.path.join("./logs", args.experiment_dir)
|
|
|
213 |
hparams.save_every_weights = args.save_every_weights
|
214 |
hparams.if_cache_data_in_gpu = args.if_cache_data_in_gpu
|
215 |
hparams.data.training_files = f"{experiment_dir}/filelist.txt"
|
216 |
+
hparams.overtraining_detector = args.overtraining_detector
|
217 |
+
hparams.overtraining_threshold = args.overtraining_threshold
|
218 |
return hparams
|
219 |
|
220 |
|
tabs/download/download.py
CHANGED
@@ -1,6 +1,8 @@
|
|
1 |
import os, sys, shutil
|
2 |
import tempfile
|
3 |
import gradio as gr
|
|
|
|
|
4 |
from core import run_download_script
|
5 |
|
6 |
from assets.i18n.i18n import I18nAuto
|
@@ -41,12 +43,30 @@ def save_drop_model(dropbox):
|
|
41 |
os.makedirs(model_path)
|
42 |
if os.path.exists(os.path.join(model_path, file_name)):
|
43 |
os.remove(os.path.join(model_path, file_name))
|
44 |
-
|
45 |
print(f"{file_name} saved in {model_path}")
|
46 |
gr.Info(f"{file_name} saved in {model_path}")
|
47 |
return None
|
48 |
|
49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
def download_tab():
|
51 |
with gr.Column():
|
52 |
gr.Markdown(value=i18n("## Download Model"))
|
@@ -57,6 +77,7 @@ def download_tab():
|
|
57 |
)
|
58 |
model_download_output_info = gr.Textbox(
|
59 |
label=i18n("Output Information"),
|
|
|
60 |
value="",
|
61 |
max_lines=8,
|
62 |
interactive=False,
|
@@ -82,3 +103,18 @@ def download_tab():
|
|
82 |
inputs=[dropbox],
|
83 |
outputs=[dropbox],
|
84 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os, sys, shutil
|
2 |
import tempfile
|
3 |
import gradio as gr
|
4 |
+
import pandas as pd
|
5 |
+
import requests
|
6 |
from core import run_download_script
|
7 |
|
8 |
from assets.i18n.i18n import I18nAuto
|
|
|
43 |
os.makedirs(model_path)
|
44 |
if os.path.exists(os.path.join(model_path, file_name)):
|
45 |
os.remove(os.path.join(model_path, file_name))
|
46 |
+
shutil.move(dropbox, os.path.join(model_path, file_name))
|
47 |
print(f"{file_name} saved in {model_path}")
|
48 |
gr.Info(f"{file_name} saved in {model_path}")
|
49 |
return None
|
50 |
|
51 |
|
52 |
+
def search_models(name):
|
53 |
+
url = f"https://cjtfqzjfdimgpvpwhzlv.supabase.co/rest/v1/models?name=ilike.%25{name}%25&order=created_at.desc&limit=15"
|
54 |
+
headers = {
|
55 |
+
"apikey": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6ImNqdGZxempmZGltZ3B2cHdoemx2Iiwicm9sZSI6ImFub24iLCJpYXQiOjE2OTUxNjczODgsImV4cCI6MjAxMDc0MzM4OH0.7z5WMIbjR99c2Ooc0ma7B_FyGq10G8X-alkCYTkKR10"
|
56 |
+
}
|
57 |
+
response = requests.get(url, headers=headers)
|
58 |
+
data = response.json()
|
59 |
+
if len(data) == 0:
|
60 |
+
gr.Info(i18n("We couldn't find models by that name."))
|
61 |
+
return None
|
62 |
+
else:
|
63 |
+
df = pd.DataFrame(data)[["name", "link", "epochs", "type"]]
|
64 |
+
df["link"] = df["link"].apply(
|
65 |
+
lambda x: f'<a href="{x}" target="_blank">{x}</a>'
|
66 |
+
)
|
67 |
+
return df
|
68 |
+
|
69 |
+
|
70 |
def download_tab():
|
71 |
with gr.Column():
|
72 |
gr.Markdown(value=i18n("## Download Model"))
|
|
|
77 |
)
|
78 |
model_download_output_info = gr.Textbox(
|
79 |
label=i18n("Output Information"),
|
80 |
+
info=i18n("The output information will be displayed here."),
|
81 |
value="",
|
82 |
max_lines=8,
|
83 |
interactive=False,
|
|
|
103 |
inputs=[dropbox],
|
104 |
outputs=[dropbox],
|
105 |
)
|
106 |
+
gr.Markdown(value=i18n("## Search Model"))
|
107 |
+
search_name = gr.Textbox(
|
108 |
+
label=i18n("Model Name"),
|
109 |
+
placeholder=i18n("Introduce the model name to search."),
|
110 |
+
interactive=True,
|
111 |
+
)
|
112 |
+
search_table = gr.Dataframe(datatype="markdown")
|
113 |
+
search = gr.Button(i18n("Search"))
|
114 |
+
search.click(
|
115 |
+
search_models,
|
116 |
+
[search_name],
|
117 |
+
search_table,
|
118 |
+
)
|
119 |
+
|
120 |
+
search_name.submit(search_models, [search_name], search_table)
|
tabs/extra/analyzer/analyzer.py
CHANGED
@@ -1,85 +1,32 @@
|
|
|
|
1 |
import gradio as gr
|
2 |
-
import matplotlib.pyplot as plt
|
3 |
-
import soundfile as sf
|
4 |
-
import numpy as np
|
5 |
-
import os
|
6 |
|
|
|
|
|
|
|
|
|
7 |
from assets.i18n.i18n import I18nAuto
|
8 |
|
9 |
i18n = I18nAuto()
|
10 |
|
11 |
|
12 |
-
def generate_spectrogram(audio_data, sample_rate, file_name):
|
13 |
-
plt.clf()
|
14 |
-
|
15 |
-
plt.specgram(
|
16 |
-
audio_data,
|
17 |
-
Fs=sample_rate / 1,
|
18 |
-
NFFT=4096,
|
19 |
-
sides="onesided",
|
20 |
-
cmap="Reds_r",
|
21 |
-
scale_by_freq=True,
|
22 |
-
scale="dB",
|
23 |
-
mode="magnitude",
|
24 |
-
window=np.hanning(4096),
|
25 |
-
)
|
26 |
-
|
27 |
-
plt.title(file_name)
|
28 |
-
plt.savefig("spectrogram.png")
|
29 |
-
|
30 |
-
|
31 |
-
def get_audio_info(audio_file):
|
32 |
-
audio_data, sample_rate = sf.read(audio_file)
|
33 |
-
|
34 |
-
if len(audio_data.shape) > 1:
|
35 |
-
audio_data = np.mean(audio_data, axis=1)
|
36 |
-
|
37 |
-
generate_spectrogram(audio_data, sample_rate, os.path.basename(audio_file))
|
38 |
-
|
39 |
-
audio_info = sf.info(audio_file)
|
40 |
-
bit_depth = {"PCM_16": 16, "FLOAT": 32}.get(audio_info.subtype, 0)
|
41 |
-
|
42 |
-
minutes, seconds = divmod(audio_info.duration, 60)
|
43 |
-
seconds, milliseconds = divmod(seconds, 1)
|
44 |
-
milliseconds *= 1000
|
45 |
-
|
46 |
-
speed_in_kbps = audio_info.samplerate * bit_depth / 1000
|
47 |
-
|
48 |
-
info_table = f"""
|
49 |
-
- **File Name:** {os.path.basename(audio_file)}
|
50 |
-
- **Duration:** {int(minutes)} minutes, {int(seconds)} seconds, {int(milliseconds)} milliseconds
|
51 |
-
- **Bitrate:** {speed_in_kbps} kbp/s
|
52 |
-
- **Audio Channels:** {audio_info.channels}
|
53 |
-
- **Sampling rate:** {audio_info.samplerate} Hz
|
54 |
-
- **Bit per second:** {audio_info.samplerate * audio_info.channels * bit_depth} bit/s
|
55 |
-
"""
|
56 |
-
|
57 |
-
return info_table, "spectrogram.png"
|
58 |
-
|
59 |
-
|
60 |
def analyzer():
|
61 |
with gr.Column():
|
62 |
-
gr.Markdown(
|
63 |
-
"Tool inspired in the original [Ilaria-Audio-Analyzer](https://github.com/TheStingerX/Ilaria-Audio-Analyzer) code."
|
64 |
-
)
|
65 |
audio_input = gr.Audio(type="filepath")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
get_info_button = gr.Button(
|
67 |
value=i18n("Get information about the audio"), variant="primary"
|
68 |
)
|
69 |
-
|
70 |
-
with gr.Row():
|
71 |
-
with gr.Column():
|
72 |
-
gr.Markdown(
|
73 |
-
value=i18n("Information about the audio file"),
|
74 |
-
visible=True,
|
75 |
-
)
|
76 |
-
output_markdown = gr.Markdown(
|
77 |
-
value=i18n("Waiting for information..."), visible=True
|
78 |
-
)
|
79 |
-
image_output = gr.Image(type="filepath", interactive=False)
|
80 |
|
81 |
get_info_button.click(
|
82 |
-
fn=
|
83 |
inputs=[audio_input],
|
84 |
-
outputs=[
|
85 |
)
|
|
|
1 |
+
import os, sys
|
2 |
import gradio as gr
|
|
|
|
|
|
|
|
|
3 |
|
4 |
+
now_dir = os.getcwd()
|
5 |
+
sys.path.append(now_dir)
|
6 |
+
|
7 |
+
from core import run_audio_analyzer_script
|
8 |
from assets.i18n.i18n import I18nAuto
|
9 |
|
10 |
i18n = I18nAuto()
|
11 |
|
12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
def analyzer():
|
14 |
with gr.Column():
|
|
|
|
|
|
|
15 |
audio_input = gr.Audio(type="filepath")
|
16 |
+
output_info = gr.Textbox(
|
17 |
+
label=i18n("Output Information"),
|
18 |
+
info=i18n("The output information will be displayed here."),
|
19 |
+
value="",
|
20 |
+
max_lines=8,
|
21 |
+
interactive=False,
|
22 |
+
)
|
23 |
get_info_button = gr.Button(
|
24 |
value=i18n("Get information about the audio"), variant="primary"
|
25 |
)
|
26 |
+
image_output = gr.Image(type="filepath", interactive=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
get_info_button.click(
|
29 |
+
fn=run_audio_analyzer_script,
|
30 |
inputs=[audio_input],
|
31 |
+
outputs=[output_info, image_output],
|
32 |
)
|
tabs/extra/extra.py
CHANGED
@@ -15,8 +15,8 @@ def extra_tab():
|
|
15 |
)
|
16 |
)
|
17 |
|
18 |
-
|
19 |
-
|
20 |
|
21 |
with gr.TabItem(i18n("Audio Analyzer")):
|
22 |
analyzer.analyzer()
|
|
|
15 |
)
|
16 |
)
|
17 |
|
18 |
+
with gr.TabItem(i18n("Processing")):
|
19 |
+
processing.processing()
|
20 |
|
21 |
with gr.TabItem(i18n("Audio Analyzer")):
|
22 |
analyzer.analyzer()
|
tabs/extra/model_information.py
CHANGED
@@ -9,12 +9,14 @@ i18n = I18nAuto()
|
|
9 |
def model_information_tab():
|
10 |
with gr.Column():
|
11 |
model_name = gr.Textbox(
|
12 |
-
label=i18n("Model
|
13 |
-
|
|
|
14 |
interactive=True,
|
15 |
)
|
16 |
model_information_output_info = gr.Textbox(
|
17 |
label=i18n("Output Information"),
|
|
|
18 |
value="",
|
19 |
max_lines=8,
|
20 |
interactive=False,
|
|
|
9 |
def model_information_tab():
|
10 |
with gr.Column():
|
11 |
model_name = gr.Textbox(
|
12 |
+
label=i18n("Path to Model"),
|
13 |
+
info=i18n("Introduce the model pth path"),
|
14 |
+
placeholder=i18n("Introduce the model pth path"),
|
15 |
interactive=True,
|
16 |
)
|
17 |
model_information_output_info = gr.Textbox(
|
18 |
label=i18n("Output Information"),
|
19 |
+
info=i18n("The output information will be displayed here."),
|
20 |
value="",
|
21 |
max_lines=8,
|
22 |
interactive=False,
|
tabs/extra/processing/processing.py
CHANGED
@@ -1,18 +1,9 @@
|
|
1 |
-
import sys
|
2 |
-
|
3 |
-
sys.path.append("..")
|
4 |
-
import os
|
5 |
|
6 |
now_dir = os.getcwd()
|
7 |
-
|
8 |
-
extract_small_model,
|
9 |
-
)
|
10 |
-
|
11 |
-
from rvc.lib.process.model_fusion import model_fusion
|
12 |
-
from rvc.lib.process.model_information import (
|
13 |
-
model_information,
|
14 |
-
)
|
15 |
|
|
|
16 |
from assets.i18n.i18n import I18nAuto
|
17 |
|
18 |
i18n = I18nAuto()
|
@@ -21,122 +12,27 @@ import gradio as gr
|
|
21 |
|
22 |
|
23 |
def processing():
|
24 |
-
with gr.Accordion(label=i18n("Model fusion (On progress)"), open=False):
|
25 |
-
with gr.Column():
|
26 |
-
model_fusion_name = gr.Textbox(
|
27 |
-
label=i18n("Model Name"),
|
28 |
-
value="",
|
29 |
-
max_lines=1,
|
30 |
-
interactive=True,
|
31 |
-
placeholder=i18n("Enter model name"),
|
32 |
-
)
|
33 |
-
model_fusion_a = gr.Textbox(
|
34 |
-
label=i18n("Path to Model A"),
|
35 |
-
value="",
|
36 |
-
interactive=True,
|
37 |
-
placeholder=i18n("Path to model"),
|
38 |
-
)
|
39 |
-
model_fusion_b = gr.Textbox(
|
40 |
-
label=i18n("Path to Model B"),
|
41 |
-
value="",
|
42 |
-
interactive=True,
|
43 |
-
placeholder=i18n("Path to model"),
|
44 |
-
)
|
45 |
-
model_fusion_output_info = gr.Textbox(
|
46 |
-
label=i18n("Output Information"),
|
47 |
-
value="",
|
48 |
-
)
|
49 |
-
|
50 |
-
model_fusion_button = gr.Button(
|
51 |
-
i18n("Fusion"), variant="primary", interactive=False
|
52 |
-
)
|
53 |
-
|
54 |
-
model_fusion_button.click(
|
55 |
-
model_fusion,
|
56 |
-
[
|
57 |
-
model_fusion_name,
|
58 |
-
model_fusion_a,
|
59 |
-
model_fusion_b,
|
60 |
-
],
|
61 |
-
model_fusion_output_info,
|
62 |
-
api_name="model_fusion",
|
63 |
-
)
|
64 |
-
|
65 |
with gr.Accordion(label=i18n("View model information")):
|
66 |
with gr.Row():
|
67 |
with gr.Column():
|
68 |
model_view_model_path = gr.Textbox(
|
69 |
label=i18n("Path to Model"),
|
|
|
70 |
value="",
|
71 |
interactive=True,
|
72 |
-
placeholder=i18n("
|
73 |
)
|
74 |
|
75 |
model_view_output_info = gr.Textbox(
|
76 |
-
label=i18n("Output Information"),
|
|
|
|
|
|
|
77 |
)
|
78 |
model_view_button = gr.Button(i18n("View"), variant="primary")
|
79 |
model_view_button.click(
|
80 |
-
|
81 |
[model_view_model_path],
|
82 |
model_view_output_info,
|
83 |
api_name="model_info",
|
84 |
)
|
85 |
-
|
86 |
-
with gr.Accordion(label=i18n("Model extraction")):
|
87 |
-
with gr.Row():
|
88 |
-
with gr.Column():
|
89 |
-
model_extract_name = gr.Textbox(
|
90 |
-
label=i18n("Model Name"),
|
91 |
-
value="",
|
92 |
-
interactive=True,
|
93 |
-
placeholder=i18n("Enter model name"),
|
94 |
-
)
|
95 |
-
model_extract_path = gr.Textbox(
|
96 |
-
label=i18n("Path to Model"),
|
97 |
-
placeholder=i18n("Path to model"),
|
98 |
-
interactive=True,
|
99 |
-
)
|
100 |
-
model_extract_info = gr.Textbox(
|
101 |
-
label=i18n("Model information to be placed"),
|
102 |
-
value="",
|
103 |
-
max_lines=8,
|
104 |
-
interactive=True,
|
105 |
-
placeholder=i18n("Model information to be placed"),
|
106 |
-
)
|
107 |
-
with gr.Column():
|
108 |
-
model_extract_pitch_guidance = gr.Checkbox(
|
109 |
-
label=i18n("Pitch Guidance"),
|
110 |
-
value=True,
|
111 |
-
interactive=True,
|
112 |
-
)
|
113 |
-
model_extract_rvc_version = gr.Radio(
|
114 |
-
label=i18n("RVC Version"),
|
115 |
-
choices=["v1", "v2"],
|
116 |
-
value="v2",
|
117 |
-
interactive=True,
|
118 |
-
)
|
119 |
-
model_extract_sampling_rate = gr.Radio(
|
120 |
-
label=i18n("Sampling Rate"),
|
121 |
-
choices=["32000", "40000", "48000"],
|
122 |
-
value="40000",
|
123 |
-
interactive=True,
|
124 |
-
)
|
125 |
-
model_extract_output_info = gr.Textbox(
|
126 |
-
label=i18n("Output Information"), value="", max_lines=8
|
127 |
-
)
|
128 |
-
|
129 |
-
model_extract_button = gr.Button(i18n("Extract"), variant="primary")
|
130 |
-
model_extract_button.click(
|
131 |
-
extract_small_model,
|
132 |
-
[
|
133 |
-
model_extract_path,
|
134 |
-
model_extract_name,
|
135 |
-
model_extract_sampling_rate,
|
136 |
-
model_extract_pitch_guidance,
|
137 |
-
model_extract_info,
|
138 |
-
model_extract_rvc_version,
|
139 |
-
],
|
140 |
-
model_extract_output_info,
|
141 |
-
api_name="model_extract",
|
142 |
-
)
|
|
|
1 |
+
import os, sys
|
|
|
|
|
|
|
2 |
|
3 |
now_dir = os.getcwd()
|
4 |
+
sys.path.append(now_dir)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
+
from core import run_model_information_script
|
7 |
from assets.i18n.i18n import I18nAuto
|
8 |
|
9 |
i18n = I18nAuto()
|
|
|
12 |
|
13 |
|
14 |
def processing():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
with gr.Accordion(label=i18n("View model information")):
|
16 |
with gr.Row():
|
17 |
with gr.Column():
|
18 |
model_view_model_path = gr.Textbox(
|
19 |
label=i18n("Path to Model"),
|
20 |
+
info=i18n("Introduce the model pth path"),
|
21 |
value="",
|
22 |
interactive=True,
|
23 |
+
placeholder=i18n("Enter path to model"),
|
24 |
)
|
25 |
|
26 |
model_view_output_info = gr.Textbox(
|
27 |
+
label=i18n("Output Information"),
|
28 |
+
info=i18n("The output information will be displayed here."),
|
29 |
+
value="",
|
30 |
+
max_lines=8,
|
31 |
)
|
32 |
model_view_button = gr.Button(i18n("View"), variant="primary")
|
33 |
model_view_button.click(
|
34 |
+
run_model_information_script,
|
35 |
[model_view_model_path],
|
36 |
model_view_output_info,
|
37 |
api_name="model_info",
|
38 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tabs/inference/inference.py
CHANGED
@@ -122,55 +122,6 @@ def get_indexes():
|
|
122 |
return indexes_list if indexes_list else ""
|
123 |
|
124 |
|
125 |
-
def match_index(model_file: str) -> tuple:
|
126 |
-
model_files_trip = re.sub(r"\.pth|\.onnx$", "", model_file)
|
127 |
-
model_file_name = os.path.split(model_files_trip)[
|
128 |
-
-1
|
129 |
-
] # Extract only the name, not the directory
|
130 |
-
|
131 |
-
# Check if the sid0strip has the specific ending format _eXXX_sXXX
|
132 |
-
if re.match(r".+_e\d+_s\d+$", model_file_name):
|
133 |
-
base_model_name = model_file_name.rsplit("_", 2)[0]
|
134 |
-
else:
|
135 |
-
base_model_name = model_file_name
|
136 |
-
|
137 |
-
sid_directory = os.path.join(model_root_relative, base_model_name)
|
138 |
-
directories_to_search = [sid_directory] if os.path.exists(sid_directory) else []
|
139 |
-
directories_to_search.append(model_root_relative)
|
140 |
-
matching_index_files = []
|
141 |
-
|
142 |
-
for directory in directories_to_search:
|
143 |
-
for filename in os.listdir(directory):
|
144 |
-
if filename.endswith(".index") and "trained" not in filename:
|
145 |
-
# Condition to match the name
|
146 |
-
name_match = any(
|
147 |
-
name.lower() in filename.lower()
|
148 |
-
for name in [model_file_name, base_model_name]
|
149 |
-
)
|
150 |
-
|
151 |
-
# If in the specific directory, it's automatically a match
|
152 |
-
folder_match = directory == sid_directory
|
153 |
-
|
154 |
-
if name_match or folder_match:
|
155 |
-
index_path = os.path.join(directory, filename)
|
156 |
-
updated_indexes_list = get_indexes()
|
157 |
-
if index_path in updated_indexes_list:
|
158 |
-
matching_index_files.append(
|
159 |
-
(
|
160 |
-
index_path,
|
161 |
-
os.path.getsize(index_path),
|
162 |
-
" " not in filename,
|
163 |
-
)
|
164 |
-
)
|
165 |
-
if matching_index_files:
|
166 |
-
# Sort by favoring files without spaces and by size (largest size first)
|
167 |
-
matching_index_files.sort(key=lambda x: (-x[2], -x[1]))
|
168 |
-
best_match_index_path = matching_index_files[0][0]
|
169 |
-
return best_match_index_path
|
170 |
-
|
171 |
-
return ""
|
172 |
-
|
173 |
-
|
174 |
def save_to_wav(record_button):
|
175 |
if record_button is None:
|
176 |
pass
|
@@ -196,11 +147,21 @@ def save_to_wav2(upload_audio):
|
|
196 |
|
197 |
|
198 |
def delete_outputs():
|
|
|
199 |
for root, _, files in os.walk(audio_root_relative, topdown=False):
|
200 |
for name in files:
|
201 |
if name.endswith(tuple(sup_audioext)) and name.__contains__("_output"):
|
202 |
os.remove(os.path.join(root, name))
|
203 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
204 |
|
205 |
|
206 |
# Inference tab
|
@@ -210,6 +171,7 @@ def inference_tab():
|
|
210 |
with gr.Row():
|
211 |
model_file = gr.Dropdown(
|
212 |
label=i18n("Voice Model"),
|
|
|
213 |
choices=sorted(names, key=lambda path: os.path.getsize(path)),
|
214 |
interactive=True,
|
215 |
value=default_weight,
|
@@ -218,6 +180,7 @@ def inference_tab():
|
|
218 |
|
219 |
index_file = gr.Dropdown(
|
220 |
label=i18n("Index File"),
|
|
|
221 |
choices=get_indexes(),
|
222 |
value=match_index(default_weight) if default_weight else "",
|
223 |
interactive=True,
|
@@ -228,13 +191,16 @@ def inference_tab():
|
|
228 |
unload_button = gr.Button(i18n("Unload Voice"))
|
229 |
|
230 |
unload_button.click(
|
231 |
-
fn=lambda: (
|
|
|
|
|
|
|
232 |
inputs=[],
|
233 |
-
outputs=[model_file],
|
234 |
)
|
235 |
|
236 |
model_file.select(
|
237 |
-
fn=match_index,
|
238 |
inputs=[model_file],
|
239 |
outputs=[index_file],
|
240 |
)
|
@@ -248,6 +214,7 @@ def inference_tab():
|
|
248 |
with gr.Row():
|
249 |
audio = gr.Dropdown(
|
250 |
label=i18n("Select Audio"),
|
|
|
251 |
choices=sorted(audio_paths),
|
252 |
value=audio_paths[0] if audio_paths else "",
|
253 |
interactive=True,
|
@@ -256,12 +223,15 @@ def inference_tab():
|
|
256 |
|
257 |
with gr.Accordion(i18n("Advanced Settings"), open=False):
|
258 |
with gr.Column():
|
259 |
-
|
260 |
i18n("Clear Outputs (Deletes all audios in assets/audios)")
|
261 |
)
|
262 |
output_path = gr.Textbox(
|
263 |
label=i18n("Output Path"),
|
264 |
placeholder=i18n("Enter output path"),
|
|
|
|
|
|
|
265 |
value=(
|
266 |
output_path_fn(audio_paths[0])
|
267 |
if audio_paths
|
@@ -269,25 +239,68 @@ def inference_tab():
|
|
269 |
),
|
270 |
interactive=True,
|
271 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
272 |
split_audio = gr.Checkbox(
|
273 |
label=i18n("Split Audio"),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
274 |
visible=True,
|
275 |
value=False,
|
276 |
interactive=True,
|
277 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
278 |
pitch = gr.Slider(
|
279 |
minimum=-24,
|
280 |
maximum=24,
|
281 |
step=1,
|
282 |
label=i18n("Pitch"),
|
|
|
|
|
|
|
283 |
value=0,
|
284 |
interactive=True,
|
285 |
)
|
286 |
filter_radius = gr.Slider(
|
287 |
minimum=0,
|
288 |
maximum=7,
|
289 |
-
label=i18n(
|
290 |
-
|
|
|
291 |
),
|
292 |
value=3,
|
293 |
step=1,
|
@@ -297,20 +310,50 @@ def inference_tab():
|
|
297 |
minimum=0,
|
298 |
maximum=1,
|
299 |
label=i18n("Search Feature Ratio"),
|
|
|
|
|
|
|
300 |
value=0.75,
|
301 |
interactive=True,
|
302 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
303 |
hop_length = gr.Slider(
|
304 |
minimum=1,
|
305 |
maximum=512,
|
306 |
step=1,
|
307 |
label=i18n("Hop Length"),
|
|
|
|
|
|
|
|
|
308 |
value=128,
|
309 |
interactive=True,
|
310 |
)
|
311 |
with gr.Column():
|
312 |
f0method = gr.Radio(
|
313 |
label=i18n("Pitch extraction algorithm"),
|
|
|
|
|
|
|
314 |
choices=[
|
315 |
"pm",
|
316 |
"harvest",
|
@@ -318,6 +361,8 @@ def inference_tab():
|
|
318 |
"crepe",
|
319 |
"crepe-tiny",
|
320 |
"rmvpe",
|
|
|
|
|
321 |
],
|
322 |
value="rmvpe",
|
323 |
interactive=True,
|
@@ -326,7 +371,10 @@ def inference_tab():
|
|
326 |
convert_button1 = gr.Button(i18n("Convert"))
|
327 |
|
328 |
with gr.Row(): # Defines output info + output audio download after conversion
|
329 |
-
vc_output1 = gr.Textbox(
|
|
|
|
|
|
|
330 |
vc_output2 = gr.Audio(label=i18n("Export Audio"))
|
331 |
|
332 |
# Batch inference tab
|
@@ -335,40 +383,87 @@ def inference_tab():
|
|
335 |
with gr.Column():
|
336 |
input_folder_batch = gr.Textbox(
|
337 |
label=i18n("Input Folder"),
|
|
|
338 |
placeholder=i18n("Enter input path"),
|
339 |
value=os.path.join(now_dir, "assets", "audios"),
|
340 |
interactive=True,
|
341 |
)
|
342 |
output_folder_batch = gr.Textbox(
|
343 |
label=i18n("Output Folder"),
|
|
|
|
|
|
|
344 |
placeholder=i18n("Enter output path"),
|
345 |
value=os.path.join(now_dir, "assets", "audios"),
|
346 |
interactive=True,
|
347 |
)
|
348 |
with gr.Accordion(i18n("Advanced Settings"), open=False):
|
349 |
with gr.Column():
|
350 |
-
|
351 |
i18n("Clear Outputs (Deletes all audios in assets/audios)")
|
352 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
353 |
split_audio_batch = gr.Checkbox(
|
354 |
label=i18n("Split Audio"),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
355 |
visible=True,
|
356 |
value=False,
|
357 |
interactive=True,
|
358 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
359 |
pitch_batch = gr.Slider(
|
360 |
minimum=-24,
|
361 |
maximum=24,
|
362 |
step=1,
|
363 |
label=i18n("Pitch"),
|
|
|
|
|
|
|
364 |
value=0,
|
365 |
interactive=True,
|
366 |
)
|
367 |
filter_radius_batch = gr.Slider(
|
368 |
minimum=0,
|
369 |
maximum=7,
|
370 |
-
label=i18n(
|
371 |
-
|
|
|
372 |
),
|
373 |
value=3,
|
374 |
step=1,
|
@@ -378,20 +473,50 @@ def inference_tab():
|
|
378 |
minimum=0,
|
379 |
maximum=1,
|
380 |
label=i18n("Search Feature Ratio"),
|
|
|
|
|
|
|
381 |
value=0.75,
|
382 |
interactive=True,
|
383 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
384 |
hop_length_batch = gr.Slider(
|
385 |
minimum=1,
|
386 |
maximum=512,
|
387 |
step=1,
|
388 |
label=i18n("Hop Length"),
|
|
|
|
|
|
|
|
|
389 |
value=128,
|
390 |
interactive=True,
|
391 |
)
|
392 |
with gr.Column():
|
393 |
f0method_batch = gr.Radio(
|
394 |
label=i18n("Pitch extraction algorithm"),
|
|
|
|
|
|
|
395 |
choices=[
|
396 |
"pm",
|
397 |
"harvest",
|
@@ -399,6 +524,8 @@ def inference_tab():
|
|
399 |
"crepe",
|
400 |
"crepe-tiny",
|
401 |
"rmvpe",
|
|
|
|
|
402 |
],
|
403 |
value="rmvpe",
|
404 |
interactive=True,
|
@@ -407,11 +534,39 @@ def inference_tab():
|
|
407 |
convert_button2 = gr.Button(i18n("Convert"))
|
408 |
|
409 |
with gr.Row(): # Defines output info + output audio download after conversion
|
410 |
-
vc_output3 = gr.Textbox(
|
|
|
|
|
|
|
411 |
|
412 |
def toggle_visible(checkbox):
|
413 |
return {"visible": checkbox, "__type__": "update"}
|
414 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
415 |
refresh_button.click(
|
416 |
fn=change_choices,
|
417 |
inputs=[],
|
@@ -432,7 +587,12 @@ def inference_tab():
|
|
432 |
inputs=[upload_audio],
|
433 |
outputs=[audio, output_path],
|
434 |
)
|
435 |
-
|
|
|
|
|
|
|
|
|
|
|
436 |
fn=delete_outputs,
|
437 |
inputs=[],
|
438 |
outputs=[],
|
@@ -443,6 +603,8 @@ def inference_tab():
|
|
443 |
pitch,
|
444 |
filter_radius,
|
445 |
index_rate,
|
|
|
|
|
446 |
hop_length,
|
447 |
f0method,
|
448 |
audio,
|
@@ -450,6 +612,10 @@ def inference_tab():
|
|
450 |
model_file,
|
451 |
index_file,
|
452 |
split_audio,
|
|
|
|
|
|
|
|
|
453 |
],
|
454 |
outputs=[vc_output1, vc_output2],
|
455 |
)
|
@@ -459,6 +625,8 @@ def inference_tab():
|
|
459 |
pitch_batch,
|
460 |
filter_radius_batch,
|
461 |
index_rate_batch,
|
|
|
|
|
462 |
hop_length_batch,
|
463 |
f0method_batch,
|
464 |
input_folder_batch,
|
@@ -466,6 +634,10 @@ def inference_tab():
|
|
466 |
model_file,
|
467 |
index_file,
|
468 |
split_audio_batch,
|
|
|
|
|
|
|
|
|
469 |
],
|
470 |
outputs=[vc_output3],
|
471 |
)
|
|
|
122 |
return indexes_list if indexes_list else ""
|
123 |
|
124 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
def save_to_wav(record_button):
|
126 |
if record_button is None:
|
127 |
pass
|
|
|
147 |
|
148 |
|
149 |
def delete_outputs():
|
150 |
+
gr.Info(f"Outputs cleared!")
|
151 |
for root, _, files in os.walk(audio_root_relative, topdown=False):
|
152 |
for name in files:
|
153 |
if name.endswith(tuple(sup_audioext)) and name.__contains__("_output"):
|
154 |
os.remove(os.path.join(root, name))
|
155 |
+
|
156 |
+
|
157 |
+
def match_index(model_file_value):
|
158 |
+
if model_file_value:
|
159 |
+
model_folder = os.path.dirname(model_file_value)
|
160 |
+
index_files = get_indexes()
|
161 |
+
for index_file in index_files:
|
162 |
+
if os.path.dirname(index_file) == model_folder:
|
163 |
+
return index_file
|
164 |
+
return ""
|
165 |
|
166 |
|
167 |
# Inference tab
|
|
|
171 |
with gr.Row():
|
172 |
model_file = gr.Dropdown(
|
173 |
label=i18n("Voice Model"),
|
174 |
+
info=i18n("Select the voice model to use for the conversion."),
|
175 |
choices=sorted(names, key=lambda path: os.path.getsize(path)),
|
176 |
interactive=True,
|
177 |
value=default_weight,
|
|
|
180 |
|
181 |
index_file = gr.Dropdown(
|
182 |
label=i18n("Index File"),
|
183 |
+
info=i18n("Select the index file to use for the conversion."),
|
184 |
choices=get_indexes(),
|
185 |
value=match_index(default_weight) if default_weight else "",
|
186 |
interactive=True,
|
|
|
191 |
unload_button = gr.Button(i18n("Unload Voice"))
|
192 |
|
193 |
unload_button.click(
|
194 |
+
fn=lambda: (
|
195 |
+
{"value": "", "__type__": "update"},
|
196 |
+
{"value": "", "__type__": "update"},
|
197 |
+
),
|
198 |
inputs=[],
|
199 |
+
outputs=[model_file, index_file],
|
200 |
)
|
201 |
|
202 |
model_file.select(
|
203 |
+
fn=lambda model_file_value: match_index(model_file_value),
|
204 |
inputs=[model_file],
|
205 |
outputs=[index_file],
|
206 |
)
|
|
|
214 |
with gr.Row():
|
215 |
audio = gr.Dropdown(
|
216 |
label=i18n("Select Audio"),
|
217 |
+
info=i18n("Select the audio to convert."),
|
218 |
choices=sorted(audio_paths),
|
219 |
value=audio_paths[0] if audio_paths else "",
|
220 |
interactive=True,
|
|
|
223 |
|
224 |
with gr.Accordion(i18n("Advanced Settings"), open=False):
|
225 |
with gr.Column():
|
226 |
+
clear_outputs_infer = gr.Button(
|
227 |
i18n("Clear Outputs (Deletes all audios in assets/audios)")
|
228 |
)
|
229 |
output_path = gr.Textbox(
|
230 |
label=i18n("Output Path"),
|
231 |
placeholder=i18n("Enter output path"),
|
232 |
+
info=i18n(
|
233 |
+
"The path where the output audio will be saved, by default in assets/audios/output.wav"
|
234 |
+
),
|
235 |
value=(
|
236 |
output_path_fn(audio_paths[0])
|
237 |
if audio_paths
|
|
|
239 |
),
|
240 |
interactive=True,
|
241 |
)
|
242 |
+
export_format = gr.Radio(
|
243 |
+
label=i18n("Export Format"),
|
244 |
+
info=i18n("Select the format to export the audio."),
|
245 |
+
choices=["WAV", "MP3", "FLAC", "OGG", "M4A"],
|
246 |
+
value="WAV",
|
247 |
+
interactive=True,
|
248 |
+
)
|
249 |
split_audio = gr.Checkbox(
|
250 |
label=i18n("Split Audio"),
|
251 |
+
info=i18n(
|
252 |
+
"Split the audio into chunks for inference to obtain better results in some cases."
|
253 |
+
),
|
254 |
+
visible=True,
|
255 |
+
value=False,
|
256 |
+
interactive=True,
|
257 |
+
)
|
258 |
+
autotune = gr.Checkbox(
|
259 |
+
label=i18n("Autotune"),
|
260 |
+
info=i18n(
|
261 |
+
"Apply a soft autotune to your inferences, recommended for singing conversions."
|
262 |
+
),
|
263 |
+
visible=True,
|
264 |
+
value=False,
|
265 |
+
interactive=True,
|
266 |
+
)
|
267 |
+
clean_audio = gr.Checkbox(
|
268 |
+
label=i18n("Clean Audio"),
|
269 |
+
info=i18n(
|
270 |
+
"Clean your audio output using noise detection algorithms, recommended for speaking audios."
|
271 |
+
),
|
272 |
visible=True,
|
273 |
value=False,
|
274 |
interactive=True,
|
275 |
)
|
276 |
+
clean_strength = gr.Slider(
|
277 |
+
minimum=0,
|
278 |
+
maximum=1,
|
279 |
+
label=i18n("Clean Strength"),
|
280 |
+
info=i18n(
|
281 |
+
"Set the clean-up level to the audio you want, the more you increase it the more it will clean up, but it is possible that the audio will be more compressed."
|
282 |
+
),
|
283 |
+
visible=False,
|
284 |
+
value=0.5,
|
285 |
+
interactive=True,
|
286 |
+
)
|
287 |
pitch = gr.Slider(
|
288 |
minimum=-24,
|
289 |
maximum=24,
|
290 |
step=1,
|
291 |
label=i18n("Pitch"),
|
292 |
+
info=i18n(
|
293 |
+
"Set the pitch of the audio, the higher the value, the higher the pitch."
|
294 |
+
),
|
295 |
value=0,
|
296 |
interactive=True,
|
297 |
)
|
298 |
filter_radius = gr.Slider(
|
299 |
minimum=0,
|
300 |
maximum=7,
|
301 |
+
label=i18n("Filter Radius"),
|
302 |
+
info=i18n(
|
303 |
+
"If the number is greater than or equal to three, employing median filtering on the collected tone results has the potential to decrease respiration."
|
304 |
),
|
305 |
value=3,
|
306 |
step=1,
|
|
|
310 |
minimum=0,
|
311 |
maximum=1,
|
312 |
label=i18n("Search Feature Ratio"),
|
313 |
+
info=i18n(
|
314 |
+
"Influence exerted by the index file; a higher value corresponds to greater influence. However, opting for lower values can help mitigate artifacts present in the audio."
|
315 |
+
),
|
316 |
value=0.75,
|
317 |
interactive=True,
|
318 |
)
|
319 |
+
rms_mix_rate = gr.Slider(
|
320 |
+
minimum=0,
|
321 |
+
maximum=1,
|
322 |
+
label=i18n("Volume Envelope"),
|
323 |
+
info=i18n(
|
324 |
+
"Substitute or blend with the volume envelope of the output. The closer the ratio is to 1, the more the output envelope is employed."
|
325 |
+
),
|
326 |
+
value=1,
|
327 |
+
interactive=True,
|
328 |
+
)
|
329 |
+
protect = gr.Slider(
|
330 |
+
minimum=0,
|
331 |
+
maximum=0.5,
|
332 |
+
label=i18n("Protect Voiceless Consonants"),
|
333 |
+
info=i18n(
|
334 |
+
"Safeguard distinct consonants and breathing sounds to prevent electro-acoustic tearing and other artifacts. Pulling the parameter to its maximum value of 0.5 offers comprehensive protection. However, reducing this value might decrease the extent of protection while potentially mitigating the indexing effect."
|
335 |
+
),
|
336 |
+
value=0.5,
|
337 |
+
interactive=True,
|
338 |
+
)
|
339 |
hop_length = gr.Slider(
|
340 |
minimum=1,
|
341 |
maximum=512,
|
342 |
step=1,
|
343 |
label=i18n("Hop Length"),
|
344 |
+
info=i18n(
|
345 |
+
"Denotes the duration it takes for the system to transition to a significant pitch change. Smaller hop lengths require more time for inference but tend to yield higher pitch accuracy."
|
346 |
+
),
|
347 |
+
visible=False,
|
348 |
value=128,
|
349 |
interactive=True,
|
350 |
)
|
351 |
with gr.Column():
|
352 |
f0method = gr.Radio(
|
353 |
label=i18n("Pitch extraction algorithm"),
|
354 |
+
info=i18n(
|
355 |
+
"Pitch extraction algorithm to use for the audio conversion. The default algorithm is rmvpe, which is recommended for most cases."
|
356 |
+
),
|
357 |
choices=[
|
358 |
"pm",
|
359 |
"harvest",
|
|
|
361 |
"crepe",
|
362 |
"crepe-tiny",
|
363 |
"rmvpe",
|
364 |
+
"fcpe",
|
365 |
+
"hybrid[rmvpe+fcpe]",
|
366 |
],
|
367 |
value="rmvpe",
|
368 |
interactive=True,
|
|
|
371 |
convert_button1 = gr.Button(i18n("Convert"))
|
372 |
|
373 |
with gr.Row(): # Defines output info + output audio download after conversion
|
374 |
+
vc_output1 = gr.Textbox(
|
375 |
+
label=i18n("Output Information"),
|
376 |
+
info=i18n("The output information will be displayed here."),
|
377 |
+
)
|
378 |
vc_output2 = gr.Audio(label=i18n("Export Audio"))
|
379 |
|
380 |
# Batch inference tab
|
|
|
383 |
with gr.Column():
|
384 |
input_folder_batch = gr.Textbox(
|
385 |
label=i18n("Input Folder"),
|
386 |
+
info=i18n("Select the folder containing the audios to convert."),
|
387 |
placeholder=i18n("Enter input path"),
|
388 |
value=os.path.join(now_dir, "assets", "audios"),
|
389 |
interactive=True,
|
390 |
)
|
391 |
output_folder_batch = gr.Textbox(
|
392 |
label=i18n("Output Folder"),
|
393 |
+
info=i18n(
|
394 |
+
"Select the folder where the output audios will be saved."
|
395 |
+
),
|
396 |
placeholder=i18n("Enter output path"),
|
397 |
value=os.path.join(now_dir, "assets", "audios"),
|
398 |
interactive=True,
|
399 |
)
|
400 |
with gr.Accordion(i18n("Advanced Settings"), open=False):
|
401 |
with gr.Column():
|
402 |
+
clear_outputs_batch = gr.Button(
|
403 |
i18n("Clear Outputs (Deletes all audios in assets/audios)")
|
404 |
)
|
405 |
+
export_format_batch = gr.Radio(
|
406 |
+
label=i18n("Export Format"),
|
407 |
+
info=i18n("Select the format to export the audio."),
|
408 |
+
choices=["WAV", "MP3", "FLAC", "OGG", "M4A"],
|
409 |
+
value="WAV",
|
410 |
+
interactive=True,
|
411 |
+
)
|
412 |
split_audio_batch = gr.Checkbox(
|
413 |
label=i18n("Split Audio"),
|
414 |
+
info=i18n(
|
415 |
+
"Split the audio into chunks for inference to obtain better results in some cases."
|
416 |
+
),
|
417 |
+
visible=True,
|
418 |
+
value=False,
|
419 |
+
interactive=True,
|
420 |
+
)
|
421 |
+
autotune_batch = gr.Checkbox(
|
422 |
+
label=i18n("Autotune"),
|
423 |
+
info=i18n(
|
424 |
+
"Apply a soft autotune to your inferences, recommended for singing conversions."
|
425 |
+
),
|
426 |
+
visible=True,
|
427 |
+
value=False,
|
428 |
+
interactive=True,
|
429 |
+
)
|
430 |
+
clean_audio_batch = gr.Checkbox(
|
431 |
+
label=i18n("Clean Audio"),
|
432 |
+
info=i18n(
|
433 |
+
"Clean your audio output using noise detection algorithms, recommended for speaking audios."
|
434 |
+
),
|
435 |
visible=True,
|
436 |
value=False,
|
437 |
interactive=True,
|
438 |
)
|
439 |
+
clean_strength_batch = gr.Slider(
|
440 |
+
minimum=0,
|
441 |
+
maximum=1,
|
442 |
+
label=i18n("Clean Strength"),
|
443 |
+
info=i18n(
|
444 |
+
"Set the clean-up level to the audio you want, the more you increase it the more it will clean up, but it is possible that the audio will be more compressed."
|
445 |
+
),
|
446 |
+
visible=False,
|
447 |
+
value=0.5,
|
448 |
+
interactive=True,
|
449 |
+
)
|
450 |
pitch_batch = gr.Slider(
|
451 |
minimum=-24,
|
452 |
maximum=24,
|
453 |
step=1,
|
454 |
label=i18n("Pitch"),
|
455 |
+
info=i18n(
|
456 |
+
"Set the pitch of the audio, the higher the value, the higher the pitch."
|
457 |
+
),
|
458 |
value=0,
|
459 |
interactive=True,
|
460 |
)
|
461 |
filter_radius_batch = gr.Slider(
|
462 |
minimum=0,
|
463 |
maximum=7,
|
464 |
+
label=i18n("Filter Radius"),
|
465 |
+
info=i18n(
|
466 |
+
"If the number is greater than or equal to three, employing median filtering on the collected tone results has the potential to decrease respiration."
|
467 |
),
|
468 |
value=3,
|
469 |
step=1,
|
|
|
473 |
minimum=0,
|
474 |
maximum=1,
|
475 |
label=i18n("Search Feature Ratio"),
|
476 |
+
info=i18n(
|
477 |
+
"Influence exerted by the index file; a higher value corresponds to greater influence. However, opting for lower values can help mitigate artifacts present in the audio."
|
478 |
+
),
|
479 |
value=0.75,
|
480 |
interactive=True,
|
481 |
)
|
482 |
+
rms_mix_rate_batch = gr.Slider(
|
483 |
+
minimum=0,
|
484 |
+
maximum=1,
|
485 |
+
label=i18n("Volume Envelope"),
|
486 |
+
info=i18n(
|
487 |
+
"Substitute or blend with the volume envelope of the output. The closer the ratio is to 1, the more the output envelope is employed."
|
488 |
+
),
|
489 |
+
value=1,
|
490 |
+
interactive=True,
|
491 |
+
)
|
492 |
+
protect_batch = gr.Slider(
|
493 |
+
minimum=0,
|
494 |
+
maximum=0.5,
|
495 |
+
label=i18n("Protect Voiceless Consonants"),
|
496 |
+
info=i18n(
|
497 |
+
"Safeguard distinct consonants and breathing sounds to prevent electro-acoustic tearing and other artifacts. Pulling the parameter to its maximum value of 0.5 offers comprehensive protection. However, reducing this value might decrease the extent of protection while potentially mitigating the indexing effect."
|
498 |
+
),
|
499 |
+
value=0.5,
|
500 |
+
interactive=True,
|
501 |
+
)
|
502 |
hop_length_batch = gr.Slider(
|
503 |
minimum=1,
|
504 |
maximum=512,
|
505 |
step=1,
|
506 |
label=i18n("Hop Length"),
|
507 |
+
info=i18n(
|
508 |
+
"Denotes the duration it takes for the system to transition to a significant pitch change. Smaller hop lengths require more time for inference but tend to yield higher pitch accuracy."
|
509 |
+
),
|
510 |
+
visible=False,
|
511 |
value=128,
|
512 |
interactive=True,
|
513 |
)
|
514 |
with gr.Column():
|
515 |
f0method_batch = gr.Radio(
|
516 |
label=i18n("Pitch extraction algorithm"),
|
517 |
+
info=i18n(
|
518 |
+
"Pitch extraction algorithm to use for the audio conversion. The default algorithm is rmvpe, which is recommended for most cases."
|
519 |
+
),
|
520 |
choices=[
|
521 |
"pm",
|
522 |
"harvest",
|
|
|
524 |
"crepe",
|
525 |
"crepe-tiny",
|
526 |
"rmvpe",
|
527 |
+
"fcpe",
|
528 |
+
"hybrid[rmvpe+fcpe]",
|
529 |
],
|
530 |
value="rmvpe",
|
531 |
interactive=True,
|
|
|
534 |
convert_button2 = gr.Button(i18n("Convert"))
|
535 |
|
536 |
with gr.Row(): # Defines output info + output audio download after conversion
|
537 |
+
vc_output3 = gr.Textbox(
|
538 |
+
label=i18n("Output Information"),
|
539 |
+
info=i18n("The output information will be displayed here."),
|
540 |
+
)
|
541 |
|
542 |
def toggle_visible(checkbox):
|
543 |
return {"visible": checkbox, "__type__": "update"}
|
544 |
|
545 |
+
def toggle_visible_hop_length(f0method):
|
546 |
+
if f0method == "crepe" or f0method == "crepe-tiny":
|
547 |
+
return {"visible": True, "__type__": "update"}
|
548 |
+
return {"visible": False, "__type__": "update"}
|
549 |
+
|
550 |
+
clean_audio.change(
|
551 |
+
fn=toggle_visible,
|
552 |
+
inputs=[clean_audio],
|
553 |
+
outputs=[clean_strength],
|
554 |
+
)
|
555 |
+
clean_audio_batch.change(
|
556 |
+
fn=toggle_visible,
|
557 |
+
inputs=[clean_audio_batch],
|
558 |
+
outputs=[clean_strength_batch],
|
559 |
+
)
|
560 |
+
f0method.change(
|
561 |
+
fn=toggle_visible_hop_length,
|
562 |
+
inputs=[f0method],
|
563 |
+
outputs=[hop_length],
|
564 |
+
)
|
565 |
+
f0method_batch.change(
|
566 |
+
fn=toggle_visible_hop_length,
|
567 |
+
inputs=[f0method_batch],
|
568 |
+
outputs=[hop_length_batch],
|
569 |
+
)
|
570 |
refresh_button.click(
|
571 |
fn=change_choices,
|
572 |
inputs=[],
|
|
|
587 |
inputs=[upload_audio],
|
588 |
outputs=[audio, output_path],
|
589 |
)
|
590 |
+
clear_outputs_infer.click(
|
591 |
+
fn=delete_outputs,
|
592 |
+
inputs=[],
|
593 |
+
outputs=[],
|
594 |
+
)
|
595 |
+
clear_outputs_batch.click(
|
596 |
fn=delete_outputs,
|
597 |
inputs=[],
|
598 |
outputs=[],
|
|
|
603 |
pitch,
|
604 |
filter_radius,
|
605 |
index_rate,
|
606 |
+
rms_mix_rate,
|
607 |
+
protect,
|
608 |
hop_length,
|
609 |
f0method,
|
610 |
audio,
|
|
|
612 |
model_file,
|
613 |
index_file,
|
614 |
split_audio,
|
615 |
+
autotune,
|
616 |
+
clean_audio,
|
617 |
+
clean_strength,
|
618 |
+
export_format,
|
619 |
],
|
620 |
outputs=[vc_output1, vc_output2],
|
621 |
)
|
|
|
625 |
pitch_batch,
|
626 |
filter_radius_batch,
|
627 |
index_rate_batch,
|
628 |
+
rms_mix_rate_batch,
|
629 |
+
protect_batch,
|
630 |
hop_length_batch,
|
631 |
f0method_batch,
|
632 |
input_folder_batch,
|
|
|
634 |
model_file,
|
635 |
index_file,
|
636 |
split_audio_batch,
|
637 |
+
autotune_batch,
|
638 |
+
clean_audio_batch,
|
639 |
+
clean_strength_batch,
|
640 |
+
export_format_batch,
|
641 |
],
|
642 |
outputs=[vc_output3],
|
643 |
)
|
tabs/plugins/plugins_core.py
CHANGED
@@ -11,24 +11,30 @@ i18n = I18nAuto()
|
|
11 |
now_dir = os.getcwd()
|
12 |
sys.path.append(now_dir)
|
13 |
|
|
|
|
|
14 |
plugins_path = os.path.join(now_dir, "tabs", "plugins", "installed")
|
15 |
if not os.path.exists(plugins_path):
|
16 |
os.makedirs(plugins_path)
|
17 |
-
json_file_path = os.path.join(now_dir, "
|
18 |
current_folders = os.listdir(plugins_path)
|
19 |
|
20 |
|
21 |
def get_existing_folders():
|
22 |
if os.path.exists(json_file_path):
|
23 |
with open(json_file_path, "r") as file:
|
24 |
-
|
|
|
25 |
else:
|
26 |
return []
|
27 |
|
28 |
|
29 |
def save_existing_folders(existing_folders):
|
|
|
|
|
|
|
30 |
with open(json_file_path, "w") as file:
|
31 |
-
json.dump(
|
32 |
|
33 |
|
34 |
def save_plugin_dropbox(dropbox):
|
@@ -53,33 +59,47 @@ def save_plugin_dropbox(dropbox):
|
|
53 |
os.remove(zip_file_path)
|
54 |
|
55 |
if os.path.exists(os.path.join(folder_path, "requirements.txt")):
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
else:
|
67 |
print("No requirements.txt file found in the plugin folder.")
|
68 |
|
69 |
save_existing_folders(get_existing_folders() + [folder_name])
|
70 |
|
71 |
print(
|
72 |
-
f"{folder_name} plugin installed in {plugins_path}!
|
73 |
)
|
74 |
gr.Info(
|
75 |
-
f"{folder_name} plugin installed in {plugins_path}!
|
76 |
)
|
|
|
77 |
return None
|
78 |
|
79 |
|
80 |
def check_new_folders():
|
81 |
existing_folders = get_existing_folders()
|
82 |
new_folders = set(current_folders) - set(existing_folders)
|
|
|
83 |
if new_folders:
|
84 |
for new_folder in new_folders:
|
85 |
complete_path = os.path.join(plugins_path, new_folder)
|
@@ -98,5 +118,5 @@ def check_new_folders():
|
|
98 |
)
|
99 |
else:
|
100 |
print("No requirements.txt file found in the plugin folder.")
|
101 |
-
print("Plugins checked and installed!
|
102 |
-
|
|
|
11 |
now_dir = os.getcwd()
|
12 |
sys.path.append(now_dir)
|
13 |
|
14 |
+
from tabs.settings.restart import restart_applio
|
15 |
+
|
16 |
plugins_path = os.path.join(now_dir, "tabs", "plugins", "installed")
|
17 |
if not os.path.exists(plugins_path):
|
18 |
os.makedirs(plugins_path)
|
19 |
+
json_file_path = os.path.join(now_dir, "assets", "config.json")
|
20 |
current_folders = os.listdir(plugins_path)
|
21 |
|
22 |
|
23 |
def get_existing_folders():
|
24 |
if os.path.exists(json_file_path):
|
25 |
with open(json_file_path, "r") as file:
|
26 |
+
config = json.load(file)
|
27 |
+
return config["plugins"]
|
28 |
else:
|
29 |
return []
|
30 |
|
31 |
|
32 |
def save_existing_folders(existing_folders):
|
33 |
+
with open(json_file_path, "r") as file:
|
34 |
+
config = json.load(file)
|
35 |
+
config["plugins"] = existing_folders
|
36 |
with open(json_file_path, "w") as file:
|
37 |
+
json.dump(config, file, indent=2)
|
38 |
|
39 |
|
40 |
def save_plugin_dropbox(dropbox):
|
|
|
59 |
os.remove(zip_file_path)
|
60 |
|
61 |
if os.path.exists(os.path.join(folder_path, "requirements.txt")):
|
62 |
+
if os.name == "nt":
|
63 |
+
subprocess.run(
|
64 |
+
[
|
65 |
+
os.path.join("env", "python.exe"),
|
66 |
+
"-m",
|
67 |
+
"pip",
|
68 |
+
"install",
|
69 |
+
"-r",
|
70 |
+
os.path.join(folder_path, "requirements.txt"),
|
71 |
+
]
|
72 |
+
)
|
73 |
+
else:
|
74 |
+
subprocess.run(
|
75 |
+
[
|
76 |
+
"python",
|
77 |
+
"-m",
|
78 |
+
"pip",
|
79 |
+
"install",
|
80 |
+
"-r",
|
81 |
+
os.path.join(folder_path, "requirements.txt"),
|
82 |
+
]
|
83 |
+
)
|
84 |
else:
|
85 |
print("No requirements.txt file found in the plugin folder.")
|
86 |
|
87 |
save_existing_folders(get_existing_folders() + [folder_name])
|
88 |
|
89 |
print(
|
90 |
+
f"{folder_name} plugin installed in {plugins_path}! Restarting applio to apply the changes."
|
91 |
)
|
92 |
gr.Info(
|
93 |
+
f"{folder_name} plugin installed in {plugins_path}! Restarting applio to apply the changes."
|
94 |
)
|
95 |
+
restart_applio()
|
96 |
return None
|
97 |
|
98 |
|
99 |
def check_new_folders():
|
100 |
existing_folders = get_existing_folders()
|
101 |
new_folders = set(current_folders) - set(existing_folders)
|
102 |
+
save_existing_folders(current_folders)
|
103 |
if new_folders:
|
104 |
for new_folder in new_folders:
|
105 |
complete_path = os.path.join(plugins_path, new_folder)
|
|
|
118 |
)
|
119 |
else:
|
120 |
print("No requirements.txt file found in the plugin folder.")
|
121 |
+
print("Plugins checked and installed! Restarting applio to apply the changes.")
|
122 |
+
restart_applio()
|
tabs/report/report.py
CHANGED
@@ -8,7 +8,7 @@ import gradio as gr
|
|
8 |
from assets.i18n.i18n import I18nAuto
|
9 |
|
10 |
now_dir = os.getcwd()
|
11 |
-
sys.path.append(
|
12 |
|
13 |
i18n = I18nAuto()
|
14 |
|
|
|
8 |
from assets.i18n.i18n import I18nAuto
|
9 |
|
10 |
now_dir = os.getcwd()
|
11 |
+
sys.path.append(now_dir)
|
12 |
|
13 |
i18n = I18nAuto()
|
14 |
|
tabs/settings/fake_gpu.py
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os, sys
|
2 |
+
import torch
|
3 |
+
import json
|
4 |
+
import gradio as gr
|
5 |
+
from assets.i18n.i18n import I18nAuto
|
6 |
+
from tabs.settings.restart import restart_applio
|
7 |
+
|
8 |
+
now_dir = os.getcwd()
|
9 |
+
sys.path.append(now_dir)
|
10 |
+
i18n = I18nAuto()
|
11 |
+
|
12 |
+
ngpu = torch.cuda.device_count()
|
13 |
+
config_file = os.path.join(now_dir, "assets", "config.json")
|
14 |
+
|
15 |
+
|
16 |
+
def gpu_available():
|
17 |
+
if torch.cuda.is_available() or ngpu != 0:
|
18 |
+
return True
|
19 |
+
|
20 |
+
|
21 |
+
def load_fake_gpu():
|
22 |
+
with open(config_file, "r", encoding="utf8") as file:
|
23 |
+
config = json.load(file)
|
24 |
+
return config["fake_gpu"]
|
25 |
+
|
26 |
+
|
27 |
+
def save_config(value):
|
28 |
+
with open(config_file, "r", encoding="utf8") as file:
|
29 |
+
config = json.load(file)
|
30 |
+
config["fake_gpu"] = value
|
31 |
+
with open(config_file, "w", encoding="utf8") as file:
|
32 |
+
json.dump(config, file, indent=2)
|
33 |
+
|
34 |
+
|
35 |
+
def fake_gpu_tab():
|
36 |
+
with gr.Row():
|
37 |
+
with gr.Column():
|
38 |
+
presence = gr.Checkbox(
|
39 |
+
label=i18n("Enable fake GPU"),
|
40 |
+
info=i18n(
|
41 |
+
"Activates the train tab. However, please note that this device lacks GPU capabilities, hence training is not supported. This option is only for testing purposes. (This option will restart Applio)"
|
42 |
+
),
|
43 |
+
interactive=True,
|
44 |
+
value=load_fake_gpu(),
|
45 |
+
)
|
46 |
+
presence.change(
|
47 |
+
fn=toggle,
|
48 |
+
inputs=[presence],
|
49 |
+
outputs=[],
|
50 |
+
)
|
51 |
+
|
52 |
+
|
53 |
+
def toggle(checkbox):
|
54 |
+
save_config(bool(checkbox))
|
55 |
+
restart_applio()
|
tabs/settings/flask_server.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
import gradio as gr
|
4 |
+
from assets.i18n.i18n import I18nAuto
|
5 |
+
import requests
|
6 |
+
|
7 |
+
now_dir = os.getcwd()
|
8 |
+
sys.path.append(now_dir)
|
9 |
+
|
10 |
+
from assets.flask.server import start_flask, load_config_flask, save_config
|
11 |
+
|
12 |
+
i18n = I18nAuto()
|
13 |
+
|
14 |
+
|
15 |
+
def flask_server_tab():
|
16 |
+
with gr.Row():
|
17 |
+
with gr.Column():
|
18 |
+
flask_checkbox = gr.Checkbox(
|
19 |
+
label=i18n(
|
20 |
+
"Enable Applio integration with applio.org/models using flask"
|
21 |
+
),
|
22 |
+
info=i18n(
|
23 |
+
"It will activate the possibility of downloading models with a click from the website."
|
24 |
+
),
|
25 |
+
interactive=True,
|
26 |
+
value=load_config_flask(),
|
27 |
+
)
|
28 |
+
flask_checkbox.change(
|
29 |
+
fn=toggle,
|
30 |
+
inputs=[flask_checkbox],
|
31 |
+
outputs=[],
|
32 |
+
)
|
33 |
+
|
34 |
+
|
35 |
+
def toggle(checkbox):
|
36 |
+
save_config(bool(checkbox))
|
37 |
+
if load_config_flask() == True:
|
38 |
+
start_flask()
|
39 |
+
else:
|
40 |
+
try:
|
41 |
+
requests.post("http://localhost:8000/shutdown")
|
42 |
+
except requests.exceptions.ConnectionError:
|
43 |
+
pass
|
tabs/settings/lang.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os, sys
|
2 |
+
import json
|
3 |
+
import gradio as gr
|
4 |
+
from assets.i18n.i18n import I18nAuto
|
5 |
+
|
6 |
+
now_dir = os.getcwd()
|
7 |
+
sys.path.append(now_dir)
|
8 |
+
|
9 |
+
i18n = I18nAuto()
|
10 |
+
|
11 |
+
config_file = os.path.join(now_dir, "assets", "config.json")
|
12 |
+
|
13 |
+
|
14 |
+
def get_language_settings():
|
15 |
+
with open(config_file, "r", encoding="utf8") as file:
|
16 |
+
config = json.load(file)
|
17 |
+
|
18 |
+
if config["lang"]["override"] == False:
|
19 |
+
return "Language automatically detected in the system"
|
20 |
+
else:
|
21 |
+
return config["lang"]["selected_lang"]
|
22 |
+
|
23 |
+
|
24 |
+
def save_lang_settings(selected_language):
|
25 |
+
with open(config_file, "r", encoding="utf8") as file:
|
26 |
+
config = json.load(file)
|
27 |
+
|
28 |
+
if selected_language == "Language automatically detected in the system":
|
29 |
+
config["lang"]["override"] = False
|
30 |
+
else:
|
31 |
+
config["lang"]["override"] = True
|
32 |
+
config["lang"]["selected_lang"] = selected_language
|
33 |
+
|
34 |
+
gr.Info("Language have been saved. Restart Applio to apply the changes.")
|
35 |
+
|
36 |
+
with open(config_file, "w", encoding="utf8") as file:
|
37 |
+
json.dump(config, file, indent=2)
|
38 |
+
|
39 |
+
|
40 |
+
def lang_tab():
|
41 |
+
with gr.Column():
|
42 |
+
selected_language = gr.Dropdown(
|
43 |
+
label=i18n("Language"),
|
44 |
+
info=i18n(
|
45 |
+
"Select the language you want to use. (Requires restarting Applio)"
|
46 |
+
),
|
47 |
+
value=get_language_settings(),
|
48 |
+
choices=["Language automatically detected in the system"]
|
49 |
+
+ i18n._get_available_languages(),
|
50 |
+
interactive=True,
|
51 |
+
)
|
52 |
+
|
53 |
+
selected_language.change(
|
54 |
+
fn=save_lang_settings,
|
55 |
+
inputs=[selected_language],
|
56 |
+
outputs=[],
|
57 |
+
)
|
tabs/settings/presence.py
CHANGED
@@ -1,17 +1,29 @@
|
|
1 |
import os
|
2 |
import sys
|
3 |
-
import base64
|
4 |
-
import pathlib
|
5 |
-
import tempfile
|
6 |
import gradio as gr
|
7 |
-
import
|
8 |
from assets.i18n.i18n import I18nAuto
|
9 |
from assets.discord_presence import RPCManager
|
10 |
|
11 |
now_dir = os.getcwd()
|
12 |
-
sys.path.append(
|
13 |
|
14 |
i18n = I18nAuto()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
|
17 |
def presence_tab():
|
@@ -19,8 +31,11 @@ def presence_tab():
|
|
19 |
with gr.Column():
|
20 |
presence = gr.Checkbox(
|
21 |
label=i18n("Enable Applio integration with Discord presence"),
|
|
|
|
|
|
|
22 |
interactive=True,
|
23 |
-
value=
|
24 |
)
|
25 |
presence.change(
|
26 |
fn=toggle,
|
@@ -30,13 +45,11 @@ def presence_tab():
|
|
30 |
|
31 |
|
32 |
def toggle(checkbox):
|
33 |
-
|
34 |
-
if
|
35 |
-
# print("Start Presence")
|
36 |
try:
|
37 |
RPCManager.start_presence()
|
38 |
except KeyboardInterrupt:
|
39 |
RPCManager.stop_presence()
|
40 |
else:
|
41 |
-
# print("Stop presence")
|
42 |
RPCManager.stop_presence()
|
|
|
1 |
import os
|
2 |
import sys
|
|
|
|
|
|
|
3 |
import gradio as gr
|
4 |
+
import json
|
5 |
from assets.i18n.i18n import I18nAuto
|
6 |
from assets.discord_presence import RPCManager
|
7 |
|
8 |
now_dir = os.getcwd()
|
9 |
+
sys.path.append(now_dir)
|
10 |
|
11 |
i18n = I18nAuto()
|
12 |
+
config_file = os.path.join(now_dir, "assets", "config.json")
|
13 |
+
|
14 |
+
|
15 |
+
def load_config_presence():
|
16 |
+
with open(config_file, "r", encoding="utf8") as file:
|
17 |
+
config = json.load(file)
|
18 |
+
return config["discord_presence"]
|
19 |
+
|
20 |
+
|
21 |
+
def save_config(value):
|
22 |
+
with open(config_file, "r", encoding="utf8") as file:
|
23 |
+
config = json.load(file)
|
24 |
+
config["discord_presence"] = value
|
25 |
+
with open(config_file, "w", encoding="utf8") as file:
|
26 |
+
json.dump(config, file, indent=2)
|
27 |
|
28 |
|
29 |
def presence_tab():
|
|
|
31 |
with gr.Column():
|
32 |
presence = gr.Checkbox(
|
33 |
label=i18n("Enable Applio integration with Discord presence"),
|
34 |
+
info=i18n(
|
35 |
+
"It will activate the possibility of displaying the current Applio activity in Discord."
|
36 |
+
),
|
37 |
interactive=True,
|
38 |
+
value=load_config_presence(),
|
39 |
)
|
40 |
presence.change(
|
41 |
fn=toggle,
|
|
|
45 |
|
46 |
|
47 |
def toggle(checkbox):
|
48 |
+
save_config(bool(checkbox))
|
49 |
+
if load_config_presence() == True:
|
|
|
50 |
try:
|
51 |
RPCManager.start_presence()
|
52 |
except KeyboardInterrupt:
|
53 |
RPCManager.stop_presence()
|
54 |
else:
|
|
|
55 |
RPCManager.stop_presence()
|
tabs/settings/restart.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import os
|
3 |
+
import sys
|
4 |
+
|
5 |
+
now_dir = os.getcwd()
|
6 |
+
pid_file_path = os.path.join(now_dir, "rvc", "train", "train_pid.txt")
|
7 |
+
|
8 |
+
|
9 |
+
def restart_applio():
|
10 |
+
if os.name != "nt":
|
11 |
+
os.system("clear")
|
12 |
+
else:
|
13 |
+
os.system("cls")
|
14 |
+
try:
|
15 |
+
with open(pid_file_path, "r") as pid_file:
|
16 |
+
pids = [int(pid) for pid in pid_file.readlines()]
|
17 |
+
for pid in pids:
|
18 |
+
os.kill(pid, 9)
|
19 |
+
os.remove(pid_file_path)
|
20 |
+
except:
|
21 |
+
pass
|
22 |
+
python = sys.executable
|
23 |
+
os.execl(python, python, *sys.argv)
|
24 |
+
|
25 |
+
|
26 |
+
from assets.i18n.i18n import I18nAuto
|
27 |
+
|
28 |
+
i18n = I18nAuto()
|
29 |
+
|
30 |
+
|
31 |
+
def restart_tab():
|
32 |
+
with gr.Row():
|
33 |
+
with gr.Column():
|
34 |
+
restart_button = gr.Button(i18n("Restart Applio"))
|
35 |
+
restart_button.click(
|
36 |
+
fn=restart_applio,
|
37 |
+
inputs=[],
|
38 |
+
outputs=[],
|
39 |
+
)
|
tabs/settings/themes.py
CHANGED
@@ -9,7 +9,7 @@ from assets.i18n.i18n import I18nAuto
|
|
9 |
import assets.themes.loadThemes as loadThemes
|
10 |
|
11 |
now_dir = os.getcwd()
|
12 |
-
sys.path.append(
|
13 |
|
14 |
i18n = I18nAuto()
|
15 |
|
@@ -21,6 +21,9 @@ def theme_tab():
|
|
21 |
loadThemes.get_list(),
|
22 |
value=loadThemes.read_json(),
|
23 |
label=i18n("Theme"),
|
|
|
|
|
|
|
24 |
visible=True,
|
25 |
)
|
26 |
themes_select.change(
|
|
|
9 |
import assets.themes.loadThemes as loadThemes
|
10 |
|
11 |
now_dir = os.getcwd()
|
12 |
+
sys.path.append(now_dir)
|
13 |
|
14 |
i18n = I18nAuto()
|
15 |
|
|
|
21 |
loadThemes.get_list(),
|
22 |
value=loadThemes.read_json(),
|
23 |
label=i18n("Theme"),
|
24 |
+
info=i18n(
|
25 |
+
"Select the theme you want to use. (Requires restarting Applio)"
|
26 |
+
),
|
27 |
visible=True,
|
28 |
)
|
29 |
themes_select.change(
|
tabs/settings/version.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
|
3 |
+
from assets.version_checker import compare_version
|
4 |
+
from assets.i18n.i18n import I18nAuto
|
5 |
+
|
6 |
+
i18n = I18nAuto()
|
7 |
+
|
8 |
+
|
9 |
+
def version_tab():
|
10 |
+
with gr.Row():
|
11 |
+
with gr.Column():
|
12 |
+
version_check = gr.Textbox(
|
13 |
+
label=i18n("Version Checker"),
|
14 |
+
info=i18n(
|
15 |
+
"Check which version of Applio is the latest to see if you need to update."
|
16 |
+
),
|
17 |
+
interactive=False,
|
18 |
+
)
|
19 |
+
version_button = gr.Button(i18n("Check for updates"))
|
20 |
+
version_button.click(
|
21 |
+
fn=compare_version,
|
22 |
+
inputs=[],
|
23 |
+
outputs=[version_check],
|
24 |
+
)
|
tabs/train/train.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import os
|
2 |
import subprocess
|
3 |
import sys
|
|
|
4 |
import gradio as gr
|
5 |
from assets.i18n.i18n import I18nAuto
|
6 |
from core import (
|
@@ -8,14 +9,40 @@ from core import (
|
|
8 |
run_extract_script,
|
9 |
run_train_script,
|
10 |
run_index_script,
|
|
|
11 |
)
|
12 |
from rvc.configs.config import max_vram_gpu, get_gpu_info
|
13 |
from rvc.lib.utils import format_title
|
|
|
14 |
|
15 |
i18n = I18nAuto()
|
16 |
now_dir = os.getcwd()
|
17 |
sys.path.append(now_dir)
|
18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
sup_audioext = {
|
20 |
"wav",
|
21 |
"mp3",
|
@@ -84,6 +111,31 @@ def refresh_datasets():
|
|
84 |
return {"choices": sorted(get_datasets_list()), "__type__": "update"}
|
85 |
|
86 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
# Drop Model
|
88 |
def save_drop_model(dropbox):
|
89 |
if ".pth" not in dropbox:
|
@@ -136,25 +188,92 @@ def save_drop_dataset_audio(dropbox, dataset_name):
|
|
136 |
return None, relative_dataset_path
|
137 |
|
138 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
# Train Tab
|
140 |
def train_tab():
|
141 |
with gr.Accordion(i18n("Preprocess")):
|
142 |
with gr.Row():
|
143 |
with gr.Column():
|
144 |
-
model_name = gr.
|
145 |
label=i18n("Model Name"),
|
146 |
-
|
|
|
147 |
value="my-project",
|
148 |
interactive=True,
|
|
|
149 |
)
|
150 |
dataset_path = gr.Dropdown(
|
151 |
label=i18n("Dataset Path"),
|
|
|
152 |
# placeholder=i18n("Enter dataset path"),
|
153 |
choices=get_datasets_list(),
|
154 |
allow_custom_value=True,
|
155 |
interactive=True,
|
156 |
)
|
157 |
-
|
158 |
dataset_creator = gr.Checkbox(
|
159 |
label=i18n("Dataset Creator"),
|
160 |
value=False,
|
@@ -163,9 +282,10 @@ def train_tab():
|
|
163 |
)
|
164 |
|
165 |
with gr.Column(visible=False) as dataset_creator_settings:
|
166 |
-
with gr.Accordion("Dataset Creator"):
|
167 |
dataset_name = gr.Textbox(
|
168 |
label=i18n("Dataset Name"),
|
|
|
169 |
placeholder=i18n("Enter dataset name"),
|
170 |
interactive=True,
|
171 |
)
|
@@ -178,6 +298,7 @@ def train_tab():
|
|
178 |
with gr.Column():
|
179 |
sampling_rate = gr.Radio(
|
180 |
label=i18n("Sampling Rate"),
|
|
|
181 |
choices=["32000", "40000", "48000"],
|
182 |
value="40000",
|
183 |
interactive=True,
|
@@ -185,6 +306,7 @@ def train_tab():
|
|
185 |
|
186 |
rvc_version = gr.Radio(
|
187 |
label=i18n("RVC Version"),
|
|
|
188 |
choices=["v1", "v2"],
|
189 |
value="v2",
|
190 |
interactive=True,
|
@@ -192,6 +314,7 @@ def train_tab():
|
|
192 |
|
193 |
preprocess_output_info = gr.Textbox(
|
194 |
label=i18n("Output Information"),
|
|
|
195 |
value="",
|
196 |
max_lines=8,
|
197 |
interactive=False,
|
@@ -209,12 +332,24 @@ def train_tab():
|
|
209 |
with gr.Accordion(i18n("Extract")):
|
210 |
with gr.Row():
|
211 |
hop_length = gr.Slider(
|
212 |
-
1,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
213 |
)
|
214 |
with gr.Row():
|
215 |
with gr.Column():
|
216 |
f0method = gr.Radio(
|
217 |
label=i18n("Pitch extraction algorithm"),
|
|
|
|
|
|
|
218 |
choices=["pm", "dio", "crepe", "crepe-tiny", "harvest", "rmvpe"],
|
219 |
value="rmvpe",
|
220 |
interactive=True,
|
@@ -222,6 +357,7 @@ def train_tab():
|
|
222 |
|
223 |
extract_output_info = gr.Textbox(
|
224 |
label=i18n("Output Information"),
|
|
|
225 |
value="",
|
226 |
max_lines=8,
|
227 |
interactive=False,
|
@@ -242,39 +378,94 @@ def train_tab():
|
|
242 |
max_vram_gpu(0),
|
243 |
step=1,
|
244 |
label=i18n("Batch Size"),
|
|
|
|
|
|
|
245 |
interactive=True,
|
246 |
)
|
247 |
save_every_epoch = gr.Slider(
|
248 |
-
1,
|
|
|
|
|
|
|
|
|
|
|
|
|
249 |
)
|
250 |
total_epoch = gr.Slider(
|
251 |
-
1,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
252 |
)
|
253 |
with gr.Row():
|
254 |
pitch_guidance = gr.Checkbox(
|
255 |
-
label=i18n("Pitch Guidance"),
|
|
|
|
|
|
|
|
|
|
|
256 |
)
|
257 |
pretrained = gr.Checkbox(
|
258 |
-
label=i18n("Pretrained"),
|
|
|
|
|
|
|
|
|
|
|
259 |
)
|
260 |
save_only_latest = gr.Checkbox(
|
261 |
-
label=i18n("Save Only Latest"),
|
|
|
|
|
|
|
|
|
|
|
262 |
)
|
263 |
save_every_weights = gr.Checkbox(
|
264 |
label=i18n("Save Every Weights"),
|
|
|
|
|
|
|
265 |
value=True,
|
266 |
interactive=True,
|
267 |
)
|
268 |
custom_pretrained = gr.Checkbox(
|
269 |
-
label=i18n("Custom Pretrained"),
|
|
|
|
|
|
|
|
|
|
|
270 |
)
|
271 |
multiple_gpu = gr.Checkbox(
|
272 |
-
label=i18n("GPU Settings"),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
273 |
)
|
274 |
|
275 |
with gr.Row():
|
276 |
with gr.Column(visible=False) as pretrained_custom_settings:
|
277 |
-
with gr.Accordion("Pretrained Custom Settings"):
|
278 |
upload_pretrained = gr.File(
|
279 |
label=i18n("Upload Pretrained Model"),
|
280 |
type="filepath",
|
@@ -285,33 +476,57 @@ def train_tab():
|
|
285 |
)
|
286 |
g_pretrained_path = gr.Dropdown(
|
287 |
label=i18n("Custom Pretrained G"),
|
|
|
|
|
|
|
288 |
choices=sorted(pretraineds_list_g),
|
289 |
interactive=True,
|
290 |
allow_custom_value=True,
|
291 |
)
|
292 |
d_pretrained_path = gr.Dropdown(
|
293 |
label=i18n("Custom Pretrained D"),
|
|
|
|
|
|
|
294 |
choices=sorted(pretraineds_list_d),
|
295 |
interactive=True,
|
296 |
allow_custom_value=True,
|
297 |
)
|
298 |
with gr.Column(visible=False) as gpu_custom_settings:
|
299 |
-
with gr.Accordion("GPU Settings"):
|
300 |
gpu = gr.Textbox(
|
301 |
label=i18n("GPU Number"),
|
|
|
|
|
|
|
302 |
placeholder=i18n("0 to ∞ separated by -"),
|
303 |
value="0",
|
304 |
interactive=True,
|
305 |
)
|
306 |
gr.Textbox(
|
307 |
label=i18n("GPU Information"),
|
|
|
308 |
value=get_gpu_info(),
|
309 |
interactive=False,
|
310 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
311 |
|
312 |
with gr.Row():
|
313 |
train_output_info = gr.Textbox(
|
314 |
label=i18n("Output Information"),
|
|
|
315 |
value="",
|
316 |
max_lines=8,
|
317 |
interactive=False,
|
@@ -332,6 +547,8 @@ def train_tab():
|
|
332 |
batch_size,
|
333 |
gpu,
|
334 |
pitch_guidance,
|
|
|
|
|
335 |
pretrained,
|
336 |
custom_pretrained,
|
337 |
g_pretrained_path,
|
@@ -341,6 +558,15 @@ def train_tab():
|
|
341 |
api_name="start_training",
|
342 |
)
|
343 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
344 |
index_button = gr.Button(i18n("Generate Index"))
|
345 |
index_button.click(
|
346 |
run_index_script,
|
@@ -349,13 +575,114 @@ def train_tab():
|
|
349 |
api_name="generate_index",
|
350 |
)
|
351 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
352 |
def toggle_visible(checkbox):
|
353 |
return {"visible": checkbox, "__type__": "update"}
|
354 |
|
355 |
-
|
356 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
357 |
inputs=[],
|
358 |
-
outputs=[dataset_path],
|
359 |
)
|
360 |
|
361 |
dataset_creator.change(
|
@@ -370,6 +697,18 @@ def train_tab():
|
|
370 |
outputs=[upload_audio_dataset, dataset_path],
|
371 |
)
|
372 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
373 |
custom_pretrained.change(
|
374 |
fn=toggle_visible,
|
375 |
inputs=[custom_pretrained],
|
@@ -388,8 +727,44 @@ def train_tab():
|
|
388 |
outputs=[upload_pretrained],
|
389 |
)
|
390 |
|
|
|
|
|
|
|
|
|
|
|
|
|
391 |
multiple_gpu.change(
|
392 |
fn=toggle_visible,
|
393 |
inputs=[multiple_gpu],
|
394 |
outputs=[gpu_custom_settings],
|
395 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
import subprocess
|
3 |
import sys
|
4 |
+
import shutil
|
5 |
import gradio as gr
|
6 |
from assets.i18n.i18n import I18nAuto
|
7 |
from core import (
|
|
|
9 |
run_extract_script,
|
10 |
run_train_script,
|
11 |
run_index_script,
|
12 |
+
run_prerequisites_script,
|
13 |
)
|
14 |
from rvc.configs.config import max_vram_gpu, get_gpu_info
|
15 |
from rvc.lib.utils import format_title
|
16 |
+
from tabs.settings.restart import restart_applio
|
17 |
|
18 |
i18n = I18nAuto()
|
19 |
now_dir = os.getcwd()
|
20 |
sys.path.append(now_dir)
|
21 |
|
22 |
+
pretraineds_v1 = [
|
23 |
+
(
|
24 |
+
"pretrained_v1/",
|
25 |
+
[
|
26 |
+
"D32k.pth",
|
27 |
+
"D40k.pth",
|
28 |
+
"D48k.pth",
|
29 |
+
"G32k.pth",
|
30 |
+
"G40k.pth",
|
31 |
+
"G48k.pth",
|
32 |
+
"f0D32k.pth",
|
33 |
+
"f0D40k.pth",
|
34 |
+
"f0D48k.pth",
|
35 |
+
"f0G32k.pth",
|
36 |
+
"f0G40k.pth",
|
37 |
+
"f0G48k.pth",
|
38 |
+
],
|
39 |
+
),
|
40 |
+
]
|
41 |
+
|
42 |
+
folder_mapping = {
|
43 |
+
"pretrained_v1/": "rvc/pretraineds/pretrained_v1/",
|
44 |
+
}
|
45 |
+
|
46 |
sup_audioext = {
|
47 |
"wav",
|
48 |
"mp3",
|
|
|
111 |
return {"choices": sorted(get_datasets_list()), "__type__": "update"}
|
112 |
|
113 |
|
114 |
+
# Model Names
|
115 |
+
models_path = os.path.join(now_dir, "logs")
|
116 |
+
|
117 |
+
|
118 |
+
def get_models_list():
|
119 |
+
return [
|
120 |
+
os.path.basename(dirpath)
|
121 |
+
for dirpath in os.listdir(models_path)
|
122 |
+
if os.path.isdir(os.path.join(models_path, dirpath))
|
123 |
+
and all(excluded not in dirpath for excluded in ["zips", "mute"])
|
124 |
+
]
|
125 |
+
|
126 |
+
|
127 |
+
def refresh_models():
|
128 |
+
return {"choices": sorted(get_models_list()), "__type__": "update"}
|
129 |
+
|
130 |
+
|
131 |
+
# Refresh Models and Datasets
|
132 |
+
def refresh_models_and_datasets():
|
133 |
+
return (
|
134 |
+
{"choices": sorted(get_models_list()), "__type__": "update"},
|
135 |
+
{"choices": sorted(get_datasets_list()), "__type__": "update"},
|
136 |
+
)
|
137 |
+
|
138 |
+
|
139 |
# Drop Model
|
140 |
def save_drop_model(dropbox):
|
141 |
if ".pth" not in dropbox:
|
|
|
188 |
return None, relative_dataset_path
|
189 |
|
190 |
|
191 |
+
# Export
|
192 |
+
## Get Pth and Index Files
|
193 |
+
def get_pth_list():
|
194 |
+
return [
|
195 |
+
os.path.relpath(os.path.join(dirpath, filename), now_dir)
|
196 |
+
for dirpath, _, filenames in os.walk(models_path)
|
197 |
+
for filename in filenames
|
198 |
+
if filename.endswith(".pth")
|
199 |
+
]
|
200 |
+
|
201 |
+
|
202 |
+
def get_index_list():
|
203 |
+
return [
|
204 |
+
os.path.relpath(os.path.join(dirpath, filename), now_dir)
|
205 |
+
for dirpath, _, filenames in os.walk(models_path)
|
206 |
+
for filename in filenames
|
207 |
+
if filename.endswith(".index") and "trained" not in filename
|
208 |
+
]
|
209 |
+
|
210 |
+
|
211 |
+
def refresh_pth_and_index_list():
|
212 |
+
return (
|
213 |
+
{"choices": sorted(get_pth_list()), "__type__": "update"},
|
214 |
+
{"choices": sorted(get_index_list()), "__type__": "update"},
|
215 |
+
)
|
216 |
+
|
217 |
+
|
218 |
+
## Export Pth and Index Files
|
219 |
+
def export_pth(pth_path):
|
220 |
+
if pth_path and os.path.exists(pth_path):
|
221 |
+
return pth_path
|
222 |
+
return None
|
223 |
+
|
224 |
+
|
225 |
+
def export_index(index_path):
|
226 |
+
if index_path and os.path.exists(index_path):
|
227 |
+
return index_path
|
228 |
+
return None
|
229 |
+
|
230 |
+
|
231 |
+
## Upload to Google Drive
|
232 |
+
def upload_to_google_drive(pth_path, index_path):
|
233 |
+
def upload_file(file_path):
|
234 |
+
if file_path:
|
235 |
+
try:
|
236 |
+
gr.Info(f"Uploading {pth_path} to Google Drive...")
|
237 |
+
google_drive_folder = "/content/drive/MyDrive/ApplioExported"
|
238 |
+
if not os.path.exists(google_drive_folder):
|
239 |
+
os.makedirs(google_drive_folder)
|
240 |
+
google_drive_file_path = os.path.join(
|
241 |
+
google_drive_folder, os.path.basename(file_path)
|
242 |
+
)
|
243 |
+
if os.path.exists(google_drive_file_path):
|
244 |
+
os.remove(google_drive_file_path)
|
245 |
+
shutil.copy2(file_path, google_drive_file_path)
|
246 |
+
gr.Info("File uploaded successfully.")
|
247 |
+
except Exception as error:
|
248 |
+
print(error)
|
249 |
+
gr.Info("Error uploading to Google Drive")
|
250 |
+
|
251 |
+
upload_file(pth_path)
|
252 |
+
upload_file(index_path)
|
253 |
+
|
254 |
+
|
255 |
# Train Tab
|
256 |
def train_tab():
|
257 |
with gr.Accordion(i18n("Preprocess")):
|
258 |
with gr.Row():
|
259 |
with gr.Column():
|
260 |
+
model_name = gr.Dropdown(
|
261 |
label=i18n("Model Name"),
|
262 |
+
info=i18n("Name of the new model."),
|
263 |
+
choices=get_models_list(),
|
264 |
value="my-project",
|
265 |
interactive=True,
|
266 |
+
allow_custom_value=True,
|
267 |
)
|
268 |
dataset_path = gr.Dropdown(
|
269 |
label=i18n("Dataset Path"),
|
270 |
+
info=i18n("Path to the dataset folder."),
|
271 |
# placeholder=i18n("Enter dataset path"),
|
272 |
choices=get_datasets_list(),
|
273 |
allow_custom_value=True,
|
274 |
interactive=True,
|
275 |
)
|
276 |
+
refresh = gr.Button(i18n("Refresh"))
|
277 |
dataset_creator = gr.Checkbox(
|
278 |
label=i18n("Dataset Creator"),
|
279 |
value=False,
|
|
|
282 |
)
|
283 |
|
284 |
with gr.Column(visible=False) as dataset_creator_settings:
|
285 |
+
with gr.Accordion(i18n("Dataset Creator")):
|
286 |
dataset_name = gr.Textbox(
|
287 |
label=i18n("Dataset Name"),
|
288 |
+
info=i18n("Name of the new dataset."),
|
289 |
placeholder=i18n("Enter dataset name"),
|
290 |
interactive=True,
|
291 |
)
|
|
|
298 |
with gr.Column():
|
299 |
sampling_rate = gr.Radio(
|
300 |
label=i18n("Sampling Rate"),
|
301 |
+
info=i18n("The sampling rate of the audio files."),
|
302 |
choices=["32000", "40000", "48000"],
|
303 |
value="40000",
|
304 |
interactive=True,
|
|
|
306 |
|
307 |
rvc_version = gr.Radio(
|
308 |
label=i18n("RVC Version"),
|
309 |
+
info=i18n("The RVC version of the model."),
|
310 |
choices=["v1", "v2"],
|
311 |
value="v2",
|
312 |
interactive=True,
|
|
|
314 |
|
315 |
preprocess_output_info = gr.Textbox(
|
316 |
label=i18n("Output Information"),
|
317 |
+
info=i18n("The output information will be displayed here."),
|
318 |
value="",
|
319 |
max_lines=8,
|
320 |
interactive=False,
|
|
|
332 |
with gr.Accordion(i18n("Extract")):
|
333 |
with gr.Row():
|
334 |
hop_length = gr.Slider(
|
335 |
+
1,
|
336 |
+
512,
|
337 |
+
128,
|
338 |
+
step=1,
|
339 |
+
label=i18n("Hop Length"),
|
340 |
+
info=i18n(
|
341 |
+
"Denotes the duration it takes for the system to transition to a significant pitch change. Smaller hop lengths require more time for inference but tend to yield higher pitch accuracy."
|
342 |
+
),
|
343 |
+
interactive=True,
|
344 |
+
visible=False,
|
345 |
)
|
346 |
with gr.Row():
|
347 |
with gr.Column():
|
348 |
f0method = gr.Radio(
|
349 |
label=i18n("Pitch extraction algorithm"),
|
350 |
+
info=i18n(
|
351 |
+
"Pitch extraction algorithm to use for the audio conversion. The default algorithm is rmvpe, which is recommended for most cases."
|
352 |
+
),
|
353 |
choices=["pm", "dio", "crepe", "crepe-tiny", "harvest", "rmvpe"],
|
354 |
value="rmvpe",
|
355 |
interactive=True,
|
|
|
357 |
|
358 |
extract_output_info = gr.Textbox(
|
359 |
label=i18n("Output Information"),
|
360 |
+
info=i18n("The output information will be displayed here."),
|
361 |
value="",
|
362 |
max_lines=8,
|
363 |
interactive=False,
|
|
|
378 |
max_vram_gpu(0),
|
379 |
step=1,
|
380 |
label=i18n("Batch Size"),
|
381 |
+
info=i18n(
|
382 |
+
"It's advisable to align it with the available VRAM of your GPU. A setting of 4 offers improved accuracy but slower processing, while 8 provides faster and standard results."
|
383 |
+
),
|
384 |
interactive=True,
|
385 |
)
|
386 |
save_every_epoch = gr.Slider(
|
387 |
+
1,
|
388 |
+
100,
|
389 |
+
10,
|
390 |
+
step=1,
|
391 |
+
label=i18n("Save Every Epoch"),
|
392 |
+
info=i18n("Determine at how many epochs the model will saved at."),
|
393 |
+
interactive=True,
|
394 |
)
|
395 |
total_epoch = gr.Slider(
|
396 |
+
1,
|
397 |
+
10000,
|
398 |
+
500,
|
399 |
+
step=1,
|
400 |
+
label=i18n("Total Epoch"),
|
401 |
+
info=i18n(
|
402 |
+
"Specifies the overall quantity of epochs for the model training process."
|
403 |
+
),
|
404 |
+
interactive=True,
|
405 |
)
|
406 |
with gr.Row():
|
407 |
pitch_guidance = gr.Checkbox(
|
408 |
+
label=i18n("Pitch Guidance"),
|
409 |
+
info=i18n(
|
410 |
+
"By employing pitch guidance, it becomes feasible to mirror the intonation of the original voice, including its pitch. This feature is particularly valuable for singing and other scenarios where preserving the original melody or pitch pattern is essential."
|
411 |
+
),
|
412 |
+
value=True,
|
413 |
+
interactive=True,
|
414 |
)
|
415 |
pretrained = gr.Checkbox(
|
416 |
+
label=i18n("Pretrained"),
|
417 |
+
info=i18n(
|
418 |
+
"Utilize pretrained models when training your own. This approach reduces training duration and enhances overall quality."
|
419 |
+
),
|
420 |
+
value=True,
|
421 |
+
interactive=True,
|
422 |
)
|
423 |
save_only_latest = gr.Checkbox(
|
424 |
+
label=i18n("Save Only Latest"),
|
425 |
+
info=i18n(
|
426 |
+
"Enabling this setting will result in the G and D files saving only their most recent versions, effectively conserving storage space."
|
427 |
+
),
|
428 |
+
value=False,
|
429 |
+
interactive=True,
|
430 |
)
|
431 |
save_every_weights = gr.Checkbox(
|
432 |
label=i18n("Save Every Weights"),
|
433 |
+
info=i18n(
|
434 |
+
"This setting enables you to save the weights of the model at the conclusion of each epoch."
|
435 |
+
),
|
436 |
value=True,
|
437 |
interactive=True,
|
438 |
)
|
439 |
custom_pretrained = gr.Checkbox(
|
440 |
+
label=i18n("Custom Pretrained"),
|
441 |
+
info=i18n(
|
442 |
+
"Utilizing custom pretrained models can lead to superior results, as selecting the most suitable pretrained models tailored to the specific use case can significantly enhance performance."
|
443 |
+
),
|
444 |
+
value=False,
|
445 |
+
interactive=True,
|
446 |
)
|
447 |
multiple_gpu = gr.Checkbox(
|
448 |
+
label=i18n("GPU Settings"),
|
449 |
+
info=(
|
450 |
+
i18n(
|
451 |
+
"Sets advanced GPU settings, recommended for users with better GPU architecture."
|
452 |
+
)
|
453 |
+
),
|
454 |
+
value=False,
|
455 |
+
interactive=True,
|
456 |
+
)
|
457 |
+
overtraining_detector = gr.Checkbox(
|
458 |
+
label=i18n("Overtraining Detector"),
|
459 |
+
info=i18n(
|
460 |
+
"Detect overtraining to prevent the model from learning the training data too well and losing the ability to generalize to new data."
|
461 |
+
),
|
462 |
+
value=False,
|
463 |
+
interactive=True,
|
464 |
)
|
465 |
|
466 |
with gr.Row():
|
467 |
with gr.Column(visible=False) as pretrained_custom_settings:
|
468 |
+
with gr.Accordion(i18n("Pretrained Custom Settings")):
|
469 |
upload_pretrained = gr.File(
|
470 |
label=i18n("Upload Pretrained Model"),
|
471 |
type="filepath",
|
|
|
476 |
)
|
477 |
g_pretrained_path = gr.Dropdown(
|
478 |
label=i18n("Custom Pretrained G"),
|
479 |
+
info=i18n(
|
480 |
+
"Select the custom pretrained model for the generator."
|
481 |
+
),
|
482 |
choices=sorted(pretraineds_list_g),
|
483 |
interactive=True,
|
484 |
allow_custom_value=True,
|
485 |
)
|
486 |
d_pretrained_path = gr.Dropdown(
|
487 |
label=i18n("Custom Pretrained D"),
|
488 |
+
info=i18n(
|
489 |
+
"Select the custom pretrained model for the discriminator."
|
490 |
+
),
|
491 |
choices=sorted(pretraineds_list_d),
|
492 |
interactive=True,
|
493 |
allow_custom_value=True,
|
494 |
)
|
495 |
with gr.Column(visible=False) as gpu_custom_settings:
|
496 |
+
with gr.Accordion(i18n("GPU Settings")):
|
497 |
gpu = gr.Textbox(
|
498 |
label=i18n("GPU Number"),
|
499 |
+
info=i18n(
|
500 |
+
"Specify the number of GPUs you wish to utilize for training by entering them separated by hyphens (-)."
|
501 |
+
),
|
502 |
placeholder=i18n("0 to ∞ separated by -"),
|
503 |
value="0",
|
504 |
interactive=True,
|
505 |
)
|
506 |
gr.Textbox(
|
507 |
label=i18n("GPU Information"),
|
508 |
+
info=i18n("The GPU information will be displayed here."),
|
509 |
value=get_gpu_info(),
|
510 |
interactive=False,
|
511 |
)
|
512 |
+
with gr.Column(visible=False) as overtraining_settings:
|
513 |
+
with gr.Accordion(i18n("Overtraining Detector Settings")):
|
514 |
+
overtraining_threshold = gr.Slider(
|
515 |
+
1,
|
516 |
+
100,
|
517 |
+
50,
|
518 |
+
step=1,
|
519 |
+
label=i18n("Overtraining Threshold"),
|
520 |
+
info=i18n(
|
521 |
+
"Set the maximum number of epochs you want your model to stop training if no improvement is detected."
|
522 |
+
),
|
523 |
+
interactive=True,
|
524 |
+
)
|
525 |
|
526 |
with gr.Row():
|
527 |
train_output_info = gr.Textbox(
|
528 |
label=i18n("Output Information"),
|
529 |
+
info=i18n("The output information will be displayed here."),
|
530 |
value="",
|
531 |
max_lines=8,
|
532 |
interactive=False,
|
|
|
547 |
batch_size,
|
548 |
gpu,
|
549 |
pitch_guidance,
|
550 |
+
overtraining_detector,
|
551 |
+
overtraining_threshold,
|
552 |
pretrained,
|
553 |
custom_pretrained,
|
554 |
g_pretrained_path,
|
|
|
558 |
api_name="start_training",
|
559 |
)
|
560 |
|
561 |
+
stop_train_button = gr.Button(
|
562 |
+
i18n("Stop Training & Restart Applio"), visible=False
|
563 |
+
)
|
564 |
+
stop_train_button.click(
|
565 |
+
fn=restart_applio,
|
566 |
+
inputs=[],
|
567 |
+
outputs=[],
|
568 |
+
)
|
569 |
+
|
570 |
index_button = gr.Button(i18n("Generate Index"))
|
571 |
index_button.click(
|
572 |
run_index_script,
|
|
|
575 |
api_name="generate_index",
|
576 |
)
|
577 |
|
578 |
+
with gr.Accordion(i18n("Export Model"), open=False):
|
579 |
+
if not os.name == "nt":
|
580 |
+
gr.Markdown(
|
581 |
+
i18n(
|
582 |
+
"The button 'Upload' is only for google colab: Uploads the exported files to the ApplioExported folder in your Google Drive."
|
583 |
+
)
|
584 |
+
)
|
585 |
+
with gr.Row():
|
586 |
+
with gr.Column():
|
587 |
+
pth_file_export = gr.File(
|
588 |
+
label=i18n("Exported Pth file"),
|
589 |
+
type="filepath",
|
590 |
+
value=None,
|
591 |
+
interactive=False,
|
592 |
+
)
|
593 |
+
pth_dropdown_export = gr.Dropdown(
|
594 |
+
label=i18n("Pth file"),
|
595 |
+
info=i18n("Select the pth file to be exported"),
|
596 |
+
choices=get_pth_list(),
|
597 |
+
value=None,
|
598 |
+
interactive=True,
|
599 |
+
allow_custom_value=True,
|
600 |
+
)
|
601 |
+
with gr.Column():
|
602 |
+
index_file_export = gr.File(
|
603 |
+
label=i18n("Exported Index File"),
|
604 |
+
type="filepath",
|
605 |
+
value=None,
|
606 |
+
interactive=False,
|
607 |
+
)
|
608 |
+
index_dropdown_export = gr.Dropdown(
|
609 |
+
label=i18n("Index File"),
|
610 |
+
info=i18n("Select the index file to be exported"),
|
611 |
+
choices=get_index_list(),
|
612 |
+
value=None,
|
613 |
+
interactive=True,
|
614 |
+
allow_custom_value=True,
|
615 |
+
)
|
616 |
+
with gr.Row():
|
617 |
+
with gr.Column():
|
618 |
+
refresh_export = gr.Button(i18n("Refresh"))
|
619 |
+
if not os.name == "nt":
|
620 |
+
upload_exported = gr.Button(i18n("Upload"), variant="primary")
|
621 |
+
upload_exported.click(
|
622 |
+
fn=upload_to_google_drive,
|
623 |
+
inputs=[pth_dropdown_export, index_dropdown_export],
|
624 |
+
outputs=[],
|
625 |
+
)
|
626 |
+
|
627 |
def toggle_visible(checkbox):
|
628 |
return {"visible": checkbox, "__type__": "update"}
|
629 |
|
630 |
+
def toggle_visible_hop_length(f0method):
|
631 |
+
if f0method == "crepe" or f0method == "crepe-tiny":
|
632 |
+
return {"visible": True, "__type__": "update"}
|
633 |
+
return {"visible": False, "__type__": "update"}
|
634 |
+
|
635 |
+
def toggle_pretrained(pretrained, custom_pretrained):
|
636 |
+
if custom_pretrained == False:
|
637 |
+
return {"visible": pretrained, "__type__": "update"}, {
|
638 |
+
"visible": False,
|
639 |
+
"__type__": "update",
|
640 |
+
}
|
641 |
+
else:
|
642 |
+
return {"visible": pretrained, "__type__": "update"}, {
|
643 |
+
"visible": pretrained,
|
644 |
+
"__type__": "update",
|
645 |
+
}
|
646 |
+
|
647 |
+
def enable_stop_train_button():
|
648 |
+
return {"visible": False, "__type__": "update"}, {
|
649 |
+
"visible": True,
|
650 |
+
"__type__": "update",
|
651 |
+
}
|
652 |
+
|
653 |
+
def disable_stop_train_button():
|
654 |
+
return {"visible": True, "__type__": "update"}, {
|
655 |
+
"visible": False,
|
656 |
+
"__type__": "update",
|
657 |
+
}
|
658 |
+
|
659 |
+
def download_prerequisites(version):
|
660 |
+
for remote_folder, file_list in pretraineds_v1:
|
661 |
+
local_folder = folder_mapping.get(remote_folder, "")
|
662 |
+
missing = False
|
663 |
+
for file in file_list:
|
664 |
+
destination_path = os.path.join(local_folder, file)
|
665 |
+
if not os.path.exists(destination_path):
|
666 |
+
missing = True
|
667 |
+
if version == "v1" and missing == True:
|
668 |
+
gr.Info(
|
669 |
+
"Downloading prerequisites... Please wait till it finishes to start preprocessing."
|
670 |
+
)
|
671 |
+
run_prerequisites_script("True", "False", "True", "True")
|
672 |
+
gr.Info(
|
673 |
+
"Prerequisites downloaded successfully, you may now start preprocessing."
|
674 |
+
)
|
675 |
+
|
676 |
+
rvc_version.change(
|
677 |
+
fn=download_prerequisites,
|
678 |
+
inputs=[rvc_version],
|
679 |
+
outputs=[],
|
680 |
+
)
|
681 |
+
|
682 |
+
refresh.click(
|
683 |
+
fn=refresh_models_and_datasets,
|
684 |
inputs=[],
|
685 |
+
outputs=[model_name, dataset_path],
|
686 |
)
|
687 |
|
688 |
dataset_creator.change(
|
|
|
697 |
outputs=[upload_audio_dataset, dataset_path],
|
698 |
)
|
699 |
|
700 |
+
f0method.change(
|
701 |
+
fn=toggle_visible_hop_length,
|
702 |
+
inputs=[f0method],
|
703 |
+
outputs=[hop_length],
|
704 |
+
)
|
705 |
+
|
706 |
+
pretrained.change(
|
707 |
+
fn=toggle_pretrained,
|
708 |
+
inputs=[pretrained, custom_pretrained],
|
709 |
+
outputs=[custom_pretrained, pretrained_custom_settings],
|
710 |
+
)
|
711 |
+
|
712 |
custom_pretrained.change(
|
713 |
fn=toggle_visible,
|
714 |
inputs=[custom_pretrained],
|
|
|
727 |
outputs=[upload_pretrained],
|
728 |
)
|
729 |
|
730 |
+
overtraining_detector.change(
|
731 |
+
fn=toggle_visible,
|
732 |
+
inputs=[overtraining_detector],
|
733 |
+
outputs=[overtraining_settings],
|
734 |
+
)
|
735 |
+
|
736 |
multiple_gpu.change(
|
737 |
fn=toggle_visible,
|
738 |
inputs=[multiple_gpu],
|
739 |
outputs=[gpu_custom_settings],
|
740 |
)
|
741 |
+
|
742 |
+
train_button.click(
|
743 |
+
fn=enable_stop_train_button,
|
744 |
+
inputs=[],
|
745 |
+
outputs=[train_button, stop_train_button],
|
746 |
+
)
|
747 |
+
|
748 |
+
train_output_info.change(
|
749 |
+
fn=disable_stop_train_button,
|
750 |
+
inputs=[],
|
751 |
+
outputs=[train_button, stop_train_button],
|
752 |
+
)
|
753 |
+
|
754 |
+
pth_dropdown_export.change(
|
755 |
+
fn=export_pth,
|
756 |
+
inputs=[pth_dropdown_export],
|
757 |
+
outputs=[pth_file_export],
|
758 |
+
)
|
759 |
+
|
760 |
+
index_dropdown_export.change(
|
761 |
+
fn=export_index,
|
762 |
+
inputs=[index_dropdown_export],
|
763 |
+
outputs=[index_file_export],
|
764 |
+
)
|
765 |
+
|
766 |
+
refresh_export.click(
|
767 |
+
fn=refresh_pth_and_index_list,
|
768 |
+
inputs=[],
|
769 |
+
outputs=[pth_dropdown_export, index_dropdown_export],
|
770 |
+
)
|
tabs/tts/tts.py
CHANGED
@@ -2,8 +2,6 @@ import os, sys
|
|
2 |
import gradio as gr
|
3 |
import regex as re
|
4 |
import json
|
5 |
-
import shutil
|
6 |
-
import datetime
|
7 |
import random
|
8 |
|
9 |
from core import (
|
@@ -18,26 +16,7 @@ now_dir = os.getcwd()
|
|
18 |
sys.path.append(now_dir)
|
19 |
|
20 |
model_root = os.path.join(now_dir, "logs")
|
21 |
-
audio_root = os.path.join(now_dir, "assets", "audios")
|
22 |
-
|
23 |
model_root_relative = os.path.relpath(model_root, now_dir)
|
24 |
-
audio_root_relative = os.path.relpath(audio_root, now_dir)
|
25 |
-
|
26 |
-
sup_audioext = {
|
27 |
-
"wav",
|
28 |
-
"mp3",
|
29 |
-
"flac",
|
30 |
-
"ogg",
|
31 |
-
"opus",
|
32 |
-
"m4a",
|
33 |
-
"mp4",
|
34 |
-
"aac",
|
35 |
-
"alac",
|
36 |
-
"wma",
|
37 |
-
"aiff",
|
38 |
-
"webm",
|
39 |
-
"ac3",
|
40 |
-
}
|
41 |
|
42 |
names = [
|
43 |
os.path.join(root, file)
|
@@ -56,15 +35,6 @@ indexes_list = [
|
|
56 |
if name.endswith(".index") and "trained" not in name
|
57 |
]
|
58 |
|
59 |
-
audio_paths = [
|
60 |
-
os.path.join(root, name)
|
61 |
-
for root, _, files in os.walk(audio_root_relative, topdown=False)
|
62 |
-
for name in files
|
63 |
-
if name.endswith(tuple(sup_audioext))
|
64 |
-
and root == audio_root_relative
|
65 |
-
and "_output" not in name
|
66 |
-
]
|
67 |
-
|
68 |
|
69 |
def change_choices():
|
70 |
names = [
|
@@ -83,19 +53,9 @@ def change_choices():
|
|
83 |
for name in files
|
84 |
if name.endswith(".index") and "trained" not in name
|
85 |
]
|
86 |
-
|
87 |
-
audio_paths = [
|
88 |
-
os.path.join(root, name)
|
89 |
-
for root, _, files in os.walk(audio_root_relative, topdown=False)
|
90 |
-
for name in files
|
91 |
-
if name.endswith(tuple(sup_audioext))
|
92 |
-
and root == audio_root_relative
|
93 |
-
and "_output" not in name
|
94 |
-
]
|
95 |
return (
|
96 |
{"choices": sorted(names), "__type__": "update"},
|
97 |
{"choices": sorted(indexes_list), "__type__": "update"},
|
98 |
-
{"choices": sorted(audio_paths), "__type__": "update"},
|
99 |
)
|
100 |
|
101 |
|
@@ -110,93 +70,30 @@ def get_indexes():
|
|
110 |
return indexes_list if indexes_list else ""
|
111 |
|
112 |
|
113 |
-
def
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
# Check if the sid0strip has the specific ending format _eXXX_sXXX
|
120 |
-
if re.match(r".+_e\d+_s\d+$", model_file_name):
|
121 |
-
base_model_name = model_file_name.rsplit("_", 2)[0]
|
122 |
-
else:
|
123 |
-
base_model_name = model_file_name
|
124 |
-
|
125 |
-
sid_directory = os.path.join(model_root_relative, base_model_name)
|
126 |
-
directories_to_search = [sid_directory] if os.path.exists(sid_directory) else []
|
127 |
-
directories_to_search.append(model_root_relative)
|
128 |
-
|
129 |
-
matching_index_files = []
|
130 |
-
|
131 |
-
for directory in directories_to_search:
|
132 |
-
for filename in os.listdir(directory):
|
133 |
-
if filename.endswith(".index") and "trained" not in filename:
|
134 |
-
# Condition to match the name
|
135 |
-
name_match = any(
|
136 |
-
name.lower() in filename.lower()
|
137 |
-
for name in [model_file_name, base_model_name]
|
138 |
-
)
|
139 |
-
|
140 |
-
# If in the specific directory, it's automatically a match
|
141 |
-
folder_match = directory == sid_directory
|
142 |
-
|
143 |
-
if name_match or folder_match:
|
144 |
-
index_path = os.path.join(directory, filename)
|
145 |
-
if index_path in indexes_list:
|
146 |
-
matching_index_files.append(
|
147 |
-
(
|
148 |
-
index_path,
|
149 |
-
os.path.getsize(index_path),
|
150 |
-
" " not in filename,
|
151 |
-
)
|
152 |
-
)
|
153 |
|
154 |
-
if matching_index_files:
|
155 |
-
# Sort by favoring files without spaces and by size (largest size first)
|
156 |
-
matching_index_files.sort(key=lambda x: (-x[2], -x[1]))
|
157 |
-
best_match_index_path = matching_index_files[0][0]
|
158 |
-
return best_match_index_path
|
159 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
160 |
return ""
|
161 |
|
162 |
|
163 |
-
def save_to_wav(record_button):
|
164 |
-
if record_button is None:
|
165 |
-
pass
|
166 |
-
else:
|
167 |
-
path_to_file = record_button
|
168 |
-
new_name = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + ".wav"
|
169 |
-
target_path = os.path.join(audio_root_relative, os.path.basename(new_name))
|
170 |
-
|
171 |
-
shutil.move(path_to_file, target_path)
|
172 |
-
return target_path
|
173 |
-
|
174 |
-
|
175 |
-
def save_to_wav2(upload_audio):
|
176 |
-
file_path = upload_audio
|
177 |
-
target_path = os.path.join(audio_root_relative, os.path.basename(file_path))
|
178 |
-
|
179 |
-
if os.path.exists(target_path):
|
180 |
-
os.remove(target_path)
|
181 |
-
|
182 |
-
shutil.copy(file_path, target_path)
|
183 |
-
return target_path
|
184 |
-
|
185 |
-
|
186 |
-
def delete_outputs():
|
187 |
-
for root, _, files in os.walk(audio_root_relative, topdown=False):
|
188 |
-
for name in files:
|
189 |
-
if name.endswith(tuple(sup_audioext)) and name.__contains__("_output"):
|
190 |
-
os.remove(os.path.join(root, name))
|
191 |
-
gr.Info(f"Outputs cleared!")
|
192 |
-
|
193 |
-
|
194 |
def tts_tab():
|
195 |
default_weight = random.choice(names) if names else ""
|
196 |
with gr.Row():
|
197 |
with gr.Row():
|
198 |
model_file = gr.Dropdown(
|
199 |
label=i18n("Voice Model"),
|
|
|
200 |
choices=sorted(names, key=lambda path: os.path.getsize(path)),
|
201 |
interactive=True,
|
202 |
value=default_weight,
|
@@ -205,6 +102,7 @@ def tts_tab():
|
|
205 |
best_default_index_path = match_index(model_file.value)
|
206 |
index_file = gr.Dropdown(
|
207 |
label=i18n("Index File"),
|
|
|
208 |
choices=get_indexes(),
|
209 |
value=best_default_index_path,
|
210 |
interactive=True,
|
@@ -215,13 +113,16 @@ def tts_tab():
|
|
215 |
unload_button = gr.Button(i18n("Unload Voice"))
|
216 |
|
217 |
unload_button.click(
|
218 |
-
fn=lambda: (
|
|
|
|
|
|
|
219 |
inputs=[],
|
220 |
-
outputs=[model_file],
|
221 |
)
|
222 |
|
223 |
model_file.select(
|
224 |
-
fn=match_index,
|
225 |
inputs=[model_file],
|
226 |
outputs=[index_file],
|
227 |
)
|
@@ -234,6 +135,7 @@ def tts_tab():
|
|
234 |
|
235 |
tts_voice = gr.Dropdown(
|
236 |
label=i18n("TTS Voices"),
|
|
|
237 |
choices=short_names,
|
238 |
interactive=True,
|
239 |
value=None,
|
@@ -241,10 +143,16 @@ def tts_tab():
|
|
241 |
|
242 |
tts_text = gr.Textbox(
|
243 |
label=i18n("Text to Synthesize"),
|
|
|
244 |
placeholder=i18n("Enter text to synthesize"),
|
245 |
lines=3,
|
246 |
)
|
247 |
|
|
|
|
|
|
|
|
|
|
|
248 |
with gr.Accordion(i18n("Advanced Settings"), open=False):
|
249 |
with gr.Column():
|
250 |
output_tts_path = gr.Textbox(
|
@@ -253,27 +161,74 @@ def tts_tab():
|
|
253 |
value=os.path.join(now_dir, "assets", "audios", "tts_output.wav"),
|
254 |
interactive=True,
|
255 |
)
|
256 |
-
|
257 |
output_rvc_path = gr.Textbox(
|
258 |
label=i18n("Output Path for RVC Audio"),
|
259 |
placeholder=i18n("Enter output path"),
|
260 |
value=os.path.join(now_dir, "assets", "audios", "tts_rvc_output.wav"),
|
261 |
interactive=True,
|
262 |
)
|
263 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
264 |
pitch = gr.Slider(
|
265 |
minimum=-24,
|
266 |
maximum=24,
|
267 |
step=1,
|
268 |
label=i18n("Pitch"),
|
|
|
|
|
|
|
269 |
value=0,
|
270 |
interactive=True,
|
271 |
)
|
272 |
filter_radius = gr.Slider(
|
273 |
minimum=0,
|
274 |
maximum=7,
|
275 |
-
label=i18n(
|
276 |
-
|
|
|
277 |
),
|
278 |
value=3,
|
279 |
step=1,
|
@@ -283,43 +238,90 @@ def tts_tab():
|
|
283 |
minimum=0,
|
284 |
maximum=1,
|
285 |
label=i18n("Search Feature Ratio"),
|
|
|
|
|
|
|
286 |
value=0.75,
|
287 |
interactive=True,
|
288 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
289 |
hop_length = gr.Slider(
|
290 |
minimum=1,
|
291 |
maximum=512,
|
292 |
step=1,
|
293 |
label=i18n("Hop Length"),
|
|
|
|
|
|
|
294 |
value=128,
|
295 |
interactive=True,
|
296 |
)
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
|
|
|
|
|
|
|
|
|
|
311 |
|
312 |
convert_button1 = gr.Button(i18n("Convert"))
|
313 |
|
314 |
with gr.Row(): # Defines output info + output audio download after conversion
|
315 |
-
vc_output1 = gr.Textbox(
|
|
|
|
|
|
|
316 |
vc_output2 = gr.Audio(label=i18n("Export Audio"))
|
317 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
318 |
refresh_button.click(
|
319 |
fn=change_choices,
|
320 |
inputs=[],
|
321 |
outputs=[model_file, index_file],
|
322 |
)
|
|
|
|
|
|
|
|
|
|
|
323 |
convert_button1.click(
|
324 |
fn=run_tts_script,
|
325 |
inputs=[
|
@@ -328,12 +330,19 @@ def tts_tab():
|
|
328 |
pitch,
|
329 |
filter_radius,
|
330 |
index_rate,
|
|
|
|
|
331 |
hop_length,
|
332 |
f0method,
|
333 |
output_tts_path,
|
334 |
output_rvc_path,
|
335 |
model_file,
|
336 |
index_file,
|
|
|
|
|
|
|
|
|
|
|
337 |
],
|
338 |
outputs=[vc_output1, vc_output2],
|
339 |
)
|
|
|
2 |
import gradio as gr
|
3 |
import regex as re
|
4 |
import json
|
|
|
|
|
5 |
import random
|
6 |
|
7 |
from core import (
|
|
|
16 |
sys.path.append(now_dir)
|
17 |
|
18 |
model_root = os.path.join(now_dir, "logs")
|
|
|
|
|
19 |
model_root_relative = os.path.relpath(model_root, now_dir)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
names = [
|
22 |
os.path.join(root, file)
|
|
|
35 |
if name.endswith(".index") and "trained" not in name
|
36 |
]
|
37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
def change_choices():
|
40 |
names = [
|
|
|
53 |
for name in files
|
54 |
if name.endswith(".index") and "trained" not in name
|
55 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
return (
|
57 |
{"choices": sorted(names), "__type__": "update"},
|
58 |
{"choices": sorted(indexes_list), "__type__": "update"},
|
|
|
59 |
)
|
60 |
|
61 |
|
|
|
70 |
return indexes_list if indexes_list else ""
|
71 |
|
72 |
|
73 |
+
def process_input(file_path):
|
74 |
+
with open(file_path, "r") as file:
|
75 |
+
file_contents = file.read()
|
76 |
+
gr.Info(f"The text from the txt file has been loaded!")
|
77 |
+
return file_contents, None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
|
|
|
|
|
|
|
|
|
|
|
79 |
|
80 |
+
def match_index(model_file_value):
|
81 |
+
if model_file_value:
|
82 |
+
model_folder = os.path.dirname(model_file_value)
|
83 |
+
index_files = get_indexes()
|
84 |
+
for index_file in index_files:
|
85 |
+
if os.path.dirname(index_file) == model_folder:
|
86 |
+
return index_file
|
87 |
return ""
|
88 |
|
89 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
def tts_tab():
|
91 |
default_weight = random.choice(names) if names else ""
|
92 |
with gr.Row():
|
93 |
with gr.Row():
|
94 |
model_file = gr.Dropdown(
|
95 |
label=i18n("Voice Model"),
|
96 |
+
info=i18n("Select the voice model to use for the conversion."),
|
97 |
choices=sorted(names, key=lambda path: os.path.getsize(path)),
|
98 |
interactive=True,
|
99 |
value=default_weight,
|
|
|
102 |
best_default_index_path = match_index(model_file.value)
|
103 |
index_file = gr.Dropdown(
|
104 |
label=i18n("Index File"),
|
105 |
+
info=i18n("Select the index file to use for the conversion."),
|
106 |
choices=get_indexes(),
|
107 |
value=best_default_index_path,
|
108 |
interactive=True,
|
|
|
113 |
unload_button = gr.Button(i18n("Unload Voice"))
|
114 |
|
115 |
unload_button.click(
|
116 |
+
fn=lambda: (
|
117 |
+
{"value": "", "__type__": "update"},
|
118 |
+
{"value": "", "__type__": "update"},
|
119 |
+
),
|
120 |
inputs=[],
|
121 |
+
outputs=[model_file, index_file],
|
122 |
)
|
123 |
|
124 |
model_file.select(
|
125 |
+
fn=lambda model_file_value: match_index(model_file_value),
|
126 |
inputs=[model_file],
|
127 |
outputs=[index_file],
|
128 |
)
|
|
|
135 |
|
136 |
tts_voice = gr.Dropdown(
|
137 |
label=i18n("TTS Voices"),
|
138 |
+
info=i18n("Select the TTS voice to use for the conversion."),
|
139 |
choices=short_names,
|
140 |
interactive=True,
|
141 |
value=None,
|
|
|
143 |
|
144 |
tts_text = gr.Textbox(
|
145 |
label=i18n("Text to Synthesize"),
|
146 |
+
info=i18n("Enter the text to synthesize."),
|
147 |
placeholder=i18n("Enter text to synthesize"),
|
148 |
lines=3,
|
149 |
)
|
150 |
|
151 |
+
txt_file = gr.File(
|
152 |
+
label=i18n("Or you can upload a .txt file"),
|
153 |
+
type="filepath",
|
154 |
+
)
|
155 |
+
|
156 |
with gr.Accordion(i18n("Advanced Settings"), open=False):
|
157 |
with gr.Column():
|
158 |
output_tts_path = gr.Textbox(
|
|
|
161 |
value=os.path.join(now_dir, "assets", "audios", "tts_output.wav"),
|
162 |
interactive=True,
|
163 |
)
|
|
|
164 |
output_rvc_path = gr.Textbox(
|
165 |
label=i18n("Output Path for RVC Audio"),
|
166 |
placeholder=i18n("Enter output path"),
|
167 |
value=os.path.join(now_dir, "assets", "audios", "tts_rvc_output.wav"),
|
168 |
interactive=True,
|
169 |
)
|
170 |
+
export_format = gr.Radio(
|
171 |
+
label=i18n("Export Format"),
|
172 |
+
info=i18n("Select the format to export the audio."),
|
173 |
+
choices=["WAV", "MP3", "FLAC", "OGG", "M4A"],
|
174 |
+
value="WAV",
|
175 |
+
interactive=True,
|
176 |
+
)
|
177 |
+
split_audio = gr.Checkbox(
|
178 |
+
label=i18n("Split Audio"),
|
179 |
+
info=i18n(
|
180 |
+
"Split the audio into chunks for inference to obtain better results in some cases."
|
181 |
+
),
|
182 |
+
visible=True,
|
183 |
+
value=False,
|
184 |
+
interactive=True,
|
185 |
+
)
|
186 |
+
autotune = gr.Checkbox(
|
187 |
+
label=i18n("Autotune"),
|
188 |
+
info=i18n(
|
189 |
+
"Apply a soft autotune to your inferences, recommended for singing conversions."
|
190 |
+
),
|
191 |
+
visible=True,
|
192 |
+
value=False,
|
193 |
+
interactive=True,
|
194 |
+
)
|
195 |
+
clean_audio = gr.Checkbox(
|
196 |
+
label=i18n("Clean Audio"),
|
197 |
+
info=i18n(
|
198 |
+
"Clean your audio output using noise detection algorithms, recommended for speaking audios."
|
199 |
+
),
|
200 |
+
visible=True,
|
201 |
+
value=True,
|
202 |
+
interactive=True,
|
203 |
+
)
|
204 |
+
clean_strength = gr.Slider(
|
205 |
+
minimum=0,
|
206 |
+
maximum=1,
|
207 |
+
label=i18n("Clean Strength"),
|
208 |
+
info=i18n(
|
209 |
+
"Set the clean-up level to the audio you want, the more you increase it the more it will clean up, but it is possible that the audio will be more compressed."
|
210 |
+
),
|
211 |
+
visible=True,
|
212 |
+
value=0.5,
|
213 |
+
interactive=True,
|
214 |
+
)
|
215 |
pitch = gr.Slider(
|
216 |
minimum=-24,
|
217 |
maximum=24,
|
218 |
step=1,
|
219 |
label=i18n("Pitch"),
|
220 |
+
info=i18n(
|
221 |
+
"Set the pitch of the audio, the higher the value, the higher the pitch."
|
222 |
+
),
|
223 |
value=0,
|
224 |
interactive=True,
|
225 |
)
|
226 |
filter_radius = gr.Slider(
|
227 |
minimum=0,
|
228 |
maximum=7,
|
229 |
+
label=i18n("Filter Radius"),
|
230 |
+
info=i18n(
|
231 |
+
"If the number is greater than or equal to three, employing median filtering on the collected tone results has the potential to decrease respiration."
|
232 |
),
|
233 |
value=3,
|
234 |
step=1,
|
|
|
238 |
minimum=0,
|
239 |
maximum=1,
|
240 |
label=i18n("Search Feature Ratio"),
|
241 |
+
info=i18n(
|
242 |
+
"Influence exerted by the index file; a higher value corresponds to greater influence. However, opting for lower values can help mitigate artifacts present in the audio."
|
243 |
+
),
|
244 |
value=0.75,
|
245 |
interactive=True,
|
246 |
)
|
247 |
+
rms_mix_rate = gr.Slider(
|
248 |
+
minimum=0,
|
249 |
+
maximum=1,
|
250 |
+
label=i18n("Volume Envelope"),
|
251 |
+
info=i18n(
|
252 |
+
"Substitute or blend with the volume envelope of the output. The closer the ratio is to 1, the more the output envelope is employed."
|
253 |
+
),
|
254 |
+
value=1,
|
255 |
+
interactive=True,
|
256 |
+
)
|
257 |
+
protect = gr.Slider(
|
258 |
+
minimum=0,
|
259 |
+
maximum=0.5,
|
260 |
+
label=i18n("Protect Voiceless Consonants"),
|
261 |
+
info=i18n(
|
262 |
+
"Safeguard distinct consonants and breathing sounds to prevent electro-acoustic tearing and other artifacts. Pulling the parameter to its maximum value of 0.5 offers comprehensive protection. However, reducing this value might decrease the extent of protection while potentially mitigating the indexing effect."
|
263 |
+
),
|
264 |
+
value=0.5,
|
265 |
+
interactive=True,
|
266 |
+
)
|
267 |
hop_length = gr.Slider(
|
268 |
minimum=1,
|
269 |
maximum=512,
|
270 |
step=1,
|
271 |
label=i18n("Hop Length"),
|
272 |
+
info=i18n(
|
273 |
+
"Denotes the duration it takes for the system to transition to a significant pitch change. Smaller hop lengths require more time for inference but tend to yield higher pitch accuracy."
|
274 |
+
),
|
275 |
value=128,
|
276 |
interactive=True,
|
277 |
)
|
278 |
+
with gr.Column():
|
279 |
+
f0method = gr.Radio(
|
280 |
+
label=i18n("Pitch extraction algorithm"),
|
281 |
+
info=i18n(
|
282 |
+
"Pitch extraction algorithm to use for the audio conversion. The default algorithm is rmvpe, which is recommended for most cases."
|
283 |
+
),
|
284 |
+
choices=[
|
285 |
+
"pm",
|
286 |
+
"harvest",
|
287 |
+
"dio",
|
288 |
+
"crepe",
|
289 |
+
"crepe-tiny",
|
290 |
+
"rmvpe",
|
291 |
+
"fcpe",
|
292 |
+
"hybrid[rmvpe+fcpe]",
|
293 |
+
],
|
294 |
+
value="rmvpe",
|
295 |
+
interactive=True,
|
296 |
+
)
|
297 |
|
298 |
convert_button1 = gr.Button(i18n("Convert"))
|
299 |
|
300 |
with gr.Row(): # Defines output info + output audio download after conversion
|
301 |
+
vc_output1 = gr.Textbox(
|
302 |
+
label=i18n("Output Information"),
|
303 |
+
info=i18n("The output information will be displayed here."),
|
304 |
+
)
|
305 |
vc_output2 = gr.Audio(label=i18n("Export Audio"))
|
306 |
|
307 |
+
def toggle_visible(checkbox):
|
308 |
+
return {"visible": checkbox, "__type__": "update"}
|
309 |
+
|
310 |
+
clean_audio.change(
|
311 |
+
fn=toggle_visible,
|
312 |
+
inputs=[clean_audio],
|
313 |
+
outputs=[clean_strength],
|
314 |
+
)
|
315 |
refresh_button.click(
|
316 |
fn=change_choices,
|
317 |
inputs=[],
|
318 |
outputs=[model_file, index_file],
|
319 |
)
|
320 |
+
txt_file.upload(
|
321 |
+
fn=process_input,
|
322 |
+
inputs=[txt_file],
|
323 |
+
outputs=[tts_text, txt_file],
|
324 |
+
)
|
325 |
convert_button1.click(
|
326 |
fn=run_tts_script,
|
327 |
inputs=[
|
|
|
330 |
pitch,
|
331 |
filter_radius,
|
332 |
index_rate,
|
333 |
+
rms_mix_rate,
|
334 |
+
protect,
|
335 |
hop_length,
|
336 |
f0method,
|
337 |
output_tts_path,
|
338 |
output_rvc_path,
|
339 |
model_file,
|
340 |
index_file,
|
341 |
+
split_audio,
|
342 |
+
autotune,
|
343 |
+
clean_audio,
|
344 |
+
clean_strength,
|
345 |
+
export_format,
|
346 |
],
|
347 |
outputs=[vc_output1, vc_output2],
|
348 |
)
|
tabs/voice_blender/voice_blender.py
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os, sys
|
2 |
+
import gradio as gr
|
3 |
+
import shutil
|
4 |
+
|
5 |
+
now_dir = os.getcwd()
|
6 |
+
sys.path.append(now_dir)
|
7 |
+
|
8 |
+
from assets.i18n.i18n import I18nAuto
|
9 |
+
from core import run_model_blender_script
|
10 |
+
|
11 |
+
i18n = I18nAuto()
|
12 |
+
|
13 |
+
|
14 |
+
def update_model_fusion(dropbox):
|
15 |
+
return dropbox, None
|
16 |
+
|
17 |
+
|
18 |
+
def voice_blender_tab():
|
19 |
+
gr.Markdown(i18n("## Voice Blender"))
|
20 |
+
gr.Markdown(
|
21 |
+
i18n(
|
22 |
+
"Select two voice models, set your desired blend percentage, and blend them into an entirely new voice."
|
23 |
+
)
|
24 |
+
)
|
25 |
+
with gr.Column():
|
26 |
+
model_fusion_name = gr.Textbox(
|
27 |
+
label=i18n("Model Name"),
|
28 |
+
info=i18n("Name of the new model."),
|
29 |
+
value="",
|
30 |
+
max_lines=1,
|
31 |
+
interactive=True,
|
32 |
+
placeholder=i18n("Enter model name"),
|
33 |
+
)
|
34 |
+
with gr.Row():
|
35 |
+
with gr.Column():
|
36 |
+
model_fusion_a_dropbox = gr.File(
|
37 |
+
label=i18n("Drag and drop your model here"), type="filepath"
|
38 |
+
)
|
39 |
+
model_fusion_a = gr.Textbox(
|
40 |
+
label=i18n("Path to Model"),
|
41 |
+
value="",
|
42 |
+
interactive=True,
|
43 |
+
placeholder=i18n("Enter path to model"),
|
44 |
+
info=i18n("You can also use a custom path."),
|
45 |
+
)
|
46 |
+
with gr.Column():
|
47 |
+
model_fusion_b_dropbox = gr.File(
|
48 |
+
label=i18n("Drag and drop your model here"), type="filepath"
|
49 |
+
)
|
50 |
+
model_fusion_b = gr.Textbox(
|
51 |
+
label=i18n("Path to Model"),
|
52 |
+
value="",
|
53 |
+
interactive=True,
|
54 |
+
placeholder=i18n("Enter path to model"),
|
55 |
+
info=i18n("You can also use a custom path."),
|
56 |
+
)
|
57 |
+
alpha_a = gr.Slider(
|
58 |
+
minimum=0,
|
59 |
+
maximum=1,
|
60 |
+
label=i18n("Blend Ratio"),
|
61 |
+
value=0.5,
|
62 |
+
interactive=True,
|
63 |
+
info=i18n(
|
64 |
+
"Adjusting the position more towards one side or the other will make the model more similar to the first or second."
|
65 |
+
),
|
66 |
+
)
|
67 |
+
model_fusion_button = gr.Button(i18n("Fusion"), variant="primary")
|
68 |
+
with gr.Row():
|
69 |
+
model_fusion_output_info = gr.Textbox(
|
70 |
+
label=i18n("Output Information"),
|
71 |
+
info=i18n("The output information will be displayed here."),
|
72 |
+
value="",
|
73 |
+
)
|
74 |
+
model_fusion_pth_output = gr.File(
|
75 |
+
label=i18n("Download Model"), type="filepath", interactive=False
|
76 |
+
)
|
77 |
+
|
78 |
+
model_fusion_button.click(
|
79 |
+
fn=run_model_blender_script,
|
80 |
+
inputs=[
|
81 |
+
model_fusion_name,
|
82 |
+
model_fusion_a,
|
83 |
+
model_fusion_b,
|
84 |
+
alpha_a,
|
85 |
+
],
|
86 |
+
outputs=[model_fusion_output_info, model_fusion_pth_output],
|
87 |
+
)
|
88 |
+
|
89 |
+
model_fusion_a_dropbox.upload(
|
90 |
+
fn=update_model_fusion,
|
91 |
+
inputs=model_fusion_a_dropbox,
|
92 |
+
outputs=[model_fusion_a, model_fusion_a_dropbox],
|
93 |
+
)
|
94 |
+
|
95 |
+
model_fusion_b_dropbox.upload(
|
96 |
+
fn=update_model_fusion,
|
97 |
+
inputs=model_fusion_b_dropbox,
|
98 |
+
outputs=[model_fusion_b, model_fusion_b_dropbox],
|
99 |
+
)
|