Aitron Emper commited on
Commit
1a7d583
1 Parent(s): dc8f793

Upload 74 files

Browse files
app.py CHANGED
@@ -13,24 +13,35 @@ from tabs.extra.extra import extra_tab
13
  from tabs.report.report import report_tab
14
  from tabs.download.download import download_tab
15
  from tabs.tts.tts import tts_tab
16
- from tabs.settings.presence import presence_tab
 
 
 
17
  from tabs.settings.themes import theme_tab
18
  from tabs.plugins.plugins import plugins_tab
 
 
 
19
 
20
  # Assets
21
  import assets.themes.loadThemes as loadThemes
22
  from assets.i18n.i18n import I18nAuto
23
  import assets.installation_checker as installation_checker
24
  from assets.discord_presence import RPCManager
25
- import assets.delete_models as delete_models
 
 
 
26
 
27
- delete_models.start_infinite_loop()
28
  i18n = I18nAuto()
29
- RPCManager.start_presence()
 
30
  installation_checker.check_installation()
31
  logging.getLogger("uvicorn").disabled = True
32
  logging.getLogger("fairseq").disabled = True
33
- logging.getLogger("h11").disabled = True
 
 
34
 
35
  my_applio = loadThemes.load_json()
36
  if my_applio:
@@ -53,17 +64,24 @@ with gr.Blocks(theme=my_applio, title="Applio") as Applio:
53
  with gr.Tab(i18n("Inference")):
54
  inference_tab()
55
 
56
- # with gr.Tab(i18n("Train")):
57
- # train_tab()
 
 
 
 
 
 
 
58
 
59
  with gr.Tab(i18n("TTS")):
60
  tts_tab()
61
 
62
- with gr.Tab(i18n("Extra")):
63
- extra_tab()
64
 
65
- # with gr.Tab(i18n("Plugins")):
66
- # plugins_tab()
67
 
68
  with gr.Tab(i18n("Download")):
69
  download_tab()
@@ -71,10 +89,30 @@ with gr.Blocks(theme=my_applio, title="Applio") as Applio:
71
  with gr.Tab(i18n("Report a Bug")):
72
  report_tab()
73
 
74
- # with gr.Tab(i18n("Settings")):
75
- # presence_tab()
76
- # theme_tab()
 
 
 
 
 
 
 
 
 
77
 
78
 
79
  if __name__ == "__main__":
80
- Applio.launch()
 
 
 
 
 
 
 
 
 
 
 
 
13
  from tabs.report.report import report_tab
14
  from tabs.download.download import download_tab
15
  from tabs.tts.tts import tts_tab
16
+ from tabs.voice_blender.voice_blender import voice_blender_tab
17
+ from tabs.settings.presence import presence_tab, load_config_presence
18
+ from tabs.settings.flask_server import flask_server_tab
19
+ from tabs.settings.fake_gpu import fake_gpu_tab, gpu_available, load_fake_gpu
20
  from tabs.settings.themes import theme_tab
21
  from tabs.plugins.plugins import plugins_tab
22
+ from tabs.settings.version import version_tab
23
+ from tabs.settings.lang import lang_tab
24
+ from tabs.settings.restart import restart_tab
25
 
26
  # Assets
27
  import assets.themes.loadThemes as loadThemes
28
  from assets.i18n.i18n import I18nAuto
29
  import assets.installation_checker as installation_checker
30
  from assets.discord_presence import RPCManager
31
+ from assets.flask.server import start_flask, load_config_flask
32
+ from core import run_prerequisites_script
33
+
34
+ run_prerequisites_script("False", "True", "True", "True")
35
 
 
36
  i18n = I18nAuto()
37
+ if load_config_presence() == True:
38
+ RPCManager.start_presence()
39
  installation_checker.check_installation()
40
  logging.getLogger("uvicorn").disabled = True
41
  logging.getLogger("fairseq").disabled = True
42
+ if load_config_flask() == True:
43
+ print("Starting Flask server")
44
+ start_flask()
45
 
46
  my_applio = loadThemes.load_json()
47
  if my_applio:
 
64
  with gr.Tab(i18n("Inference")):
65
  inference_tab()
66
 
67
+ with gr.Tab(i18n("Train")):
68
+ if gpu_available() or load_fake_gpu():
69
+ train_tab()
70
+ else:
71
+ gr.Markdown(
72
+ i18n(
73
+ "Training is currently unsupported due to the absence of a GPU. To activate the training tab, navigate to the settings tab and enable the 'Fake GPU' option."
74
+ )
75
+ )
76
 
77
  with gr.Tab(i18n("TTS")):
78
  tts_tab()
79
 
80
+ with gr.Tab(i18n("Voice Blender")):
81
+ voice_blender_tab()
82
 
83
+ with gr.Tab(i18n("Plugins")):
84
+ plugins_tab()
85
 
86
  with gr.Tab(i18n("Download")):
87
  download_tab()
 
89
  with gr.Tab(i18n("Report a Bug")):
90
  report_tab()
91
 
92
+ with gr.Tab(i18n("Extra")):
93
+ extra_tab()
94
+
95
+ with gr.Tab(i18n("Settings")):
96
+ presence_tab()
97
+ flask_server_tab()
98
+ if not gpu_available():
99
+ fake_gpu_tab()
100
+ theme_tab()
101
+ version_tab()
102
+ lang_tab()
103
+ restart_tab()
104
 
105
 
106
  if __name__ == "__main__":
107
+ port = 6969
108
+ if "--port" in sys.argv:
109
+ port_index = sys.argv.index("--port") + 1
110
+ if port_index < len(sys.argv):
111
+ port = int(sys.argv[port_index])
112
+
113
+ Applio.launch(
114
+ favicon_path="assets/ICON.ico",
115
+ share="--share" in sys.argv,
116
+ inbrowser="--open" in sys.argv,
117
+ server_port=port,
118
+ )
core.py CHANGED
@@ -1,5 +1,6 @@
1
  import os
2
  import sys
 
3
  import argparse
4
  import subprocess
5
 
@@ -7,26 +8,32 @@ now_dir = os.getcwd()
7
  sys.path.append(now_dir)
8
 
9
  from rvc.configs.config import Config
10
- from rvc.lib.tools.validators import (
11
- validate_sampling_rate,
12
- validate_f0up_key,
13
- validate_f0method,
14
- validate_true_false,
15
- validate_tts_voices,
16
- )
17
 
 
18
  from rvc.train.extract.preparing_files import generate_config, generate_filelist
19
  from rvc.lib.tools.pretrained_selector import pretrained_selector
20
 
21
- from rvc.lib.process.model_fusion import model_fusion
22
- from rvc.lib.process.model_information import model_information
 
 
 
 
 
 
 
 
 
23
 
24
  config = Config()
25
  current_script_directory = os.path.dirname(os.path.realpath(__file__))
26
  logs_path = os.path.join(current_script_directory, "logs")
27
- subprocess.run(
28
- ["python", os.path.join("rvc", "lib", "tools", "prerequisites_download.py")]
29
- )
 
 
 
30
 
31
 
32
  # Infer
@@ -34,31 +41,41 @@ def run_infer_script(
34
  f0up_key,
35
  filter_radius,
36
  index_rate,
 
 
37
  hop_length,
38
  f0method,
39
  input_path,
40
  output_path,
41
- pth_file,
42
  index_path,
43
  split_audio,
 
 
 
 
44
  ):
45
- infer_script_path = os.path.join("rvc", "infer", "infer.py")
46
- command = [
47
- "python",
48
- infer_script_path,
49
- str(f0up_key),
50
- str(filter_radius),
51
- str(index_rate),
52
- str(hop_length),
53
  f0method,
54
  input_path,
55
  output_path,
56
- pth_file,
57
  index_path,
58
- str(split_audio),
59
- ]
60
- subprocess.run(command)
61
- return f"File {input_path} inferred successfully.", output_path
 
 
 
 
 
62
 
63
 
64
  # Batch infer
@@ -66,16 +83,20 @@ def run_batch_infer_script(
66
  f0up_key,
67
  filter_radius,
68
  index_rate,
 
 
69
  hop_length,
70
  f0method,
71
  input_folder,
72
  output_folder,
73
- pth_file,
74
  index_path,
75
  split_audio,
 
 
 
 
76
  ):
77
- infer_script_path = os.path.join("rvc", "infer", "infer.py")
78
-
79
  audio_files = [
80
  f for f in os.listdir(input_folder) if f.endswith((".mp3", ".wav", ".flac"))
81
  ]
@@ -93,21 +114,24 @@ def run_batch_infer_script(
93
  )
94
  print(f"Inferring {input_path}...")
95
 
96
- command = [
97
- "python",
98
- infer_script_path,
99
- str(f0up_key),
100
- str(filter_radius),
101
- str(index_rate),
102
- str(hop_length),
103
- f0method,
104
- input_path,
105
- output_path,
106
- pth_file,
107
- index_path,
108
- str(split_audio),
109
- ]
110
- subprocess.run(command)
 
 
 
111
 
112
  return f"Files from {input_folder} inferred successfully."
113
 
@@ -119,15 +143,21 @@ def run_tts_script(
119
  f0up_key,
120
  filter_radius,
121
  index_rate,
 
 
122
  hop_length,
123
  f0method,
124
  output_tts_path,
125
  output_rvc_path,
126
- pth_file,
127
  index_path,
 
 
 
 
 
128
  ):
129
  tts_script_path = os.path.join("rvc", "lib", "tools", "tts.py")
130
- infer_script_path = os.path.join("rvc", "infer", "infer.py")
131
 
132
  if os.path.exists(output_tts_path):
133
  os.remove(output_tts_path)
@@ -139,23 +169,30 @@ def run_tts_script(
139
  tts_voice,
140
  output_tts_path,
141
  ]
 
142
 
143
- command_infer = [
144
- "python",
145
- infer_script_path,
146
- str(f0up_key),
147
- str(filter_radius),
148
- str(index_rate),
149
- str(hop_length),
150
  f0method,
151
  output_tts_path,
152
  output_rvc_path,
153
- pth_file,
154
  index_path,
155
- ]
156
- subprocess.run(command_tts)
157
- subprocess.run(command_infer)
158
- return f"Text {tts_text} synthesized successfully.", output_rvc_path
 
 
 
 
 
 
159
 
160
 
161
  # Preprocess
@@ -165,20 +202,25 @@ def run_preprocess_script(model_name, dataset_path, sampling_rate):
165
  command = [
166
  "python",
167
  preprocess_script_path,
168
- os.path.join(logs_path, str(model_name)),
169
- dataset_path,
170
- str(sampling_rate),
171
- str(per),
 
 
 
 
 
172
  ]
173
 
174
- os.makedirs(os.path.join(logs_path, str(model_name)), exist_ok=True)
175
  subprocess.run(command)
176
  return f"Model {model_name} preprocessed successfully."
177
 
178
 
179
  # Extract
180
  def run_extract_script(model_name, rvc_version, f0method, hop_length, sampling_rate):
181
- model_path = os.path.join(logs_path, str(model_name))
182
  extract_f0_script_path = os.path.join(
183
  "rvc", "train", "extract", "extract_f0_print.py"
184
  )
@@ -189,20 +231,30 @@ def run_extract_script(model_name, rvc_version, f0method, hop_length, sampling_r
189
  command_1 = [
190
  "python",
191
  extract_f0_script_path,
192
- model_path,
193
- f0method,
194
- str(hop_length),
 
 
 
 
 
195
  ]
196
  command_2 = [
197
  "python",
198
  extract_feature_script_path,
199
- config.device,
200
- "1",
201
- "0",
202
- "0",
203
- model_path,
204
- rvc_version,
205
- "True",
 
 
 
 
 
206
  ]
207
  subprocess.run(command_1)
208
  subprocess.run(command_2)
@@ -224,6 +276,8 @@ def run_train_script(
224
  batch_size,
225
  gpu,
226
  pitch_guidance,
 
 
227
  pretrained,
228
  custom_pretrained,
229
  g_pretrained_path=None,
@@ -232,6 +286,7 @@ def run_train_script(
232
  f0 = 1 if str(pitch_guidance) == "True" else 0
233
  latest = 1 if str(save_only_latest) == "True" else 0
234
  save_every = 1 if str(save_every_weights) == "True" else 0
 
235
 
236
  if str(pretrained) == "True":
237
  if str(custom_pretrained) == "False":
@@ -248,33 +303,42 @@ def run_train_script(
248
  train_script_path = os.path.join("rvc", "train", "train.py")
249
  command = [
250
  "python",
251
- str(train_script_path),
252
- "-se",
253
- str(save_every_epoch),
254
- "-te",
255
- str(total_epoch),
256
- "-pg",
257
- str(pg),
258
- "-pd",
259
- str(pd),
260
- "-sr",
261
- str(sampling_rate),
262
- "-bs",
263
- str(batch_size),
264
- "-g",
265
- str(gpu),
266
- "-e",
267
- os.path.join(logs_path, str(model_name)),
268
- "-v",
269
- str(rvc_version),
270
- "-l",
271
- str(latest),
272
- "-c",
273
- "0",
274
- "-sw",
275
- str(save_every),
276
- "-f0",
277
- str(f0),
 
 
 
 
 
 
 
 
 
278
  ]
279
 
280
  subprocess.run(command)
@@ -284,11 +348,11 @@ def run_train_script(
284
 
285
  # Index
286
  def run_index_script(model_name, rvc_version):
287
- index_script_path = os.path.join("rvc", "train", "index_generator.py")
288
  command = [
289
  "python",
290
  index_script_path,
291
- os.path.join(logs_path, str(model_name)),
292
  rvc_version,
293
  ]
294
 
@@ -296,38 +360,66 @@ def run_index_script(model_name, rvc_version):
296
  return f"Index file for {model_name} generated successfully."
297
 
298
 
 
 
 
 
 
 
 
 
 
 
 
299
  # Model information
300
  def run_model_information_script(pth_path):
301
  print(model_information(pth_path))
302
 
303
 
304
- # Model fusion
305
- def run_model_fusion_script(model_name, pth_path_1, pth_path_2):
306
- model_fusion(model_name, pth_path_1, pth_path_2)
 
307
 
308
 
309
  # Tensorboard
310
  def run_tensorboard_script():
311
- tensorboard_script_path = os.path.join(
312
- "rvc", "lib", "tools", "launch_tensorboard.py"
313
- )
314
- command = [
315
- "python",
316
- tensorboard_script_path,
317
- ]
318
- subprocess.run(command)
319
 
320
 
321
  # Download
322
  def run_download_script(model_link):
323
- download_script_path = os.path.join("rvc", "lib", "tools", "model_download.py")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
324
  command = [
325
- "python",
326
- download_script_path,
327
- model_link,
 
 
 
328
  ]
329
  subprocess.run(command)
330
- return f"Model downloaded successfully."
331
 
332
 
333
  # Parse arguments
@@ -342,48 +434,108 @@ def parse_arguments():
342
  # Parser for 'infer' mode
343
  infer_parser = subparsers.add_parser("infer", help="Run inference")
344
  infer_parser.add_argument(
345
- "f0up_key",
346
- type=validate_f0up_key,
347
- help="Value for f0up_key (-24 to +24)",
 
 
348
  )
349
  infer_parser.add_argument(
350
- "filter_radius",
351
  type=str,
352
- help="Value for filter_radius (0 to 10)",
 
 
353
  )
354
  infer_parser.add_argument(
355
- "index_rate",
356
  type=str,
357
- help="Value for index_rate (0.0 to 1)",
 
 
358
  )
359
  infer_parser.add_argument(
360
- "hop_length",
361
  type=str,
362
- help="Value for hop_length (1 to 512)",
 
 
363
  )
364
  infer_parser.add_argument(
365
- "f0method",
366
- type=validate_f0method,
367
- help="Value for f0method (pm, dio, crepe, crepe-tiny, harvest, rmvpe)",
 
 
368
  )
369
  infer_parser.add_argument(
370
- "input_path", type=str, help="Input path (enclose in double quotes)"
 
 
 
 
371
  )
372
  infer_parser.add_argument(
373
- "output_path", type=str, help="Output path (enclose in double quotes)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
374
  )
375
  infer_parser.add_argument(
376
- "pth_file", type=str, help="Path to the .pth file (enclose in double quotes)"
 
 
 
 
377
  )
378
  infer_parser.add_argument(
379
- "index_path",
380
  type=str,
381
- help="Path to the .index file (enclose in double quotes)",
 
 
382
  )
383
  infer_parser.add_argument(
384
- "split_audio",
385
  type=str,
386
- help="Enable split audio ( better results )",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
387
  )
388
 
389
  # Parser for 'batch_infer' mode
@@ -391,229 +543,454 @@ def parse_arguments():
391
  "batch_infer", help="Run batch inference"
392
  )
393
  batch_infer_parser.add_argument(
394
- "f0up_key",
395
- type=validate_f0up_key,
396
- help="Value for f0up_key (-24 to +24)",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
397
  )
398
  batch_infer_parser.add_argument(
399
- "filter_radius",
400
  type=str,
401
- help="Value for filter_radius (0 to 10)",
 
 
402
  )
403
  batch_infer_parser.add_argument(
404
- "index_rate",
405
  type=str,
406
- help="Value for index_rate (0.0 to 1)",
 
 
407
  )
408
  batch_infer_parser.add_argument(
409
- "hop_length",
410
  type=str,
411
- help="Value for hop_length (1 to 512)",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
412
  )
413
  batch_infer_parser.add_argument(
414
- "f0method",
415
- type=validate_f0method,
416
- help="Value for f0method (pm, dio, crepe, crepe-tiny, harvest, rmvpe)",
417
  )
418
  batch_infer_parser.add_argument(
419
- "input_folder", type=str, help="Input folder (enclose in double quotes)"
 
 
 
 
420
  )
421
  batch_infer_parser.add_argument(
422
- "output_folder", type=str, help="Output folder (enclose in double quotes)"
 
 
 
 
423
  )
424
  batch_infer_parser.add_argument(
425
- "pth_file", type=str, help="Path to the .pth file (enclose in double quotes)"
 
 
 
 
426
  )
427
  batch_infer_parser.add_argument(
428
- "index_path",
429
  type=str,
430
- help="Path to the .index file (enclose in double quotes)",
 
 
 
 
 
 
 
 
 
431
  )
432
 
433
  # Parser for 'tts' mode
434
  tts_parser = subparsers.add_parser("tts", help="Run TTS")
435
  tts_parser.add_argument(
436
- "tts_text",
437
  type=str,
438
- help="Text to be synthesized (enclose in double quotes)",
439
  )
440
  tts_parser.add_argument(
441
- "tts_voice",
442
- type=validate_tts_voices,
443
- help="Voice to be used (enclose in double quotes)",
 
444
  )
445
  tts_parser.add_argument(
446
- "f0up_key",
447
- type=validate_f0up_key,
448
- help="Value for f0up_key (-24 to +24)",
 
 
449
  )
450
  tts_parser.add_argument(
451
- "filter_radius",
452
  type=str,
453
- help="Value for filter_radius (0 to 10)",
 
 
454
  )
455
  tts_parser.add_argument(
456
- "index_rate",
457
  type=str,
458
- help="Value for index_rate (0.0 to 1)",
 
 
459
  )
460
  tts_parser.add_argument(
461
- "hop_length",
462
  type=str,
463
- help="Value for hop_length (1 to 512)",
 
 
464
  )
465
  tts_parser.add_argument(
466
- "f0method",
467
- type=validate_f0method,
468
- help="Value for f0method (pm, dio, crepe, crepe-tiny, harvest, rmvpe)",
 
 
469
  )
470
  tts_parser.add_argument(
471
- "output_tts_path", type=str, help="Output tts path (enclose in double quotes)"
 
 
 
 
472
  )
473
  tts_parser.add_argument(
474
- "output_rvc_path", type=str, help="Output rvc path (enclose in double quotes)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
475
  )
476
  tts_parser.add_argument(
477
- "pth_file", type=str, help="Path to the .pth file (enclose in double quotes)"
 
 
 
 
478
  )
479
  tts_parser.add_argument(
480
- "index_path",
481
  type=str,
482
- help="Path to the .index file (enclose in double quotes)",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
483
  )
484
 
485
  # Parser for 'preprocess' mode
486
  preprocess_parser = subparsers.add_parser("preprocess", help="Run preprocessing")
 
487
  preprocess_parser.add_argument(
488
- "model_name", type=str, help="Name of the model (enclose in double quotes)"
489
- )
490
- preprocess_parser.add_argument(
491
- "dataset_path",
492
  type=str,
493
- help="Path to the dataset (enclose in double quotes)",
494
  )
495
  preprocess_parser.add_argument(
496
- "sampling_rate",
497
- type=validate_sampling_rate,
498
- help="Sampling rate (32000, 40000 or 48000)",
 
499
  )
500
 
501
  # Parser for 'extract' mode
502
  extract_parser = subparsers.add_parser("extract", help="Run extract")
503
  extract_parser.add_argument(
504
- "model_name",
505
  type=str,
506
- help="Name of the model (enclose in double quotes)",
507
  )
508
  extract_parser.add_argument(
509
- "rvc_version",
510
  type=str,
511
- help="Version of the model (v1 or v2)",
 
 
512
  )
513
  extract_parser.add_argument(
514
- "f0method",
515
- type=validate_f0method,
516
- help="Value for f0method (pm, dio, crepe, crepe-tiny, mangio-crepe, mangio-crepe-tiny, harvest, rmvpe)",
 
 
 
 
 
 
 
 
 
517
  )
518
  extract_parser.add_argument(
519
- "hop_length",
520
  type=str,
521
- help="Value for hop_length (1 to 512)",
 
 
522
  )
523
  extract_parser.add_argument(
524
- "sampling_rate",
525
- type=validate_sampling_rate,
526
- help="Sampling rate (32000, 40000 or 48000)",
 
527
  )
528
 
529
  # Parser for 'train' mode
530
  train_parser = subparsers.add_parser("train", help="Run training")
531
  train_parser.add_argument(
532
- "model_name",
533
  type=str,
534
- help="Name of the model (enclose in double quotes)",
535
  )
536
  train_parser.add_argument(
537
- "rvc_version",
538
  type=str,
539
- help="Version of the model (v1 or v2)",
 
 
540
  )
541
  train_parser.add_argument(
542
- "save_every_epoch",
543
  type=str,
544
  help="Save every epoch",
 
545
  )
546
  train_parser.add_argument(
547
- "save_only_latest",
548
  type=str,
549
  help="Save weight only at last epoch",
 
 
550
  )
551
  train_parser.add_argument(
552
- "save_every_weights",
553
  type=str,
554
  help="Save weight every epoch",
 
 
555
  )
556
  train_parser.add_argument(
557
- "total_epoch",
558
  type=str,
559
  help="Total epoch",
 
 
560
  )
561
  train_parser.add_argument(
562
- "sampling_rate",
563
- type=validate_sampling_rate,
564
- help="Sampling rate (32000, 40000, or 48000)",
 
565
  )
566
  train_parser.add_argument(
567
- "batch_size",
568
  type=str,
569
  help="Batch size",
 
 
570
  )
571
  train_parser.add_argument(
572
- "gpu",
573
  type=str,
574
- help="GPU number (0 to 10 separated by -)",
 
 
575
  )
576
  train_parser.add_argument(
577
- "pitch_guidance",
578
- type=validate_true_false,
579
- help="Pitch guidance (True or False)",
 
 
580
  )
581
  train_parser.add_argument(
582
- "pretrained",
583
- type=validate_true_false,
584
- help="Pretrained (True or False)",
 
 
585
  )
586
  train_parser.add_argument(
587
- "custom_pretrained",
588
- type=validate_true_false,
589
- help="Custom pretrained (True or False)",
 
 
590
  )
591
  train_parser.add_argument(
592
- "g_pretrained_path",
593
  type=str,
594
  nargs="?",
595
  default=None,
596
- help="Path to the pretrained G file (enclose in double quotes)",
597
  )
598
  train_parser.add_argument(
599
- "d_pretrained_path",
600
  type=str,
601
  nargs="?",
602
  default=None,
603
- help="Path to the pretrained D file (enclose in double quotes)",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
604
  )
605
 
606
  # Parser for 'index' mode
607
  index_parser = subparsers.add_parser("index", help="Generate index file")
608
  index_parser.add_argument(
609
- "model_name",
610
  type=str,
611
- help="Name of the model (enclose in double quotes)",
612
  )
613
  index_parser.add_argument(
614
- "rvc_version",
615
  type=str,
616
- help="Version of the model (v1 or v2)",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
617
  )
618
 
619
  # Parser for 'model_information' mode
@@ -621,27 +998,36 @@ def parse_arguments():
621
  "model_information", help="Print model information"
622
  )
623
  model_information_parser.add_argument(
624
- "pth_path",
625
  type=str,
626
- help="Path to the .pth file (enclose in double quotes)",
627
  )
628
 
629
- # Parser for 'model_fusion' mode
630
- model_fusion_parser = subparsers.add_parser("model_fusion", help="Fuse two models")
631
- model_fusion_parser.add_argument(
632
- "model_name",
 
 
 
 
 
 
 
633
  type=str,
634
- help="Name of the model (enclose in double quotes)",
635
  )
636
- model_fusion_parser.add_argument(
637
- "pth_path_1",
638
  type=str,
639
- help="Path to the first .pth file (enclose in double quotes)",
640
  )
641
- model_fusion_parser.add_argument(
642
- "pth_path_2",
643
  type=str,
644
- help="Path to the second .pth file (enclose in double quotes)",
 
 
645
  )
646
 
647
  # Parser for 'tensorboard' mode
@@ -650,11 +1036,57 @@ def parse_arguments():
650
  # Parser for 'download' mode
651
  download_parser = subparsers.add_parser("download", help="Download models")
652
  download_parser.add_argument(
653
- "model_link",
 
 
 
 
 
 
 
 
 
 
654
  type=str,
655
- help="Link of the model (enclose in double quotes)",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
656
  )
657
 
 
 
 
 
 
658
  return parser.parse_args()
659
 
660
 
@@ -668,95 +1100,143 @@ def main():
668
  try:
669
  if args.mode == "infer":
670
  run_infer_script(
671
- args.f0up_key,
672
- args.filter_radius,
673
- args.index_rate,
674
- args.hop_length,
675
- args.f0method,
676
- args.input_path,
677
- args.output_path,
678
- args.pth_file,
679
- args.index_path,
680
- args.split_audio,
 
 
 
 
 
 
681
  )
682
  elif args.mode == "batch_infer":
683
  run_batch_infer_script(
684
- args.f0up_key,
685
- args.filter_radius,
686
- args.index_rate,
687
- args.hop_length,
688
- args.f0method,
689
- args.input_folder,
690
- args.output_folder,
691
- args.pth_file,
692
- args.index_path,
 
 
 
 
 
 
 
693
  )
694
  elif args.mode == "tts":
695
  run_tts_script(
696
- args.tts_text,
697
- args.tts_voice,
698
- args.f0up_key,
699
- args.filter_radius,
700
- args.index_rate,
701
- args.hop_length,
702
- args.f0method,
703
- args.output_tts_path,
704
- args.output_rvc_path,
705
- args.pth_file,
706
- args.index_path,
 
 
 
 
 
 
 
707
  )
708
  elif args.mode == "preprocess":
709
  run_preprocess_script(
710
- args.model_name,
711
- args.dataset_path,
712
  str(args.sampling_rate),
713
  )
714
-
715
  elif args.mode == "extract":
716
  run_extract_script(
717
- args.model_name,
718
- args.rvc_version,
719
- args.f0method,
720
- args.hop_length,
721
- args.sampling_rate,
722
  )
723
  elif args.mode == "train":
724
  run_train_script(
725
- args.model_name,
726
- args.rvc_version,
727
- args.save_every_epoch,
728
- args.save_only_latest,
729
- args.save_every_weights,
730
- args.total_epoch,
731
- args.sampling_rate,
732
- args.batch_size,
733
- args.gpu,
734
- args.pitch_guidance,
735
- args.pretrained,
736
- args.custom_pretrained,
737
- args.g_pretrained_path,
738
- args.d_pretrained_path,
 
 
739
  )
740
  elif args.mode == "index":
741
  run_index_script(
742
- args.model_name,
743
- args.rvc_version,
 
 
 
 
 
 
 
 
 
 
744
  )
745
  elif args.mode == "model_information":
746
  run_model_information_script(
747
- args.pth_path,
748
  )
749
- elif args.mode == "model_fusion":
750
- run_model_fusion_script(
751
- args.model_name,
752
- args.pth_path_1,
753
- args.pth_path_2,
 
754
  )
755
  elif args.mode == "tensorboard":
756
  run_tensorboard_script()
757
  elif args.mode == "download":
758
  run_download_script(
759
- args.model_link,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
760
  )
761
  except Exception as error:
762
  print(f"Error: {error}")
 
1
  import os
2
  import sys
3
+ import json
4
  import argparse
5
  import subprocess
6
 
 
8
  sys.path.append(now_dir)
9
 
10
  from rvc.configs.config import Config
 
 
 
 
 
 
 
11
 
12
+ from rvc.lib.tools.prerequisites_download import prequisites_download_pipeline
13
  from rvc.train.extract.preparing_files import generate_config, generate_filelist
14
  from rvc.lib.tools.pretrained_selector import pretrained_selector
15
 
16
+ from rvc.train.process.model_blender import model_blender
17
+ from rvc.train.process.model_information import model_information
18
+ from rvc.train.process.extract_small_model import extract_small_model
19
+
20
+ from rvc.infer.infer import infer_pipeline
21
+
22
+ from rvc.lib.tools.analyzer import analyze_audio
23
+
24
+ from rvc.lib.tools.launch_tensorboard import launch_tensorboard_pipeline
25
+
26
+ from rvc.lib.tools.model_download import model_download_pipeline
27
 
28
  config = Config()
29
  current_script_directory = os.path.dirname(os.path.realpath(__file__))
30
  logs_path = os.path.join(current_script_directory, "logs")
31
+
32
+ # Get TTS Voices
33
+ with open(os.path.join("rvc", "lib", "tools", "tts_voices.json"), "r") as f:
34
+ voices_data = json.load(f)
35
+
36
+ locales = list({voice["Locale"] for voice in voices_data})
37
 
38
 
39
  # Infer
 
41
  f0up_key,
42
  filter_radius,
43
  index_rate,
44
+ rms_mix_rate,
45
+ protect,
46
  hop_length,
47
  f0method,
48
  input_path,
49
  output_path,
50
+ pth_path,
51
  index_path,
52
  split_audio,
53
+ f0autotune,
54
+ clean_audio,
55
+ clean_strength,
56
+ export_format,
57
  ):
58
+ infer_pipeline(
59
+ f0up_key,
60
+ filter_radius,
61
+ index_rate,
62
+ rms_mix_rate,
63
+ protect,
64
+ hop_length,
 
65
  f0method,
66
  input_path,
67
  output_path,
68
+ pth_path,
69
  index_path,
70
+ split_audio,
71
+ f0autotune,
72
+ clean_audio,
73
+ clean_strength,
74
+ export_format,
75
+ )
76
+ return f"File {input_path} inferred successfully.", output_path.replace(
77
+ ".wav", f".{export_format.lower()}"
78
+ )
79
 
80
 
81
  # Batch infer
 
83
  f0up_key,
84
  filter_radius,
85
  index_rate,
86
+ rms_mix_rate,
87
+ protect,
88
  hop_length,
89
  f0method,
90
  input_folder,
91
  output_folder,
92
+ pth_path,
93
  index_path,
94
  split_audio,
95
+ f0autotune,
96
+ clean_audio,
97
+ clean_strength,
98
+ export_format,
99
  ):
 
 
100
  audio_files = [
101
  f for f in os.listdir(input_folder) if f.endswith((".mp3", ".wav", ".flac"))
102
  ]
 
114
  )
115
  print(f"Inferring {input_path}...")
116
 
117
+ infer_pipeline(
118
+ f0up_key,
119
+ filter_radius,
120
+ index_rate,
121
+ rms_mix_rate,
122
+ protect,
123
+ hop_length,
124
+ f0method,
125
+ input_path,
126
+ output_path,
127
+ pth_path,
128
+ index_path,
129
+ split_audio,
130
+ f0autotune,
131
+ clean_audio,
132
+ clean_strength,
133
+ export_format,
134
+ )
135
 
136
  return f"Files from {input_folder} inferred successfully."
137
 
 
143
  f0up_key,
144
  filter_radius,
145
  index_rate,
146
+ rms_mix_rate,
147
+ protect,
148
  hop_length,
149
  f0method,
150
  output_tts_path,
151
  output_rvc_path,
152
+ pth_path,
153
  index_path,
154
+ split_audio,
155
+ f0autotune,
156
+ clean_audio,
157
+ clean_strength,
158
+ export_format,
159
  ):
160
  tts_script_path = os.path.join("rvc", "lib", "tools", "tts.py")
 
161
 
162
  if os.path.exists(output_tts_path):
163
  os.remove(output_tts_path)
 
169
  tts_voice,
170
  output_tts_path,
171
  ]
172
+ subprocess.run(command_tts)
173
 
174
+ infer_pipeline(
175
+ f0up_key,
176
+ filter_radius,
177
+ index_rate,
178
+ rms_mix_rate,
179
+ protect,
180
+ hop_length,
181
  f0method,
182
  output_tts_path,
183
  output_rvc_path,
184
+ pth_path,
185
  index_path,
186
+ split_audio,
187
+ f0autotune,
188
+ clean_audio,
189
+ clean_strength,
190
+ export_format,
191
+ )
192
+
193
+ return f"Text {tts_text} synthesized successfully.", output_rvc_path.replace(
194
+ ".wav", f".{export_format.lower()}"
195
+ )
196
 
197
 
198
  # Preprocess
 
202
  command = [
203
  "python",
204
  preprocess_script_path,
205
+ *map(
206
+ str,
207
+ [
208
+ os.path.join(logs_path, model_name),
209
+ dataset_path,
210
+ sampling_rate,
211
+ per,
212
+ ],
213
+ ),
214
  ]
215
 
216
+ os.makedirs(os.path.join(logs_path, model_name), exist_ok=True)
217
  subprocess.run(command)
218
  return f"Model {model_name} preprocessed successfully."
219
 
220
 
221
  # Extract
222
  def run_extract_script(model_name, rvc_version, f0method, hop_length, sampling_rate):
223
+ model_path = os.path.join(logs_path, model_name)
224
  extract_f0_script_path = os.path.join(
225
  "rvc", "train", "extract", "extract_f0_print.py"
226
  )
 
231
  command_1 = [
232
  "python",
233
  extract_f0_script_path,
234
+ *map(
235
+ str,
236
+ [
237
+ model_path,
238
+ f0method,
239
+ hop_length,
240
+ ],
241
+ ),
242
  ]
243
  command_2 = [
244
  "python",
245
  extract_feature_script_path,
246
+ *map(
247
+ str,
248
+ [
249
+ config.device,
250
+ "1",
251
+ "0",
252
+ "0",
253
+ model_path,
254
+ rvc_version,
255
+ "True",
256
+ ],
257
+ ),
258
  ]
259
  subprocess.run(command_1)
260
  subprocess.run(command_2)
 
276
  batch_size,
277
  gpu,
278
  pitch_guidance,
279
+ overtraining_detector,
280
+ overtraining_threshold,
281
  pretrained,
282
  custom_pretrained,
283
  g_pretrained_path=None,
 
286
  f0 = 1 if str(pitch_guidance) == "True" else 0
287
  latest = 1 if str(save_only_latest) == "True" else 0
288
  save_every = 1 if str(save_every_weights) == "True" else 0
289
+ detector = 1 if str(overtraining_detector) == "True" else 0
290
 
291
  if str(pretrained) == "True":
292
  if str(custom_pretrained) == "False":
 
303
  train_script_path = os.path.join("rvc", "train", "train.py")
304
  command = [
305
  "python",
306
+ train_script_path,
307
+ *map(
308
+ str,
309
+ [
310
+ "-se",
311
+ save_every_epoch,
312
+ "-te",
313
+ total_epoch,
314
+ "-pg",
315
+ pg,
316
+ "-pd",
317
+ pd,
318
+ "-sr",
319
+ sampling_rate,
320
+ "-bs",
321
+ batch_size,
322
+ "-g",
323
+ gpu,
324
+ "-e",
325
+ os.path.join(logs_path, model_name),
326
+ "-v",
327
+ rvc_version,
328
+ "-l",
329
+ latest,
330
+ "-c",
331
+ "0",
332
+ "-sw",
333
+ save_every,
334
+ "-f0",
335
+ f0,
336
+ "-od",
337
+ detector,
338
+ "-ot",
339
+ overtraining_threshold,
340
+ ],
341
+ ),
342
  ]
343
 
344
  subprocess.run(command)
 
348
 
349
  # Index
350
  def run_index_script(model_name, rvc_version):
351
+ index_script_path = os.path.join("rvc", "train", "process", "extract_index.py")
352
  command = [
353
  "python",
354
  index_script_path,
355
+ os.path.join(logs_path, model_name),
356
  rvc_version,
357
  ]
358
 
 
360
  return f"Index file for {model_name} generated successfully."
361
 
362
 
363
+ # Model extract
364
+ def run_model_extract_script(
365
+ pth_path, model_name, sampling_rate, pitch_guidance, rvc_version, epoch, step
366
+ ):
367
+ f0 = 1 if str(pitch_guidance) == "True" else 0
368
+ extract_small_model(
369
+ pth_path, model_name, sampling_rate, f0, rvc_version, epoch, step
370
+ )
371
+ return f"Model {model_name} extracted successfully."
372
+
373
+
374
  # Model information
375
  def run_model_information_script(pth_path):
376
  print(model_information(pth_path))
377
 
378
 
379
+ # Model blender
380
+ def run_model_blender_script(model_name, pth_path_1, pth_path_2, ratio):
381
+ message, model_blended = model_blender(model_name, pth_path_1, pth_path_2, ratio)
382
+ return message, model_blended
383
 
384
 
385
  # Tensorboard
386
  def run_tensorboard_script():
387
+ launch_tensorboard_pipeline()
 
 
 
 
 
 
 
388
 
389
 
390
  # Download
391
  def run_download_script(model_link):
392
+ model_download_pipeline(model_link)
393
+ return f"Model downloaded successfully."
394
+
395
+
396
+ # Prerequisites
397
+ def run_prerequisites_script(pretraineds_v1, pretraineds_v2, models, exe):
398
+ prequisites_download_pipeline(pretraineds_v1, pretraineds_v2, models, exe)
399
+ return "Prerequisites installed successfully."
400
+
401
+
402
+ # Audio analyzer
403
+ def run_audio_analyzer_script(input_path, save_plot_path="logs/audio_analysis.png"):
404
+ audio_info, plot_path = analyze_audio(input_path, save_plot_path)
405
+ print(
406
+ f"Audio info of {input_path}: {audio_info}",
407
+ f"Audio file {input_path} analyzed successfully. Plot saved at: {plot_path}",
408
+ )
409
+ return audio_info, plot_path
410
+
411
+
412
+ # API
413
+ def run_api_script(ip, port):
414
  command = [
415
+ "env/Scripts/uvicorn.exe" if os.name == "nt" else "uvicorn",
416
+ "api:app",
417
+ "--host",
418
+ ip,
419
+ "--port",
420
+ port,
421
  ]
422
  subprocess.run(command)
 
423
 
424
 
425
  # Parse arguments
 
434
  # Parser for 'infer' mode
435
  infer_parser = subparsers.add_parser("infer", help="Run inference")
436
  infer_parser.add_argument(
437
+ "--f0up_key",
438
+ type=str,
439
+ help="Value for f0up_key",
440
+ choices=[str(i) for i in range(-24, 25)],
441
+ default="0",
442
  )
443
  infer_parser.add_argument(
444
+ "--filter_radius",
445
  type=str,
446
+ help="Value for filter_radius",
447
+ choices=[str(i) for i in range(11)],
448
+ default="3",
449
  )
450
  infer_parser.add_argument(
451
+ "--index_rate",
452
  type=str,
453
+ help="Value for index_rate",
454
+ choices=[str(i / 10) for i in range(11)],
455
+ default="0.3",
456
  )
457
  infer_parser.add_argument(
458
+ "--rms_mix_rate",
459
  type=str,
460
+ help="Value for rms_mix_rate",
461
+ choices=[str(i / 10) for i in range(11)],
462
+ default="1",
463
  )
464
  infer_parser.add_argument(
465
+ "--protect",
466
+ type=str,
467
+ help="Value for protect",
468
+ choices=[str(i / 10) for i in range(6)],
469
+ default="0.33",
470
  )
471
  infer_parser.add_argument(
472
+ "--hop_length",
473
+ type=str,
474
+ help="Value for hop_length",
475
+ choices=[str(i) for i in range(1, 513)],
476
+ default="128",
477
  )
478
  infer_parser.add_argument(
479
+ "--f0method",
480
+ type=str,
481
+ help="Value for f0method",
482
+ choices=[
483
+ "pm",
484
+ "harvest",
485
+ "dio",
486
+ "crepe",
487
+ "crepe-tiny",
488
+ "rmvpe",
489
+ "fcpe",
490
+ "hybrid[crepe+rmvpe]",
491
+ "hybrid[crepe+fcpe]",
492
+ "hybrid[rmvpe+fcpe]",
493
+ "hybrid[crepe+rmvpe+fcpe]",
494
+ ],
495
+ default="rmvpe",
496
+ )
497
+ infer_parser.add_argument("--input_path", type=str, help="Input path")
498
+ infer_parser.add_argument("--output_path", type=str, help="Output path")
499
+ infer_parser.add_argument("--pth_path", type=str, help="Path to the .pth file")
500
+ infer_parser.add_argument(
501
+ "--index_path",
502
+ type=str,
503
+ help="Path to the .index file",
504
  )
505
  infer_parser.add_argument(
506
+ "--split_audio",
507
+ type=str,
508
+ help="Enable split audio",
509
+ choices=["True", "False"],
510
+ default="False",
511
  )
512
  infer_parser.add_argument(
513
+ "--f0autotune",
514
  type=str,
515
+ help="Enable autotune",
516
+ choices=["True", "False"],
517
+ default="False",
518
  )
519
  infer_parser.add_argument(
520
+ "--clean_audio",
521
  type=str,
522
+ help="Enable clean audio",
523
+ choices=["True", "False"],
524
+ default="False",
525
+ )
526
+ infer_parser.add_argument(
527
+ "--clean_strength",
528
+ type=str,
529
+ help="Value for clean_strength",
530
+ choices=[str(i / 10) for i in range(11)],
531
+ default="0.7",
532
+ )
533
+ infer_parser.add_argument(
534
+ "--export_format",
535
+ type=str,
536
+ help="Export format",
537
+ choices=["WAV", "MP3", "FLAC", "OGG", "M4A"],
538
+ default="WAV",
539
  )
540
 
541
  # Parser for 'batch_infer' mode
 
543
  "batch_infer", help="Run batch inference"
544
  )
545
  batch_infer_parser.add_argument(
546
+ "--f0up_key",
547
+ type=str,
548
+ help="Value for f0up_key",
549
+ choices=[str(i) for i in range(-24, 25)],
550
+ default="0",
551
+ )
552
+ batch_infer_parser.add_argument(
553
+ "--filter_radius",
554
+ type=str,
555
+ help="Value for filter_radius",
556
+ choices=[str(i) for i in range(11)],
557
+ default="3",
558
+ )
559
+ batch_infer_parser.add_argument(
560
+ "--index_rate",
561
+ type=str,
562
+ help="Value for index_rate",
563
+ choices=[str(i / 10) for i in range(11)],
564
+ default="0.3",
565
+ )
566
+ batch_infer_parser.add_argument(
567
+ "--rms_mix_rate",
568
+ type=str,
569
+ help="Value for rms_mix_rate",
570
+ choices=[str(i / 10) for i in range(11)],
571
+ default="1",
572
  )
573
  batch_infer_parser.add_argument(
574
+ "--protect",
575
  type=str,
576
+ help="Value for protect",
577
+ choices=[str(i / 10) for i in range(6)],
578
+ default="0.33",
579
  )
580
  batch_infer_parser.add_argument(
581
+ "--hop_length",
582
  type=str,
583
+ help="Value for hop_length",
584
+ choices=[str(i) for i in range(1, 513)],
585
+ default="128",
586
  )
587
  batch_infer_parser.add_argument(
588
+ "--f0method",
589
  type=str,
590
+ help="Value for f0method",
591
+ choices=[
592
+ "pm",
593
+ "harvest",
594
+ "dio",
595
+ "crepe",
596
+ "crepe-tiny",
597
+ "rmvpe",
598
+ "fcpe",
599
+ "hybrid[crepe+rmvpe]",
600
+ "hybrid[crepe+fcpe]",
601
+ "hybrid[rmvpe+fcpe]",
602
+ "hybrid[crepe+rmvpe+fcpe]",
603
+ ],
604
+ default="rmvpe",
605
+ )
606
+ batch_infer_parser.add_argument("--input_folder", type=str, help="Input folder")
607
+ batch_infer_parser.add_argument("--output_folder", type=str, help="Output folder")
608
+ batch_infer_parser.add_argument(
609
+ "--pth_path", type=str, help="Path to the .pth file"
610
  )
611
  batch_infer_parser.add_argument(
612
+ "--index_path",
613
+ type=str,
614
+ help="Path to the .index file",
615
  )
616
  batch_infer_parser.add_argument(
617
+ "--split_audio",
618
+ type=str,
619
+ help="Enable split audio",
620
+ choices=["True", "False"],
621
+ default="False",
622
  )
623
  batch_infer_parser.add_argument(
624
+ "--f0autotune",
625
+ type=str,
626
+ help="Enable autotune",
627
+ choices=["True", "False"],
628
+ default="False",
629
  )
630
  batch_infer_parser.add_argument(
631
+ "--clean_audio",
632
+ type=str,
633
+ help="Enable clean audio",
634
+ choices=["True", "False"],
635
+ default="False",
636
  )
637
  batch_infer_parser.add_argument(
638
+ "--clean_strength",
639
  type=str,
640
+ help="Value for clean_strength",
641
+ choices=[str(i / 10) for i in range(11)],
642
+ default="0.7",
643
+ )
644
+ batch_infer_parser.add_argument(
645
+ "--export_format",
646
+ type=str,
647
+ help="Export format",
648
+ choices=["WAV", "MP3", "FLAC", "OGG", "M4A"],
649
+ default="WAV",
650
  )
651
 
652
  # Parser for 'tts' mode
653
  tts_parser = subparsers.add_parser("tts", help="Run TTS")
654
  tts_parser.add_argument(
655
+ "--tts_text",
656
  type=str,
657
+ help="Text to be synthesized",
658
  )
659
  tts_parser.add_argument(
660
+ "--tts_voice",
661
+ type=str,
662
+ help="Voice to be used",
663
+ choices=locales,
664
  )
665
  tts_parser.add_argument(
666
+ "--f0up_key",
667
+ type=str,
668
+ help="Value for f0up_key",
669
+ choices=[str(i) for i in range(-24, 25)],
670
+ default="0",
671
  )
672
  tts_parser.add_argument(
673
+ "--filter_radius",
674
  type=str,
675
+ help="Value for filter_radius",
676
+ choices=[str(i) for i in range(11)],
677
+ default="3",
678
  )
679
  tts_parser.add_argument(
680
+ "--index_rate",
681
  type=str,
682
+ help="Value for index_rate",
683
+ choices=[str(i / 10) for i in range(11)],
684
+ default="0.3",
685
  )
686
  tts_parser.add_argument(
687
+ "--rms_mix_rate",
688
  type=str,
689
+ help="Value for rms_mix_rate",
690
+ choices=[str(i / 10) for i in range(11)],
691
+ default="1",
692
  )
693
  tts_parser.add_argument(
694
+ "--protect",
695
+ type=str,
696
+ help="Value for protect",
697
+ choices=[str(i / 10) for i in range(6)],
698
+ default="0.33",
699
  )
700
  tts_parser.add_argument(
701
+ "--hop_length",
702
+ type=str,
703
+ help="Value for hop_length",
704
+ choices=[str(i) for i in range(1, 513)],
705
+ default="128",
706
  )
707
  tts_parser.add_argument(
708
+ "--f0method",
709
+ type=str,
710
+ help="Value for f0method",
711
+ choices=[
712
+ "pm",
713
+ "harvest",
714
+ "dio",
715
+ "crepe",
716
+ "crepe-tiny",
717
+ "rmvpe",
718
+ "fcpe",
719
+ "hybrid[crepe+rmvpe]",
720
+ "hybrid[crepe+fcpe]",
721
+ "hybrid[rmvpe+fcpe]",
722
+ "hybrid[crepe+rmvpe+fcpe]",
723
+ ],
724
+ default="rmvpe",
725
+ )
726
+ tts_parser.add_argument("--output_tts_path", type=str, help="Output tts path")
727
+ tts_parser.add_argument("--output_rvc_path", type=str, help="Output rvc path")
728
+ tts_parser.add_argument("--pth_path", type=str, help="Path to the .pth file")
729
+ tts_parser.add_argument(
730
+ "--index_path",
731
+ type=str,
732
+ help="Path to the .index file",
733
  )
734
  tts_parser.add_argument(
735
+ "--split_audio",
736
+ type=str,
737
+ help="Enable split audio",
738
+ choices=["True", "False"],
739
+ default="False",
740
  )
741
  tts_parser.add_argument(
742
+ "--f0autotune",
743
  type=str,
744
+ help="Enable autotune",
745
+ choices=["True", "False"],
746
+ default="False",
747
+ )
748
+ tts_parser.add_argument(
749
+ "--clean_audio",
750
+ type=str,
751
+ help="Enable clean audio",
752
+ choices=["True", "False"],
753
+ default="False",
754
+ )
755
+ tts_parser.add_argument(
756
+ "--clean_strength",
757
+ type=str,
758
+ help="Value for clean_strength",
759
+ choices=[str(i / 10) for i in range(11)],
760
+ default="0.7",
761
+ )
762
+ tts_parser.add_argument(
763
+ "--export_format",
764
+ type=str,
765
+ help="Export format",
766
+ choices=["WAV", "MP3", "FLAC", "OGG", "M4A"],
767
+ default="WAV",
768
  )
769
 
770
  # Parser for 'preprocess' mode
771
  preprocess_parser = subparsers.add_parser("preprocess", help="Run preprocessing")
772
+ preprocess_parser.add_argument("--model_name", type=str, help="Name of the model")
773
  preprocess_parser.add_argument(
774
+ "--dataset_path",
 
 
 
775
  type=str,
776
+ help="Path to the dataset",
777
  )
778
  preprocess_parser.add_argument(
779
+ "--sampling_rate",
780
+ type=str,
781
+ help="Sampling rate",
782
+ choices=["32000", "40000", "48000"],
783
  )
784
 
785
  # Parser for 'extract' mode
786
  extract_parser = subparsers.add_parser("extract", help="Run extract")
787
  extract_parser.add_argument(
788
+ "--model_name",
789
  type=str,
790
+ help="Name of the model",
791
  )
792
  extract_parser.add_argument(
793
+ "--rvc_version",
794
  type=str,
795
+ help="Version of the model",
796
+ choices=["v1", "v2"],
797
+ default="v2",
798
  )
799
  extract_parser.add_argument(
800
+ "--f0method",
801
+ type=str,
802
+ help="Value for f0method",
803
+ choices=[
804
+ "pm",
805
+ "harvest",
806
+ "dio",
807
+ "crepe",
808
+ "crepe-tiny",
809
+ "rmvpe",
810
+ ],
811
+ default="rmvpe",
812
  )
813
  extract_parser.add_argument(
814
+ "--hop_length",
815
  type=str,
816
+ help="Value for hop_length",
817
+ choices=[str(i) for i in range(1, 513)],
818
+ default="128",
819
  )
820
  extract_parser.add_argument(
821
+ "--sampling_rate",
822
+ type=str,
823
+ help="Sampling rate",
824
+ choices=["32000", "40000", "48000"],
825
  )
826
 
827
  # Parser for 'train' mode
828
  train_parser = subparsers.add_parser("train", help="Run training")
829
  train_parser.add_argument(
830
+ "--model_name",
831
  type=str,
832
+ help="Name of the model",
833
  )
834
  train_parser.add_argument(
835
+ "--rvc_version",
836
  type=str,
837
+ help="Version of the model",
838
+ choices=["v1", "v2"],
839
+ default="v2",
840
  )
841
  train_parser.add_argument(
842
+ "--save_every_epoch",
843
  type=str,
844
  help="Save every epoch",
845
+ choices=[str(i) for i in range(1, 101)],
846
  )
847
  train_parser.add_argument(
848
+ "--save_only_latest",
849
  type=str,
850
  help="Save weight only at last epoch",
851
+ choices=["True", "False"],
852
+ default="False",
853
  )
854
  train_parser.add_argument(
855
+ "--save_every_weights",
856
  type=str,
857
  help="Save weight every epoch",
858
+ choices=["True", "False"],
859
+ default="True",
860
  )
861
  train_parser.add_argument(
862
+ "--total_epoch",
863
  type=str,
864
  help="Total epoch",
865
+ choices=[str(i) for i in range(1, 10001)],
866
+ default="1000",
867
  )
868
  train_parser.add_argument(
869
+ "--sampling_rate",
870
+ type=str,
871
+ help="Sampling rate",
872
+ choices=["32000", "40000", "48000"],
873
  )
874
  train_parser.add_argument(
875
+ "--batch_size",
876
  type=str,
877
  help="Batch size",
878
+ choices=[str(i) for i in range(1, 51)],
879
+ default="8",
880
  )
881
  train_parser.add_argument(
882
+ "--gpu",
883
  type=str,
884
+ help="GPU number",
885
+ choices=[str(i) for i in range(0, 11)],
886
+ default="0",
887
  )
888
  train_parser.add_argument(
889
+ "--pitch_guidance",
890
+ type=str,
891
+ help="Pitch guidance",
892
+ choices=["True", "False"],
893
+ default="True",
894
  )
895
  train_parser.add_argument(
896
+ "--pretrained",
897
+ type=str,
898
+ help="Pretrained",
899
+ choices=["True", "False"],
900
+ default="True",
901
  )
902
  train_parser.add_argument(
903
+ "--custom_pretrained",
904
+ type=str,
905
+ help="Custom pretrained",
906
+ choices=["True", "False"],
907
+ default="False",
908
  )
909
  train_parser.add_argument(
910
+ "--g_pretrained_path",
911
  type=str,
912
  nargs="?",
913
  default=None,
914
+ help="Path to the pretrained G file",
915
  )
916
  train_parser.add_argument(
917
+ "--d_pretrained_path",
918
  type=str,
919
  nargs="?",
920
  default=None,
921
+ help="Path to the pretrained D file",
922
+ )
923
+ train_parser.add_argument(
924
+ "--overtraining_detector",
925
+ type=str,
926
+ help="Overtraining detector",
927
+ choices=["True", "False"],
928
+ default="False",
929
+ )
930
+ train_parser.add_argument(
931
+ "--overtraining_threshold",
932
+ type=str,
933
+ help="Overtraining threshold",
934
+ choices=[str(i) for i in range(1, 101)],
935
+ default="50",
936
  )
937
 
938
  # Parser for 'index' mode
939
  index_parser = subparsers.add_parser("index", help="Generate index file")
940
  index_parser.add_argument(
941
+ "--model_name",
942
  type=str,
943
+ help="Name of the model",
944
  )
945
  index_parser.add_argument(
946
+ "--rvc_version",
947
  type=str,
948
+ help="Version of the model",
949
+ choices=["v1", "v2"],
950
+ default="v2",
951
+ )
952
+
953
+ # Parser for 'model_extract' mode
954
+ model_extract_parser = subparsers.add_parser("model_extract", help="Extract model")
955
+ model_extract_parser.add_argument(
956
+ "--pth_path",
957
+ type=str,
958
+ help="Path to the .pth file",
959
+ )
960
+ model_extract_parser.add_argument(
961
+ "--model_name",
962
+ type=str,
963
+ help="Name of the model",
964
+ )
965
+ model_extract_parser.add_argument(
966
+ "--sampling_rate",
967
+ type=str,
968
+ help="Sampling rate",
969
+ choices=["40000", "48000"],
970
+ )
971
+ model_extract_parser.add_argument(
972
+ "--pitch_guidance",
973
+ type=str,
974
+ help="Pitch guidance",
975
+ choices=["True", "False"],
976
+ )
977
+ model_extract_parser.add_argument(
978
+ "--rvc_version",
979
+ type=str,
980
+ help="Version of the model",
981
+ choices=["v1", "v2"],
982
+ default="v2",
983
+ )
984
+ model_extract_parser.add_argument(
985
+ "--epoch",
986
+ type=str,
987
+ help="Epochs of the model",
988
+ choices=[str(i) for i in range(1, 10001)],
989
+ )
990
+ model_extract_parser.add_argument(
991
+ "--step",
992
+ type=str,
993
+ help="Steps of the model",
994
  )
995
 
996
  # Parser for 'model_information' mode
 
998
  "model_information", help="Print model information"
999
  )
1000
  model_information_parser.add_argument(
1001
+ "--pth_path",
1002
  type=str,
1003
+ help="Path to the .pth file",
1004
  )
1005
 
1006
+ # Parser for 'model_blender' mode
1007
+ model_blender_parser = subparsers.add_parser(
1008
+ "model_blender", help="Fuse two models"
1009
+ )
1010
+ model_blender_parser.add_argument(
1011
+ "--model_name",
1012
+ type=str,
1013
+ help="Name of the model",
1014
+ )
1015
+ model_blender_parser.add_argument(
1016
+ "--pth_path_1",
1017
  type=str,
1018
+ help="Path to the first .pth file",
1019
  )
1020
+ model_blender_parser.add_argument(
1021
+ "--pth_path_2",
1022
  type=str,
1023
+ help="Path to the second .pth file",
1024
  )
1025
+ model_blender_parser.add_argument(
1026
+ "--ratio",
1027
  type=str,
1028
+ help="Value for blender ratio",
1029
+ choices=[str(i / 10) for i in range(11)],
1030
+ default="0.5",
1031
  )
1032
 
1033
  # Parser for 'tensorboard' mode
 
1036
  # Parser for 'download' mode
1037
  download_parser = subparsers.add_parser("download", help="Download models")
1038
  download_parser.add_argument(
1039
+ "--model_link",
1040
+ type=str,
1041
+ help="Link of the model",
1042
+ )
1043
+
1044
+ # Parser for 'prerequisites' mode
1045
+ prerequisites_parser = subparsers.add_parser(
1046
+ "prerequisites", help="Install prerequisites"
1047
+ )
1048
+ prerequisites_parser.add_argument(
1049
+ "--pretraineds_v1",
1050
  type=str,
1051
+ choices=["True", "False"],
1052
+ default="True",
1053
+ help="Download pretrained models for v1",
1054
+ )
1055
+ prerequisites_parser.add_argument(
1056
+ "--pretraineds_v2",
1057
+ type=str,
1058
+ choices=["True", "False"],
1059
+ default="True",
1060
+ help="Download pretrained models for v2",
1061
+ )
1062
+ prerequisites_parser.add_argument(
1063
+ "--models",
1064
+ type=str,
1065
+ choices=["True", "False"],
1066
+ default="True",
1067
+ help="Donwload models",
1068
+ )
1069
+ prerequisites_parser.add_argument(
1070
+ "--exe",
1071
+ type=str,
1072
+ choices=["True", "False"],
1073
+ default="True",
1074
+ help="Download executables",
1075
+ )
1076
+
1077
+ # Parser for 'audio_analyzer' mode
1078
+ audio_analyzer = subparsers.add_parser("audio_analyzer", help="Run audio analyzer")
1079
+ audio_analyzer.add_argument(
1080
+ "--input_path",
1081
+ type=str,
1082
+ help="Path to the input audio file",
1083
  )
1084
 
1085
+ # Parser for 'api' mode
1086
+ api_parser = subparsers.add_parser("api", help="Run the API")
1087
+ api_parser.add_argument("--ip", type=str, help="IP address", default="127.0.0.1")
1088
+ api_parser.add_argument("--port", type=str, help="Port", default="8000")
1089
+
1090
  return parser.parse_args()
1091
 
1092
 
 
1100
  try:
1101
  if args.mode == "infer":
1102
  run_infer_script(
1103
+ str(args.f0up_key),
1104
+ str(args.filter_radius),
1105
+ str(args.index_rate),
1106
+ str(args.rms_mix_rate),
1107
+ str(args.protect),
1108
+ str(args.hop_length),
1109
+ str(args.f0method),
1110
+ str(args.input_path),
1111
+ str(args.output_path),
1112
+ str(args.pth_path),
1113
+ str(args.index_path),
1114
+ str(args.split_audio),
1115
+ str(args.f0autotune),
1116
+ str(args.clean_audio),
1117
+ str(args.clean_strength),
1118
+ str(args.export_format),
1119
  )
1120
  elif args.mode == "batch_infer":
1121
  run_batch_infer_script(
1122
+ str(args.f0up_key),
1123
+ str(args.filter_radius),
1124
+ str(args.index_rate),
1125
+ str(args.rms_mix_rate),
1126
+ str(args.protect),
1127
+ str(args.hop_length),
1128
+ str(args.f0method),
1129
+ str(args.input_folder),
1130
+ str(args.output_folder),
1131
+ str(args.pth_path),
1132
+ str(args.index_path),
1133
+ str(args.split_audio),
1134
+ str(args.f0autotune),
1135
+ str(args.clean_audio),
1136
+ str(args.clean_strength),
1137
+ str(args.export_format),
1138
  )
1139
  elif args.mode == "tts":
1140
  run_tts_script(
1141
+ str(args.tts_text),
1142
+ str(args.tts_voice),
1143
+ str(args.f0up_key),
1144
+ str(args.filter_radius),
1145
+ str(args.index_rate),
1146
+ str(args.rms_mix_rate),
1147
+ str(args.protect),
1148
+ str(args.hop_length),
1149
+ str(args.f0method),
1150
+ str(args.output_tts_path),
1151
+ str(args.output_rvc_path),
1152
+ str(args.pth_path),
1153
+ str(args.index_path),
1154
+ str(args.split_audio),
1155
+ str(args.f0autotune),
1156
+ str(args.clean_audio),
1157
+ str(args.clean_strength),
1158
+ str(args.export_format),
1159
  )
1160
  elif args.mode == "preprocess":
1161
  run_preprocess_script(
1162
+ str(args.model_name),
1163
+ str(args.dataset_path),
1164
  str(args.sampling_rate),
1165
  )
 
1166
  elif args.mode == "extract":
1167
  run_extract_script(
1168
+ str(args.model_name),
1169
+ str(args.rvc_version),
1170
+ str(args.f0method),
1171
+ str(args.hop_length),
1172
+ str(args.sampling_rate),
1173
  )
1174
  elif args.mode == "train":
1175
  run_train_script(
1176
+ str(args.model_name),
1177
+ str(args.rvc_version),
1178
+ str(args.save_every_epoch),
1179
+ str(args.save_only_latest),
1180
+ str(args.save_every_weights),
1181
+ str(args.total_epoch),
1182
+ str(args.sampling_rate),
1183
+ str(args.batch_size),
1184
+ str(args.gpu),
1185
+ str(args.pitch_guidance),
1186
+ str(args.pretrained),
1187
+ str(args.custom_pretrained),
1188
+ str(args.g_pretrained_path),
1189
+ str(args.d_pretrained_path),
1190
+ str(args.overtraining_detector),
1191
+ str(args.overtraining_threshold),
1192
  )
1193
  elif args.mode == "index":
1194
  run_index_script(
1195
+ str(args.model_name),
1196
+ str(args.rvc_version),
1197
+ )
1198
+ elif args.mode == "model_extract":
1199
+ run_model_extract_script(
1200
+ str(args.pth_path),
1201
+ str(args.model_name),
1202
+ str(args.sampling_rate),
1203
+ str(args.pitch_guidance),
1204
+ str(args.rvc_version),
1205
+ str(args.epoch),
1206
+ str(args.step),
1207
  )
1208
  elif args.mode == "model_information":
1209
  run_model_information_script(
1210
+ str(args.pth_path),
1211
  )
1212
+ elif args.mode == "model_blender":
1213
+ run_model_blender_script(
1214
+ str(args.model_name),
1215
+ str(args.pth_path_1),
1216
+ str(args.pth_path_2),
1217
+ str(args.ratio),
1218
  )
1219
  elif args.mode == "tensorboard":
1220
  run_tensorboard_script()
1221
  elif args.mode == "download":
1222
  run_download_script(
1223
+ str(args.model_link),
1224
+ )
1225
+ elif args.mode == "prerequisites":
1226
+ run_prerequisites_script(
1227
+ str(args.pretraineds_v1),
1228
+ str(args.pretraineds_v2),
1229
+ str(args.models),
1230
+ str(args.exe),
1231
+ )
1232
+ elif args.mode == "audio_analyzer":
1233
+ run_audio_analyzer_script(
1234
+ str(args.input_path),
1235
+ )
1236
+ elif args.mode == "api":
1237
+ run_api_script(
1238
+ str(args.ip),
1239
+ str(args.port),
1240
  )
1241
  except Exception as error:
1242
  print(f"Error: {error}")
rvc/configs/config.py CHANGED
@@ -1,10 +1,6 @@
1
- import argparse
2
- import os
3
- import sys
4
- import json
5
- from multiprocessing import cpu_count
6
-
7
  import torch
 
 
8
 
9
  version_config_list = [
10
  "v1/32000.json",
@@ -64,6 +60,9 @@ class Config:
64
  return False
65
 
66
  def use_fp32_config(self):
 
 
 
67
  for config_file in version_config_list:
68
  self.json_config[config_file]["train"]["fp16_run"] = False
69
  with open(f"rvc/configs/{config_file}", "r") as f:
@@ -116,7 +115,7 @@ class Config:
116
  self.use_fp32_config()
117
 
118
  if self.n_cpu == 0:
119
- self.n_cpu = cpu_count()
120
 
121
  if self.is_half:
122
  x_pad = 3
 
 
 
 
 
 
 
1
  import torch
2
+ import json
3
+ import os
4
 
5
  version_config_list = [
6
  "v1/32000.json",
 
60
  return False
61
 
62
  def use_fp32_config(self):
63
+ print(
64
+ f"Using FP32 config instead of FP16 due to GPU compatibility ({self.gpu_name})"
65
+ )
66
  for config_file in version_config_list:
67
  self.json_config[config_file]["train"]["fp16_run"] = False
68
  with open(f"rvc/configs/{config_file}", "r") as f:
 
115
  self.use_fp32_config()
116
 
117
  if self.n_cpu == 0:
118
+ self.n_cpu = os.cpu_count()
119
 
120
  if self.is_half:
121
  x_pad = 3
rvc/infer/infer.py CHANGED
@@ -1,9 +1,19 @@
1
  import os
2
  import sys
 
3
  import torch
 
 
4
  import numpy as np
5
  import soundfile as sf
6
- from vc_infer_pipeline import VC
 
 
 
 
 
 
 
7
  from rvc.lib.utils import load_audio
8
  from rvc.lib.tools.split_audio import process_audio, merge_audio
9
  from fairseq import checkpoint_utils
@@ -13,13 +23,19 @@ from rvc.lib.infer_pack.models import (
13
  SynthesizerTrnMs768NSFsid,
14
  SynthesizerTrnMs768NSFsid_nono,
15
  )
16
-
17
  from rvc.configs.config import Config
18
 
19
- config = Config()
 
20
 
21
- torch.manual_seed(114514)
22
  hubert_model = None
 
 
 
 
 
 
23
 
24
 
25
  def load_hubert():
@@ -37,6 +53,44 @@ def load_hubert():
37
  hubert_model.eval()
38
 
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  def vc_single(
41
  sid=0,
42
  input_audio_path=None,
@@ -46,17 +100,16 @@ def vc_single(
46
  file_index=None,
47
  index_rate=None,
48
  resample_sr=0,
49
- rms_mix_rate=1,
50
- protect=0.33,
51
  hop_length=None,
52
  output_path=None,
53
  split_audio=False,
 
 
54
  ):
55
  global tgt_sr, net_g, vc, hubert_model, version
56
 
57
- if input_audio_path is None:
58
- return "Please, load an audio!", None
59
-
60
  f0_up_key = int(f0_up_key)
61
  try:
62
  audio = load_audio(input_audio_path, 16000)
@@ -95,7 +148,7 @@ def vc_single(
95
  ]
96
  try:
97
  for path in paths:
98
- info, opt = vc_single(
99
  sid,
100
  path,
101
  f0_up_key,
@@ -109,17 +162,18 @@ def vc_single(
109
  hop_length,
110
  path,
111
  False,
 
112
  )
113
- # new_dir_path
114
  except Exception as error:
115
  print(error)
116
- return "Error", None
117
  print("Finished processing segmented audio, now merging audio...")
118
  merge_timestamps_file = os.path.join(
119
  os.path.dirname(new_dir_path),
120
  f"{os.path.basename(input_audio_path).split('.')[0]}_timestamps.txt",
121
  )
122
  tgt_sr, audio_opt = merge_audio(merge_timestamps_file)
 
123
 
124
  else:
125
  audio_opt = vc.pipeline(
@@ -140,9 +194,9 @@ def vc_single(
140
  version,
141
  protect,
142
  hop_length,
 
143
  f0_file=f0_file,
144
  )
145
-
146
  if output_path is not None:
147
  sf.write(output_path, audio_opt, tgt_sr, format="WAV")
148
 
@@ -158,7 +212,7 @@ def get_vc(weight_root, sid):
158
  global hubert_model
159
  if hubert_model is not None:
160
  print("clean_empty_cache")
161
- del net_g, n_spk, vc, hubert_model, tgt_sr # ,cpt
162
  hubert_model = net_g = n_spk = vc = hubert_model = tgt_sr = None
163
  if torch.cuda.is_available():
164
  torch.cuda.empty_cache()
@@ -211,55 +265,64 @@ def get_vc(weight_root, sid):
211
  n_spk = cpt["config"][-3]
212
 
213
 
214
- f0up_key = sys.argv[1]
215
- filter_radius = sys.argv[2]
216
- index_rate = float(sys.argv[3])
217
- hop_length = sys.argv[4]
218
- f0method = sys.argv[5]
219
-
220
- audio_input_path = sys.argv[6]
221
- audio_output_path = sys.argv[7]
222
-
223
- model_path = sys.argv[8]
224
- index_path = sys.argv[9]
225
-
226
- try:
227
- split_audio = sys.argv[10]
228
- except IndexError:
229
- split_audio = None
230
-
231
- sid = f0up_key
232
- input_audio = audio_input_path
233
- f0_pitch = f0up_key
234
- f0_file = None
235
- f0_method = f0method
236
- file_index = index_path
237
- index_rate = index_rate
238
- output_file = audio_output_path
239
- split_audio = split_audio
240
-
241
- get_vc(model_path, 0)
242
-
243
- try:
244
- result, audio_opt = vc_single(
245
- sid=0,
246
- input_audio_path=input_audio,
247
- f0_up_key=f0_pitch,
248
- f0_file=None,
249
- f0_method=f0_method,
250
- file_index=file_index,
251
- index_rate=index_rate,
252
- hop_length=hop_length,
253
- output_path=output_file,
254
- split_audio=split_audio,
255
- )
256
 
257
- if os.path.exists(output_file) and os.path.getsize(output_file) > 0:
258
- message = result
259
- else:
260
- message = result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
 
262
- print(f"Conversion completed. Output file: '{output_file}'")
 
 
 
 
 
263
 
264
- except Exception as error:
265
- print(f"Voice conversion failed: {error}")
 
 
 
 
 
 
 
1
  import os
2
  import sys
3
+ import time
4
  import torch
5
+ import logging
6
+
7
  import numpy as np
8
  import soundfile as sf
9
+ import librosa
10
+
11
+ now_dir = os.getcwd()
12
+ sys.path.append(now_dir)
13
+
14
+ from rvc.infer.pipeline import VC
15
+ from scipy.io import wavfile
16
+ import noisereduce as nr
17
  from rvc.lib.utils import load_audio
18
  from rvc.lib.tools.split_audio import process_audio, merge_audio
19
  from fairseq import checkpoint_utils
 
23
  SynthesizerTrnMs768NSFsid,
24
  SynthesizerTrnMs768NSFsid_nono,
25
  )
 
26
  from rvc.configs.config import Config
27
 
28
+ logging.getLogger("fairseq").setLevel(logging.WARNING)
29
+ logging.getLogger("httpx").setLevel(logging.WARNING)
30
 
31
+ config = Config()
32
  hubert_model = None
33
+ tgt_sr = None
34
+ net_g = None
35
+ vc = None
36
+ cpt = None
37
+ version = None
38
+ n_spk = None
39
 
40
 
41
  def load_hubert():
 
53
  hubert_model.eval()
54
 
55
 
56
+ def remove_audio_noise(input_audio_path, reduction_strength=0.7):
57
+ try:
58
+ rate, data = wavfile.read(input_audio_path)
59
+ reduced_noise = nr.reduce_noise(
60
+ y=data,
61
+ sr=rate,
62
+ prop_decrease=reduction_strength,
63
+ )
64
+ return reduced_noise
65
+ except Exception as error:
66
+ print(f"Error cleaning audio: {error}")
67
+ return None
68
+
69
+
70
+ def convert_audio_format(input_path, output_path, output_format):
71
+ try:
72
+ if output_format != "WAV":
73
+ print(f"Converting audio to {output_format} format...")
74
+ audio, sample_rate = librosa.load(input_path, sr=None)
75
+ common_sample_rates = [
76
+ 8000,
77
+ 11025,
78
+ 12000,
79
+ 16000,
80
+ 22050,
81
+ 24000,
82
+ 32000,
83
+ 44100,
84
+ 48000,
85
+ ]
86
+ target_sr = min(common_sample_rates, key=lambda x: abs(x - sample_rate))
87
+ audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=target_sr)
88
+ sf.write(output_path, audio, target_sr, format=output_format.lower())
89
+ return output_path
90
+ except Exception as error:
91
+ print(f"Failed to convert audio to {output_format} format: {error}")
92
+
93
+
94
  def vc_single(
95
  sid=0,
96
  input_audio_path=None,
 
100
  file_index=None,
101
  index_rate=None,
102
  resample_sr=0,
103
+ rms_mix_rate=None,
104
+ protect=None,
105
  hop_length=None,
106
  output_path=None,
107
  split_audio=False,
108
+ f0autotune=False,
109
+ filter_radius=None,
110
  ):
111
  global tgt_sr, net_g, vc, hubert_model, version
112
 
 
 
 
113
  f0_up_key = int(f0_up_key)
114
  try:
115
  audio = load_audio(input_audio_path, 16000)
 
148
  ]
149
  try:
150
  for path in paths:
151
+ vc_single(
152
  sid,
153
  path,
154
  f0_up_key,
 
162
  hop_length,
163
  path,
164
  False,
165
+ f0autotune,
166
  )
 
167
  except Exception as error:
168
  print(error)
169
+ return f"Error {error}"
170
  print("Finished processing segmented audio, now merging audio...")
171
  merge_timestamps_file = os.path.join(
172
  os.path.dirname(new_dir_path),
173
  f"{os.path.basename(input_audio_path).split('.')[0]}_timestamps.txt",
174
  )
175
  tgt_sr, audio_opt = merge_audio(merge_timestamps_file)
176
+ os.remove(merge_timestamps_file)
177
 
178
  else:
179
  audio_opt = vc.pipeline(
 
194
  version,
195
  protect,
196
  hop_length,
197
+ f0autotune,
198
  f0_file=f0_file,
199
  )
 
200
  if output_path is not None:
201
  sf.write(output_path, audio_opt, tgt_sr, format="WAV")
202
 
 
212
  global hubert_model
213
  if hubert_model is not None:
214
  print("clean_empty_cache")
215
+ del net_g, n_spk, vc, hubert_model, tgt_sr
216
  hubert_model = net_g = n_spk = vc = hubert_model = tgt_sr = None
217
  if torch.cuda.is_available():
218
  torch.cuda.empty_cache()
 
265
  n_spk = cpt["config"][-3]
266
 
267
 
268
+ def infer_pipeline(
269
+ f0up_key,
270
+ filter_radius,
271
+ index_rate,
272
+ rms_mix_rate,
273
+ protect,
274
+ hop_length,
275
+ f0method,
276
+ audio_input_path,
277
+ audio_output_path,
278
+ model_path,
279
+ index_path,
280
+ split_audio,
281
+ f0autotune,
282
+ clean_audio,
283
+ clean_strength,
284
+ export_format,
285
+ ):
286
+ global tgt_sr, net_g, vc, cpt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
287
 
288
+ get_vc(model_path, 0)
289
+
290
+ try:
291
+ start_time = time.time()
292
+ vc_single(
293
+ sid=0,
294
+ input_audio_path=audio_input_path,
295
+ f0_up_key=f0up_key,
296
+ f0_file=None,
297
+ f0_method=f0method,
298
+ file_index=index_path,
299
+ index_rate=index_rate,
300
+ rms_mix_rate=rms_mix_rate,
301
+ protect=protect,
302
+ hop_length=hop_length,
303
+ output_path=audio_output_path,
304
+ split_audio=split_audio,
305
+ f0autotune=f0autotune,
306
+ filter_radius=filter_radius,
307
+ )
308
+
309
+ if clean_audio == "True":
310
+ cleaned_audio = remove_audio_noise(audio_output_path, clean_strength)
311
+ if cleaned_audio is not None:
312
+ sf.write(audio_output_path, cleaned_audio, tgt_sr, format="WAV")
313
 
314
+ output_path_format = audio_output_path.replace(
315
+ ".wav", f".{export_format.lower()}"
316
+ )
317
+ audio_output_path = convert_audio_format(
318
+ audio_output_path, output_path_format, export_format
319
+ )
320
 
321
+ end_time = time.time()
322
+ elapsed_time = end_time - start_time
323
+ print(
324
+ f"Conversion completed. Output file: '{audio_output_path}' in {elapsed_time:.2f} seconds."
325
+ )
326
+
327
+ except Exception as error:
328
+ print(f"Voice conversion failed: {error}")
rvc/infer/pipeline.py ADDED
@@ -0,0 +1,625 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np, parselmouth, torch, pdb, sys, os
2
+ from time import time as ttime
3
+ import torch.nn.functional as F
4
+ import torchcrepe
5
+ from torch import Tensor
6
+ import scipy.signal as signal
7
+ import pyworld, os, faiss, librosa, torchcrepe
8
+ from scipy import signal
9
+ from functools import lru_cache
10
+ import random
11
+ import gc
12
+ import re
13
+
14
+ now_dir = os.getcwd()
15
+ sys.path.append(now_dir)
16
+
17
+ from rvc.lib.FCPEF0Predictor import FCPEF0Predictor
18
+
19
+ bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
20
+
21
+ input_audio_path2wav = {}
22
+
23
+
24
+ @lru_cache
25
+ def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period):
26
+ audio = input_audio_path2wav[input_audio_path]
27
+ f0, t = pyworld.harvest(
28
+ audio,
29
+ fs=fs,
30
+ f0_ceil=f0max,
31
+ f0_floor=f0min,
32
+ frame_period=frame_period,
33
+ )
34
+ f0 = pyworld.stonemask(audio, f0, t, fs)
35
+ return f0
36
+
37
+
38
+ def change_rms(data1, sr1, data2, sr2, rate):
39
+ # print(data1.max(),data2.max())
40
+ rms1 = librosa.feature.rms(y=data1, frame_length=sr1 // 2 * 2, hop_length=sr1 // 2)
41
+ rms2 = librosa.feature.rms(y=data2, frame_length=sr2 // 2 * 2, hop_length=sr2 // 2)
42
+
43
+ rms1 = torch.from_numpy(rms1)
44
+ rms1 = F.interpolate(
45
+ rms1.unsqueeze(0), size=data2.shape[0], mode="linear"
46
+ ).squeeze()
47
+
48
+ rms2 = torch.from_numpy(rms2)
49
+ rms2 = F.interpolate(
50
+ rms2.unsqueeze(0), size=data2.shape[0], mode="linear"
51
+ ).squeeze()
52
+ rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-6)
53
+
54
+ data2 *= (
55
+ torch.pow(rms1, torch.tensor(1 - rate))
56
+ * torch.pow(rms2, torch.tensor(rate - 1))
57
+ ).numpy()
58
+ return data2
59
+
60
+
61
+ class VC(object):
62
+ def __init__(self, tgt_sr, config):
63
+ self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = (
64
+ config.x_pad,
65
+ config.x_query,
66
+ config.x_center,
67
+ config.x_max,
68
+ config.is_half,
69
+ )
70
+ self.sr = 16000
71
+ self.window = 160
72
+ self.t_pad = self.sr * self.x_pad
73
+ self.t_pad_tgt = tgt_sr * self.x_pad
74
+ self.t_pad2 = self.t_pad * 2
75
+ self.t_query = self.sr * self.x_query
76
+ self.t_center = self.sr * self.x_center
77
+ self.t_max = self.sr * self.x_max
78
+ self.device = config.device
79
+ self.ref_freqs = [
80
+ 65.41,
81
+ 82.41,
82
+ 110.00,
83
+ 146.83,
84
+ 196.00,
85
+ 246.94,
86
+ 329.63,
87
+ 440.00,
88
+ 587.33,
89
+ 783.99,
90
+ 1046.50,
91
+ ]
92
+ # Generate interpolated frequencies
93
+ self.note_dict = self.generate_interpolated_frequencies()
94
+
95
+ def generate_interpolated_frequencies(self):
96
+ # Generate interpolated frequencies based on the reference frequencies.
97
+ note_dict = []
98
+ for i in range(len(self.ref_freqs) - 1):
99
+ freq_low = self.ref_freqs[i]
100
+ freq_high = self.ref_freqs[i + 1]
101
+ # Interpolate between adjacent reference frequencies
102
+ interpolated_freqs = np.linspace(
103
+ freq_low, freq_high, num=10, endpoint=False
104
+ )
105
+ note_dict.extend(interpolated_freqs)
106
+ # Add the last reference frequency
107
+ note_dict.append(self.ref_freqs[-1])
108
+ return note_dict
109
+
110
+ def autotune_f0(self, f0):
111
+ # Autotunes the given fundamental frequency (f0) to the nearest musical note.
112
+ autotuned_f0 = np.zeros_like(f0)
113
+ for i, freq in enumerate(f0):
114
+ # Find the closest note
115
+ closest_note = min(self.note_dict, key=lambda x: abs(x - freq))
116
+ autotuned_f0[i] = closest_note
117
+ return autotuned_f0
118
+
119
+ def get_optimal_torch_device(self, index: int = 0) -> torch.device:
120
+ if torch.cuda.is_available():
121
+ return torch.device(f"cuda:{index % torch.cuda.device_count()}")
122
+ elif torch.backends.mps.is_available():
123
+ return torch.device("mps")
124
+ return torch.device("cpu")
125
+
126
+ def get_f0_crepe_computation(
127
+ self,
128
+ x,
129
+ f0_min,
130
+ f0_max,
131
+ p_len,
132
+ hop_length,
133
+ model="full",
134
+ ):
135
+ x = x.astype(np.float32)
136
+ x /= np.quantile(np.abs(x), 0.999)
137
+ torch_device = self.get_optimal_torch_device()
138
+ audio = torch.from_numpy(x).to(torch_device, copy=True)
139
+ audio = torch.unsqueeze(audio, dim=0)
140
+ if audio.ndim == 2 and audio.shape[0] > 1:
141
+ audio = torch.mean(audio, dim=0, keepdim=True).detach()
142
+ audio = audio.detach()
143
+ pitch: Tensor = torchcrepe.predict(
144
+ audio,
145
+ self.sr,
146
+ hop_length,
147
+ f0_min,
148
+ f0_max,
149
+ model,
150
+ batch_size=hop_length * 2,
151
+ device=torch_device,
152
+ pad=True,
153
+ )
154
+ p_len = p_len or x.shape[0] // hop_length
155
+ source = np.array(pitch.squeeze(0).cpu().float().numpy())
156
+ source[source < 0.001] = np.nan
157
+ target = np.interp(
158
+ np.arange(0, len(source) * p_len, len(source)) / p_len,
159
+ np.arange(0, len(source)),
160
+ source,
161
+ )
162
+ f0 = np.nan_to_num(target)
163
+ return f0
164
+
165
+ def get_f0_official_crepe_computation(
166
+ self,
167
+ x,
168
+ f0_min,
169
+ f0_max,
170
+ model="full",
171
+ ):
172
+ batch_size = 512
173
+ audio = torch.tensor(np.copy(x))[None].float()
174
+ f0, pd = torchcrepe.predict(
175
+ audio,
176
+ self.sr,
177
+ self.window,
178
+ f0_min,
179
+ f0_max,
180
+ model,
181
+ batch_size=batch_size,
182
+ device=self.device,
183
+ return_periodicity=True,
184
+ )
185
+ pd = torchcrepe.filter.median(pd, 3)
186
+ f0 = torchcrepe.filter.mean(f0, 3)
187
+ f0[pd < 0.1] = 0
188
+ f0 = f0[0].cpu().numpy()
189
+ return f0
190
+
191
+ def get_f0_hybrid_computation(
192
+ self,
193
+ methods_str,
194
+ x,
195
+ f0_min,
196
+ f0_max,
197
+ p_len,
198
+ hop_length,
199
+ ):
200
+ methods_str = re.search("hybrid\[(.+)\]", methods_str)
201
+ if methods_str:
202
+ methods = [method.strip() for method in methods_str.group(1).split("+")]
203
+ f0_computation_stack = []
204
+ print(f"Calculating f0 pitch estimations for methods {str(methods)}")
205
+ x = x.astype(np.float32)
206
+ x /= np.quantile(np.abs(x), 0.999)
207
+ for method in methods:
208
+ f0 = None
209
+ if method == "crepe":
210
+ f0 = self.get_f0_crepe_computation(
211
+ x, f0_min, f0_max, p_len, int(hop_length)
212
+ )
213
+ elif method == "rmvpe":
214
+ if hasattr(self, "model_rmvpe") == False:
215
+ from rvc.lib.rmvpe import RMVPE
216
+
217
+ self.model_rmvpe = RMVPE(
218
+ "rmvpe.pt", is_half=self.is_half, device=self.device
219
+ )
220
+ f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
221
+ f0 = f0[1:]
222
+ elif method == "fcpe":
223
+ self.model_fcpe = FCPEF0Predictor(
224
+ "fcpe.pt",
225
+ f0_min=int(f0_min),
226
+ f0_max=int(f0_max),
227
+ dtype=torch.float32,
228
+ device=self.device,
229
+ sampling_rate=self.sr,
230
+ threshold=0.03,
231
+ )
232
+ f0 = self.model_fcpe.compute_f0(x, p_len=p_len)
233
+ del self.model_fcpe
234
+ gc.collect()
235
+ f0_computation_stack.append(f0)
236
+
237
+ print(f"Calculating hybrid median f0 from the stack of {str(methods)}")
238
+ f0_computation_stack = [fc for fc in f0_computation_stack if fc is not None]
239
+ f0_median_hybrid = None
240
+ if len(f0_computation_stack) == 1:
241
+ f0_median_hybrid = f0_computation_stack[0]
242
+ else:
243
+ f0_median_hybrid = np.nanmedian(f0_computation_stack, axis=0)
244
+ return f0_median_hybrid
245
+
246
+ def get_f0(
247
+ self,
248
+ input_audio_path,
249
+ x,
250
+ p_len,
251
+ f0_up_key,
252
+ f0_method,
253
+ filter_radius,
254
+ hop_length,
255
+ f0autotune,
256
+ inp_f0=None,
257
+ ):
258
+ global input_audio_path2wav
259
+ time_step = self.window / self.sr * 1000
260
+ f0_min = 50
261
+ f0_max = 1100
262
+ f0_mel_min = 1127 * np.log(1 + f0_min / 700)
263
+ f0_mel_max = 1127 * np.log(1 + f0_max / 700)
264
+ if f0_method == "pm":
265
+ f0 = (
266
+ parselmouth.Sound(x, self.sr)
267
+ .to_pitch_ac(
268
+ time_step=time_step / 1000,
269
+ voicing_threshold=0.6,
270
+ pitch_floor=f0_min,
271
+ pitch_ceiling=f0_max,
272
+ )
273
+ .selected_array["frequency"]
274
+ )
275
+ pad_size = (p_len - len(f0) + 1) // 2
276
+ if pad_size > 0 or p_len - len(f0) - pad_size > 0:
277
+ f0 = np.pad(
278
+ f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
279
+ )
280
+ elif f0_method == "harvest":
281
+ input_audio_path2wav[input_audio_path] = x.astype(np.double)
282
+ f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10)
283
+ if int(filter_radius) > 2:
284
+ f0 = signal.medfilt(f0, 3)
285
+ elif f0_method == "dio":
286
+ f0, t = pyworld.dio(
287
+ x.astype(np.double),
288
+ fs=self.sr,
289
+ f0_ceil=f0_max,
290
+ f0_floor=f0_min,
291
+ frame_period=10,
292
+ )
293
+ f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)
294
+ f0 = signal.medfilt(f0, 3)
295
+ elif f0_method == "crepe":
296
+ f0 = self.get_f0_crepe_computation(
297
+ x, f0_min, f0_max, p_len, int(hop_length)
298
+ )
299
+ elif f0_method == "crepe-tiny":
300
+ f0 = self.get_f0_crepe_computation(
301
+ x, f0_min, f0_max, p_len, int(hop_length), "tiny"
302
+ )
303
+ elif f0_method == "rmvpe":
304
+ if hasattr(self, "model_rmvpe") == False:
305
+ from rvc.lib.rmvpe import RMVPE
306
+
307
+ self.model_rmvpe = RMVPE(
308
+ "rmvpe.pt", is_half=self.is_half, device=self.device
309
+ )
310
+ f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
311
+ elif f0_method == "fcpe":
312
+ self.model_fcpe = FCPEF0Predictor(
313
+ "fcpe.pt",
314
+ f0_min=int(f0_min),
315
+ f0_max=int(f0_max),
316
+ dtype=torch.float32,
317
+ device=self.device,
318
+ sampling_rate=self.sr,
319
+ threshold=0.03,
320
+ )
321
+ f0 = self.model_fcpe.compute_f0(x, p_len=p_len)
322
+ del self.model_fcpe
323
+ gc.collect()
324
+ elif "hybrid" in f0_method:
325
+ input_audio_path2wav[input_audio_path] = x.astype(np.double)
326
+ f0 = self.get_f0_hybrid_computation(
327
+ f0_method,
328
+ x,
329
+ f0_min,
330
+ f0_max,
331
+ p_len,
332
+ hop_length,
333
+ )
334
+
335
+ if f0autotune == "True":
336
+ f0 = self.autotune_f0(f0)
337
+
338
+ f0 *= pow(2, f0_up_key / 12)
339
+ tf0 = self.sr // self.window
340
+ if inp_f0 is not None:
341
+ delta_t = np.round(
342
+ (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1
343
+ ).astype("int16")
344
+ replace_f0 = np.interp(
345
+ list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1]
346
+ )
347
+ shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0]
348
+ f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[
349
+ :shape
350
+ ]
351
+ f0bak = f0.copy()
352
+ f0_mel = 1127 * np.log(1 + f0 / 700)
353
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
354
+ f0_mel_max - f0_mel_min
355
+ ) + 1
356
+ f0_mel[f0_mel <= 1] = 1
357
+ f0_mel[f0_mel > 255] = 255
358
+ f0_coarse = np.rint(f0_mel).astype(np.int)
359
+
360
+ return f0_coarse, f0bak
361
+
362
+ def vc(
363
+ self,
364
+ model,
365
+ net_g,
366
+ sid,
367
+ audio0,
368
+ pitch,
369
+ pitchf,
370
+ index,
371
+ big_npy,
372
+ index_rate,
373
+ version,
374
+ protect,
375
+ ):
376
+ feats = torch.from_numpy(audio0)
377
+ if self.is_half:
378
+ feats = feats.half()
379
+ else:
380
+ feats = feats.float()
381
+ if feats.dim() == 2:
382
+ feats = feats.mean(-1)
383
+ assert feats.dim() == 1, feats.dim()
384
+ feats = feats.view(1, -1)
385
+ padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
386
+
387
+ inputs = {
388
+ "source": feats.to(self.device),
389
+ "padding_mask": padding_mask,
390
+ "output_layer": 9 if version == "v1" else 12,
391
+ }
392
+ t0 = ttime()
393
+ with torch.no_grad():
394
+ logits = model.extract_features(**inputs)
395
+ feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
396
+ if protect < 0.5 and pitch != None and pitchf != None:
397
+ feats0 = feats.clone()
398
+ if (
399
+ isinstance(index, type(None)) == False
400
+ and isinstance(big_npy, type(None)) == False
401
+ and index_rate != 0
402
+ ):
403
+ npy = feats[0].cpu().numpy()
404
+ if self.is_half:
405
+ npy = npy.astype("float32")
406
+
407
+ score, ix = index.search(npy, k=8)
408
+ weight = np.square(1 / score)
409
+ weight /= weight.sum(axis=1, keepdims=True)
410
+ npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
411
+
412
+ if self.is_half:
413
+ npy = npy.astype("float16")
414
+ feats = (
415
+ torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
416
+ + (1 - index_rate) * feats
417
+ )
418
+
419
+ feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
420
+ if protect < 0.5 and pitch != None and pitchf != None:
421
+ feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
422
+ 0, 2, 1
423
+ )
424
+ t1 = ttime()
425
+ p_len = audio0.shape[0] // self.window
426
+ if feats.shape[1] < p_len:
427
+ p_len = feats.shape[1]
428
+ if pitch != None and pitchf != None:
429
+ pitch = pitch[:, :p_len]
430
+ pitchf = pitchf[:, :p_len]
431
+
432
+ if protect < 0.5 and pitch != None and pitchf != None:
433
+ pitchff = pitchf.clone()
434
+ pitchff[pitchf > 0] = 1
435
+ pitchff[pitchf < 1] = protect
436
+ pitchff = pitchff.unsqueeze(-1)
437
+ feats = feats * pitchff + feats0 * (1 - pitchff)
438
+ feats = feats.to(feats0.dtype)
439
+ p_len = torch.tensor([p_len], device=self.device).long()
440
+ with torch.no_grad():
441
+ if pitch != None and pitchf != None:
442
+ audio1 = (
443
+ (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0])
444
+ .data.cpu()
445
+ .float()
446
+ .numpy()
447
+ )
448
+ else:
449
+ audio1 = (
450
+ (net_g.infer(feats, p_len, sid)[0][0, 0]).data.cpu().float().numpy()
451
+ )
452
+ del feats, p_len, padding_mask
453
+ if torch.cuda.is_available():
454
+ torch.cuda.empty_cache()
455
+ t2 = ttime()
456
+ return audio1
457
+
458
+ def pipeline(
459
+ self,
460
+ model,
461
+ net_g,
462
+ sid,
463
+ audio,
464
+ input_audio_path,
465
+ f0_up_key,
466
+ f0_method,
467
+ file_index,
468
+ index_rate,
469
+ if_f0,
470
+ filter_radius,
471
+ tgt_sr,
472
+ resample_sr,
473
+ rms_mix_rate,
474
+ version,
475
+ protect,
476
+ hop_length,
477
+ f0autotune,
478
+ f0_file=None,
479
+ ):
480
+ if file_index != "" and os.path.exists(file_index) == True and index_rate != 0:
481
+ try:
482
+ index = faiss.read_index(file_index)
483
+ big_npy = index.reconstruct_n(0, index.ntotal)
484
+ except Exception as error:
485
+ print(error)
486
+ index = big_npy = None
487
+ else:
488
+ index = big_npy = None
489
+ audio = signal.filtfilt(bh, ah, audio)
490
+ audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
491
+ opt_ts = []
492
+ if audio_pad.shape[0] > self.t_max:
493
+ audio_sum = np.zeros_like(audio)
494
+ for i in range(self.window):
495
+ audio_sum += audio_pad[i : i - self.window]
496
+ for t in range(self.t_center, audio.shape[0], self.t_center):
497
+ opt_ts.append(
498
+ t
499
+ - self.t_query
500
+ + np.where(
501
+ np.abs(audio_sum[t - self.t_query : t + self.t_query])
502
+ == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
503
+ )[0][0]
504
+ )
505
+ s = 0
506
+ audio_opt = []
507
+ t = None
508
+ t1 = ttime()
509
+ audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
510
+ p_len = audio_pad.shape[0] // self.window
511
+ inp_f0 = None
512
+ if hasattr(f0_file, "name") == True:
513
+ try:
514
+ with open(f0_file.name, "r") as f:
515
+ lines = f.read().strip("\n").split("\n")
516
+ inp_f0 = []
517
+ for line in lines:
518
+ inp_f0.append([float(i) for i in line.split(",")])
519
+ inp_f0 = np.array(inp_f0, dtype="float32")
520
+ except Exception as error:
521
+ print(error)
522
+ sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
523
+ pitch, pitchf = None, None
524
+ if if_f0 == 1:
525
+ pitch, pitchf = self.get_f0(
526
+ input_audio_path,
527
+ audio_pad,
528
+ p_len,
529
+ f0_up_key,
530
+ f0_method,
531
+ filter_radius,
532
+ hop_length,
533
+ f0autotune,
534
+ inp_f0,
535
+ )
536
+ pitch = pitch[:p_len]
537
+ pitchf = pitchf[:p_len]
538
+ if self.device == "mps":
539
+ pitchf = pitchf.astype(np.float32)
540
+ pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
541
+ pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
542
+ t2 = ttime()
543
+ for t in opt_ts:
544
+ t = t // self.window * self.window
545
+ if if_f0 == 1:
546
+ audio_opt.append(
547
+ self.vc(
548
+ model,
549
+ net_g,
550
+ sid,
551
+ audio_pad[s : t + self.t_pad2 + self.window],
552
+ pitch[:, s // self.window : (t + self.t_pad2) // self.window],
553
+ pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
554
+ index,
555
+ big_npy,
556
+ index_rate,
557
+ version,
558
+ protect,
559
+ )[self.t_pad_tgt : -self.t_pad_tgt]
560
+ )
561
+ else:
562
+ audio_opt.append(
563
+ self.vc(
564
+ model,
565
+ net_g,
566
+ sid,
567
+ audio_pad[s : t + self.t_pad2 + self.window],
568
+ None,
569
+ None,
570
+ index,
571
+ big_npy,
572
+ index_rate,
573
+ version,
574
+ protect,
575
+ )[self.t_pad_tgt : -self.t_pad_tgt]
576
+ )
577
+ s = t
578
+ if if_f0 == 1:
579
+ audio_opt.append(
580
+ self.vc(
581
+ model,
582
+ net_g,
583
+ sid,
584
+ audio_pad[t:],
585
+ pitch[:, t // self.window :] if t is not None else pitch,
586
+ pitchf[:, t // self.window :] if t is not None else pitchf,
587
+ index,
588
+ big_npy,
589
+ index_rate,
590
+ version,
591
+ protect,
592
+ )[self.t_pad_tgt : -self.t_pad_tgt]
593
+ )
594
+ else:
595
+ audio_opt.append(
596
+ self.vc(
597
+ model,
598
+ net_g,
599
+ sid,
600
+ audio_pad[t:],
601
+ None,
602
+ None,
603
+ index,
604
+ big_npy,
605
+ index_rate,
606
+ version,
607
+ protect,
608
+ )[self.t_pad_tgt : -self.t_pad_tgt]
609
+ )
610
+ audio_opt = np.concatenate(audio_opt)
611
+ if rms_mix_rate != 1:
612
+ audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate)
613
+ if resample_sr >= 16000 and tgt_sr != resample_sr:
614
+ audio_opt = librosa.resample(
615
+ audio_opt, orig_sr=tgt_sr, target_sr=resample_sr
616
+ )
617
+ audio_max = np.abs(audio_opt).max() / 0.99
618
+ max_int16 = 32768
619
+ if audio_max > 1:
620
+ max_int16 /= audio_max
621
+ audio_opt = (audio_opt * max_int16).astype(np.int16)
622
+ del pitch, pitchf, sid
623
+ if torch.cuda.is_available():
624
+ torch.cuda.empty_cache()
625
+ return audio_opt
rvc/lib/FCPEF0Predictor.py ADDED
@@ -0,0 +1,1036 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Union
2
+
3
+ import torch.nn.functional as F
4
+ import numpy as np
5
+ import torch
6
+ import torch.nn as nn
7
+ from torch.nn.utils.parametrizations import weight_norm
8
+ from torchaudio.transforms import Resample
9
+ import os
10
+ import librosa
11
+ import soundfile as sf
12
+ import torch.utils.data
13
+ from librosa.filters import mel as librosa_mel_fn
14
+ import math
15
+ from functools import partial
16
+
17
+ from einops import rearrange, repeat
18
+ from local_attention import LocalAttention
19
+ from torch import nn
20
+
21
+ os.environ["LRU_CACHE_CAPACITY"] = "3"
22
+
23
+
24
+ def load_wav_to_torch(full_path, target_sr=None, return_empty_on_exception=False):
25
+ sampling_rate = None
26
+ try:
27
+ data, sampling_rate = sf.read(full_path, always_2d=True) # than soundfile.
28
+ except Exception as error:
29
+ print(f"'{full_path}' failed to load with {error}")
30
+ if return_empty_on_exception:
31
+ return [], sampling_rate or target_sr or 48000
32
+ else:
33
+ raise Exception(error)
34
+
35
+ if len(data.shape) > 1:
36
+ data = data[:, 0]
37
+ assert (
38
+ len(data) > 2
39
+ ) # check duration of audio file is > 2 samples (because otherwise the slice operation was on the wrong dimension)
40
+
41
+ if np.issubdtype(data.dtype, np.integer): # if audio data is type int
42
+ max_mag = -np.iinfo(
43
+ data.dtype
44
+ ).min # maximum magnitude = min possible value of intXX
45
+ else: # if audio data is type fp32
46
+ max_mag = max(np.amax(data), -np.amin(data))
47
+ max_mag = (
48
+ (2**31) + 1
49
+ if max_mag > (2**15)
50
+ else ((2**15) + 1 if max_mag > 1.01 else 1.0)
51
+ ) # data should be either 16-bit INT, 32-bit INT or [-1 to 1] float32
52
+
53
+ data = torch.FloatTensor(data.astype(np.float32)) / max_mag
54
+
55
+ if (
56
+ torch.isinf(data) | torch.isnan(data)
57
+ ).any() and return_empty_on_exception: # resample will crash with inf/NaN inputs. return_empty_on_exception will return empty arr instead of except
58
+ return [], sampling_rate or target_sr or 48000
59
+ if target_sr is not None and sampling_rate != target_sr:
60
+ data = torch.from_numpy(
61
+ librosa.core.resample(
62
+ data.numpy(), orig_sr=sampling_rate, target_sr=target_sr
63
+ )
64
+ )
65
+ sampling_rate = target_sr
66
+
67
+ return data, sampling_rate
68
+
69
+
70
+ def dynamic_range_compression(x, C=1, clip_val=1e-5):
71
+ return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
72
+
73
+
74
+ def dynamic_range_decompression(x, C=1):
75
+ return np.exp(x) / C
76
+
77
+
78
+ def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
79
+ return torch.log(torch.clamp(x, min=clip_val) * C)
80
+
81
+
82
+ def dynamic_range_decompression_torch(x, C=1):
83
+ return torch.exp(x) / C
84
+
85
+
86
+ class STFT:
87
+ def __init__(
88
+ self,
89
+ sr=22050,
90
+ n_mels=80,
91
+ n_fft=1024,
92
+ win_size=1024,
93
+ hop_length=256,
94
+ fmin=20,
95
+ fmax=11025,
96
+ clip_val=1e-5,
97
+ ):
98
+ self.target_sr = sr
99
+
100
+ self.n_mels = n_mels
101
+ self.n_fft = n_fft
102
+ self.win_size = win_size
103
+ self.hop_length = hop_length
104
+ self.fmin = fmin
105
+ self.fmax = fmax
106
+ self.clip_val = clip_val
107
+ self.mel_basis = {}
108
+ self.hann_window = {}
109
+
110
+ def get_mel(self, y, keyshift=0, speed=1, center=False, train=False):
111
+ sampling_rate = self.target_sr
112
+ n_mels = self.n_mels
113
+ n_fft = self.n_fft
114
+ win_size = self.win_size
115
+ hop_length = self.hop_length
116
+ fmin = self.fmin
117
+ fmax = self.fmax
118
+ clip_val = self.clip_val
119
+
120
+ factor = 2 ** (keyshift / 12)
121
+ n_fft_new = int(np.round(n_fft * factor))
122
+ win_size_new = int(np.round(win_size * factor))
123
+ hop_length_new = int(np.round(hop_length * speed))
124
+ if not train:
125
+ mel_basis = self.mel_basis
126
+ hann_window = self.hann_window
127
+ else:
128
+ mel_basis = {}
129
+ hann_window = {}
130
+
131
+ mel_basis_key = str(fmax) + "_" + str(y.device)
132
+ if mel_basis_key not in mel_basis:
133
+ mel = librosa_mel_fn(
134
+ sr=sampling_rate, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax
135
+ )
136
+ mel_basis[mel_basis_key] = torch.from_numpy(mel).float().to(y.device)
137
+
138
+ keyshift_key = str(keyshift) + "_" + str(y.device)
139
+ if keyshift_key not in hann_window:
140
+ hann_window[keyshift_key] = torch.hann_window(win_size_new).to(y.device)
141
+
142
+ pad_left = (win_size_new - hop_length_new) // 2
143
+ pad_right = max(
144
+ (win_size_new - hop_length_new + 1) // 2,
145
+ win_size_new - y.size(-1) - pad_left,
146
+ )
147
+ if pad_right < y.size(-1):
148
+ mode = "reflect"
149
+ else:
150
+ mode = "constant"
151
+ y = torch.nn.functional.pad(y.unsqueeze(1), (pad_left, pad_right), mode=mode)
152
+ y = y.squeeze(1)
153
+
154
+ spec = torch.stft(
155
+ y,
156
+ n_fft_new,
157
+ hop_length=hop_length_new,
158
+ win_length=win_size_new,
159
+ window=hann_window[keyshift_key],
160
+ center=center,
161
+ pad_mode="reflect",
162
+ normalized=False,
163
+ onesided=True,
164
+ return_complex=True,
165
+ )
166
+ spec = torch.sqrt(spec.real.pow(2) + spec.imag.pow(2) + (1e-9))
167
+ if keyshift != 0:
168
+ size = n_fft // 2 + 1
169
+ resize = spec.size(1)
170
+ if resize < size:
171
+ spec = F.pad(spec, (0, 0, 0, size - resize))
172
+ spec = spec[:, :size, :] * win_size / win_size_new
173
+ spec = torch.matmul(mel_basis[mel_basis_key], spec)
174
+ spec = dynamic_range_compression_torch(spec, clip_val=clip_val)
175
+ return spec
176
+
177
+ def __call__(self, audiopath):
178
+ audio, sr = load_wav_to_torch(audiopath, target_sr=self.target_sr)
179
+ spect = self.get_mel(audio.unsqueeze(0)).squeeze(0)
180
+ return spect
181
+
182
+
183
+ stft = STFT()
184
+
185
+ # import fast_transformers.causal_product.causal_product_cuda
186
+
187
+
188
+ def softmax_kernel(
189
+ data, *, projection_matrix, is_query, normalize_data=True, eps=1e-4, device=None
190
+ ):
191
+ b, h, *_ = data.shape
192
+ # (batch size, head, length, model_dim)
193
+
194
+ # normalize model dim
195
+ data_normalizer = (data.shape[-1] ** -0.25) if normalize_data else 1.0
196
+
197
+ # what is ration?, projection_matrix.shape[0] --> 266
198
+
199
+ ratio = projection_matrix.shape[0] ** -0.5
200
+
201
+ projection = repeat(projection_matrix, "j d -> b h j d", b=b, h=h)
202
+ projection = projection.type_as(data)
203
+
204
+ # data_dash = w^T x
205
+ data_dash = torch.einsum("...id,...jd->...ij", (data_normalizer * data), projection)
206
+
207
+ # diag_data = D**2
208
+ diag_data = data**2
209
+ diag_data = torch.sum(diag_data, dim=-1)
210
+ diag_data = (diag_data / 2.0) * (data_normalizer**2)
211
+ diag_data = diag_data.unsqueeze(dim=-1)
212
+
213
+ if is_query:
214
+ data_dash = ratio * (
215
+ torch.exp(
216
+ data_dash
217
+ - diag_data
218
+ - torch.max(data_dash, dim=-1, keepdim=True).values
219
+ )
220
+ + eps
221
+ )
222
+ else:
223
+ data_dash = ratio * (
224
+ torch.exp(data_dash - diag_data + eps)
225
+ ) # - torch.max(data_dash)) + eps)
226
+
227
+ return data_dash.type_as(data)
228
+
229
+
230
+ def orthogonal_matrix_chunk(cols, qr_uniform_q=False, device=None):
231
+ unstructured_block = torch.randn((cols, cols), device=device)
232
+ q, r = torch.linalg.qr(unstructured_block.cpu(), mode="reduced")
233
+ q, r = map(lambda t: t.to(device), (q, r))
234
+
235
+ # proposed by @Parskatt
236
+ # to make sure Q is uniform https://arxiv.org/pdf/math-ph/0609050.pdf
237
+ if qr_uniform_q:
238
+ d = torch.diag(r, 0)
239
+ q *= d.sign()
240
+ return q.t()
241
+
242
+
243
+ def exists(val):
244
+ return val is not None
245
+
246
+
247
+ def empty(tensor):
248
+ return tensor.numel() == 0
249
+
250
+
251
+ def default(val, d):
252
+ return val if exists(val) else d
253
+
254
+
255
+ def cast_tuple(val):
256
+ return (val,) if not isinstance(val, tuple) else val
257
+
258
+
259
+ class PCmer(nn.Module):
260
+ """The encoder that is used in the Transformer model."""
261
+
262
+ def __init__(
263
+ self,
264
+ num_layers,
265
+ num_heads,
266
+ dim_model,
267
+ dim_keys,
268
+ dim_values,
269
+ residual_dropout,
270
+ attention_dropout,
271
+ ):
272
+ super().__init__()
273
+ self.num_layers = num_layers
274
+ self.num_heads = num_heads
275
+ self.dim_model = dim_model
276
+ self.dim_values = dim_values
277
+ self.dim_keys = dim_keys
278
+ self.residual_dropout = residual_dropout
279
+ self.attention_dropout = attention_dropout
280
+
281
+ self._layers = nn.ModuleList([_EncoderLayer(self) for _ in range(num_layers)])
282
+
283
+ # METHODS ########################################################################################################
284
+
285
+ def forward(self, phone, mask=None):
286
+
287
+ # apply all layers to the input
288
+ for i, layer in enumerate(self._layers):
289
+ phone = layer(phone, mask)
290
+ # provide the final sequence
291
+ return phone
292
+
293
+
294
+ # ==================================================================================================================== #
295
+ # CLASS _ E N C O D E R L A Y E R #
296
+ # ==================================================================================================================== #
297
+
298
+
299
+ class _EncoderLayer(nn.Module):
300
+ """One layer of the encoder.
301
+
302
+ Attributes:
303
+ attn: (:class:`mha.MultiHeadAttention`): The attention mechanism that is used to read the input sequence.
304
+ feed_forward (:class:`ffl.FeedForwardLayer`): The feed-forward layer on top of the attention mechanism.
305
+ """
306
+
307
+ def __init__(self, parent: PCmer):
308
+ """Creates a new instance of ``_EncoderLayer``.
309
+
310
+ Args:
311
+ parent (Encoder): The encoder that the layers is created for.
312
+ """
313
+ super().__init__()
314
+
315
+ self.conformer = ConformerConvModule(parent.dim_model)
316
+ self.norm = nn.LayerNorm(parent.dim_model)
317
+ self.dropout = nn.Dropout(parent.residual_dropout)
318
+
319
+ # selfatt -> fastatt: performer!
320
+ self.attn = SelfAttention(
321
+ dim=parent.dim_model, heads=parent.num_heads, causal=False
322
+ )
323
+
324
+ # METHODS ########################################################################################################
325
+
326
+ def forward(self, phone, mask=None):
327
+
328
+ # compute attention sub-layer
329
+ phone = phone + (self.attn(self.norm(phone), mask=mask))
330
+
331
+ phone = phone + (self.conformer(phone))
332
+
333
+ return phone
334
+
335
+
336
+ def calc_same_padding(kernel_size):
337
+ pad = kernel_size // 2
338
+ return (pad, pad - (kernel_size + 1) % 2)
339
+
340
+
341
+ # helper classes
342
+
343
+
344
+ class Swish(nn.Module):
345
+ def forward(self, x):
346
+ return x * x.sigmoid()
347
+
348
+
349
+ class Transpose(nn.Module):
350
+ def __init__(self, dims):
351
+ super().__init__()
352
+ assert len(dims) == 2, "dims must be a tuple of two dimensions"
353
+ self.dims = dims
354
+
355
+ def forward(self, x):
356
+ return x.transpose(*self.dims)
357
+
358
+
359
+ class GLU(nn.Module):
360
+ def __init__(self, dim):
361
+ super().__init__()
362
+ self.dim = dim
363
+
364
+ def forward(self, x):
365
+ out, gate = x.chunk(2, dim=self.dim)
366
+ return out * gate.sigmoid()
367
+
368
+
369
+ class DepthWiseConv1d(nn.Module):
370
+ def __init__(self, chan_in, chan_out, kernel_size, padding):
371
+ super().__init__()
372
+ self.padding = padding
373
+ self.conv = nn.Conv1d(chan_in, chan_out, kernel_size, groups=chan_in)
374
+
375
+ def forward(self, x):
376
+ x = F.pad(x, self.padding)
377
+ return self.conv(x)
378
+
379
+
380
+ class ConformerConvModule(nn.Module):
381
+ def __init__(
382
+ self, dim, causal=False, expansion_factor=2, kernel_size=31, dropout=0.0
383
+ ):
384
+ super().__init__()
385
+
386
+ inner_dim = dim * expansion_factor
387
+ padding = calc_same_padding(kernel_size) if not causal else (kernel_size - 1, 0)
388
+
389
+ self.net = nn.Sequential(
390
+ nn.LayerNorm(dim),
391
+ Transpose((1, 2)),
392
+ nn.Conv1d(dim, inner_dim * 2, 1),
393
+ GLU(dim=1),
394
+ DepthWiseConv1d(
395
+ inner_dim, inner_dim, kernel_size=kernel_size, padding=padding
396
+ ),
397
+ # nn.BatchNorm1d(inner_dim) if not causal else nn.Identity(),
398
+ Swish(),
399
+ nn.Conv1d(inner_dim, dim, 1),
400
+ Transpose((1, 2)),
401
+ nn.Dropout(dropout),
402
+ )
403
+
404
+ def forward(self, x):
405
+ return self.net(x)
406
+
407
+
408
+ def linear_attention(q, k, v):
409
+ if v is None:
410
+ out = torch.einsum("...ed,...nd->...ne", k, q)
411
+ return out
412
+
413
+ else:
414
+ k_cumsum = k.sum(dim=-2)
415
+ # k_cumsum = k.sum(dim = -2)
416
+ D_inv = 1.0 / (torch.einsum("...nd,...d->...n", q, k_cumsum.type_as(q)) + 1e-8)
417
+
418
+ context = torch.einsum("...nd,...ne->...de", k, v)
419
+ out = torch.einsum("...de,...nd,...n->...ne", context, q, D_inv)
420
+ return out
421
+
422
+
423
+ def gaussian_orthogonal_random_matrix(
424
+ nb_rows, nb_columns, scaling=0, qr_uniform_q=False, device=None
425
+ ):
426
+ nb_full_blocks = int(nb_rows / nb_columns)
427
+ block_list = []
428
+
429
+ for _ in range(nb_full_blocks):
430
+ q = orthogonal_matrix_chunk(
431
+ nb_columns, qr_uniform_q=qr_uniform_q, device=device
432
+ )
433
+ block_list.append(q)
434
+
435
+ remaining_rows = nb_rows - nb_full_blocks * nb_columns
436
+ if remaining_rows > 0:
437
+ q = orthogonal_matrix_chunk(
438
+ nb_columns, qr_uniform_q=qr_uniform_q, device=device
439
+ )
440
+
441
+ block_list.append(q[:remaining_rows])
442
+
443
+ final_matrix = torch.cat(block_list)
444
+
445
+ if scaling == 0:
446
+ multiplier = torch.randn((nb_rows, nb_columns), device=device).norm(dim=1)
447
+ elif scaling == 1:
448
+ multiplier = math.sqrt((float(nb_columns))) * torch.ones(
449
+ (nb_rows,), device=device
450
+ )
451
+ else:
452
+ raise ValueError(f"Invalid scaling {scaling}")
453
+
454
+ return torch.diag(multiplier) @ final_matrix
455
+
456
+
457
+ class FastAttention(nn.Module):
458
+ def __init__(
459
+ self,
460
+ dim_heads,
461
+ nb_features=None,
462
+ ortho_scaling=0,
463
+ causal=False,
464
+ generalized_attention=False,
465
+ kernel_fn=nn.ReLU(),
466
+ qr_uniform_q=False,
467
+ no_projection=False,
468
+ ):
469
+ super().__init__()
470
+ nb_features = default(nb_features, int(dim_heads * math.log(dim_heads)))
471
+
472
+ self.dim_heads = dim_heads
473
+ self.nb_features = nb_features
474
+ self.ortho_scaling = ortho_scaling
475
+
476
+ self.create_projection = partial(
477
+ gaussian_orthogonal_random_matrix,
478
+ nb_rows=self.nb_features,
479
+ nb_columns=dim_heads,
480
+ scaling=ortho_scaling,
481
+ qr_uniform_q=qr_uniform_q,
482
+ )
483
+ projection_matrix = self.create_projection()
484
+ self.register_buffer("projection_matrix", projection_matrix)
485
+
486
+ self.generalized_attention = generalized_attention
487
+ self.kernel_fn = kernel_fn
488
+
489
+ # if this is turned on, no projection will be used
490
+ # queries and keys will be softmax-ed as in the original efficient attention paper
491
+ self.no_projection = no_projection
492
+
493
+ self.causal = causal
494
+
495
+ @torch.no_grad()
496
+ def redraw_projection_matrix(self):
497
+ projections = self.create_projection()
498
+ self.projection_matrix.copy_(projections)
499
+ del projections
500
+
501
+ def forward(self, q, k, v):
502
+ device = q.device
503
+
504
+ if self.no_projection:
505
+ q = q.softmax(dim=-1)
506
+ k = torch.exp(k) if self.causal else k.softmax(dim=-2)
507
+ else:
508
+ create_kernel = partial(
509
+ softmax_kernel, projection_matrix=self.projection_matrix, device=device
510
+ )
511
+
512
+ q = create_kernel(q, is_query=True)
513
+ k = create_kernel(k, is_query=False)
514
+
515
+ attn_fn = linear_attention if not self.causal else self.causal_linear_fn
516
+ if v is None:
517
+ out = attn_fn(q, k, None)
518
+ return out
519
+ else:
520
+ out = attn_fn(q, k, v)
521
+ return out
522
+
523
+
524
+ class SelfAttention(nn.Module):
525
+ def __init__(
526
+ self,
527
+ dim,
528
+ causal=False,
529
+ heads=8,
530
+ dim_head=64,
531
+ local_heads=0,
532
+ local_window_size=256,
533
+ nb_features=None,
534
+ feature_redraw_interval=1000,
535
+ generalized_attention=False,
536
+ kernel_fn=nn.ReLU(),
537
+ qr_uniform_q=False,
538
+ dropout=0.0,
539
+ no_projection=False,
540
+ ):
541
+ super().__init__()
542
+ assert dim % heads == 0, "dimension must be divisible by number of heads"
543
+ dim_head = default(dim_head, dim // heads)
544
+ inner_dim = dim_head * heads
545
+ self.fast_attention = FastAttention(
546
+ dim_head,
547
+ nb_features,
548
+ causal=causal,
549
+ generalized_attention=generalized_attention,
550
+ kernel_fn=kernel_fn,
551
+ qr_uniform_q=qr_uniform_q,
552
+ no_projection=no_projection,
553
+ )
554
+
555
+ self.heads = heads
556
+ self.global_heads = heads - local_heads
557
+ self.local_attn = (
558
+ LocalAttention(
559
+ window_size=local_window_size,
560
+ causal=causal,
561
+ autopad=True,
562
+ dropout=dropout,
563
+ look_forward=int(not causal),
564
+ rel_pos_emb_config=(dim_head, local_heads),
565
+ )
566
+ if local_heads > 0
567
+ else None
568
+ )
569
+
570
+ self.to_q = nn.Linear(dim, inner_dim)
571
+ self.to_k = nn.Linear(dim, inner_dim)
572
+ self.to_v = nn.Linear(dim, inner_dim)
573
+ self.to_out = nn.Linear(inner_dim, dim)
574
+ self.dropout = nn.Dropout(dropout)
575
+
576
+ @torch.no_grad()
577
+ def redraw_projection_matrix(self):
578
+ self.fast_attention.redraw_projection_matrix()
579
+
580
+ def forward(
581
+ self,
582
+ x,
583
+ context=None,
584
+ mask=None,
585
+ context_mask=None,
586
+ name=None,
587
+ inference=False,
588
+ **kwargs,
589
+ ):
590
+ _, _, _, h, gh = *x.shape, self.heads, self.global_heads
591
+
592
+ cross_attend = exists(context)
593
+
594
+ context = default(context, x)
595
+ context_mask = default(context_mask, mask) if not cross_attend else context_mask
596
+ q, k, v = self.to_q(x), self.to_k(context), self.to_v(context)
597
+
598
+ q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=h), (q, k, v))
599
+ (q, lq), (k, lk), (v, lv) = map(lambda t: (t[:, :gh], t[:, gh:]), (q, k, v))
600
+
601
+ attn_outs = []
602
+ if not empty(q):
603
+ if exists(context_mask):
604
+ global_mask = context_mask[:, None, :, None]
605
+ v.masked_fill_(~global_mask, 0.0)
606
+ if cross_attend:
607
+ pass
608
+ else:
609
+ out = self.fast_attention(q, k, v)
610
+ attn_outs.append(out)
611
+
612
+ if not empty(lq):
613
+ assert (
614
+ not cross_attend
615
+ ), "local attention is not compatible with cross attention"
616
+ out = self.local_attn(lq, lk, lv, input_mask=mask)
617
+ attn_outs.append(out)
618
+
619
+ out = torch.cat(attn_outs, dim=1)
620
+ out = rearrange(out, "b h n d -> b n (h d)")
621
+ out = self.to_out(out)
622
+ return self.dropout(out)
623
+
624
+
625
+ def l2_regularization(model, l2_alpha):
626
+ l2_loss = []
627
+ for module in model.modules():
628
+ if type(module) is nn.Conv2d:
629
+ l2_loss.append((module.weight**2).sum() / 2.0)
630
+ return l2_alpha * sum(l2_loss)
631
+
632
+
633
+ class FCPE(nn.Module):
634
+ def __init__(
635
+ self,
636
+ input_channel=128,
637
+ out_dims=360,
638
+ n_layers=12,
639
+ n_chans=512,
640
+ use_siren=False,
641
+ use_full=False,
642
+ loss_mse_scale=10,
643
+ loss_l2_regularization=False,
644
+ loss_l2_regularization_scale=1,
645
+ loss_grad1_mse=False,
646
+ loss_grad1_mse_scale=1,
647
+ f0_max=1975.5,
648
+ f0_min=32.70,
649
+ confidence=False,
650
+ threshold=0.05,
651
+ use_input_conv=True,
652
+ ):
653
+ super().__init__()
654
+ if use_siren is True:
655
+ raise ValueError("Siren is not supported yet.")
656
+ if use_full is True:
657
+ raise ValueError("Full model is not supported yet.")
658
+
659
+ self.loss_mse_scale = loss_mse_scale if (loss_mse_scale is not None) else 10
660
+ self.loss_l2_regularization = (
661
+ loss_l2_regularization if (loss_l2_regularization is not None) else False
662
+ )
663
+ self.loss_l2_regularization_scale = (
664
+ loss_l2_regularization_scale
665
+ if (loss_l2_regularization_scale is not None)
666
+ else 1
667
+ )
668
+ self.loss_grad1_mse = loss_grad1_mse if (loss_grad1_mse is not None) else False
669
+ self.loss_grad1_mse_scale = (
670
+ loss_grad1_mse_scale if (loss_grad1_mse_scale is not None) else 1
671
+ )
672
+ self.f0_max = f0_max if (f0_max is not None) else 1975.5
673
+ self.f0_min = f0_min if (f0_min is not None) else 32.70
674
+ self.confidence = confidence if (confidence is not None) else False
675
+ self.threshold = threshold if (threshold is not None) else 0.05
676
+ self.use_input_conv = use_input_conv if (use_input_conv is not None) else True
677
+
678
+ self.cent_table_b = torch.Tensor(
679
+ np.linspace(
680
+ self.f0_to_cent(torch.Tensor([f0_min]))[0],
681
+ self.f0_to_cent(torch.Tensor([f0_max]))[0],
682
+ out_dims,
683
+ )
684
+ )
685
+ self.register_buffer("cent_table", self.cent_table_b)
686
+
687
+ # conv in stack
688
+ _leaky = nn.LeakyReLU()
689
+ self.stack = nn.Sequential(
690
+ nn.Conv1d(input_channel, n_chans, 3, 1, 1),
691
+ nn.GroupNorm(4, n_chans),
692
+ _leaky,
693
+ nn.Conv1d(n_chans, n_chans, 3, 1, 1),
694
+ )
695
+
696
+ # transformer
697
+ self.decoder = PCmer(
698
+ num_layers=n_layers,
699
+ num_heads=8,
700
+ dim_model=n_chans,
701
+ dim_keys=n_chans,
702
+ dim_values=n_chans,
703
+ residual_dropout=0.1,
704
+ attention_dropout=0.1,
705
+ )
706
+ self.norm = nn.LayerNorm(n_chans)
707
+
708
+ # out
709
+ self.n_out = out_dims
710
+ self.dense_out = weight_norm(nn.Linear(n_chans, self.n_out))
711
+
712
+ def forward(
713
+ self, mel, infer=True, gt_f0=None, return_hz_f0=False, cdecoder="local_argmax"
714
+ ):
715
+ """
716
+ input:
717
+ B x n_frames x n_unit
718
+ return:
719
+ dict of B x n_frames x feat
720
+ """
721
+ if cdecoder == "argmax":
722
+ self.cdecoder = self.cents_decoder
723
+ elif cdecoder == "local_argmax":
724
+ self.cdecoder = self.cents_local_decoder
725
+ if self.use_input_conv:
726
+ x = self.stack(mel.transpose(1, 2)).transpose(1, 2)
727
+ else:
728
+ x = mel
729
+ x = self.decoder(x)
730
+ x = self.norm(x)
731
+ x = self.dense_out(x) # [B,N,D]
732
+ x = torch.sigmoid(x)
733
+ if not infer:
734
+ gt_cent_f0 = self.f0_to_cent(gt_f0) # mel f0 #[B,N,1]
735
+ gt_cent_f0 = self.gaussian_blurred_cent(gt_cent_f0) # #[B,N,out_dim]
736
+ loss_all = self.loss_mse_scale * F.binary_cross_entropy(
737
+ x, gt_cent_f0
738
+ ) # bce loss
739
+ # l2 regularization
740
+ if self.loss_l2_regularization:
741
+ loss_all = loss_all + l2_regularization(
742
+ model=self, l2_alpha=self.loss_l2_regularization_scale
743
+ )
744
+ x = loss_all
745
+ if infer:
746
+ x = self.cdecoder(x)
747
+ x = self.cent_to_f0(x)
748
+ if not return_hz_f0:
749
+ x = (1 + x / 700).log()
750
+ return x
751
+
752
+ def cents_decoder(self, y, mask=True):
753
+ B, N, _ = y.size()
754
+ ci = self.cent_table[None, None, :].expand(B, N, -1)
755
+ rtn = torch.sum(ci * y, dim=-1, keepdim=True) / torch.sum(
756
+ y, dim=-1, keepdim=True
757
+ ) # cents: [B,N,1]
758
+ if mask:
759
+ confident = torch.max(y, dim=-1, keepdim=True)[0]
760
+ confident_mask = torch.ones_like(confident)
761
+ confident_mask[confident <= self.threshold] = float("-INF")
762
+ rtn = rtn * confident_mask
763
+ if self.confidence:
764
+ return rtn, confident
765
+ else:
766
+ return rtn
767
+
768
+ def cents_local_decoder(self, y, mask=True):
769
+ B, N, _ = y.size()
770
+ ci = self.cent_table[None, None, :].expand(B, N, -1)
771
+ confident, max_index = torch.max(y, dim=-1, keepdim=True)
772
+ local_argmax_index = torch.arange(0, 9).to(max_index.device) + (max_index - 4)
773
+ local_argmax_index[local_argmax_index < 0] = 0
774
+ local_argmax_index[local_argmax_index >= self.n_out] = self.n_out - 1
775
+ ci_l = torch.gather(ci, -1, local_argmax_index)
776
+ y_l = torch.gather(y, -1, local_argmax_index)
777
+ rtn = torch.sum(ci_l * y_l, dim=-1, keepdim=True) / torch.sum(
778
+ y_l, dim=-1, keepdim=True
779
+ ) # cents: [B,N,1]
780
+ if mask:
781
+ confident_mask = torch.ones_like(confident)
782
+ confident_mask[confident <= self.threshold] = float("-INF")
783
+ rtn = rtn * confident_mask
784
+ if self.confidence:
785
+ return rtn, confident
786
+ else:
787
+ return rtn
788
+
789
+ def cent_to_f0(self, cent):
790
+ return 10.0 * 2 ** (cent / 1200.0)
791
+
792
+ def f0_to_cent(self, f0):
793
+ return 1200.0 * torch.log2(f0 / 10.0)
794
+
795
+ def gaussian_blurred_cent(self, cents): # cents: [B,N,1]
796
+ mask = (cents > 0.1) & (cents < (1200.0 * np.log2(self.f0_max / 10.0)))
797
+ B, N, _ = cents.size()
798
+ ci = self.cent_table[None, None, :].expand(B, N, -1)
799
+ return torch.exp(-torch.square(ci - cents) / 1250) * mask.float()
800
+
801
+
802
+ class FCPEInfer:
803
+ def __init__(self, model_path, device=None, dtype=torch.float32):
804
+ if device is None:
805
+ device = "cuda" if torch.cuda.is_available() else "cpu"
806
+ self.device = device
807
+ ckpt = torch.load(model_path, map_location=torch.device(self.device))
808
+ self.args = DotDict(ckpt["config"])
809
+ self.dtype = dtype
810
+ model = FCPE(
811
+ input_channel=self.args.model.input_channel,
812
+ out_dims=self.args.model.out_dims,
813
+ n_layers=self.args.model.n_layers,
814
+ n_chans=self.args.model.n_chans,
815
+ use_siren=self.args.model.use_siren,
816
+ use_full=self.args.model.use_full,
817
+ loss_mse_scale=self.args.loss.loss_mse_scale,
818
+ loss_l2_regularization=self.args.loss.loss_l2_regularization,
819
+ loss_l2_regularization_scale=self.args.loss.loss_l2_regularization_scale,
820
+ loss_grad1_mse=self.args.loss.loss_grad1_mse,
821
+ loss_grad1_mse_scale=self.args.loss.loss_grad1_mse_scale,
822
+ f0_max=self.args.model.f0_max,
823
+ f0_min=self.args.model.f0_min,
824
+ confidence=self.args.model.confidence,
825
+ )
826
+ model.to(self.device).to(self.dtype)
827
+ model.load_state_dict(ckpt["model"])
828
+ model.eval()
829
+ self.model = model
830
+ self.wav2mel = Wav2Mel(self.args, dtype=self.dtype, device=self.device)
831
+
832
+ @torch.no_grad()
833
+ def __call__(self, audio, sr, threshold=0.05):
834
+ self.model.threshold = threshold
835
+ audio = audio[None, :]
836
+ mel = self.wav2mel(audio=audio, sample_rate=sr).to(self.dtype)
837
+ f0 = self.model(mel=mel, infer=True, return_hz_f0=True)
838
+ return f0
839
+
840
+
841
+ class Wav2Mel:
842
+
843
+ def __init__(self, args, device=None, dtype=torch.float32):
844
+ # self.args = args
845
+ self.sampling_rate = args.mel.sampling_rate
846
+ self.hop_size = args.mel.hop_size
847
+ if device is None:
848
+ device = "cuda" if torch.cuda.is_available() else "cpu"
849
+ self.device = device
850
+ self.dtype = dtype
851
+ self.stft = STFT(
852
+ args.mel.sampling_rate,
853
+ args.mel.num_mels,
854
+ args.mel.n_fft,
855
+ args.mel.win_size,
856
+ args.mel.hop_size,
857
+ args.mel.fmin,
858
+ args.mel.fmax,
859
+ )
860
+ self.resample_kernel = {}
861
+
862
+ def extract_nvstft(self, audio, keyshift=0, train=False):
863
+ mel = self.stft.get_mel(audio, keyshift=keyshift, train=train).transpose(
864
+ 1, 2
865
+ ) # B, n_frames, bins
866
+ return mel
867
+
868
+ def extract_mel(self, audio, sample_rate, keyshift=0, train=False):
869
+ audio = audio.to(self.dtype).to(self.device)
870
+ # resample
871
+ if sample_rate == self.sampling_rate:
872
+ audio_res = audio
873
+ else:
874
+ key_str = str(sample_rate)
875
+ if key_str not in self.resample_kernel:
876
+ self.resample_kernel[key_str] = Resample(
877
+ sample_rate, self.sampling_rate, lowpass_filter_width=128
878
+ )
879
+ self.resample_kernel[key_str] = (
880
+ self.resample_kernel[key_str].to(self.dtype).to(self.device)
881
+ )
882
+ audio_res = self.resample_kernel[key_str](audio)
883
+
884
+ # extract
885
+ mel = self.extract_nvstft(
886
+ audio_res, keyshift=keyshift, train=train
887
+ ) # B, n_frames, bins
888
+ n_frames = int(audio.shape[1] // self.hop_size) + 1
889
+ if n_frames > int(mel.shape[1]):
890
+ mel = torch.cat((mel, mel[:, -1:, :]), 1)
891
+ if n_frames < int(mel.shape[1]):
892
+ mel = mel[:, :n_frames, :]
893
+ return mel
894
+
895
+ def __call__(self, audio, sample_rate, keyshift=0, train=False):
896
+ return self.extract_mel(audio, sample_rate, keyshift=keyshift, train=train)
897
+
898
+
899
+ class DotDict(dict):
900
+ def __getattr__(*args):
901
+ val = dict.get(*args)
902
+ return DotDict(val) if type(val) is dict else val
903
+
904
+ __setattr__ = dict.__setitem__
905
+ __delattr__ = dict.__delitem__
906
+
907
+
908
+ class F0Predictor(object):
909
+ def compute_f0(self, wav, p_len):
910
+ """
911
+ input: wav:[signal_length]
912
+ p_len:int
913
+ output: f0:[signal_length//hop_length]
914
+ """
915
+ pass
916
+
917
+ def compute_f0_uv(self, wav, p_len):
918
+ """
919
+ input: wav:[signal_length]
920
+ p_len:int
921
+ output: f0:[signal_length//hop_length],uv:[signal_length//hop_length]
922
+ """
923
+ pass
924
+
925
+
926
+ class FCPEF0Predictor(F0Predictor):
927
+ def __init__(
928
+ self,
929
+ model_path,
930
+ hop_length=512,
931
+ f0_min=50,
932
+ f0_max=1100,
933
+ dtype=torch.float32,
934
+ device=None,
935
+ sampling_rate=44100,
936
+ threshold=0.05,
937
+ ):
938
+ self.fcpe = FCPEInfer(model_path, device=device, dtype=dtype)
939
+ self.hop_length = hop_length
940
+ self.f0_min = f0_min
941
+ self.f0_max = f0_max
942
+ if device is None:
943
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
944
+ else:
945
+ self.device = device
946
+ self.threshold = threshold
947
+ self.sampling_rate = sampling_rate
948
+ self.dtype = dtype
949
+ self.name = "fcpe"
950
+
951
+ def repeat_expand(
952
+ self,
953
+ content: Union[torch.Tensor, np.ndarray],
954
+ target_len: int,
955
+ mode: str = "nearest",
956
+ ):
957
+ ndim = content.ndim
958
+
959
+ if content.ndim == 1:
960
+ content = content[None, None]
961
+ elif content.ndim == 2:
962
+ content = content[None]
963
+
964
+ assert content.ndim == 3
965
+
966
+ is_np = isinstance(content, np.ndarray)
967
+ if is_np:
968
+ content = torch.from_numpy(content)
969
+
970
+ results = torch.nn.functional.interpolate(content, size=target_len, mode=mode)
971
+
972
+ if is_np:
973
+ results = results.numpy()
974
+
975
+ if ndim == 1:
976
+ return results[0, 0]
977
+ elif ndim == 2:
978
+ return results[0]
979
+
980
+ def post_process(self, x, sampling_rate, f0, pad_to):
981
+ if isinstance(f0, np.ndarray):
982
+ f0 = torch.from_numpy(f0).float().to(x.device)
983
+
984
+ if pad_to is None:
985
+ return f0
986
+
987
+ f0 = self.repeat_expand(f0, pad_to)
988
+
989
+ vuv_vector = torch.zeros_like(f0)
990
+ vuv_vector[f0 > 0.0] = 1.0
991
+ vuv_vector[f0 <= 0.0] = 0.0
992
+
993
+ # 去掉0频率, 并线性插值
994
+ nzindex = torch.nonzero(f0).squeeze()
995
+ f0 = torch.index_select(f0, dim=0, index=nzindex).cpu().numpy()
996
+ time_org = self.hop_length / sampling_rate * nzindex.cpu().numpy()
997
+ time_frame = np.arange(pad_to) * self.hop_length / sampling_rate
998
+
999
+ vuv_vector = F.interpolate(vuv_vector[None, None, :], size=pad_to)[0][0]
1000
+
1001
+ if f0.shape[0] <= 0:
1002
+ return (
1003
+ torch.zeros(pad_to, dtype=torch.float, device=x.device).cpu().numpy(),
1004
+ vuv_vector.cpu().numpy(),
1005
+ )
1006
+ if f0.shape[0] == 1:
1007
+ return (
1008
+ torch.ones(pad_to, dtype=torch.float, device=x.device) * f0[0]
1009
+ ).cpu().numpy(), vuv_vector.cpu().numpy()
1010
+
1011
+ # 大概可以用 torch 重写?
1012
+ f0 = np.interp(time_frame, time_org, f0, left=f0[0], right=f0[-1])
1013
+ # vuv_vector = np.ceil(scipy.ndimage.zoom(vuv_vector,pad_to/len(vuv_vector),order = 0))
1014
+
1015
+ return f0, vuv_vector.cpu().numpy()
1016
+
1017
+ def compute_f0(self, wav, p_len=None):
1018
+ x = torch.FloatTensor(wav).to(self.dtype).to(self.device)
1019
+ if p_len is None:
1020
+ print("fcpe p_len is None")
1021
+ p_len = x.shape[0] // self.hop_length
1022
+ f0 = self.fcpe(x, sr=self.sampling_rate, threshold=self.threshold)[0, :, 0]
1023
+ if torch.all(f0 == 0):
1024
+ rtn = f0.cpu().numpy() if p_len is None else np.zeros(p_len)
1025
+ return rtn, rtn
1026
+ return self.post_process(x, self.sampling_rate, f0, p_len)[0]
1027
+
1028
+ def compute_f0_uv(self, wav, p_len=None):
1029
+ x = torch.FloatTensor(wav).to(self.dtype).to(self.device)
1030
+ if p_len is None:
1031
+ p_len = x.shape[0] // self.hop_length
1032
+ f0 = self.fcpe(x, sr=self.sampling_rate, threshold=self.threshold)[0, :, 0]
1033
+ if torch.all(f0 == 0):
1034
+ rtn = f0.cpu().numpy() if p_len is None else np.zeros(p_len)
1035
+ return rtn, rtn
1036
+ return self.post_process(x, self.sampling_rate, f0, p_len)
rvc/lib/infer_pack/models.py CHANGED
@@ -178,7 +178,7 @@ class ResidualCouplingBlock(nn.Module):
178
  for i in range(self.n_flows):
179
  for hook in self.flows[i * 2]._forward_pre_hooks.values():
180
  if (
181
- hook.__module__ == "torch.nn.utils.weight_norm"
182
  and hook.__class__.__name__ == "WeightNorm"
183
  ):
184
  torch.nn.utils.remove_weight_norm(self.flows[i * 2])
@@ -235,7 +235,7 @@ class PosteriorEncoder(nn.Module):
235
  def __prepare_scriptable__(self):
236
  for hook in self.enc._forward_pre_hooks.values():
237
  if (
238
- hook.__module__ == "torch.nn.utils.weight_norm"
239
  and hook.__class__.__name__ == "WeightNorm"
240
  ):
241
  torch.nn.utils.remove_weight_norm(self.enc)
@@ -319,7 +319,7 @@ class Generator(torch.nn.Module):
319
  # because of shadowing, so we check the module name directly.
320
  # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3
321
  if (
322
- hook.__module__ == "torch.nn.utils.weight_norm"
323
  and hook.__class__.__name__ == "WeightNorm"
324
  ):
325
  torch.nn.utils.remove_weight_norm(l)
@@ -327,7 +327,7 @@ class Generator(torch.nn.Module):
327
  for l in self.resblocks:
328
  for hook in l._forward_pre_hooks.values():
329
  if (
330
- hook.__module__ == "torch.nn.utils.weight_norm"
331
  and hook.__class__.__name__ == "WeightNorm"
332
  ):
333
  torch.nn.utils.remove_weight_norm(l)
@@ -610,14 +610,14 @@ class GeneratorNSF(torch.nn.Module):
610
  # because of shadowing, so we check the module name directly.
611
  # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3
612
  if (
613
- hook.__module__ == "torch.nn.utils.weight_norm"
614
  and hook.__class__.__name__ == "WeightNorm"
615
  ):
616
  torch.nn.utils.remove_weight_norm(l)
617
  for l in self.resblocks:
618
  for hook in self.resblocks._forward_pre_hooks.values():
619
  if (
620
- hook.__module__ == "torch.nn.utils.weight_norm"
621
  and hook.__class__.__name__ == "WeightNorm"
622
  ):
623
  torch.nn.utils.remove_weight_norm(l)
@@ -722,20 +722,20 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
722
  # because of shadowing, so we check the module name directly.
723
  # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3
724
  if (
725
- hook.__module__ == "torch.nn.utils.weight_norm"
726
  and hook.__class__.__name__ == "WeightNorm"
727
  ):
728
  torch.nn.utils.remove_weight_norm(self.dec)
729
  for hook in self.flow._forward_pre_hooks.values():
730
  if (
731
- hook.__module__ == "torch.nn.utils.weight_norm"
732
  and hook.__class__.__name__ == "WeightNorm"
733
  ):
734
  torch.nn.utils.remove_weight_norm(self.flow)
735
  if hasattr(self, "enc_q"):
736
  for hook in self.enc_q._forward_pre_hooks.values():
737
  if (
738
- hook.__module__ == "torch.nn.utils.weight_norm"
739
  and hook.__class__.__name__ == "WeightNorm"
740
  ):
741
  torch.nn.utils.remove_weight_norm(self.enc_q)
@@ -881,20 +881,20 @@ class SynthesizerTrnMs768NSFsid(nn.Module):
881
  # because of shadowing, so we check the module name directly.
882
  # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3
883
  if (
884
- hook.__module__ == "torch.nn.utils.weight_norm"
885
  and hook.__class__.__name__ == "WeightNorm"
886
  ):
887
  torch.nn.utils.remove_weight_norm(self.dec)
888
  for hook in self.flow._forward_pre_hooks.values():
889
  if (
890
- hook.__module__ == "torch.nn.utils.weight_norm"
891
  and hook.__class__.__name__ == "WeightNorm"
892
  ):
893
  torch.nn.utils.remove_weight_norm(self.flow)
894
  if hasattr(self, "enc_q"):
895
  for hook in self.enc_q._forward_pre_hooks.values():
896
  if (
897
- hook.__module__ == "torch.nn.utils.weight_norm"
898
  and hook.__class__.__name__ == "WeightNorm"
899
  ):
900
  torch.nn.utils.remove_weight_norm(self.enc_q)
@@ -1029,20 +1029,20 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
1029
  # because of shadowing, so we check the module name directly.
1030
  # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3
1031
  if (
1032
- hook.__module__ == "torch.nn.utils.weight_norm"
1033
  and hook.__class__.__name__ == "WeightNorm"
1034
  ):
1035
  torch.nn.utils.remove_weight_norm(self.dec)
1036
  for hook in self.flow._forward_pre_hooks.values():
1037
  if (
1038
- hook.__module__ == "torch.nn.utils.weight_norm"
1039
  and hook.__class__.__name__ == "WeightNorm"
1040
  ):
1041
  torch.nn.utils.remove_weight_norm(self.flow)
1042
  if hasattr(self, "enc_q"):
1043
  for hook in self.enc_q._forward_pre_hooks.values():
1044
  if (
1045
- hook.__module__ == "torch.nn.utils.weight_norm"
1046
  and hook.__class__.__name__ == "WeightNorm"
1047
  ):
1048
  torch.nn.utils.remove_weight_norm(self.enc_q)
@@ -1168,20 +1168,20 @@ class SynthesizerTrnMs768NSFsid_nono(nn.Module):
1168
  # because of shadowing, so we check the module name directly.
1169
  # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3
1170
  if (
1171
- hook.__module__ == "torch.nn.utils.weight_norm"
1172
  and hook.__class__.__name__ == "WeightNorm"
1173
  ):
1174
  torch.nn.utils.remove_weight_norm(self.dec)
1175
  for hook in self.flow._forward_pre_hooks.values():
1176
  if (
1177
- hook.__module__ == "torch.nn.utils.weight_norm"
1178
  and hook.__class__.__name__ == "WeightNorm"
1179
  ):
1180
  torch.nn.utils.remove_weight_norm(self.flow)
1181
  if hasattr(self, "enc_q"):
1182
  for hook in self.enc_q._forward_pre_hooks.values():
1183
  if (
1184
- hook.__module__ == "torch.nn.utils.weight_norm"
1185
  and hook.__class__.__name__ == "WeightNorm"
1186
  ):
1187
  torch.nn.utils.remove_weight_norm(self.enc_q)
 
178
  for i in range(self.n_flows):
179
  for hook in self.flows[i * 2]._forward_pre_hooks.values():
180
  if (
181
+ hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
182
  and hook.__class__.__name__ == "WeightNorm"
183
  ):
184
  torch.nn.utils.remove_weight_norm(self.flows[i * 2])
 
235
  def __prepare_scriptable__(self):
236
  for hook in self.enc._forward_pre_hooks.values():
237
  if (
238
+ hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
239
  and hook.__class__.__name__ == "WeightNorm"
240
  ):
241
  torch.nn.utils.remove_weight_norm(self.enc)
 
319
  # because of shadowing, so we check the module name directly.
320
  # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3
321
  if (
322
+ hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
323
  and hook.__class__.__name__ == "WeightNorm"
324
  ):
325
  torch.nn.utils.remove_weight_norm(l)
 
327
  for l in self.resblocks:
328
  for hook in l._forward_pre_hooks.values():
329
  if (
330
+ hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
331
  and hook.__class__.__name__ == "WeightNorm"
332
  ):
333
  torch.nn.utils.remove_weight_norm(l)
 
610
  # because of shadowing, so we check the module name directly.
611
  # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3
612
  if (
613
+ hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
614
  and hook.__class__.__name__ == "WeightNorm"
615
  ):
616
  torch.nn.utils.remove_weight_norm(l)
617
  for l in self.resblocks:
618
  for hook in self.resblocks._forward_pre_hooks.values():
619
  if (
620
+ hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
621
  and hook.__class__.__name__ == "WeightNorm"
622
  ):
623
  torch.nn.utils.remove_weight_norm(l)
 
722
  # because of shadowing, so we check the module name directly.
723
  # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3
724
  if (
725
+ hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
726
  and hook.__class__.__name__ == "WeightNorm"
727
  ):
728
  torch.nn.utils.remove_weight_norm(self.dec)
729
  for hook in self.flow._forward_pre_hooks.values():
730
  if (
731
+ hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
732
  and hook.__class__.__name__ == "WeightNorm"
733
  ):
734
  torch.nn.utils.remove_weight_norm(self.flow)
735
  if hasattr(self, "enc_q"):
736
  for hook in self.enc_q._forward_pre_hooks.values():
737
  if (
738
+ hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
739
  and hook.__class__.__name__ == "WeightNorm"
740
  ):
741
  torch.nn.utils.remove_weight_norm(self.enc_q)
 
881
  # because of shadowing, so we check the module name directly.
882
  # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3
883
  if (
884
+ hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
885
  and hook.__class__.__name__ == "WeightNorm"
886
  ):
887
  torch.nn.utils.remove_weight_norm(self.dec)
888
  for hook in self.flow._forward_pre_hooks.values():
889
  if (
890
+ hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
891
  and hook.__class__.__name__ == "WeightNorm"
892
  ):
893
  torch.nn.utils.remove_weight_norm(self.flow)
894
  if hasattr(self, "enc_q"):
895
  for hook in self.enc_q._forward_pre_hooks.values():
896
  if (
897
+ hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
898
  and hook.__class__.__name__ == "WeightNorm"
899
  ):
900
  torch.nn.utils.remove_weight_norm(self.enc_q)
 
1029
  # because of shadowing, so we check the module name directly.
1030
  # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3
1031
  if (
1032
+ hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
1033
  and hook.__class__.__name__ == "WeightNorm"
1034
  ):
1035
  torch.nn.utils.remove_weight_norm(self.dec)
1036
  for hook in self.flow._forward_pre_hooks.values():
1037
  if (
1038
+ hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
1039
  and hook.__class__.__name__ == "WeightNorm"
1040
  ):
1041
  torch.nn.utils.remove_weight_norm(self.flow)
1042
  if hasattr(self, "enc_q"):
1043
  for hook in self.enc_q._forward_pre_hooks.values():
1044
  if (
1045
+ hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
1046
  and hook.__class__.__name__ == "WeightNorm"
1047
  ):
1048
  torch.nn.utils.remove_weight_norm(self.enc_q)
 
1168
  # because of shadowing, so we check the module name directly.
1169
  # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3
1170
  if (
1171
+ hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
1172
  and hook.__class__.__name__ == "WeightNorm"
1173
  ):
1174
  torch.nn.utils.remove_weight_norm(self.dec)
1175
  for hook in self.flow._forward_pre_hooks.values():
1176
  if (
1177
+ hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
1178
  and hook.__class__.__name__ == "WeightNorm"
1179
  ):
1180
  torch.nn.utils.remove_weight_norm(self.flow)
1181
  if hasattr(self, "enc_q"):
1182
  for hook in self.enc_q._forward_pre_hooks.values():
1183
  if (
1184
+ hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
1185
  and hook.__class__.__name__ == "WeightNorm"
1186
  ):
1187
  torch.nn.utils.remove_weight_norm(self.enc_q)
rvc/lib/tools/analyzer.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import matplotlib.pyplot as plt
3
+ import librosa.display
4
+ import librosa
5
+
6
+
7
+ def calculate_features(y, sr):
8
+ stft = np.abs(librosa.stft(y))
9
+ duration = librosa.get_duration(y=y, sr=sr)
10
+ cent = librosa.feature.spectral_centroid(S=stft, sr=sr)[0]
11
+ bw = librosa.feature.spectral_bandwidth(S=stft, sr=sr)[0]
12
+ rolloff = librosa.feature.spectral_rolloff(S=stft, sr=sr)[0]
13
+ return stft, duration, cent, bw, rolloff
14
+
15
+
16
+ def plot_title(title):
17
+ plt.suptitle(title, fontsize=16, fontweight="bold")
18
+
19
+
20
+ def plot_spectrogram(y, sr, stft, duration, cmap="inferno"):
21
+ plt.subplot(3, 1, 1)
22
+ plt.imshow(
23
+ librosa.amplitude_to_db(stft, ref=np.max),
24
+ origin="lower",
25
+ extent=[0, duration, 0, sr / 1000],
26
+ aspect="auto",
27
+ cmap=cmap, # Change the colormap here
28
+ )
29
+ plt.colorbar(format="%+2.0f dB")
30
+ plt.xlabel("Time (s)")
31
+ plt.ylabel("Frequency (kHz)")
32
+ plt.title("Spectrogram")
33
+
34
+
35
+ def plot_waveform(y, sr, duration):
36
+ plt.subplot(3, 1, 2)
37
+ librosa.display.waveshow(y, sr=sr)
38
+ plt.xlabel("Time (s)")
39
+ plt.ylabel("Amplitude")
40
+ plt.title("Waveform")
41
+
42
+
43
+ def plot_features(times, cent, bw, rolloff, duration):
44
+ plt.subplot(3, 1, 3)
45
+ plt.plot(times, cent, label="Spectral Centroid (kHz)", color="b")
46
+ plt.plot(times, bw, label="Spectral Bandwidth (kHz)", color="g")
47
+ plt.plot(times, rolloff, label="Spectral Rolloff (kHz)", color="r")
48
+ plt.xlabel("Time (s)")
49
+ plt.title("Spectral Features")
50
+ plt.legend()
51
+
52
+
53
+ def analyze_audio(audio_file, save_plot_path="logs/audio_analysis.png"):
54
+ y, sr = librosa.load(audio_file)
55
+ stft, duration, cent, bw, rolloff = calculate_features(y, sr)
56
+
57
+ plt.figure(figsize=(12, 10))
58
+
59
+ plot_title("Audio Analysis" + " - " + audio_file.split("/")[-1])
60
+ plot_spectrogram(y, sr, stft, duration)
61
+ plot_waveform(y, sr, duration)
62
+ plot_features(librosa.times_like(cent), cent, bw, rolloff, duration)
63
+
64
+ plt.tight_layout()
65
+
66
+ if save_plot_path:
67
+ plt.savefig(save_plot_path, bbox_inches="tight", dpi=300)
68
+ plt.close()
69
+
70
+ audio_info = f"""Sample Rate: {sr}\nDuration: {(
71
+ str(round(duration, 2)) + " seconds"
72
+ if duration < 60
73
+ else str(round(duration / 60, 2)) + " minutes"
74
+ )}\nNumber of Samples: {len(y)}\nBits per Sample: {librosa.get_samplerate(audio_file)}\nChannels: {"Mono (1)" if y.ndim == 1 else "Stereo (2)"}"""
75
+
76
+ return audio_info, save_plot_path
rvc/lib/tools/gdown.py CHANGED
@@ -16,6 +16,7 @@ import requests
16
  import six
17
  import tqdm
18
 
 
19
  def indent(text, prefix):
20
  def prefixed_lines():
21
  for line in text.splitlines(True):
@@ -23,6 +24,7 @@ def indent(text, prefix):
23
 
24
  return "".join(prefixed_lines())
25
 
 
26
  class FileURLRetrievalError(Exception):
27
  pass
28
 
@@ -30,6 +32,7 @@ class FileURLRetrievalError(Exception):
30
  class FolderContentsMaximumLimitError(Exception):
31
  pass
32
 
 
33
  def parse_url(url, warning=True):
34
  """Parse URLs especially for Google Drive links.
35
 
@@ -93,11 +96,17 @@ def get_url_from_gdrive_confirmation(contents):
93
  m = re.search(r'href="/open\?id=([^"]+)"', contents)
94
  if m:
95
  url = m.groups()[0]
96
- uuid = re.search(r'<input\s+type="hidden"\s+name="uuid"\s+value="([^"]+)"', contents)
 
 
97
  uuid = uuid.groups()[0]
98
- url = "https://drive.usercontent.google.com/download?id=" + url + "&confirm=t&uuid=" + uuid
 
 
 
 
 
99
  return url
100
-
101
 
102
  m = re.search(r'"downloadUrl":"([^"]+)', contents)
103
  if m:
@@ -116,6 +125,8 @@ def get_url_from_gdrive_confirmation(contents):
116
  "You may need to change the permission to "
117
  "'Anyone with the link', or have had many accesses."
118
  )
 
 
119
  def _get_session(proxy, use_cookies, return_cookies_file=False):
120
  sess = requests.session()
121
 
@@ -211,16 +222,12 @@ def download(
211
  url_origin = url
212
  is_gdrive_download_link = True
213
 
214
-
215
-
216
  while True:
217
  res = sess.get(url, stream=True, verify=verify)
218
 
219
  if url == url_origin and res.status_code == 500:
220
  # The file could be Google Docs or Spreadsheets.
221
- url = "https://drive.google.com/open?id={id}".format(
222
- id=gdrive_file_id
223
- )
224
  continue
225
 
226
  if res.headers["Content-Type"].startswith("text/html"):
 
16
  import six
17
  import tqdm
18
 
19
+
20
  def indent(text, prefix):
21
  def prefixed_lines():
22
  for line in text.splitlines(True):
 
24
 
25
  return "".join(prefixed_lines())
26
 
27
+
28
  class FileURLRetrievalError(Exception):
29
  pass
30
 
 
32
  class FolderContentsMaximumLimitError(Exception):
33
  pass
34
 
35
+
36
  def parse_url(url, warning=True):
37
  """Parse URLs especially for Google Drive links.
38
 
 
96
  m = re.search(r'href="/open\?id=([^"]+)"', contents)
97
  if m:
98
  url = m.groups()[0]
99
+ uuid = re.search(
100
+ r'<input\s+type="hidden"\s+name="uuid"\s+value="([^"]+)"', contents
101
+ )
102
  uuid = uuid.groups()[0]
103
+ url = (
104
+ "https://drive.usercontent.google.com/download?id="
105
+ + url
106
+ + "&confirm=t&uuid="
107
+ + uuid
108
+ )
109
  return url
 
110
 
111
  m = re.search(r'"downloadUrl":"([^"]+)', contents)
112
  if m:
 
125
  "You may need to change the permission to "
126
  "'Anyone with the link', or have had many accesses."
127
  )
128
+
129
+
130
  def _get_session(proxy, use_cookies, return_cookies_file=False):
131
  sess = requests.session()
132
 
 
222
  url_origin = url
223
  is_gdrive_download_link = True
224
 
 
 
225
  while True:
226
  res = sess.get(url, stream=True, verify=verify)
227
 
228
  if url == url_origin and res.status_code == 500:
229
  # The file could be Google Docs or Spreadsheets.
230
+ url = "https://drive.google.com/open?id={id}".format(id=gdrive_file_id)
 
 
231
  continue
232
 
233
  if res.headers["Content-Type"].startswith("text/html"):
rvc/lib/tools/launch_tensorboard.py CHANGED
@@ -3,7 +3,8 @@ from tensorboard import program
3
 
4
  log_path = "logs"
5
 
6
- if __name__ == "__main__":
 
7
  tb = program.TensorBoard()
8
  tb.configure(argv=[None, "--logdir", log_path])
9
  url = tb.launch()
 
3
 
4
  log_path = "logs"
5
 
6
+
7
+ def launch_tensorboard_pipeline():
8
  tb = program.TensorBoard()
9
  tb.configure(argv=[None, "--logdir", log_path])
10
  url = tb.launch()
rvc/lib/tools/model_download.py CHANGED
@@ -4,9 +4,11 @@ import wget
4
  import zipfile
5
  from bs4 import BeautifulSoup
6
  import requests
7
- from urllib.parse import unquote
8
  import re
9
  import shutil
 
 
10
 
11
  def find_folder_parent(search_dir, folder_name):
12
  for dirpath, dirnames, _ in os.walk(search_dir):
@@ -14,12 +16,13 @@ def find_folder_parent(search_dir, folder_name):
14
  return os.path.abspath(dirpath)
15
  return None
16
 
 
17
  now_dir = os.getcwd()
18
  sys.path.append(now_dir)
19
 
20
  from rvc.lib.utils import format_title
21
 
22
- import rvc.lib.tools.gdown as gdown
23
 
24
  file_path = find_folder_parent(now_dir, "logs")
25
 
@@ -71,7 +74,7 @@ def download_from_url(url):
71
  try:
72
  gdown.download(
73
  f"https://drive.google.com/uc?id={file_id}",
74
- quiet=False,
75
  fuzzy=True,
76
  )
77
  except Exception as error:
@@ -91,7 +94,60 @@ def download_from_url(url):
91
  print(error_message)
92
  os.chdir(now_dir)
93
  return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  elif "/blob/" in url or "/resolve/" in url:
96
  os.chdir(zips_path)
97
  if "/blob/" in url:
@@ -99,11 +155,12 @@ def download_from_url(url):
99
 
100
  response = requests.get(url, stream=True)
101
  if response.status_code == 200:
102
- file_name = url.split("/")[-1]
103
- file_name = unquote(file_name)
104
-
105
- file_name = re.sub(r"[^a-zA-Z0-9_.-]", "_", file_name)
106
-
 
107
  total_size_in_bytes = int(response.headers.get("content-length", 0))
108
  block_size = 1024
109
  progress_bar_length = 50
@@ -152,6 +209,31 @@ def download_from_url(url):
152
  else:
153
  os.chdir(now_dir)
154
  return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  else:
156
  try:
157
  os.chdir(zips_path)
@@ -197,73 +279,86 @@ def unzip_file(zip_path, zip_file_name):
197
  os.remove(zip_file_path)
198
 
199
 
200
- url = sys.argv[1]
201
-
202
- if "?download=true" in url:
203
- url = url.replace("?download=true", "")
204
-
205
- verify = download_from_url(url)
206
-
207
- if verify == "downloaded":
208
- extract_folder_path = ""
209
- for filename in os.listdir(zips_path):
210
- if filename.endswith(".zip"):
211
- zipfile_path = os.path.join(zips_path, filename)
212
- print("Proceeding with the extraction...")
213
-
214
- model_zip = os.path.basename(zipfile_path)
215
- model_name = format_title(model_zip.split(".zip")[0])
216
- extract_folder_path = os.path.join(
217
- "logs",
218
- os.path.normpath(model_name),
219
- )
220
-
221
- success = extract_and_show_progress(zipfile_path, extract_folder_path)
222
-
223
- subfolders = [f for f in os.listdir(extract_folder_path) if os.path.isdir(os.path.join(extract_folder_path, f))]
224
- if len(subfolders) == 1:
225
- subfolder_path = os.path.join(extract_folder_path, subfolders[0])
226
- for item in os.listdir(subfolder_path):
227
- s = os.path.join(subfolder_path, item)
228
- d = os.path.join(extract_folder_path, item)
229
- shutil.move(s, d)
230
- os.rmdir(subfolder_path)
231
-
232
- for item in os.listdir(extract_folder_path):
233
- if ".pth" in item:
234
- file_name = item.split(".pth")[0]
235
- if file_name != model_name:
236
- os.rename(
237
- os.path.join(extract_folder_path, item),
238
- os.path.join(extract_folder_path, model_name + ".pth"),
239
- )
240
- else:
241
- if "v2" not in item:
242
- file_name = item.split("_nprobe_1_")[1].split("_v1")[0]
243
  if file_name != model_name:
244
- new_file_name = item.split("_nprobe_1_")[0] + "_nprobe_1_" + model_name + "_v1"
245
  os.rename(
246
  os.path.join(extract_folder_path, item),
247
- os.path.join(extract_folder_path, new_file_name + ".index"),
248
  )
249
  else:
250
- file_name = item.split("_nprobe_1_")[1].split("_v2")[0]
251
- if file_name != model_name:
252
- new_file_name = item.split("_nprobe_1_")[0] + "_nprobe_1_" + model_name + "_v2"
253
- os.rename(
254
- os.path.join(extract_folder_path, item),
255
- os.path.join(extract_folder_path, new_file_name + ".index"),
256
- )
257
-
258
- if success:
259
- print(f"Model {model_name} downloaded!")
260
- else:
261
- print(f"Error downloading {model_name}")
262
- sys.exit()
263
- if extract_folder_path == "":
264
- print("Zip file was not found.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
265
  sys.exit()
266
- result = search_pth_index(extract_folder_path)
267
- else:
268
- message = "Error"
269
- sys.exit()
 
4
  import zipfile
5
  from bs4 import BeautifulSoup
6
  import requests
7
+ from urllib.parse import unquote, urlencode, parse_qs, urlparse
8
  import re
9
  import shutil
10
+ import six
11
+
12
 
13
  def find_folder_parent(search_dir, folder_name):
14
  for dirpath, dirnames, _ in os.walk(search_dir):
 
16
  return os.path.abspath(dirpath)
17
  return None
18
 
19
+
20
  now_dir = os.getcwd()
21
  sys.path.append(now_dir)
22
 
23
  from rvc.lib.utils import format_title
24
 
25
+ from rvc.lib.tools import gdown
26
 
27
  file_path = find_folder_parent(now_dir, "logs")
28
 
 
74
  try:
75
  gdown.download(
76
  f"https://drive.google.com/uc?id={file_id}",
77
+ quiet=True,
78
  fuzzy=True,
79
  )
80
  except Exception as error:
 
94
  print(error_message)
95
  os.chdir(now_dir)
96
  return None
97
+ elif "disk.yandex.ru" in url:
98
+ base_url = "https://cloud-api.yandex.net/v1/disk/public/resources/download?"
99
+ public_key = url
100
+ final_url = base_url + urlencode(dict(public_key=public_key))
101
+ response = requests.get(final_url)
102
+ download_url = response.json()["href"]
103
+ download_response = requests.get(download_url)
104
+
105
+ if download_response.status_code == 200:
106
+ filename = parse_qs(urlparse(unquote(download_url)).query).get(
107
+ "filename", [""]
108
+ )[0]
109
+ if filename:
110
+ os.chdir(zips_path)
111
+ with open(filename, "wb") as f:
112
+ f.write(download_response.content)
113
+ else:
114
+ print("Failed to get filename from URL.")
115
+ return None
116
 
117
+ elif "pixeldrain.com" in url:
118
+ try:
119
+ file_id = url.split("pixeldrain.com/u/")[1]
120
+ os.chdir(zips_path)
121
+ print(file_id)
122
+ response = requests.get(f"https://pixeldrain.com/api/file/{file_id}")
123
+ if response.status_code == 200:
124
+ file_name = (
125
+ response.headers.get("Content-Disposition")
126
+ .split("filename=")[-1]
127
+ .strip('";')
128
+ )
129
+ os.makedirs(zips_path, exist_ok=True)
130
+ with open(os.path.join(zips_path, file_name), "wb") as newfile:
131
+ newfile.write(response.content)
132
+ os.chdir(file_path)
133
+ return "downloaded"
134
+ else:
135
+ os.chdir(file_path)
136
+ return None
137
+ except Exception as e:
138
+ print(e)
139
+ os.chdir(file_path)
140
+ return None
141
+
142
+ elif "cdn.discordapp.com" in url:
143
+ file = requests.get(url)
144
+ os.chdir(zips_path)
145
+ if file.status_code == 200:
146
+ name = url.split("/")
147
+ with open(os.path.join(name[-1]), "wb") as newfile:
148
+ newfile.write(file.content)
149
+ else:
150
+ return None
151
  elif "/blob/" in url or "/resolve/" in url:
152
  os.chdir(zips_path)
153
  if "/blob/" in url:
 
155
 
156
  response = requests.get(url, stream=True)
157
  if response.status_code == 200:
158
+ content_disposition = six.moves.urllib_parse.unquote(
159
+ response.headers["Content-Disposition"]
160
+ )
161
+ m = re.search(r'filename="([^"]+)"', content_disposition)
162
+ file_name = m.groups()[0]
163
+ file_name = file_name.replace(os.path.sep, "_")
164
  total_size_in_bytes = int(response.headers.get("content-length", 0))
165
  block_size = 1024
166
  progress_bar_length = 50
 
209
  else:
210
  os.chdir(now_dir)
211
  return None
212
+ elif "applio.org" in url:
213
+ parts = url.split("/")
214
+ id_with_query = parts[-1]
215
+ id_parts = id_with_query.split("?")
216
+ id_number = id_parts[0]
217
+
218
+ url = "https://cjtfqzjfdimgpvpwhzlv.supabase.co/rest/v1/models"
219
+ headers = {
220
+ "apikey": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6ImNqdGZxempmZGltZ3B2cHdoemx2Iiwicm9sZSI6ImFub24iLCJpYXQiOjE2OTUxNjczODgsImV4cCI6MjAxMDc0MzM4OH0.7z5WMIbjR99c2Ooc0ma7B_FyGq10G8X-alkCYTkKR10"
221
+ }
222
+
223
+ params = {"id": f"eq.{id_number}"}
224
+ response = requests.get(url, headers=headers, params=params)
225
+ if response.status_code == 200:
226
+ json_response = response.json()
227
+ print(json_response)
228
+ if json_response:
229
+ link = json_response[0]["link"]
230
+ verify = download_from_url(link)
231
+ if verify == "downloaded":
232
+ return "downloaded"
233
+ else:
234
+ return None
235
+ else:
236
+ return None
237
  else:
238
  try:
239
  os.chdir(zips_path)
 
279
  os.remove(zip_file_path)
280
 
281
 
282
+ def model_download_pipeline(url):
283
+ verify = download_from_url(url)
284
+ if verify == "downloaded":
285
+ extract_folder_path = ""
286
+ for filename in os.listdir(zips_path):
287
+ if filename.endswith(".zip"):
288
+ zipfile_path = os.path.join(zips_path, filename)
289
+ print("Proceeding with the extraction...")
290
+
291
+ model_zip = os.path.basename(zipfile_path)
292
+ model_name = format_title(model_zip.split(".zip")[0])
293
+ extract_folder_path = os.path.join(
294
+ "logs",
295
+ os.path.normpath(model_name),
296
+ )
297
+
298
+ success = extract_and_show_progress(zipfile_path, extract_folder_path)
299
+
300
+ subfolders = [
301
+ f
302
+ for f in os.listdir(extract_folder_path)
303
+ if os.path.isdir(os.path.join(extract_folder_path, f))
304
+ ]
305
+ if len(subfolders) == 1:
306
+ subfolder_path = os.path.join(extract_folder_path, subfolders[0])
307
+ for item in os.listdir(subfolder_path):
308
+ s = os.path.join(subfolder_path, item)
309
+ d = os.path.join(extract_folder_path, item)
310
+ shutil.move(s, d)
311
+ os.rmdir(subfolder_path)
312
+
313
+ for item in os.listdir(extract_folder_path):
314
+ if ".pth" in item:
315
+ file_name = item.split(".pth")[0]
 
 
 
 
 
 
 
 
 
316
  if file_name != model_name:
 
317
  os.rename(
318
  os.path.join(extract_folder_path, item),
319
+ os.path.join(extract_folder_path, model_name + ".pth"),
320
  )
321
  else:
322
+ if "v2" not in item:
323
+ file_name = item.split("_nprobe_1_")[1].split("_v1")[0]
324
+ if file_name != model_name:
325
+ new_file_name = (
326
+ item.split("_nprobe_1_")[0]
327
+ + "_nprobe_1_"
328
+ + model_name
329
+ + "_v1"
330
+ )
331
+ os.rename(
332
+ os.path.join(extract_folder_path, item),
333
+ os.path.join(
334
+ extract_folder_path, new_file_name + ".index"
335
+ ),
336
+ )
337
+ else:
338
+ file_name = item.split("_nprobe_1_")[1].split("_v2")[0]
339
+ if file_name != model_name:
340
+ new_file_name = (
341
+ item.split("_nprobe_1_")[0]
342
+ + "_nprobe_1_"
343
+ + model_name
344
+ + "_v2"
345
+ )
346
+ os.rename(
347
+ os.path.join(extract_folder_path, item),
348
+ os.path.join(
349
+ extract_folder_path, new_file_name + ".index"
350
+ ),
351
+ )
352
+
353
+ if success:
354
+ print(f"Model {model_name} downloaded!")
355
+ else:
356
+ print(f"Error downloading {model_name}")
357
+ sys.exit()
358
+ if extract_folder_path == "":
359
+ print("Zip file was not found.")
360
+ sys.exit()
361
+ result = search_pth_index(extract_folder_path)
362
+ else:
363
+ message = "Error"
364
  sys.exit()
 
 
 
 
rvc/lib/tools/prerequisites_download.py CHANGED
@@ -1,11 +1,10 @@
1
  import os
2
  import wget
3
- import sys
4
 
5
- url_base = "https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main"
6
- models_download = [
7
  (
8
- "pretrained/",
9
  [
10
  "D32k.pth",
11
  "D40k.pth",
@@ -21,6 +20,8 @@ models_download = [
21
  "f0G48k.pth",
22
  ],
23
  ),
 
 
24
  (
25
  "pretrained_v2/",
26
  [
@@ -40,45 +41,55 @@ models_download = [
40
  ),
41
  ]
42
 
43
- models_file = [
44
  "hubert_base.pt",
45
  "rmvpe.pt",
46
- # "rmvpe.onnx",
 
47
  ]
48
 
49
- executables_file = [
50
- "ffmpeg.exe",
51
- "ffprobe.exe",
52
- ]
53
 
54
- folder_mapping = {
55
- "pretrained/": "rvc/pretraineds/pretrained_v1/",
56
  "pretrained_v2/": "rvc/pretraineds/pretrained_v2/",
57
  }
58
 
59
- for file_name in models_file:
60
- destination_path = os.path.join(file_name)
61
- url = f"{url_base}/{file_name}"
62
- if not os.path.exists(destination_path):
63
- os.makedirs(os.path.dirname(destination_path) or ".", exist_ok=True)
64
- print(f"\nDownloading {url} to {destination_path}...")
65
- wget.download(url, out=destination_path)
66
 
67
- for file_name in executables_file:
68
- if sys.platform == "win32":
69
- destination_path = os.path.join(file_name)
70
- url = f"{url_base}/{file_name}"
71
- if not os.path.exists(destination_path):
72
- os.makedirs(os.path.dirname(destination_path) or ".", exist_ok=True)
73
- print(f"\nDownloading {url} to {destination_path}...")
74
- wget.download(url, out=destination_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
- for remote_folder, file_list in models_download:
77
- local_folder = folder_mapping.get(remote_folder, "")
78
- for file in file_list:
79
- destination_path = os.path.join(local_folder, file)
80
- url = f"{url_base}/{remote_folder}{file}"
81
- if not os.path.exists(destination_path):
82
- os.makedirs(os.path.dirname(destination_path) or ".", exist_ok=True)
83
- print(f"\nDownloading {url} to {destination_path}...")
84
- wget.download(url, out=destination_path)
 
 
1
  import os
2
  import wget
 
3
 
4
+ url_base = "https://huggingface.co/IAHispano/Applio/resolve/main/Resources"
5
+ pretraineds_v1_list = [
6
  (
7
+ "pretrained_v1/",
8
  [
9
  "D32k.pth",
10
  "D40k.pth",
 
20
  "f0G48k.pth",
21
  ],
22
  ),
23
+ ]
24
+ pretraineds_v2_list = [
25
  (
26
  "pretrained_v2/",
27
  [
 
41
  ),
42
  ]
43
 
44
+ models_list = [
45
  "hubert_base.pt",
46
  "rmvpe.pt",
47
+ "fcpe.pt",
48
+ # "rmvpe.onnx"
49
  ]
50
 
51
+ executables_list = ["ffmpeg.exe", "ffprobe.exe"]
 
 
 
52
 
53
+ folder_mapping_list = {
54
+ "pretrained_v1/": "rvc/pretraineds/pretrained_v1/",
55
  "pretrained_v2/": "rvc/pretraineds/pretrained_v2/",
56
  }
57
 
 
 
 
 
 
 
 
58
 
59
+ def prequisites_download_pipeline(pretraineds_v1, pretraineds_v2, models, exe):
60
+ def download_files(file_list):
61
+ for file_name in file_list:
62
+ destination_path = os.path.join(file_name)
63
+ url = f"{url_base}/{file_name}"
64
+ if not os.path.exists(destination_path):
65
+ os.makedirs(os.path.dirname(destination_path) or ".", exist_ok=True)
66
+ print(f"\nDownloading {url} to {destination_path}...")
67
+ wget.download(url, out=destination_path)
68
+
69
+ if models == "True":
70
+ download_files(models_list)
71
+
72
+ if exe == "True" and os.name == "nt":
73
+ download_files(executables_list)
74
+
75
+ if pretraineds_v1 == "True":
76
+ for remote_folder, file_list in pretraineds_v1_list:
77
+ local_folder = folder_mapping_list.get(remote_folder, "")
78
+ for file in file_list:
79
+ destination_path = os.path.join(local_folder, file)
80
+ url = f"{url_base}/{remote_folder}{file}"
81
+ if not os.path.exists(destination_path):
82
+ os.makedirs(os.path.dirname(destination_path) or ".", exist_ok=True)
83
+ print(f"\nDownloading {url} to {destination_path}...")
84
+ wget.download(url, out=destination_path)
85
 
86
+ if pretraineds_v2 == "True":
87
+ for remote_folder, file_list in pretraineds_v2_list:
88
+ local_folder = folder_mapping_list.get(remote_folder, "")
89
+ for file in file_list:
90
+ destination_path = os.path.join(local_folder, file)
91
+ url = f"{url_base}/{remote_folder}{file}"
92
+ if not os.path.exists(destination_path):
93
+ os.makedirs(os.path.dirname(destination_path) or ".", exist_ok=True)
94
+ print(f"\nDownloading {url} to {destination_path}...")
95
+ wget.download(url, out=destination_path)
rvc/lib/tools/pretrained_selector.py CHANGED
@@ -60,4 +60,4 @@ def pretrained_selector(pitch_guidance):
60
  "rvc/pretraineds/pretrained_v2/D48k.pth",
61
  ),
62
  },
63
- }
 
60
  "rvc/pretraineds/pretrained_v2/D48k.pth",
61
  ),
62
  },
63
+ }
rvc/lib/tools/split_audio.py CHANGED
@@ -17,11 +17,13 @@ def process_audio(file_path):
17
  min_silence_len = 750 # ms, adjust as needed
18
 
19
  # detect nonsilent parts
20
- nonsilent_parts = detect_nonsilent(song, min_silence_len=min_silence_len, silence_thresh=silence_thresh)
 
 
21
 
22
  # Create a new directory to store chunks
23
  file_dir = os.path.dirname(file_path)
24
- file_name = os.path.basename(file_path).split('.')[0]
25
  file_name = format_title(file_name)
26
  new_dir_path = os.path.join(file_dir, file_name)
27
  os.makedirs(new_dir_path, exist_ok=True)
@@ -58,7 +60,7 @@ def process_audio(file_path):
58
  def merge_audio(timestamps_file):
59
  try:
60
  # Extract prefix from the timestamps filename
61
- prefix = os.path.basename(timestamps_file).replace('_timestamps.txt', '')
62
  timestamps_dir = os.path.dirname(timestamps_file)
63
 
64
  # Open the timestamps file
@@ -98,8 +100,8 @@ def merge_audio(timestamps_file):
98
  # Concatenate all audio_segments and export
99
  merged_audio = sum(audio_segments)
100
  merged_audio_np = np.array(merged_audio.get_array_of_samples())
101
- #print(f"Exported merged file: {merged_filename}\n")
102
  return merged_audio.frame_rate, merged_audio_np
103
 
104
  except Exception as e:
105
- print(f"An error occurred: {e}")
 
17
  min_silence_len = 750 # ms, adjust as needed
18
 
19
  # detect nonsilent parts
20
+ nonsilent_parts = detect_nonsilent(
21
+ song, min_silence_len=min_silence_len, silence_thresh=silence_thresh
22
+ )
23
 
24
  # Create a new directory to store chunks
25
  file_dir = os.path.dirname(file_path)
26
+ file_name = os.path.basename(file_path).split(".")[0]
27
  file_name = format_title(file_name)
28
  new_dir_path = os.path.join(file_dir, file_name)
29
  os.makedirs(new_dir_path, exist_ok=True)
 
60
  def merge_audio(timestamps_file):
61
  try:
62
  # Extract prefix from the timestamps filename
63
+ prefix = os.path.basename(timestamps_file).replace("_timestamps.txt", "")
64
  timestamps_dir = os.path.dirname(timestamps_file)
65
 
66
  # Open the timestamps file
 
100
  # Concatenate all audio_segments and export
101
  merged_audio = sum(audio_segments)
102
  merged_audio_np = np.array(merged_audio.get_array_of_samples())
103
+ # print(f"Exported merged file: {merged_filename}\n")
104
  return merged_audio.frame_rate, merged_audio_np
105
 
106
  except Exception as e:
107
+ print(f"An error occurred: {e}")
rvc/lib/utils.py CHANGED
@@ -19,8 +19,10 @@ def load_audio(file, sampling_rate):
19
 
20
 
21
  def format_title(title):
22
- formatted_title = unicodedata.normalize('NFKD', title).encode('ascii', 'ignore').decode('utf-8')
23
- formatted_title = re.sub(r'[\u2500-\u257F]+', '', formatted_title)
24
- formatted_title = re.sub(r'[^\w\s.-]', '', formatted_title)
25
- formatted_title = re.sub(r'\s+', '_', formatted_title)
26
- return formatted_title
 
 
 
19
 
20
 
21
  def format_title(title):
22
+ formatted_title = (
23
+ unicodedata.normalize("NFKD", title).encode("ascii", "ignore").decode("utf-8")
24
+ )
25
+ formatted_title = re.sub(r"[\u2500-\u257F]+", "", formatted_title)
26
+ formatted_title = re.sub(r"[^\w\s.-]", "", formatted_title)
27
+ formatted_title = re.sub(r"\s+", "_", formatted_title)
28
+ return formatted_title
rvc/train/extract/extract_feature_print.py CHANGED
@@ -7,6 +7,9 @@ import fairseq
7
  import soundfile as sf
8
  import numpy as np
9
 
 
 
 
10
 
11
  device = sys.argv[1]
12
  n_parts = int(sys.argv[2])
 
7
  import soundfile as sf
8
  import numpy as np
9
 
10
+ import logging
11
+
12
+ logging.getLogger("fairseq").setLevel(logging.WARNING)
13
 
14
  device = sys.argv[1]
15
  n_parts = int(sys.argv[2])
rvc/train/process/extract_index.py CHANGED
@@ -78,8 +78,11 @@ try:
78
  index_added.add(big_npy[i : i + batch_size_add])
79
 
80
  faiss.write_index(index_added, index_filepath_added)
 
81
 
82
  except Exception as error:
83
  print(f"Failed to train index: {error}")
84
-
85
- print(f"Saved index file '{index_filepath_added}'")
 
 
 
78
  index_added.add(big_npy[i : i + batch_size_add])
79
 
80
  faiss.write_index(index_added, index_filepath_added)
81
+ print(f"Saved index file '{index_filepath_added}'")
82
 
83
  except Exception as error:
84
  print(f"Failed to train index: {error}")
85
+ if "one array to concatenate" in str(error):
86
+ print(
87
+ "If you are running this code in a virtual environment, make sure you have enough GPU available to generate the Index file."
88
+ )
rvc/train/process/extract_model.py CHANGED
@@ -1,28 +1,27 @@
1
  import os
2
  import torch
 
 
3
  from collections import OrderedDict
4
 
5
 
6
  def replace_keys_in_dict(d, old_key_part, new_key_part):
7
- # Use OrderedDict if the original is an OrderedDict
8
  if isinstance(d, OrderedDict):
9
  updated_dict = OrderedDict()
10
  else:
11
  updated_dict = {}
12
  for key, value in d.items():
13
- # Replace the key part if found
14
  new_key = key.replace(old_key_part, new_key_part)
15
- # If the value is a dictionary, apply the function recursively
16
  if isinstance(value, dict):
17
  value = replace_keys_in_dict(value, old_key_part, new_key_part)
18
  updated_dict[new_key] = value
19
  return updated_dict
20
 
21
 
22
- def extract_model(ckpt, sr, if_f0, name, model_dir, epoch, version, hps):
23
  try:
24
- print(f"Saved model '{model_dir}' (epoch {epoch})")
25
- pth_file = f"{name}_{epoch}e.pth"
26
  pth_file_old_version_path = os.path.join(
27
  model_dir, f"{pth_file}_old_version.pth"
28
  )
@@ -51,7 +50,18 @@ def extract_model(ckpt, sr, if_f0, name, model_dir, epoch, version, hps):
51
  hps.model.gin_channels,
52
  hps.data.sampling_rate,
53
  ]
54
- opt["info"], opt["sr"], opt["f0"], opt["version"] = epoch, sr, if_f0, version
 
 
 
 
 
 
 
 
 
 
 
55
  torch.save(opt, model_dir)
56
 
57
  model = torch.load(model_dir, map_location=torch.device("cpu"))
 
1
  import os
2
  import torch
3
+ import hashlib
4
+ import datetime
5
  from collections import OrderedDict
6
 
7
 
8
  def replace_keys_in_dict(d, old_key_part, new_key_part):
 
9
  if isinstance(d, OrderedDict):
10
  updated_dict = OrderedDict()
11
  else:
12
  updated_dict = {}
13
  for key, value in d.items():
 
14
  new_key = key.replace(old_key_part, new_key_part)
 
15
  if isinstance(value, dict):
16
  value = replace_keys_in_dict(value, old_key_part, new_key_part)
17
  updated_dict[new_key] = value
18
  return updated_dict
19
 
20
 
21
+ def extract_model(ckpt, sr, if_f0, name, model_dir, epoch, step, version, hps):
22
  try:
23
+ print(f"Saved model '{model_dir}' (epoch {epoch} and step {step})")
24
+ pth_file = f"{name}_{epoch}e_{step}s.pth"
25
  pth_file_old_version_path = os.path.join(
26
  model_dir, f"{pth_file}_old_version.pth"
27
  )
 
50
  hps.model.gin_channels,
51
  hps.data.sampling_rate,
52
  ]
53
+
54
+ opt["epoch"] = epoch
55
+ opt["step"] = step
56
+ opt["sr"] = sr
57
+ opt["f0"] = if_f0
58
+ opt["version"] = version
59
+ opt["creation_date"] = datetime.datetime.now().isoformat()
60
+
61
+ hash_input = f"{str(ckpt)} {epoch} {step} {datetime.datetime.now().isoformat()}"
62
+ model_hash = hashlib.sha256(hash_input.encode()).hexdigest()
63
+ opt["model_hash"] = model_hash
64
+
65
  torch.save(opt, model_dir)
66
 
67
  model = torch.load(model_dir, map_location=torch.device("cpu"))
rvc/train/process/extract_small_model.py ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import hashlib
4
+ import datetime
5
+ from collections import OrderedDict
6
+
7
+
8
+ def replace_keys_in_dict(d, old_key_part, new_key_part):
9
+ # Use OrderedDict if the original is an OrderedDict
10
+ if isinstance(d, OrderedDict):
11
+ updated_dict = OrderedDict()
12
+ else:
13
+ updated_dict = {}
14
+ for key, value in d.items():
15
+ # Replace the key part if found
16
+ new_key = key.replace(old_key_part, new_key_part)
17
+ # If the value is a dictionary, apply the function recursively
18
+ if isinstance(value, dict):
19
+ value = replace_keys_in_dict(value, old_key_part, new_key_part)
20
+ updated_dict[new_key] = value
21
+ return updated_dict
22
+
23
+
24
+ def extract_small_model(path, name, sr, if_f0, version, epoch, step):
25
+ try:
26
+ ckpt = torch.load(path, map_location="cpu")
27
+ pth_file = f"{name}.pth"
28
+ pth_file_old_version_path = os.path.join("logs", f"{pth_file}_old_version.pth")
29
+ opt = OrderedDict(
30
+ weight={
31
+ key: value.half() for key, value in ckpt.items() if "enc_q" not in key
32
+ }
33
+ )
34
+ if "model" in ckpt:
35
+ ckpt = ckpt["model"]
36
+ opt = OrderedDict()
37
+ opt["weight"] = {}
38
+ for key in ckpt.keys():
39
+ if "enc_q" in key:
40
+ continue
41
+ opt["weight"][key] = ckpt[key].half()
42
+ if sr == "40k":
43
+ opt["config"] = [
44
+ 1025,
45
+ 32,
46
+ 192,
47
+ 192,
48
+ 768,
49
+ 2,
50
+ 6,
51
+ 3,
52
+ 0,
53
+ "1",
54
+ [3, 7, 11],
55
+ [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
56
+ [10, 10, 2, 2],
57
+ 512,
58
+ [16, 16, 4, 4],
59
+ 109,
60
+ 256,
61
+ 40000,
62
+ ]
63
+ elif sr == "48k":
64
+ if version == "v1":
65
+ opt["config"] = [
66
+ 1025,
67
+ 32,
68
+ 192,
69
+ 192,
70
+ 768,
71
+ 2,
72
+ 6,
73
+ 3,
74
+ 0,
75
+ "1",
76
+ [3, 7, 11],
77
+ [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
78
+ [10, 6, 2, 2, 2],
79
+ 512,
80
+ [16, 16, 4, 4, 4],
81
+ 109,
82
+ 256,
83
+ 48000,
84
+ ]
85
+ else:
86
+ opt["config"] = [
87
+ 1025,
88
+ 32,
89
+ 192,
90
+ 192,
91
+ 768,
92
+ 2,
93
+ 6,
94
+ 3,
95
+ 0,
96
+ "1",
97
+ [3, 7, 11],
98
+ [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
99
+ [12, 10, 2, 2],
100
+ 512,
101
+ [24, 20, 4, 4],
102
+ 109,
103
+ 256,
104
+ 48000,
105
+ ]
106
+ elif sr == "32k":
107
+ if version == "v1":
108
+ opt["config"] = [
109
+ 513,
110
+ 32,
111
+ 192,
112
+ 192,
113
+ 768,
114
+ 2,
115
+ 6,
116
+ 3,
117
+ 0,
118
+ "1",
119
+ [3, 7, 11],
120
+ [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
121
+ [10, 4, 2, 2, 2],
122
+ 512,
123
+ [16, 16, 4, 4, 4],
124
+ 109,
125
+ 256,
126
+ 32000,
127
+ ]
128
+ else:
129
+ opt["config"] = [
130
+ 513,
131
+ 32,
132
+ 192,
133
+ 192,
134
+ 768,
135
+ 2,
136
+ 6,
137
+ 3,
138
+ 0,
139
+ "1",
140
+ [3, 7, 11],
141
+ [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
142
+ [10, 8, 2, 2],
143
+ 512,
144
+ [20, 16, 4, 4],
145
+ 109,
146
+ 256,
147
+ 32000,
148
+ ]
149
+
150
+ opt["epoch"] = epoch
151
+ opt["step"] = step
152
+ opt["sr"] = sr
153
+ opt["f0"] = int(if_f0)
154
+ opt["version"] = version
155
+ opt["creation_date"] = datetime.datetime.now().isoformat()
156
+
157
+ hash_input = f"{str(ckpt)} {epoch} {step} {datetime.datetime.now().isoformat()}"
158
+ model_hash = hashlib.sha256(hash_input.encode()).hexdigest()
159
+ opt["model_hash"] = model_hash
160
+
161
+ model = torch.load(pth_file_old_version_path, map_location=torch.device("cpu"))
162
+ torch.save(
163
+ replace_keys_in_dict(
164
+ replace_keys_in_dict(
165
+ model, ".parametrizations.weight.original1", ".weight_v"
166
+ ),
167
+ ".parametrizations.weight.original0",
168
+ ".weight_g",
169
+ ),
170
+ pth_file_old_version_path,
171
+ )
172
+ os.remove(pth_file_old_version_path)
173
+ os.rename(pth_file_old_version_path, pth_file)
174
+ except Exception as error:
175
+ print(error)
rvc/train/process/model_blender.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ from collections import OrderedDict
4
+
5
+
6
+ def extract(ckpt):
7
+ a = ckpt["model"]
8
+ opt = OrderedDict()
9
+ opt["weight"] = {}
10
+ for key in a.keys():
11
+ if "enc_q" in key:
12
+ continue
13
+ opt["weight"][key] = a[key]
14
+ return opt
15
+
16
+
17
+ def model_blender(name, path1, path2, ratio):
18
+ try:
19
+ message = f"Model {path1} and {path2} are merged with alpha {ratio}."
20
+ ckpt1 = torch.load(path1, map_location="cpu")
21
+ ckpt2 = torch.load(path2, map_location="cpu")
22
+ cfg = ckpt1["config"]
23
+ cfg_f0 = ckpt1["f0"]
24
+ cfg_version = ckpt1["version"]
25
+
26
+ if "model" in ckpt1:
27
+ ckpt1 = extract(ckpt1)
28
+ else:
29
+ ckpt1 = ckpt1["weight"]
30
+ if "model" in ckpt2:
31
+ ckpt2 = extract(ckpt2)
32
+ else:
33
+ ckpt2 = ckpt2["weight"]
34
+
35
+ if sorted(list(ckpt1.keys())) != sorted(list(ckpt2.keys())):
36
+ return "Fail to merge the models. The model architectures are not the same."
37
+
38
+ opt = OrderedDict()
39
+ opt["weight"] = {}
40
+ for key in ckpt1.keys():
41
+ if key == "emb_g.weight" and ckpt1[key].shape != ckpt2[key].shape:
42
+ min_shape0 = min(ckpt1[key].shape[0], ckpt2[key].shape[0])
43
+ opt["weight"][key] = (
44
+ ratio * (ckpt1[key][:min_shape0].float())
45
+ + (1 - ratio) * (ckpt2[key][:min_shape0].float())
46
+ ).half()
47
+ else:
48
+ opt["weight"][key] = (
49
+ ratio * (ckpt1[key].float()) + (1 - ratio) * (ckpt2[key].float())
50
+ ).half()
51
+
52
+ opt["config"] = cfg
53
+ opt["sr"] = message
54
+ opt["f0"] = cfg_f0
55
+ opt["version"] = cfg_version
56
+ opt["info"] = message
57
+
58
+ torch.save(opt, os.path.join("logs", "%s.pth" % name))
59
+ print(message)
60
+ return message, os.path.join("logs", "%s.pth" % name)
61
+ except Exception as error:
62
+ print(error)
63
+ return error
rvc/train/process/model_information.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from datetime import datetime
3
+
4
+
5
+ def prettify_date(date_str):
6
+ date_time_obj = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S.%f")
7
+ return date_time_obj.strftime("%Y-%m-%d %H:%M:%S")
8
+
9
+
10
+ def model_information(path):
11
+ model_data = torch.load(path, map_location="cpu")
12
+
13
+ print(f"Loaded model from {path}")
14
+
15
+ epochs = model_data.get("epoch", "None")
16
+ steps = model_data.get("step", "None")
17
+ sr = model_data.get("sr", "None")
18
+ f0 = model_data.get("f0", "None")
19
+ version = model_data.get("version", "None")
20
+ creation_date = model_data.get("creation_date", "None")
21
+ model_hash = model_data.get("model_hash", "None")
22
+
23
+ pitch_guidance = "True" if f0 == 1 else "False"
24
+
25
+ return (
26
+ f"Epochs: {epochs}\n"
27
+ f"Steps: {steps}\n"
28
+ f"RVC Version: {version}\n"
29
+ f"Sampling Rate: {sr}\n"
30
+ f"Pitch Guidance: {pitch_guidance}\n"
31
+ f"Creation Date: {prettify_date(creation_date)}\n"
32
+ f"Hash (ID): {model_hash}"
33
+ )
rvc/train/train.py CHANGED
@@ -70,15 +70,9 @@ torch.backends.cudnn.deterministic = False
70
  torch.backends.cudnn.benchmark = False
71
 
72
  global_step = 0
73
- bestEpochStep = 0
74
  last_loss_gen_all = 0
75
- lastValue = 1
76
- lowestValue = {"step": 0, "value": float("inf"), "epoch": 0}
77
- dirtyTb = []
78
- dirtyValues = []
79
- dirtySteps = []
80
- dirtyEpochs = []
81
- continued = False
82
 
83
 
84
  class EpochRecorder:
@@ -104,13 +98,16 @@ def main():
104
  print("GPU not detected, reverting to CPU (not recommended)")
105
  n_gpus = 1
106
  children = []
107
- for i in range(n_gpus):
108
- subproc = mp.Process(
109
- target=run,
110
- args=(i, n_gpus, hps),
111
- )
112
- children.append(subproc)
113
- subproc.start()
 
 
 
114
 
115
  for i in range(n_gpus):
116
  children[i].join()
@@ -287,9 +284,13 @@ def run(
287
 
288
 
289
  def train_and_evaluate(rank, epoch, hps, nets, optims, scaler, loaders, writers, cache):
290
- global global_step, last_loss_gen_all, lowestValue
 
291
  if epoch == 1:
292
- last_loss_gen_all = {}
 
 
 
293
  net_g, net_d = nets
294
  optim_g, optim_d = optims
295
  train_loader = loaders[0] if loaders is not None else None
@@ -467,10 +468,15 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, scaler, loaders, writers,
467
  loss_gen, losses_gen = generator_loss(y_d_hat_g)
468
  loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl
469
 
470
- if loss_gen_all < lowestValue["value"]:
471
- lowestValue["value"] = loss_gen_all
472
- lowestValue["step"] = global_step
473
- lowestValue["epoch"] = epoch
 
 
 
 
 
474
 
475
  optim_g.zero_grad()
476
  scaler.scale(loss_gen_all).backward()
@@ -558,25 +564,43 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, scaler, loaders, writers,
558
  ckpt = net_g.module.state_dict()
559
  else:
560
  ckpt = net_g.state_dict()
561
- extract_model(
562
- ckpt,
563
- hps.sample_rate,
564
- hps.if_f0,
565
- hps.name,
566
- os.path.join(hps.model_dir, "{}_{}e.pth".format(hps.name, epoch)),
567
- epoch,
568
- hps.version,
569
- hps,
570
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
571
 
572
  if rank == 0:
573
  if epoch > 1:
574
- change = last_loss_gen_all - loss_gen_all
575
- change_str = ""
576
- if change != 0:
577
- change_str = f"({'decreased' if change > 0 else 'increased'} {abs(change)})" # decreased = good
578
  print(
579
- f"{hps.name} | epoch={epoch} | step={global_step} | {epoch_recorder.record()} | loss_gen_all={round(loss_gen_all.item(), 3)} {change_str}"
 
 
 
 
580
  )
581
  last_loss_gen_all = loss_gen_all
582
 
@@ -585,9 +609,12 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, scaler, loaders, writers,
585
  f"Training has been successfully completed with {epoch} epoch, {global_step} steps and {round(loss_gen_all.item(), 3)} loss gen."
586
  )
587
  print(
588
- f"Lowest generator loss: {lowestValue['value']} at epoch {lowestValue['epoch']}, step {lowestValue['step']}"
589
  )
590
 
 
 
 
591
  if hasattr(net_g, "module"):
592
  ckpt = net_g.module.state_dict()
593
  else:
@@ -598,8 +625,11 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, scaler, loaders, writers,
598
  hps.sample_rate,
599
  hps.if_f0,
600
  hps.name,
601
- os.path.join(hps.model_dir, "{}_{}e.pth".format(hps.name, epoch)),
 
 
602
  epoch,
 
603
  hps.version,
604
  hps,
605
  )
 
70
  torch.backends.cudnn.benchmark = False
71
 
72
  global_step = 0
73
+ lowest_value = {"step": 0, "value": float("inf"), "epoch": 0}
74
  last_loss_gen_all = 0
75
+ epochs_since_last_lowest = 0
 
 
 
 
 
 
76
 
77
 
78
  class EpochRecorder:
 
98
  print("GPU not detected, reverting to CPU (not recommended)")
99
  n_gpus = 1
100
  children = []
101
+ pid_file_path = os.path.join(now_dir, "rvc", "train", "train_pid.txt")
102
+ with open(pid_file_path, "w") as pid_file:
103
+ for i in range(n_gpus):
104
+ subproc = mp.Process(
105
+ target=run,
106
+ args=(i, n_gpus, hps),
107
+ )
108
+ children.append(subproc)
109
+ subproc.start()
110
+ pid_file.write(str(subproc.pid) + "\n")
111
 
112
  for i in range(n_gpus):
113
  children[i].join()
 
284
 
285
 
286
  def train_and_evaluate(rank, epoch, hps, nets, optims, scaler, loaders, writers, cache):
287
+ global global_step, last_loss_gen_all, lowest_value, epochs_since_last_lowest
288
+
289
  if epoch == 1:
290
+ lowest_value = {"step": 0, "value": float("inf"), "epoch": 0}
291
+ last_loss_gen_all = 0.0
292
+ epochs_since_last_lowest = 0
293
+
294
  net_g, net_d = nets
295
  optim_g, optim_d = optims
296
  train_loader = loaders[0] if loaders is not None else None
 
468
  loss_gen, losses_gen = generator_loss(y_d_hat_g)
469
  loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl
470
 
471
+ if loss_gen_all < lowest_value["value"]:
472
+ lowest_value["value"] = loss_gen_all
473
+ lowest_value["step"] = global_step
474
+ lowest_value["epoch"] = epoch
475
+ # print(f'Lowest generator loss updated: {lowest_value["value"]} at epoch {epoch}, step {global_step}')
476
+ if epoch > lowest_value["epoch"]:
477
+ print(
478
+ "Alert: The lower generating loss has been exceeded by a lower loss in a subsequent epoch."
479
+ )
480
 
481
  optim_g.zero_grad()
482
  scaler.scale(loss_gen_all).backward()
 
564
  ckpt = net_g.module.state_dict()
565
  else:
566
  ckpt = net_g.state_dict()
567
+ extract_model(
568
+ ckpt,
569
+ hps.sample_rate,
570
+ hps.if_f0,
571
+ hps.name,
572
+ os.path.join(
573
+ hps.model_dir, "{}_{}e_{}s.pth".format(hps.name, epoch, global_step)
574
+ ),
575
+ epoch,
576
+ global_step,
577
+ hps.version,
578
+ hps,
579
+ )
580
+
581
+ if hps.overtraining_detector == 1:
582
+ if lowest_value["value"] < last_loss_gen_all:
583
+ epochs_since_last_lowest += 1
584
+ else:
585
+ epochs_since_last_lowest = 0
586
+
587
+ if epochs_since_last_lowest >= hps.overtraining_threshold:
588
+ print(
589
+ "Stopping training due to possible overtraining. Lowest generator loss: {} at epoch {}, step {}".format(
590
+ lowest_value["value"], lowest_value["epoch"], lowest_value["step"]
591
+ )
592
+ )
593
+ os._exit(2333333)
594
 
595
  if rank == 0:
596
  if epoch > 1:
597
+ print(hps.overtraining_threshold)
 
 
 
598
  print(
599
+ f"{hps.name} | epoch={epoch} | step={global_step} | {epoch_recorder.record()} | lowest_value={lowest_value['value']} (epoch {lowest_value['epoch']} and step {lowest_value['step']})"
600
+ )
601
+ else:
602
+ print(
603
+ f"{hps.name} | epoch={epoch} | step={global_step} | {epoch_recorder.record()}"
604
  )
605
  last_loss_gen_all = loss_gen_all
606
 
 
609
  f"Training has been successfully completed with {epoch} epoch, {global_step} steps and {round(loss_gen_all.item(), 3)} loss gen."
610
  )
611
  print(
612
+ f"Lowest generator loss: {lowest_value['value']} at epoch {lowest_value['epoch']}, step {lowest_value['step']}"
613
  )
614
 
615
+ pid_file_path = os.path.join(now_dir, "rvc", "train", "train_pid.txt")
616
+ os.remove(pid_file_path)
617
+
618
  if hasattr(net_g, "module"):
619
  ckpt = net_g.module.state_dict()
620
  else:
 
625
  hps.sample_rate,
626
  hps.if_f0,
627
  hps.name,
628
+ os.path.join(
629
+ hps.model_dir, "{}_{}e_{}s.pth".format(hps.name, epoch, global_step)
630
+ ),
631
  epoch,
632
+ global_step,
633
  hps.version,
634
  hps,
635
  )
rvc/train/utils.py CHANGED
@@ -7,49 +7,6 @@ import numpy as np
7
  from scipy.io.wavfile import read
8
 
9
 
10
- def load_checkpoint_d(checkpoint_path, combd, sbd, optimizer=None, load_opt=1):
11
- assert os.path.isfile(checkpoint_path)
12
- checkpoint_dict = torch.load(checkpoint_path, map_location="cpu")
13
-
14
- def go(model, bkey):
15
- saved_state_dict = checkpoint_dict[bkey]
16
- if hasattr(model, "module"):
17
- state_dict = model.module.state_dict()
18
- else:
19
- state_dict = model.state_dict()
20
- new_state_dict = {}
21
- for k, v in state_dict.items():
22
- try:
23
- new_state_dict[k] = saved_state_dict[k]
24
- if saved_state_dict[k].shape != state_dict[k].shape:
25
- print(
26
- "shape-%s-mismatch. need: %s, get: %s",
27
- k,
28
- state_dict[k].shape,
29
- saved_state_dict[k].shape,
30
- )
31
- raise KeyError
32
- except:
33
- print("%s is not in the checkpoint", k)
34
- new_state_dict[k] = v
35
- if hasattr(model, "module"):
36
- model.module.load_state_dict(new_state_dict, strict=False)
37
- else:
38
- model.load_state_dict(new_state_dict, strict=False)
39
- return model
40
-
41
- go(combd, "combd")
42
- model = go(sbd, "sbd")
43
-
44
- iteration = checkpoint_dict["iteration"]
45
- learning_rate = checkpoint_dict["learning_rate"]
46
- if optimizer is not None and load_opt == 1:
47
- optimizer.load_state_dict(checkpoint_dict["optimizer"])
48
-
49
- print("Loaded checkpoint '{}' (epoch {})".format(checkpoint_path, iteration))
50
- return model, optimizer, learning_rate, iteration
51
-
52
-
53
  def load_checkpoint(checkpoint_path, model, optimizer=None, load_opt=1):
54
  assert os.path.isfile(checkpoint_path)
55
  checkpoint_dict = torch.load(checkpoint_path, map_location="cpu")
@@ -218,6 +175,22 @@ def get_hparams():
218
  required=True,
219
  help="if caching the dataset in GPU memory, 1 or 0",
220
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
  args = parser.parse_args()
222
  name = args.experiment_dir
223
  experiment_dir = os.path.join("./logs", args.experiment_dir)
@@ -240,6 +213,8 @@ def get_hparams():
240
  hparams.save_every_weights = args.save_every_weights
241
  hparams.if_cache_data_in_gpu = args.if_cache_data_in_gpu
242
  hparams.data.training_files = f"{experiment_dir}/filelist.txt"
 
 
243
  return hparams
244
 
245
 
 
7
  from scipy.io.wavfile import read
8
 
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  def load_checkpoint(checkpoint_path, model, optimizer=None, load_opt=1):
11
  assert os.path.isfile(checkpoint_path)
12
  checkpoint_dict = torch.load(checkpoint_path, map_location="cpu")
 
175
  required=True,
176
  help="if caching the dataset in GPU memory, 1 or 0",
177
  )
178
+
179
+ parser.add_argument(
180
+ "-od",
181
+ "--overtraining_detector",
182
+ type=int,
183
+ required=True,
184
+ help="Detect overtraining or not, 1 or 0",
185
+ )
186
+ parser.add_argument(
187
+ "-ot",
188
+ "--overtraining_threshold",
189
+ type=int,
190
+ default=50,
191
+ help="overtraining_threshold",
192
+ )
193
+
194
  args = parser.parse_args()
195
  name = args.experiment_dir
196
  experiment_dir = os.path.join("./logs", args.experiment_dir)
 
213
  hparams.save_every_weights = args.save_every_weights
214
  hparams.if_cache_data_in_gpu = args.if_cache_data_in_gpu
215
  hparams.data.training_files = f"{experiment_dir}/filelist.txt"
216
+ hparams.overtraining_detector = args.overtraining_detector
217
+ hparams.overtraining_threshold = args.overtraining_threshold
218
  return hparams
219
 
220
 
tabs/download/download.py CHANGED
@@ -1,6 +1,8 @@
1
  import os, sys, shutil
2
  import tempfile
3
  import gradio as gr
 
 
4
  from core import run_download_script
5
 
6
  from assets.i18n.i18n import I18nAuto
@@ -41,12 +43,30 @@ def save_drop_model(dropbox):
41
  os.makedirs(model_path)
42
  if os.path.exists(os.path.join(model_path, file_name)):
43
  os.remove(os.path.join(model_path, file_name))
44
- os.rename(dropbox, os.path.join(model_path, file_name))
45
  print(f"{file_name} saved in {model_path}")
46
  gr.Info(f"{file_name} saved in {model_path}")
47
  return None
48
 
49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  def download_tab():
51
  with gr.Column():
52
  gr.Markdown(value=i18n("## Download Model"))
@@ -57,6 +77,7 @@ def download_tab():
57
  )
58
  model_download_output_info = gr.Textbox(
59
  label=i18n("Output Information"),
 
60
  value="",
61
  max_lines=8,
62
  interactive=False,
@@ -82,3 +103,18 @@ def download_tab():
82
  inputs=[dropbox],
83
  outputs=[dropbox],
84
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os, sys, shutil
2
  import tempfile
3
  import gradio as gr
4
+ import pandas as pd
5
+ import requests
6
  from core import run_download_script
7
 
8
  from assets.i18n.i18n import I18nAuto
 
43
  os.makedirs(model_path)
44
  if os.path.exists(os.path.join(model_path, file_name)):
45
  os.remove(os.path.join(model_path, file_name))
46
+ shutil.move(dropbox, os.path.join(model_path, file_name))
47
  print(f"{file_name} saved in {model_path}")
48
  gr.Info(f"{file_name} saved in {model_path}")
49
  return None
50
 
51
 
52
+ def search_models(name):
53
+ url = f"https://cjtfqzjfdimgpvpwhzlv.supabase.co/rest/v1/models?name=ilike.%25{name}%25&order=created_at.desc&limit=15"
54
+ headers = {
55
+ "apikey": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6ImNqdGZxempmZGltZ3B2cHdoemx2Iiwicm9sZSI6ImFub24iLCJpYXQiOjE2OTUxNjczODgsImV4cCI6MjAxMDc0MzM4OH0.7z5WMIbjR99c2Ooc0ma7B_FyGq10G8X-alkCYTkKR10"
56
+ }
57
+ response = requests.get(url, headers=headers)
58
+ data = response.json()
59
+ if len(data) == 0:
60
+ gr.Info(i18n("We couldn't find models by that name."))
61
+ return None
62
+ else:
63
+ df = pd.DataFrame(data)[["name", "link", "epochs", "type"]]
64
+ df["link"] = df["link"].apply(
65
+ lambda x: f'<a href="{x}" target="_blank">{x}</a>'
66
+ )
67
+ return df
68
+
69
+
70
  def download_tab():
71
  with gr.Column():
72
  gr.Markdown(value=i18n("## Download Model"))
 
77
  )
78
  model_download_output_info = gr.Textbox(
79
  label=i18n("Output Information"),
80
+ info=i18n("The output information will be displayed here."),
81
  value="",
82
  max_lines=8,
83
  interactive=False,
 
103
  inputs=[dropbox],
104
  outputs=[dropbox],
105
  )
106
+ gr.Markdown(value=i18n("## Search Model"))
107
+ search_name = gr.Textbox(
108
+ label=i18n("Model Name"),
109
+ placeholder=i18n("Introduce the model name to search."),
110
+ interactive=True,
111
+ )
112
+ search_table = gr.Dataframe(datatype="markdown")
113
+ search = gr.Button(i18n("Search"))
114
+ search.click(
115
+ search_models,
116
+ [search_name],
117
+ search_table,
118
+ )
119
+
120
+ search_name.submit(search_models, [search_name], search_table)
tabs/extra/analyzer/analyzer.py CHANGED
@@ -1,85 +1,32 @@
 
1
  import gradio as gr
2
- import matplotlib.pyplot as plt
3
- import soundfile as sf
4
- import numpy as np
5
- import os
6
 
 
 
 
 
7
  from assets.i18n.i18n import I18nAuto
8
 
9
  i18n = I18nAuto()
10
 
11
 
12
- def generate_spectrogram(audio_data, sample_rate, file_name):
13
- plt.clf()
14
-
15
- plt.specgram(
16
- audio_data,
17
- Fs=sample_rate / 1,
18
- NFFT=4096,
19
- sides="onesided",
20
- cmap="Reds_r",
21
- scale_by_freq=True,
22
- scale="dB",
23
- mode="magnitude",
24
- window=np.hanning(4096),
25
- )
26
-
27
- plt.title(file_name)
28
- plt.savefig("spectrogram.png")
29
-
30
-
31
- def get_audio_info(audio_file):
32
- audio_data, sample_rate = sf.read(audio_file)
33
-
34
- if len(audio_data.shape) > 1:
35
- audio_data = np.mean(audio_data, axis=1)
36
-
37
- generate_spectrogram(audio_data, sample_rate, os.path.basename(audio_file))
38
-
39
- audio_info = sf.info(audio_file)
40
- bit_depth = {"PCM_16": 16, "FLOAT": 32}.get(audio_info.subtype, 0)
41
-
42
- minutes, seconds = divmod(audio_info.duration, 60)
43
- seconds, milliseconds = divmod(seconds, 1)
44
- milliseconds *= 1000
45
-
46
- speed_in_kbps = audio_info.samplerate * bit_depth / 1000
47
-
48
- info_table = f"""
49
- - **File Name:** {os.path.basename(audio_file)}
50
- - **Duration:** {int(minutes)} minutes, {int(seconds)} seconds, {int(milliseconds)} milliseconds
51
- - **Bitrate:** {speed_in_kbps} kbp/s
52
- - **Audio Channels:** {audio_info.channels}
53
- - **Sampling rate:** {audio_info.samplerate} Hz
54
- - **Bit per second:** {audio_info.samplerate * audio_info.channels * bit_depth} bit/s
55
- """
56
-
57
- return info_table, "spectrogram.png"
58
-
59
-
60
  def analyzer():
61
  with gr.Column():
62
- gr.Markdown(
63
- "Tool inspired in the original [Ilaria-Audio-Analyzer](https://github.com/TheStingerX/Ilaria-Audio-Analyzer) code."
64
- )
65
  audio_input = gr.Audio(type="filepath")
 
 
 
 
 
 
 
66
  get_info_button = gr.Button(
67
  value=i18n("Get information about the audio"), variant="primary"
68
  )
69
- with gr.Column():
70
- with gr.Row():
71
- with gr.Column():
72
- gr.Markdown(
73
- value=i18n("Information about the audio file"),
74
- visible=True,
75
- )
76
- output_markdown = gr.Markdown(
77
- value=i18n("Waiting for information..."), visible=True
78
- )
79
- image_output = gr.Image(type="filepath", interactive=False)
80
 
81
  get_info_button.click(
82
- fn=get_audio_info,
83
  inputs=[audio_input],
84
- outputs=[output_markdown, image_output],
85
  )
 
1
+ import os, sys
2
  import gradio as gr
 
 
 
 
3
 
4
+ now_dir = os.getcwd()
5
+ sys.path.append(now_dir)
6
+
7
+ from core import run_audio_analyzer_script
8
  from assets.i18n.i18n import I18nAuto
9
 
10
  i18n = I18nAuto()
11
 
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  def analyzer():
14
  with gr.Column():
 
 
 
15
  audio_input = gr.Audio(type="filepath")
16
+ output_info = gr.Textbox(
17
+ label=i18n("Output Information"),
18
+ info=i18n("The output information will be displayed here."),
19
+ value="",
20
+ max_lines=8,
21
+ interactive=False,
22
+ )
23
  get_info_button = gr.Button(
24
  value=i18n("Get information about the audio"), variant="primary"
25
  )
26
+ image_output = gr.Image(type="filepath", interactive=False)
 
 
 
 
 
 
 
 
 
 
27
 
28
  get_info_button.click(
29
+ fn=run_audio_analyzer_script,
30
  inputs=[audio_input],
31
+ outputs=[output_info, image_output],
32
  )
tabs/extra/extra.py CHANGED
@@ -15,8 +15,8 @@ def extra_tab():
15
  )
16
  )
17
 
18
- # with gr.TabItem(i18n("Processing")):
19
- # processing.processing()
20
 
21
  with gr.TabItem(i18n("Audio Analyzer")):
22
  analyzer.analyzer()
 
15
  )
16
  )
17
 
18
+ with gr.TabItem(i18n("Processing")):
19
+ processing.processing()
20
 
21
  with gr.TabItem(i18n("Audio Analyzer")):
22
  analyzer.analyzer()
tabs/extra/model_information.py CHANGED
@@ -9,12 +9,14 @@ i18n = I18nAuto()
9
  def model_information_tab():
10
  with gr.Column():
11
  model_name = gr.Textbox(
12
- label=i18n("Model Path"),
13
- placeholder=i18n("Introduce the model .pth path"),
 
14
  interactive=True,
15
  )
16
  model_information_output_info = gr.Textbox(
17
  label=i18n("Output Information"),
 
18
  value="",
19
  max_lines=8,
20
  interactive=False,
 
9
  def model_information_tab():
10
  with gr.Column():
11
  model_name = gr.Textbox(
12
+ label=i18n("Path to Model"),
13
+ info=i18n("Introduce the model pth path"),
14
+ placeholder=i18n("Introduce the model pth path"),
15
  interactive=True,
16
  )
17
  model_information_output_info = gr.Textbox(
18
  label=i18n("Output Information"),
19
+ info=i18n("The output information will be displayed here."),
20
  value="",
21
  max_lines=8,
22
  interactive=False,
tabs/extra/processing/processing.py CHANGED
@@ -1,18 +1,9 @@
1
- import sys
2
-
3
- sys.path.append("..")
4
- import os
5
 
6
  now_dir = os.getcwd()
7
- from rvc.train.process_ckpt import (
8
- extract_small_model,
9
- )
10
-
11
- from rvc.lib.process.model_fusion import model_fusion
12
- from rvc.lib.process.model_information import (
13
- model_information,
14
- )
15
 
 
16
  from assets.i18n.i18n import I18nAuto
17
 
18
  i18n = I18nAuto()
@@ -21,122 +12,27 @@ import gradio as gr
21
 
22
 
23
  def processing():
24
- with gr.Accordion(label=i18n("Model fusion (On progress)"), open=False):
25
- with gr.Column():
26
- model_fusion_name = gr.Textbox(
27
- label=i18n("Model Name"),
28
- value="",
29
- max_lines=1,
30
- interactive=True,
31
- placeholder=i18n("Enter model name"),
32
- )
33
- model_fusion_a = gr.Textbox(
34
- label=i18n("Path to Model A"),
35
- value="",
36
- interactive=True,
37
- placeholder=i18n("Path to model"),
38
- )
39
- model_fusion_b = gr.Textbox(
40
- label=i18n("Path to Model B"),
41
- value="",
42
- interactive=True,
43
- placeholder=i18n("Path to model"),
44
- )
45
- model_fusion_output_info = gr.Textbox(
46
- label=i18n("Output Information"),
47
- value="",
48
- )
49
-
50
- model_fusion_button = gr.Button(
51
- i18n("Fusion"), variant="primary", interactive=False
52
- )
53
-
54
- model_fusion_button.click(
55
- model_fusion,
56
- [
57
- model_fusion_name,
58
- model_fusion_a,
59
- model_fusion_b,
60
- ],
61
- model_fusion_output_info,
62
- api_name="model_fusion",
63
- )
64
-
65
  with gr.Accordion(label=i18n("View model information")):
66
  with gr.Row():
67
  with gr.Column():
68
  model_view_model_path = gr.Textbox(
69
  label=i18n("Path to Model"),
 
70
  value="",
71
  interactive=True,
72
- placeholder=i18n("Path to model"),
73
  )
74
 
75
  model_view_output_info = gr.Textbox(
76
- label=i18n("Output Information"), value="", max_lines=8
 
 
 
77
  )
78
  model_view_button = gr.Button(i18n("View"), variant="primary")
79
  model_view_button.click(
80
- model_information,
81
  [model_view_model_path],
82
  model_view_output_info,
83
  api_name="model_info",
84
  )
85
-
86
- with gr.Accordion(label=i18n("Model extraction")):
87
- with gr.Row():
88
- with gr.Column():
89
- model_extract_name = gr.Textbox(
90
- label=i18n("Model Name"),
91
- value="",
92
- interactive=True,
93
- placeholder=i18n("Enter model name"),
94
- )
95
- model_extract_path = gr.Textbox(
96
- label=i18n("Path to Model"),
97
- placeholder=i18n("Path to model"),
98
- interactive=True,
99
- )
100
- model_extract_info = gr.Textbox(
101
- label=i18n("Model information to be placed"),
102
- value="",
103
- max_lines=8,
104
- interactive=True,
105
- placeholder=i18n("Model information to be placed"),
106
- )
107
- with gr.Column():
108
- model_extract_pitch_guidance = gr.Checkbox(
109
- label=i18n("Pitch Guidance"),
110
- value=True,
111
- interactive=True,
112
- )
113
- model_extract_rvc_version = gr.Radio(
114
- label=i18n("RVC Version"),
115
- choices=["v1", "v2"],
116
- value="v2",
117
- interactive=True,
118
- )
119
- model_extract_sampling_rate = gr.Radio(
120
- label=i18n("Sampling Rate"),
121
- choices=["32000", "40000", "48000"],
122
- value="40000",
123
- interactive=True,
124
- )
125
- model_extract_output_info = gr.Textbox(
126
- label=i18n("Output Information"), value="", max_lines=8
127
- )
128
-
129
- model_extract_button = gr.Button(i18n("Extract"), variant="primary")
130
- model_extract_button.click(
131
- extract_small_model,
132
- [
133
- model_extract_path,
134
- model_extract_name,
135
- model_extract_sampling_rate,
136
- model_extract_pitch_guidance,
137
- model_extract_info,
138
- model_extract_rvc_version,
139
- ],
140
- model_extract_output_info,
141
- api_name="model_extract",
142
- )
 
1
+ import os, sys
 
 
 
2
 
3
  now_dir = os.getcwd()
4
+ sys.path.append(now_dir)
 
 
 
 
 
 
 
5
 
6
+ from core import run_model_information_script
7
  from assets.i18n.i18n import I18nAuto
8
 
9
  i18n = I18nAuto()
 
12
 
13
 
14
  def processing():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  with gr.Accordion(label=i18n("View model information")):
16
  with gr.Row():
17
  with gr.Column():
18
  model_view_model_path = gr.Textbox(
19
  label=i18n("Path to Model"),
20
+ info=i18n("Introduce the model pth path"),
21
  value="",
22
  interactive=True,
23
+ placeholder=i18n("Enter path to model"),
24
  )
25
 
26
  model_view_output_info = gr.Textbox(
27
+ label=i18n("Output Information"),
28
+ info=i18n("The output information will be displayed here."),
29
+ value="",
30
+ max_lines=8,
31
  )
32
  model_view_button = gr.Button(i18n("View"), variant="primary")
33
  model_view_button.click(
34
+ run_model_information_script,
35
  [model_view_model_path],
36
  model_view_output_info,
37
  api_name="model_info",
38
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tabs/inference/inference.py CHANGED
@@ -122,55 +122,6 @@ def get_indexes():
122
  return indexes_list if indexes_list else ""
123
 
124
 
125
- def match_index(model_file: str) -> tuple:
126
- model_files_trip = re.sub(r"\.pth|\.onnx$", "", model_file)
127
- model_file_name = os.path.split(model_files_trip)[
128
- -1
129
- ] # Extract only the name, not the directory
130
-
131
- # Check if the sid0strip has the specific ending format _eXXX_sXXX
132
- if re.match(r".+_e\d+_s\d+$", model_file_name):
133
- base_model_name = model_file_name.rsplit("_", 2)[0]
134
- else:
135
- base_model_name = model_file_name
136
-
137
- sid_directory = os.path.join(model_root_relative, base_model_name)
138
- directories_to_search = [sid_directory] if os.path.exists(sid_directory) else []
139
- directories_to_search.append(model_root_relative)
140
- matching_index_files = []
141
-
142
- for directory in directories_to_search:
143
- for filename in os.listdir(directory):
144
- if filename.endswith(".index") and "trained" not in filename:
145
- # Condition to match the name
146
- name_match = any(
147
- name.lower() in filename.lower()
148
- for name in [model_file_name, base_model_name]
149
- )
150
-
151
- # If in the specific directory, it's automatically a match
152
- folder_match = directory == sid_directory
153
-
154
- if name_match or folder_match:
155
- index_path = os.path.join(directory, filename)
156
- updated_indexes_list = get_indexes()
157
- if index_path in updated_indexes_list:
158
- matching_index_files.append(
159
- (
160
- index_path,
161
- os.path.getsize(index_path),
162
- " " not in filename,
163
- )
164
- )
165
- if matching_index_files:
166
- # Sort by favoring files without spaces and by size (largest size first)
167
- matching_index_files.sort(key=lambda x: (-x[2], -x[1]))
168
- best_match_index_path = matching_index_files[0][0]
169
- return best_match_index_path
170
-
171
- return ""
172
-
173
-
174
  def save_to_wav(record_button):
175
  if record_button is None:
176
  pass
@@ -196,11 +147,21 @@ def save_to_wav2(upload_audio):
196
 
197
 
198
  def delete_outputs():
 
199
  for root, _, files in os.walk(audio_root_relative, topdown=False):
200
  for name in files:
201
  if name.endswith(tuple(sup_audioext)) and name.__contains__("_output"):
202
  os.remove(os.path.join(root, name))
203
- gr.Info(f"Outputs cleared!")
 
 
 
 
 
 
 
 
 
204
 
205
 
206
  # Inference tab
@@ -210,6 +171,7 @@ def inference_tab():
210
  with gr.Row():
211
  model_file = gr.Dropdown(
212
  label=i18n("Voice Model"),
 
213
  choices=sorted(names, key=lambda path: os.path.getsize(path)),
214
  interactive=True,
215
  value=default_weight,
@@ -218,6 +180,7 @@ def inference_tab():
218
 
219
  index_file = gr.Dropdown(
220
  label=i18n("Index File"),
 
221
  choices=get_indexes(),
222
  value=match_index(default_weight) if default_weight else "",
223
  interactive=True,
@@ -228,13 +191,16 @@ def inference_tab():
228
  unload_button = gr.Button(i18n("Unload Voice"))
229
 
230
  unload_button.click(
231
- fn=lambda: ({"value": "", "__type__": "update"}),
 
 
 
232
  inputs=[],
233
- outputs=[model_file],
234
  )
235
 
236
  model_file.select(
237
- fn=match_index,
238
  inputs=[model_file],
239
  outputs=[index_file],
240
  )
@@ -248,6 +214,7 @@ def inference_tab():
248
  with gr.Row():
249
  audio = gr.Dropdown(
250
  label=i18n("Select Audio"),
 
251
  choices=sorted(audio_paths),
252
  value=audio_paths[0] if audio_paths else "",
253
  interactive=True,
@@ -256,12 +223,15 @@ def inference_tab():
256
 
257
  with gr.Accordion(i18n("Advanced Settings"), open=False):
258
  with gr.Column():
259
- clear_outputs = gr.Button(
260
  i18n("Clear Outputs (Deletes all audios in assets/audios)")
261
  )
262
  output_path = gr.Textbox(
263
  label=i18n("Output Path"),
264
  placeholder=i18n("Enter output path"),
 
 
 
265
  value=(
266
  output_path_fn(audio_paths[0])
267
  if audio_paths
@@ -269,25 +239,68 @@ def inference_tab():
269
  ),
270
  interactive=True,
271
  )
 
 
 
 
 
 
 
272
  split_audio = gr.Checkbox(
273
  label=i18n("Split Audio"),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
274
  visible=True,
275
  value=False,
276
  interactive=True,
277
  )
 
 
 
 
 
 
 
 
 
 
 
278
  pitch = gr.Slider(
279
  minimum=-24,
280
  maximum=24,
281
  step=1,
282
  label=i18n("Pitch"),
 
 
 
283
  value=0,
284
  interactive=True,
285
  )
286
  filter_radius = gr.Slider(
287
  minimum=0,
288
  maximum=7,
289
- label=i18n(
290
- "If >=3: apply median filtering to the harvested pitch results. The value represents the filter radius and can reduce breathiness"
 
291
  ),
292
  value=3,
293
  step=1,
@@ -297,20 +310,50 @@ def inference_tab():
297
  minimum=0,
298
  maximum=1,
299
  label=i18n("Search Feature Ratio"),
 
 
 
300
  value=0.75,
301
  interactive=True,
302
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
303
  hop_length = gr.Slider(
304
  minimum=1,
305
  maximum=512,
306
  step=1,
307
  label=i18n("Hop Length"),
 
 
 
 
308
  value=128,
309
  interactive=True,
310
  )
311
  with gr.Column():
312
  f0method = gr.Radio(
313
  label=i18n("Pitch extraction algorithm"),
 
 
 
314
  choices=[
315
  "pm",
316
  "harvest",
@@ -318,6 +361,8 @@ def inference_tab():
318
  "crepe",
319
  "crepe-tiny",
320
  "rmvpe",
 
 
321
  ],
322
  value="rmvpe",
323
  interactive=True,
@@ -326,7 +371,10 @@ def inference_tab():
326
  convert_button1 = gr.Button(i18n("Convert"))
327
 
328
  with gr.Row(): # Defines output info + output audio download after conversion
329
- vc_output1 = gr.Textbox(label=i18n("Output Information"))
 
 
 
330
  vc_output2 = gr.Audio(label=i18n("Export Audio"))
331
 
332
  # Batch inference tab
@@ -335,40 +383,87 @@ def inference_tab():
335
  with gr.Column():
336
  input_folder_batch = gr.Textbox(
337
  label=i18n("Input Folder"),
 
338
  placeholder=i18n("Enter input path"),
339
  value=os.path.join(now_dir, "assets", "audios"),
340
  interactive=True,
341
  )
342
  output_folder_batch = gr.Textbox(
343
  label=i18n("Output Folder"),
 
 
 
344
  placeholder=i18n("Enter output path"),
345
  value=os.path.join(now_dir, "assets", "audios"),
346
  interactive=True,
347
  )
348
  with gr.Accordion(i18n("Advanced Settings"), open=False):
349
  with gr.Column():
350
- clear_outputs = gr.Button(
351
  i18n("Clear Outputs (Deletes all audios in assets/audios)")
352
  )
 
 
 
 
 
 
 
353
  split_audio_batch = gr.Checkbox(
354
  label=i18n("Split Audio"),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
355
  visible=True,
356
  value=False,
357
  interactive=True,
358
  )
 
 
 
 
 
 
 
 
 
 
 
359
  pitch_batch = gr.Slider(
360
  minimum=-24,
361
  maximum=24,
362
  step=1,
363
  label=i18n("Pitch"),
 
 
 
364
  value=0,
365
  interactive=True,
366
  )
367
  filter_radius_batch = gr.Slider(
368
  minimum=0,
369
  maximum=7,
370
- label=i18n(
371
- "If >=3: apply median filtering to the harvested pitch results. The value represents the filter radius and can reduce breathiness"
 
372
  ),
373
  value=3,
374
  step=1,
@@ -378,20 +473,50 @@ def inference_tab():
378
  minimum=0,
379
  maximum=1,
380
  label=i18n("Search Feature Ratio"),
 
 
 
381
  value=0.75,
382
  interactive=True,
383
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
384
  hop_length_batch = gr.Slider(
385
  minimum=1,
386
  maximum=512,
387
  step=1,
388
  label=i18n("Hop Length"),
 
 
 
 
389
  value=128,
390
  interactive=True,
391
  )
392
  with gr.Column():
393
  f0method_batch = gr.Radio(
394
  label=i18n("Pitch extraction algorithm"),
 
 
 
395
  choices=[
396
  "pm",
397
  "harvest",
@@ -399,6 +524,8 @@ def inference_tab():
399
  "crepe",
400
  "crepe-tiny",
401
  "rmvpe",
 
 
402
  ],
403
  value="rmvpe",
404
  interactive=True,
@@ -407,11 +534,39 @@ def inference_tab():
407
  convert_button2 = gr.Button(i18n("Convert"))
408
 
409
  with gr.Row(): # Defines output info + output audio download after conversion
410
- vc_output3 = gr.Textbox(label=i18n("Output Information"))
 
 
 
411
 
412
  def toggle_visible(checkbox):
413
  return {"visible": checkbox, "__type__": "update"}
414
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
415
  refresh_button.click(
416
  fn=change_choices,
417
  inputs=[],
@@ -432,7 +587,12 @@ def inference_tab():
432
  inputs=[upload_audio],
433
  outputs=[audio, output_path],
434
  )
435
- clear_outputs.click(
 
 
 
 
 
436
  fn=delete_outputs,
437
  inputs=[],
438
  outputs=[],
@@ -443,6 +603,8 @@ def inference_tab():
443
  pitch,
444
  filter_radius,
445
  index_rate,
 
 
446
  hop_length,
447
  f0method,
448
  audio,
@@ -450,6 +612,10 @@ def inference_tab():
450
  model_file,
451
  index_file,
452
  split_audio,
 
 
 
 
453
  ],
454
  outputs=[vc_output1, vc_output2],
455
  )
@@ -459,6 +625,8 @@ def inference_tab():
459
  pitch_batch,
460
  filter_radius_batch,
461
  index_rate_batch,
 
 
462
  hop_length_batch,
463
  f0method_batch,
464
  input_folder_batch,
@@ -466,6 +634,10 @@ def inference_tab():
466
  model_file,
467
  index_file,
468
  split_audio_batch,
 
 
 
 
469
  ],
470
  outputs=[vc_output3],
471
  )
 
122
  return indexes_list if indexes_list else ""
123
 
124
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  def save_to_wav(record_button):
126
  if record_button is None:
127
  pass
 
147
 
148
 
149
  def delete_outputs():
150
+ gr.Info(f"Outputs cleared!")
151
  for root, _, files in os.walk(audio_root_relative, topdown=False):
152
  for name in files:
153
  if name.endswith(tuple(sup_audioext)) and name.__contains__("_output"):
154
  os.remove(os.path.join(root, name))
155
+
156
+
157
+ def match_index(model_file_value):
158
+ if model_file_value:
159
+ model_folder = os.path.dirname(model_file_value)
160
+ index_files = get_indexes()
161
+ for index_file in index_files:
162
+ if os.path.dirname(index_file) == model_folder:
163
+ return index_file
164
+ return ""
165
 
166
 
167
  # Inference tab
 
171
  with gr.Row():
172
  model_file = gr.Dropdown(
173
  label=i18n("Voice Model"),
174
+ info=i18n("Select the voice model to use for the conversion."),
175
  choices=sorted(names, key=lambda path: os.path.getsize(path)),
176
  interactive=True,
177
  value=default_weight,
 
180
 
181
  index_file = gr.Dropdown(
182
  label=i18n("Index File"),
183
+ info=i18n("Select the index file to use for the conversion."),
184
  choices=get_indexes(),
185
  value=match_index(default_weight) if default_weight else "",
186
  interactive=True,
 
191
  unload_button = gr.Button(i18n("Unload Voice"))
192
 
193
  unload_button.click(
194
+ fn=lambda: (
195
+ {"value": "", "__type__": "update"},
196
+ {"value": "", "__type__": "update"},
197
+ ),
198
  inputs=[],
199
+ outputs=[model_file, index_file],
200
  )
201
 
202
  model_file.select(
203
+ fn=lambda model_file_value: match_index(model_file_value),
204
  inputs=[model_file],
205
  outputs=[index_file],
206
  )
 
214
  with gr.Row():
215
  audio = gr.Dropdown(
216
  label=i18n("Select Audio"),
217
+ info=i18n("Select the audio to convert."),
218
  choices=sorted(audio_paths),
219
  value=audio_paths[0] if audio_paths else "",
220
  interactive=True,
 
223
 
224
  with gr.Accordion(i18n("Advanced Settings"), open=False):
225
  with gr.Column():
226
+ clear_outputs_infer = gr.Button(
227
  i18n("Clear Outputs (Deletes all audios in assets/audios)")
228
  )
229
  output_path = gr.Textbox(
230
  label=i18n("Output Path"),
231
  placeholder=i18n("Enter output path"),
232
+ info=i18n(
233
+ "The path where the output audio will be saved, by default in assets/audios/output.wav"
234
+ ),
235
  value=(
236
  output_path_fn(audio_paths[0])
237
  if audio_paths
 
239
  ),
240
  interactive=True,
241
  )
242
+ export_format = gr.Radio(
243
+ label=i18n("Export Format"),
244
+ info=i18n("Select the format to export the audio."),
245
+ choices=["WAV", "MP3", "FLAC", "OGG", "M4A"],
246
+ value="WAV",
247
+ interactive=True,
248
+ )
249
  split_audio = gr.Checkbox(
250
  label=i18n("Split Audio"),
251
+ info=i18n(
252
+ "Split the audio into chunks for inference to obtain better results in some cases."
253
+ ),
254
+ visible=True,
255
+ value=False,
256
+ interactive=True,
257
+ )
258
+ autotune = gr.Checkbox(
259
+ label=i18n("Autotune"),
260
+ info=i18n(
261
+ "Apply a soft autotune to your inferences, recommended for singing conversions."
262
+ ),
263
+ visible=True,
264
+ value=False,
265
+ interactive=True,
266
+ )
267
+ clean_audio = gr.Checkbox(
268
+ label=i18n("Clean Audio"),
269
+ info=i18n(
270
+ "Clean your audio output using noise detection algorithms, recommended for speaking audios."
271
+ ),
272
  visible=True,
273
  value=False,
274
  interactive=True,
275
  )
276
+ clean_strength = gr.Slider(
277
+ minimum=0,
278
+ maximum=1,
279
+ label=i18n("Clean Strength"),
280
+ info=i18n(
281
+ "Set the clean-up level to the audio you want, the more you increase it the more it will clean up, but it is possible that the audio will be more compressed."
282
+ ),
283
+ visible=False,
284
+ value=0.5,
285
+ interactive=True,
286
+ )
287
  pitch = gr.Slider(
288
  minimum=-24,
289
  maximum=24,
290
  step=1,
291
  label=i18n("Pitch"),
292
+ info=i18n(
293
+ "Set the pitch of the audio, the higher the value, the higher the pitch."
294
+ ),
295
  value=0,
296
  interactive=True,
297
  )
298
  filter_radius = gr.Slider(
299
  minimum=0,
300
  maximum=7,
301
+ label=i18n("Filter Radius"),
302
+ info=i18n(
303
+ "If the number is greater than or equal to three, employing median filtering on the collected tone results has the potential to decrease respiration."
304
  ),
305
  value=3,
306
  step=1,
 
310
  minimum=0,
311
  maximum=1,
312
  label=i18n("Search Feature Ratio"),
313
+ info=i18n(
314
+ "Influence exerted by the index file; a higher value corresponds to greater influence. However, opting for lower values can help mitigate artifacts present in the audio."
315
+ ),
316
  value=0.75,
317
  interactive=True,
318
  )
319
+ rms_mix_rate = gr.Slider(
320
+ minimum=0,
321
+ maximum=1,
322
+ label=i18n("Volume Envelope"),
323
+ info=i18n(
324
+ "Substitute or blend with the volume envelope of the output. The closer the ratio is to 1, the more the output envelope is employed."
325
+ ),
326
+ value=1,
327
+ interactive=True,
328
+ )
329
+ protect = gr.Slider(
330
+ minimum=0,
331
+ maximum=0.5,
332
+ label=i18n("Protect Voiceless Consonants"),
333
+ info=i18n(
334
+ "Safeguard distinct consonants and breathing sounds to prevent electro-acoustic tearing and other artifacts. Pulling the parameter to its maximum value of 0.5 offers comprehensive protection. However, reducing this value might decrease the extent of protection while potentially mitigating the indexing effect."
335
+ ),
336
+ value=0.5,
337
+ interactive=True,
338
+ )
339
  hop_length = gr.Slider(
340
  minimum=1,
341
  maximum=512,
342
  step=1,
343
  label=i18n("Hop Length"),
344
+ info=i18n(
345
+ "Denotes the duration it takes for the system to transition to a significant pitch change. Smaller hop lengths require more time for inference but tend to yield higher pitch accuracy."
346
+ ),
347
+ visible=False,
348
  value=128,
349
  interactive=True,
350
  )
351
  with gr.Column():
352
  f0method = gr.Radio(
353
  label=i18n("Pitch extraction algorithm"),
354
+ info=i18n(
355
+ "Pitch extraction algorithm to use for the audio conversion. The default algorithm is rmvpe, which is recommended for most cases."
356
+ ),
357
  choices=[
358
  "pm",
359
  "harvest",
 
361
  "crepe",
362
  "crepe-tiny",
363
  "rmvpe",
364
+ "fcpe",
365
+ "hybrid[rmvpe+fcpe]",
366
  ],
367
  value="rmvpe",
368
  interactive=True,
 
371
  convert_button1 = gr.Button(i18n("Convert"))
372
 
373
  with gr.Row(): # Defines output info + output audio download after conversion
374
+ vc_output1 = gr.Textbox(
375
+ label=i18n("Output Information"),
376
+ info=i18n("The output information will be displayed here."),
377
+ )
378
  vc_output2 = gr.Audio(label=i18n("Export Audio"))
379
 
380
  # Batch inference tab
 
383
  with gr.Column():
384
  input_folder_batch = gr.Textbox(
385
  label=i18n("Input Folder"),
386
+ info=i18n("Select the folder containing the audios to convert."),
387
  placeholder=i18n("Enter input path"),
388
  value=os.path.join(now_dir, "assets", "audios"),
389
  interactive=True,
390
  )
391
  output_folder_batch = gr.Textbox(
392
  label=i18n("Output Folder"),
393
+ info=i18n(
394
+ "Select the folder where the output audios will be saved."
395
+ ),
396
  placeholder=i18n("Enter output path"),
397
  value=os.path.join(now_dir, "assets", "audios"),
398
  interactive=True,
399
  )
400
  with gr.Accordion(i18n("Advanced Settings"), open=False):
401
  with gr.Column():
402
+ clear_outputs_batch = gr.Button(
403
  i18n("Clear Outputs (Deletes all audios in assets/audios)")
404
  )
405
+ export_format_batch = gr.Radio(
406
+ label=i18n("Export Format"),
407
+ info=i18n("Select the format to export the audio."),
408
+ choices=["WAV", "MP3", "FLAC", "OGG", "M4A"],
409
+ value="WAV",
410
+ interactive=True,
411
+ )
412
  split_audio_batch = gr.Checkbox(
413
  label=i18n("Split Audio"),
414
+ info=i18n(
415
+ "Split the audio into chunks for inference to obtain better results in some cases."
416
+ ),
417
+ visible=True,
418
+ value=False,
419
+ interactive=True,
420
+ )
421
+ autotune_batch = gr.Checkbox(
422
+ label=i18n("Autotune"),
423
+ info=i18n(
424
+ "Apply a soft autotune to your inferences, recommended for singing conversions."
425
+ ),
426
+ visible=True,
427
+ value=False,
428
+ interactive=True,
429
+ )
430
+ clean_audio_batch = gr.Checkbox(
431
+ label=i18n("Clean Audio"),
432
+ info=i18n(
433
+ "Clean your audio output using noise detection algorithms, recommended for speaking audios."
434
+ ),
435
  visible=True,
436
  value=False,
437
  interactive=True,
438
  )
439
+ clean_strength_batch = gr.Slider(
440
+ minimum=0,
441
+ maximum=1,
442
+ label=i18n("Clean Strength"),
443
+ info=i18n(
444
+ "Set the clean-up level to the audio you want, the more you increase it the more it will clean up, but it is possible that the audio will be more compressed."
445
+ ),
446
+ visible=False,
447
+ value=0.5,
448
+ interactive=True,
449
+ )
450
  pitch_batch = gr.Slider(
451
  minimum=-24,
452
  maximum=24,
453
  step=1,
454
  label=i18n("Pitch"),
455
+ info=i18n(
456
+ "Set the pitch of the audio, the higher the value, the higher the pitch."
457
+ ),
458
  value=0,
459
  interactive=True,
460
  )
461
  filter_radius_batch = gr.Slider(
462
  minimum=0,
463
  maximum=7,
464
+ label=i18n("Filter Radius"),
465
+ info=i18n(
466
+ "If the number is greater than or equal to three, employing median filtering on the collected tone results has the potential to decrease respiration."
467
  ),
468
  value=3,
469
  step=1,
 
473
  minimum=0,
474
  maximum=1,
475
  label=i18n("Search Feature Ratio"),
476
+ info=i18n(
477
+ "Influence exerted by the index file; a higher value corresponds to greater influence. However, opting for lower values can help mitigate artifacts present in the audio."
478
+ ),
479
  value=0.75,
480
  interactive=True,
481
  )
482
+ rms_mix_rate_batch = gr.Slider(
483
+ minimum=0,
484
+ maximum=1,
485
+ label=i18n("Volume Envelope"),
486
+ info=i18n(
487
+ "Substitute or blend with the volume envelope of the output. The closer the ratio is to 1, the more the output envelope is employed."
488
+ ),
489
+ value=1,
490
+ interactive=True,
491
+ )
492
+ protect_batch = gr.Slider(
493
+ minimum=0,
494
+ maximum=0.5,
495
+ label=i18n("Protect Voiceless Consonants"),
496
+ info=i18n(
497
+ "Safeguard distinct consonants and breathing sounds to prevent electro-acoustic tearing and other artifacts. Pulling the parameter to its maximum value of 0.5 offers comprehensive protection. However, reducing this value might decrease the extent of protection while potentially mitigating the indexing effect."
498
+ ),
499
+ value=0.5,
500
+ interactive=True,
501
+ )
502
  hop_length_batch = gr.Slider(
503
  minimum=1,
504
  maximum=512,
505
  step=1,
506
  label=i18n("Hop Length"),
507
+ info=i18n(
508
+ "Denotes the duration it takes for the system to transition to a significant pitch change. Smaller hop lengths require more time for inference but tend to yield higher pitch accuracy."
509
+ ),
510
+ visible=False,
511
  value=128,
512
  interactive=True,
513
  )
514
  with gr.Column():
515
  f0method_batch = gr.Radio(
516
  label=i18n("Pitch extraction algorithm"),
517
+ info=i18n(
518
+ "Pitch extraction algorithm to use for the audio conversion. The default algorithm is rmvpe, which is recommended for most cases."
519
+ ),
520
  choices=[
521
  "pm",
522
  "harvest",
 
524
  "crepe",
525
  "crepe-tiny",
526
  "rmvpe",
527
+ "fcpe",
528
+ "hybrid[rmvpe+fcpe]",
529
  ],
530
  value="rmvpe",
531
  interactive=True,
 
534
  convert_button2 = gr.Button(i18n("Convert"))
535
 
536
  with gr.Row(): # Defines output info + output audio download after conversion
537
+ vc_output3 = gr.Textbox(
538
+ label=i18n("Output Information"),
539
+ info=i18n("The output information will be displayed here."),
540
+ )
541
 
542
  def toggle_visible(checkbox):
543
  return {"visible": checkbox, "__type__": "update"}
544
 
545
+ def toggle_visible_hop_length(f0method):
546
+ if f0method == "crepe" or f0method == "crepe-tiny":
547
+ return {"visible": True, "__type__": "update"}
548
+ return {"visible": False, "__type__": "update"}
549
+
550
+ clean_audio.change(
551
+ fn=toggle_visible,
552
+ inputs=[clean_audio],
553
+ outputs=[clean_strength],
554
+ )
555
+ clean_audio_batch.change(
556
+ fn=toggle_visible,
557
+ inputs=[clean_audio_batch],
558
+ outputs=[clean_strength_batch],
559
+ )
560
+ f0method.change(
561
+ fn=toggle_visible_hop_length,
562
+ inputs=[f0method],
563
+ outputs=[hop_length],
564
+ )
565
+ f0method_batch.change(
566
+ fn=toggle_visible_hop_length,
567
+ inputs=[f0method_batch],
568
+ outputs=[hop_length_batch],
569
+ )
570
  refresh_button.click(
571
  fn=change_choices,
572
  inputs=[],
 
587
  inputs=[upload_audio],
588
  outputs=[audio, output_path],
589
  )
590
+ clear_outputs_infer.click(
591
+ fn=delete_outputs,
592
+ inputs=[],
593
+ outputs=[],
594
+ )
595
+ clear_outputs_batch.click(
596
  fn=delete_outputs,
597
  inputs=[],
598
  outputs=[],
 
603
  pitch,
604
  filter_radius,
605
  index_rate,
606
+ rms_mix_rate,
607
+ protect,
608
  hop_length,
609
  f0method,
610
  audio,
 
612
  model_file,
613
  index_file,
614
  split_audio,
615
+ autotune,
616
+ clean_audio,
617
+ clean_strength,
618
+ export_format,
619
  ],
620
  outputs=[vc_output1, vc_output2],
621
  )
 
625
  pitch_batch,
626
  filter_radius_batch,
627
  index_rate_batch,
628
+ rms_mix_rate_batch,
629
+ protect_batch,
630
  hop_length_batch,
631
  f0method_batch,
632
  input_folder_batch,
 
634
  model_file,
635
  index_file,
636
  split_audio_batch,
637
+ autotune_batch,
638
+ clean_audio_batch,
639
+ clean_strength_batch,
640
+ export_format_batch,
641
  ],
642
  outputs=[vc_output3],
643
  )
tabs/plugins/plugins_core.py CHANGED
@@ -11,24 +11,30 @@ i18n = I18nAuto()
11
  now_dir = os.getcwd()
12
  sys.path.append(now_dir)
13
 
 
 
14
  plugins_path = os.path.join(now_dir, "tabs", "plugins", "installed")
15
  if not os.path.exists(plugins_path):
16
  os.makedirs(plugins_path)
17
- json_file_path = os.path.join(now_dir, "tabs", "plugins", "installed_list.json")
18
  current_folders = os.listdir(plugins_path)
19
 
20
 
21
  def get_existing_folders():
22
  if os.path.exists(json_file_path):
23
  with open(json_file_path, "r") as file:
24
- return json.load(file)
 
25
  else:
26
  return []
27
 
28
 
29
  def save_existing_folders(existing_folders):
 
 
 
30
  with open(json_file_path, "w") as file:
31
- json.dump(existing_folders, file)
32
 
33
 
34
  def save_plugin_dropbox(dropbox):
@@ -53,33 +59,47 @@ def save_plugin_dropbox(dropbox):
53
  os.remove(zip_file_path)
54
 
55
  if os.path.exists(os.path.join(folder_path, "requirements.txt")):
56
- subprocess.run(
57
- [
58
- os.path.join("env", "python.exe"),
59
- "-m",
60
- "pip",
61
- "install",
62
- "-r",
63
- os.path.join(folder_path, "requirements.txt"),
64
- ]
65
- )
 
 
 
 
 
 
 
 
 
 
 
 
66
  else:
67
  print("No requirements.txt file found in the plugin folder.")
68
 
69
  save_existing_folders(get_existing_folders() + [folder_name])
70
 
71
  print(
72
- f"{folder_name} plugin installed in {plugins_path}! Restart applio to see the changes."
73
  )
74
  gr.Info(
75
- f"{folder_name} plugin installed in {plugins_path}! Restart applio to see the changes."
76
  )
 
77
  return None
78
 
79
 
80
  def check_new_folders():
81
  existing_folders = get_existing_folders()
82
  new_folders = set(current_folders) - set(existing_folders)
 
83
  if new_folders:
84
  for new_folder in new_folders:
85
  complete_path = os.path.join(plugins_path, new_folder)
@@ -98,5 +118,5 @@ def check_new_folders():
98
  )
99
  else:
100
  print("No requirements.txt file found in the plugin folder.")
101
- print("Plugins checked and installed! Restart applio to see the changes.")
102
- save_existing_folders(current_folders)
 
11
  now_dir = os.getcwd()
12
  sys.path.append(now_dir)
13
 
14
+ from tabs.settings.restart import restart_applio
15
+
16
  plugins_path = os.path.join(now_dir, "tabs", "plugins", "installed")
17
  if not os.path.exists(plugins_path):
18
  os.makedirs(plugins_path)
19
+ json_file_path = os.path.join(now_dir, "assets", "config.json")
20
  current_folders = os.listdir(plugins_path)
21
 
22
 
23
  def get_existing_folders():
24
  if os.path.exists(json_file_path):
25
  with open(json_file_path, "r") as file:
26
+ config = json.load(file)
27
+ return config["plugins"]
28
  else:
29
  return []
30
 
31
 
32
  def save_existing_folders(existing_folders):
33
+ with open(json_file_path, "r") as file:
34
+ config = json.load(file)
35
+ config["plugins"] = existing_folders
36
  with open(json_file_path, "w") as file:
37
+ json.dump(config, file, indent=2)
38
 
39
 
40
  def save_plugin_dropbox(dropbox):
 
59
  os.remove(zip_file_path)
60
 
61
  if os.path.exists(os.path.join(folder_path, "requirements.txt")):
62
+ if os.name == "nt":
63
+ subprocess.run(
64
+ [
65
+ os.path.join("env", "python.exe"),
66
+ "-m",
67
+ "pip",
68
+ "install",
69
+ "-r",
70
+ os.path.join(folder_path, "requirements.txt"),
71
+ ]
72
+ )
73
+ else:
74
+ subprocess.run(
75
+ [
76
+ "python",
77
+ "-m",
78
+ "pip",
79
+ "install",
80
+ "-r",
81
+ os.path.join(folder_path, "requirements.txt"),
82
+ ]
83
+ )
84
  else:
85
  print("No requirements.txt file found in the plugin folder.")
86
 
87
  save_existing_folders(get_existing_folders() + [folder_name])
88
 
89
  print(
90
+ f"{folder_name} plugin installed in {plugins_path}! Restarting applio to apply the changes."
91
  )
92
  gr.Info(
93
+ f"{folder_name} plugin installed in {plugins_path}! Restarting applio to apply the changes."
94
  )
95
+ restart_applio()
96
  return None
97
 
98
 
99
  def check_new_folders():
100
  existing_folders = get_existing_folders()
101
  new_folders = set(current_folders) - set(existing_folders)
102
+ save_existing_folders(current_folders)
103
  if new_folders:
104
  for new_folder in new_folders:
105
  complete_path = os.path.join(plugins_path, new_folder)
 
118
  )
119
  else:
120
  print("No requirements.txt file found in the plugin folder.")
121
+ print("Plugins checked and installed! Restarting applio to apply the changes.")
122
+ restart_applio()
tabs/report/report.py CHANGED
@@ -8,7 +8,7 @@ import gradio as gr
8
  from assets.i18n.i18n import I18nAuto
9
 
10
  now_dir = os.getcwd()
11
- sys.path.append("..")
12
 
13
  i18n = I18nAuto()
14
 
 
8
  from assets.i18n.i18n import I18nAuto
9
 
10
  now_dir = os.getcwd()
11
+ sys.path.append(now_dir)
12
 
13
  i18n = I18nAuto()
14
 
tabs/settings/fake_gpu.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, sys
2
+ import torch
3
+ import json
4
+ import gradio as gr
5
+ from assets.i18n.i18n import I18nAuto
6
+ from tabs.settings.restart import restart_applio
7
+
8
+ now_dir = os.getcwd()
9
+ sys.path.append(now_dir)
10
+ i18n = I18nAuto()
11
+
12
+ ngpu = torch.cuda.device_count()
13
+ config_file = os.path.join(now_dir, "assets", "config.json")
14
+
15
+
16
+ def gpu_available():
17
+ if torch.cuda.is_available() or ngpu != 0:
18
+ return True
19
+
20
+
21
+ def load_fake_gpu():
22
+ with open(config_file, "r", encoding="utf8") as file:
23
+ config = json.load(file)
24
+ return config["fake_gpu"]
25
+
26
+
27
+ def save_config(value):
28
+ with open(config_file, "r", encoding="utf8") as file:
29
+ config = json.load(file)
30
+ config["fake_gpu"] = value
31
+ with open(config_file, "w", encoding="utf8") as file:
32
+ json.dump(config, file, indent=2)
33
+
34
+
35
+ def fake_gpu_tab():
36
+ with gr.Row():
37
+ with gr.Column():
38
+ presence = gr.Checkbox(
39
+ label=i18n("Enable fake GPU"),
40
+ info=i18n(
41
+ "Activates the train tab. However, please note that this device lacks GPU capabilities, hence training is not supported. This option is only for testing purposes. (This option will restart Applio)"
42
+ ),
43
+ interactive=True,
44
+ value=load_fake_gpu(),
45
+ )
46
+ presence.change(
47
+ fn=toggle,
48
+ inputs=[presence],
49
+ outputs=[],
50
+ )
51
+
52
+
53
+ def toggle(checkbox):
54
+ save_config(bool(checkbox))
55
+ restart_applio()
tabs/settings/flask_server.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import gradio as gr
4
+ from assets.i18n.i18n import I18nAuto
5
+ import requests
6
+
7
+ now_dir = os.getcwd()
8
+ sys.path.append(now_dir)
9
+
10
+ from assets.flask.server import start_flask, load_config_flask, save_config
11
+
12
+ i18n = I18nAuto()
13
+
14
+
15
+ def flask_server_tab():
16
+ with gr.Row():
17
+ with gr.Column():
18
+ flask_checkbox = gr.Checkbox(
19
+ label=i18n(
20
+ "Enable Applio integration with applio.org/models using flask"
21
+ ),
22
+ info=i18n(
23
+ "It will activate the possibility of downloading models with a click from the website."
24
+ ),
25
+ interactive=True,
26
+ value=load_config_flask(),
27
+ )
28
+ flask_checkbox.change(
29
+ fn=toggle,
30
+ inputs=[flask_checkbox],
31
+ outputs=[],
32
+ )
33
+
34
+
35
+ def toggle(checkbox):
36
+ save_config(bool(checkbox))
37
+ if load_config_flask() == True:
38
+ start_flask()
39
+ else:
40
+ try:
41
+ requests.post("http://localhost:8000/shutdown")
42
+ except requests.exceptions.ConnectionError:
43
+ pass
tabs/settings/lang.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, sys
2
+ import json
3
+ import gradio as gr
4
+ from assets.i18n.i18n import I18nAuto
5
+
6
+ now_dir = os.getcwd()
7
+ sys.path.append(now_dir)
8
+
9
+ i18n = I18nAuto()
10
+
11
+ config_file = os.path.join(now_dir, "assets", "config.json")
12
+
13
+
14
+ def get_language_settings():
15
+ with open(config_file, "r", encoding="utf8") as file:
16
+ config = json.load(file)
17
+
18
+ if config["lang"]["override"] == False:
19
+ return "Language automatically detected in the system"
20
+ else:
21
+ return config["lang"]["selected_lang"]
22
+
23
+
24
+ def save_lang_settings(selected_language):
25
+ with open(config_file, "r", encoding="utf8") as file:
26
+ config = json.load(file)
27
+
28
+ if selected_language == "Language automatically detected in the system":
29
+ config["lang"]["override"] = False
30
+ else:
31
+ config["lang"]["override"] = True
32
+ config["lang"]["selected_lang"] = selected_language
33
+
34
+ gr.Info("Language have been saved. Restart Applio to apply the changes.")
35
+
36
+ with open(config_file, "w", encoding="utf8") as file:
37
+ json.dump(config, file, indent=2)
38
+
39
+
40
+ def lang_tab():
41
+ with gr.Column():
42
+ selected_language = gr.Dropdown(
43
+ label=i18n("Language"),
44
+ info=i18n(
45
+ "Select the language you want to use. (Requires restarting Applio)"
46
+ ),
47
+ value=get_language_settings(),
48
+ choices=["Language automatically detected in the system"]
49
+ + i18n._get_available_languages(),
50
+ interactive=True,
51
+ )
52
+
53
+ selected_language.change(
54
+ fn=save_lang_settings,
55
+ inputs=[selected_language],
56
+ outputs=[],
57
+ )
tabs/settings/presence.py CHANGED
@@ -1,17 +1,29 @@
1
  import os
2
  import sys
3
- import base64
4
- import pathlib
5
- import tempfile
6
  import gradio as gr
7
- import threading
8
  from assets.i18n.i18n import I18nAuto
9
  from assets.discord_presence import RPCManager
10
 
11
  now_dir = os.getcwd()
12
- sys.path.append("..")
13
 
14
  i18n = I18nAuto()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
 
17
  def presence_tab():
@@ -19,8 +31,11 @@ def presence_tab():
19
  with gr.Column():
20
  presence = gr.Checkbox(
21
  label=i18n("Enable Applio integration with Discord presence"),
 
 
 
22
  interactive=True,
23
- value=True,
24
  )
25
  presence.change(
26
  fn=toggle,
@@ -30,13 +45,11 @@ def presence_tab():
30
 
31
 
32
  def toggle(checkbox):
33
-
34
- if bool(checkbox):
35
- # print("Start Presence")
36
  try:
37
  RPCManager.start_presence()
38
  except KeyboardInterrupt:
39
  RPCManager.stop_presence()
40
  else:
41
- # print("Stop presence")
42
  RPCManager.stop_presence()
 
1
  import os
2
  import sys
 
 
 
3
  import gradio as gr
4
+ import json
5
  from assets.i18n.i18n import I18nAuto
6
  from assets.discord_presence import RPCManager
7
 
8
  now_dir = os.getcwd()
9
+ sys.path.append(now_dir)
10
 
11
  i18n = I18nAuto()
12
+ config_file = os.path.join(now_dir, "assets", "config.json")
13
+
14
+
15
+ def load_config_presence():
16
+ with open(config_file, "r", encoding="utf8") as file:
17
+ config = json.load(file)
18
+ return config["discord_presence"]
19
+
20
+
21
+ def save_config(value):
22
+ with open(config_file, "r", encoding="utf8") as file:
23
+ config = json.load(file)
24
+ config["discord_presence"] = value
25
+ with open(config_file, "w", encoding="utf8") as file:
26
+ json.dump(config, file, indent=2)
27
 
28
 
29
  def presence_tab():
 
31
  with gr.Column():
32
  presence = gr.Checkbox(
33
  label=i18n("Enable Applio integration with Discord presence"),
34
+ info=i18n(
35
+ "It will activate the possibility of displaying the current Applio activity in Discord."
36
+ ),
37
  interactive=True,
38
+ value=load_config_presence(),
39
  )
40
  presence.change(
41
  fn=toggle,
 
45
 
46
 
47
  def toggle(checkbox):
48
+ save_config(bool(checkbox))
49
+ if load_config_presence() == True:
 
50
  try:
51
  RPCManager.start_presence()
52
  except KeyboardInterrupt:
53
  RPCManager.stop_presence()
54
  else:
 
55
  RPCManager.stop_presence()
tabs/settings/restart.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import sys
4
+
5
+ now_dir = os.getcwd()
6
+ pid_file_path = os.path.join(now_dir, "rvc", "train", "train_pid.txt")
7
+
8
+
9
+ def restart_applio():
10
+ if os.name != "nt":
11
+ os.system("clear")
12
+ else:
13
+ os.system("cls")
14
+ try:
15
+ with open(pid_file_path, "r") as pid_file:
16
+ pids = [int(pid) for pid in pid_file.readlines()]
17
+ for pid in pids:
18
+ os.kill(pid, 9)
19
+ os.remove(pid_file_path)
20
+ except:
21
+ pass
22
+ python = sys.executable
23
+ os.execl(python, python, *sys.argv)
24
+
25
+
26
+ from assets.i18n.i18n import I18nAuto
27
+
28
+ i18n = I18nAuto()
29
+
30
+
31
+ def restart_tab():
32
+ with gr.Row():
33
+ with gr.Column():
34
+ restart_button = gr.Button(i18n("Restart Applio"))
35
+ restart_button.click(
36
+ fn=restart_applio,
37
+ inputs=[],
38
+ outputs=[],
39
+ )
tabs/settings/themes.py CHANGED
@@ -9,7 +9,7 @@ from assets.i18n.i18n import I18nAuto
9
  import assets.themes.loadThemes as loadThemes
10
 
11
  now_dir = os.getcwd()
12
- sys.path.append("..")
13
 
14
  i18n = I18nAuto()
15
 
@@ -21,6 +21,9 @@ def theme_tab():
21
  loadThemes.get_list(),
22
  value=loadThemes.read_json(),
23
  label=i18n("Theme"),
 
 
 
24
  visible=True,
25
  )
26
  themes_select.change(
 
9
  import assets.themes.loadThemes as loadThemes
10
 
11
  now_dir = os.getcwd()
12
+ sys.path.append(now_dir)
13
 
14
  i18n = I18nAuto()
15
 
 
21
  loadThemes.get_list(),
22
  value=loadThemes.read_json(),
23
  label=i18n("Theme"),
24
+ info=i18n(
25
+ "Select the theme you want to use. (Requires restarting Applio)"
26
+ ),
27
  visible=True,
28
  )
29
  themes_select.change(
tabs/settings/version.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ from assets.version_checker import compare_version
4
+ from assets.i18n.i18n import I18nAuto
5
+
6
+ i18n = I18nAuto()
7
+
8
+
9
+ def version_tab():
10
+ with gr.Row():
11
+ with gr.Column():
12
+ version_check = gr.Textbox(
13
+ label=i18n("Version Checker"),
14
+ info=i18n(
15
+ "Check which version of Applio is the latest to see if you need to update."
16
+ ),
17
+ interactive=False,
18
+ )
19
+ version_button = gr.Button(i18n("Check for updates"))
20
+ version_button.click(
21
+ fn=compare_version,
22
+ inputs=[],
23
+ outputs=[version_check],
24
+ )
tabs/train/train.py CHANGED
@@ -1,6 +1,7 @@
1
  import os
2
  import subprocess
3
  import sys
 
4
  import gradio as gr
5
  from assets.i18n.i18n import I18nAuto
6
  from core import (
@@ -8,14 +9,40 @@ from core import (
8
  run_extract_script,
9
  run_train_script,
10
  run_index_script,
 
11
  )
12
  from rvc.configs.config import max_vram_gpu, get_gpu_info
13
  from rvc.lib.utils import format_title
 
14
 
15
  i18n = I18nAuto()
16
  now_dir = os.getcwd()
17
  sys.path.append(now_dir)
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  sup_audioext = {
20
  "wav",
21
  "mp3",
@@ -84,6 +111,31 @@ def refresh_datasets():
84
  return {"choices": sorted(get_datasets_list()), "__type__": "update"}
85
 
86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  # Drop Model
88
  def save_drop_model(dropbox):
89
  if ".pth" not in dropbox:
@@ -136,25 +188,92 @@ def save_drop_dataset_audio(dropbox, dataset_name):
136
  return None, relative_dataset_path
137
 
138
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  # Train Tab
140
  def train_tab():
141
  with gr.Accordion(i18n("Preprocess")):
142
  with gr.Row():
143
  with gr.Column():
144
- model_name = gr.Textbox(
145
  label=i18n("Model Name"),
146
- placeholder=i18n("Enter model name"),
 
147
  value="my-project",
148
  interactive=True,
 
149
  )
150
  dataset_path = gr.Dropdown(
151
  label=i18n("Dataset Path"),
 
152
  # placeholder=i18n("Enter dataset path"),
153
  choices=get_datasets_list(),
154
  allow_custom_value=True,
155
  interactive=True,
156
  )
157
- refresh_datasets_button = gr.Button(i18n("Refresh Datasets"))
158
  dataset_creator = gr.Checkbox(
159
  label=i18n("Dataset Creator"),
160
  value=False,
@@ -163,9 +282,10 @@ def train_tab():
163
  )
164
 
165
  with gr.Column(visible=False) as dataset_creator_settings:
166
- with gr.Accordion("Dataset Creator"):
167
  dataset_name = gr.Textbox(
168
  label=i18n("Dataset Name"),
 
169
  placeholder=i18n("Enter dataset name"),
170
  interactive=True,
171
  )
@@ -178,6 +298,7 @@ def train_tab():
178
  with gr.Column():
179
  sampling_rate = gr.Radio(
180
  label=i18n("Sampling Rate"),
 
181
  choices=["32000", "40000", "48000"],
182
  value="40000",
183
  interactive=True,
@@ -185,6 +306,7 @@ def train_tab():
185
 
186
  rvc_version = gr.Radio(
187
  label=i18n("RVC Version"),
 
188
  choices=["v1", "v2"],
189
  value="v2",
190
  interactive=True,
@@ -192,6 +314,7 @@ def train_tab():
192
 
193
  preprocess_output_info = gr.Textbox(
194
  label=i18n("Output Information"),
 
195
  value="",
196
  max_lines=8,
197
  interactive=False,
@@ -209,12 +332,24 @@ def train_tab():
209
  with gr.Accordion(i18n("Extract")):
210
  with gr.Row():
211
  hop_length = gr.Slider(
212
- 1, 512, 128, step=1, label=i18n("Hop Length"), interactive=True
 
 
 
 
 
 
 
 
 
213
  )
214
  with gr.Row():
215
  with gr.Column():
216
  f0method = gr.Radio(
217
  label=i18n("Pitch extraction algorithm"),
 
 
 
218
  choices=["pm", "dio", "crepe", "crepe-tiny", "harvest", "rmvpe"],
219
  value="rmvpe",
220
  interactive=True,
@@ -222,6 +357,7 @@ def train_tab():
222
 
223
  extract_output_info = gr.Textbox(
224
  label=i18n("Output Information"),
 
225
  value="",
226
  max_lines=8,
227
  interactive=False,
@@ -242,39 +378,94 @@ def train_tab():
242
  max_vram_gpu(0),
243
  step=1,
244
  label=i18n("Batch Size"),
 
 
 
245
  interactive=True,
246
  )
247
  save_every_epoch = gr.Slider(
248
- 1, 100, 10, step=1, label=i18n("Save Every Epoch"), interactive=True
 
 
 
 
 
 
249
  )
250
  total_epoch = gr.Slider(
251
- 1, 1000, 500, step=1, label=i18n("Total Epoch"), interactive=True
 
 
 
 
 
 
 
 
252
  )
253
  with gr.Row():
254
  pitch_guidance = gr.Checkbox(
255
- label=i18n("Pitch Guidance"), value=True, interactive=True
 
 
 
 
 
256
  )
257
  pretrained = gr.Checkbox(
258
- label=i18n("Pretrained"), value=True, interactive=True
 
 
 
 
 
259
  )
260
  save_only_latest = gr.Checkbox(
261
- label=i18n("Save Only Latest"), value=False, interactive=True
 
 
 
 
 
262
  )
263
  save_every_weights = gr.Checkbox(
264
  label=i18n("Save Every Weights"),
 
 
 
265
  value=True,
266
  interactive=True,
267
  )
268
  custom_pretrained = gr.Checkbox(
269
- label=i18n("Custom Pretrained"), value=False, interactive=True
 
 
 
 
 
270
  )
271
  multiple_gpu = gr.Checkbox(
272
- label=i18n("GPU Settings"), value=False, interactive=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
273
  )
274
 
275
  with gr.Row():
276
  with gr.Column(visible=False) as pretrained_custom_settings:
277
- with gr.Accordion("Pretrained Custom Settings"):
278
  upload_pretrained = gr.File(
279
  label=i18n("Upload Pretrained Model"),
280
  type="filepath",
@@ -285,33 +476,57 @@ def train_tab():
285
  )
286
  g_pretrained_path = gr.Dropdown(
287
  label=i18n("Custom Pretrained G"),
 
 
 
288
  choices=sorted(pretraineds_list_g),
289
  interactive=True,
290
  allow_custom_value=True,
291
  )
292
  d_pretrained_path = gr.Dropdown(
293
  label=i18n("Custom Pretrained D"),
 
 
 
294
  choices=sorted(pretraineds_list_d),
295
  interactive=True,
296
  allow_custom_value=True,
297
  )
298
  with gr.Column(visible=False) as gpu_custom_settings:
299
- with gr.Accordion("GPU Settings"):
300
  gpu = gr.Textbox(
301
  label=i18n("GPU Number"),
 
 
 
302
  placeholder=i18n("0 to ∞ separated by -"),
303
  value="0",
304
  interactive=True,
305
  )
306
  gr.Textbox(
307
  label=i18n("GPU Information"),
 
308
  value=get_gpu_info(),
309
  interactive=False,
310
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
311
 
312
  with gr.Row():
313
  train_output_info = gr.Textbox(
314
  label=i18n("Output Information"),
 
315
  value="",
316
  max_lines=8,
317
  interactive=False,
@@ -332,6 +547,8 @@ def train_tab():
332
  batch_size,
333
  gpu,
334
  pitch_guidance,
 
 
335
  pretrained,
336
  custom_pretrained,
337
  g_pretrained_path,
@@ -341,6 +558,15 @@ def train_tab():
341
  api_name="start_training",
342
  )
343
 
 
 
 
 
 
 
 
 
 
344
  index_button = gr.Button(i18n("Generate Index"))
345
  index_button.click(
346
  run_index_script,
@@ -349,13 +575,114 @@ def train_tab():
349
  api_name="generate_index",
350
  )
351
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
352
  def toggle_visible(checkbox):
353
  return {"visible": checkbox, "__type__": "update"}
354
 
355
- refresh_datasets_button.click(
356
- fn=refresh_datasets,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
357
  inputs=[],
358
- outputs=[dataset_path],
359
  )
360
 
361
  dataset_creator.change(
@@ -370,6 +697,18 @@ def train_tab():
370
  outputs=[upload_audio_dataset, dataset_path],
371
  )
372
 
 
 
 
 
 
 
 
 
 
 
 
 
373
  custom_pretrained.change(
374
  fn=toggle_visible,
375
  inputs=[custom_pretrained],
@@ -388,8 +727,44 @@ def train_tab():
388
  outputs=[upload_pretrained],
389
  )
390
 
 
 
 
 
 
 
391
  multiple_gpu.change(
392
  fn=toggle_visible,
393
  inputs=[multiple_gpu],
394
  outputs=[gpu_custom_settings],
395
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import subprocess
3
  import sys
4
+ import shutil
5
  import gradio as gr
6
  from assets.i18n.i18n import I18nAuto
7
  from core import (
 
9
  run_extract_script,
10
  run_train_script,
11
  run_index_script,
12
+ run_prerequisites_script,
13
  )
14
  from rvc.configs.config import max_vram_gpu, get_gpu_info
15
  from rvc.lib.utils import format_title
16
+ from tabs.settings.restart import restart_applio
17
 
18
  i18n = I18nAuto()
19
  now_dir = os.getcwd()
20
  sys.path.append(now_dir)
21
 
22
+ pretraineds_v1 = [
23
+ (
24
+ "pretrained_v1/",
25
+ [
26
+ "D32k.pth",
27
+ "D40k.pth",
28
+ "D48k.pth",
29
+ "G32k.pth",
30
+ "G40k.pth",
31
+ "G48k.pth",
32
+ "f0D32k.pth",
33
+ "f0D40k.pth",
34
+ "f0D48k.pth",
35
+ "f0G32k.pth",
36
+ "f0G40k.pth",
37
+ "f0G48k.pth",
38
+ ],
39
+ ),
40
+ ]
41
+
42
+ folder_mapping = {
43
+ "pretrained_v1/": "rvc/pretraineds/pretrained_v1/",
44
+ }
45
+
46
  sup_audioext = {
47
  "wav",
48
  "mp3",
 
111
  return {"choices": sorted(get_datasets_list()), "__type__": "update"}
112
 
113
 
114
+ # Model Names
115
+ models_path = os.path.join(now_dir, "logs")
116
+
117
+
118
+ def get_models_list():
119
+ return [
120
+ os.path.basename(dirpath)
121
+ for dirpath in os.listdir(models_path)
122
+ if os.path.isdir(os.path.join(models_path, dirpath))
123
+ and all(excluded not in dirpath for excluded in ["zips", "mute"])
124
+ ]
125
+
126
+
127
+ def refresh_models():
128
+ return {"choices": sorted(get_models_list()), "__type__": "update"}
129
+
130
+
131
+ # Refresh Models and Datasets
132
+ def refresh_models_and_datasets():
133
+ return (
134
+ {"choices": sorted(get_models_list()), "__type__": "update"},
135
+ {"choices": sorted(get_datasets_list()), "__type__": "update"},
136
+ )
137
+
138
+
139
  # Drop Model
140
  def save_drop_model(dropbox):
141
  if ".pth" not in dropbox:
 
188
  return None, relative_dataset_path
189
 
190
 
191
+ # Export
192
+ ## Get Pth and Index Files
193
+ def get_pth_list():
194
+ return [
195
+ os.path.relpath(os.path.join(dirpath, filename), now_dir)
196
+ for dirpath, _, filenames in os.walk(models_path)
197
+ for filename in filenames
198
+ if filename.endswith(".pth")
199
+ ]
200
+
201
+
202
+ def get_index_list():
203
+ return [
204
+ os.path.relpath(os.path.join(dirpath, filename), now_dir)
205
+ for dirpath, _, filenames in os.walk(models_path)
206
+ for filename in filenames
207
+ if filename.endswith(".index") and "trained" not in filename
208
+ ]
209
+
210
+
211
+ def refresh_pth_and_index_list():
212
+ return (
213
+ {"choices": sorted(get_pth_list()), "__type__": "update"},
214
+ {"choices": sorted(get_index_list()), "__type__": "update"},
215
+ )
216
+
217
+
218
+ ## Export Pth and Index Files
219
+ def export_pth(pth_path):
220
+ if pth_path and os.path.exists(pth_path):
221
+ return pth_path
222
+ return None
223
+
224
+
225
+ def export_index(index_path):
226
+ if index_path and os.path.exists(index_path):
227
+ return index_path
228
+ return None
229
+
230
+
231
+ ## Upload to Google Drive
232
+ def upload_to_google_drive(pth_path, index_path):
233
+ def upload_file(file_path):
234
+ if file_path:
235
+ try:
236
+ gr.Info(f"Uploading {pth_path} to Google Drive...")
237
+ google_drive_folder = "/content/drive/MyDrive/ApplioExported"
238
+ if not os.path.exists(google_drive_folder):
239
+ os.makedirs(google_drive_folder)
240
+ google_drive_file_path = os.path.join(
241
+ google_drive_folder, os.path.basename(file_path)
242
+ )
243
+ if os.path.exists(google_drive_file_path):
244
+ os.remove(google_drive_file_path)
245
+ shutil.copy2(file_path, google_drive_file_path)
246
+ gr.Info("File uploaded successfully.")
247
+ except Exception as error:
248
+ print(error)
249
+ gr.Info("Error uploading to Google Drive")
250
+
251
+ upload_file(pth_path)
252
+ upload_file(index_path)
253
+
254
+
255
  # Train Tab
256
  def train_tab():
257
  with gr.Accordion(i18n("Preprocess")):
258
  with gr.Row():
259
  with gr.Column():
260
+ model_name = gr.Dropdown(
261
  label=i18n("Model Name"),
262
+ info=i18n("Name of the new model."),
263
+ choices=get_models_list(),
264
  value="my-project",
265
  interactive=True,
266
+ allow_custom_value=True,
267
  )
268
  dataset_path = gr.Dropdown(
269
  label=i18n("Dataset Path"),
270
+ info=i18n("Path to the dataset folder."),
271
  # placeholder=i18n("Enter dataset path"),
272
  choices=get_datasets_list(),
273
  allow_custom_value=True,
274
  interactive=True,
275
  )
276
+ refresh = gr.Button(i18n("Refresh"))
277
  dataset_creator = gr.Checkbox(
278
  label=i18n("Dataset Creator"),
279
  value=False,
 
282
  )
283
 
284
  with gr.Column(visible=False) as dataset_creator_settings:
285
+ with gr.Accordion(i18n("Dataset Creator")):
286
  dataset_name = gr.Textbox(
287
  label=i18n("Dataset Name"),
288
+ info=i18n("Name of the new dataset."),
289
  placeholder=i18n("Enter dataset name"),
290
  interactive=True,
291
  )
 
298
  with gr.Column():
299
  sampling_rate = gr.Radio(
300
  label=i18n("Sampling Rate"),
301
+ info=i18n("The sampling rate of the audio files."),
302
  choices=["32000", "40000", "48000"],
303
  value="40000",
304
  interactive=True,
 
306
 
307
  rvc_version = gr.Radio(
308
  label=i18n("RVC Version"),
309
+ info=i18n("The RVC version of the model."),
310
  choices=["v1", "v2"],
311
  value="v2",
312
  interactive=True,
 
314
 
315
  preprocess_output_info = gr.Textbox(
316
  label=i18n("Output Information"),
317
+ info=i18n("The output information will be displayed here."),
318
  value="",
319
  max_lines=8,
320
  interactive=False,
 
332
  with gr.Accordion(i18n("Extract")):
333
  with gr.Row():
334
  hop_length = gr.Slider(
335
+ 1,
336
+ 512,
337
+ 128,
338
+ step=1,
339
+ label=i18n("Hop Length"),
340
+ info=i18n(
341
+ "Denotes the duration it takes for the system to transition to a significant pitch change. Smaller hop lengths require more time for inference but tend to yield higher pitch accuracy."
342
+ ),
343
+ interactive=True,
344
+ visible=False,
345
  )
346
  with gr.Row():
347
  with gr.Column():
348
  f0method = gr.Radio(
349
  label=i18n("Pitch extraction algorithm"),
350
+ info=i18n(
351
+ "Pitch extraction algorithm to use for the audio conversion. The default algorithm is rmvpe, which is recommended for most cases."
352
+ ),
353
  choices=["pm", "dio", "crepe", "crepe-tiny", "harvest", "rmvpe"],
354
  value="rmvpe",
355
  interactive=True,
 
357
 
358
  extract_output_info = gr.Textbox(
359
  label=i18n("Output Information"),
360
+ info=i18n("The output information will be displayed here."),
361
  value="",
362
  max_lines=8,
363
  interactive=False,
 
378
  max_vram_gpu(0),
379
  step=1,
380
  label=i18n("Batch Size"),
381
+ info=i18n(
382
+ "It's advisable to align it with the available VRAM of your GPU. A setting of 4 offers improved accuracy but slower processing, while 8 provides faster and standard results."
383
+ ),
384
  interactive=True,
385
  )
386
  save_every_epoch = gr.Slider(
387
+ 1,
388
+ 100,
389
+ 10,
390
+ step=1,
391
+ label=i18n("Save Every Epoch"),
392
+ info=i18n("Determine at how many epochs the model will saved at."),
393
+ interactive=True,
394
  )
395
  total_epoch = gr.Slider(
396
+ 1,
397
+ 10000,
398
+ 500,
399
+ step=1,
400
+ label=i18n("Total Epoch"),
401
+ info=i18n(
402
+ "Specifies the overall quantity of epochs for the model training process."
403
+ ),
404
+ interactive=True,
405
  )
406
  with gr.Row():
407
  pitch_guidance = gr.Checkbox(
408
+ label=i18n("Pitch Guidance"),
409
+ info=i18n(
410
+ "By employing pitch guidance, it becomes feasible to mirror the intonation of the original voice, including its pitch. This feature is particularly valuable for singing and other scenarios where preserving the original melody or pitch pattern is essential."
411
+ ),
412
+ value=True,
413
+ interactive=True,
414
  )
415
  pretrained = gr.Checkbox(
416
+ label=i18n("Pretrained"),
417
+ info=i18n(
418
+ "Utilize pretrained models when training your own. This approach reduces training duration and enhances overall quality."
419
+ ),
420
+ value=True,
421
+ interactive=True,
422
  )
423
  save_only_latest = gr.Checkbox(
424
+ label=i18n("Save Only Latest"),
425
+ info=i18n(
426
+ "Enabling this setting will result in the G and D files saving only their most recent versions, effectively conserving storage space."
427
+ ),
428
+ value=False,
429
+ interactive=True,
430
  )
431
  save_every_weights = gr.Checkbox(
432
  label=i18n("Save Every Weights"),
433
+ info=i18n(
434
+ "This setting enables you to save the weights of the model at the conclusion of each epoch."
435
+ ),
436
  value=True,
437
  interactive=True,
438
  )
439
  custom_pretrained = gr.Checkbox(
440
+ label=i18n("Custom Pretrained"),
441
+ info=i18n(
442
+ "Utilizing custom pretrained models can lead to superior results, as selecting the most suitable pretrained models tailored to the specific use case can significantly enhance performance."
443
+ ),
444
+ value=False,
445
+ interactive=True,
446
  )
447
  multiple_gpu = gr.Checkbox(
448
+ label=i18n("GPU Settings"),
449
+ info=(
450
+ i18n(
451
+ "Sets advanced GPU settings, recommended for users with better GPU architecture."
452
+ )
453
+ ),
454
+ value=False,
455
+ interactive=True,
456
+ )
457
+ overtraining_detector = gr.Checkbox(
458
+ label=i18n("Overtraining Detector"),
459
+ info=i18n(
460
+ "Detect overtraining to prevent the model from learning the training data too well and losing the ability to generalize to new data."
461
+ ),
462
+ value=False,
463
+ interactive=True,
464
  )
465
 
466
  with gr.Row():
467
  with gr.Column(visible=False) as pretrained_custom_settings:
468
+ with gr.Accordion(i18n("Pretrained Custom Settings")):
469
  upload_pretrained = gr.File(
470
  label=i18n("Upload Pretrained Model"),
471
  type="filepath",
 
476
  )
477
  g_pretrained_path = gr.Dropdown(
478
  label=i18n("Custom Pretrained G"),
479
+ info=i18n(
480
+ "Select the custom pretrained model for the generator."
481
+ ),
482
  choices=sorted(pretraineds_list_g),
483
  interactive=True,
484
  allow_custom_value=True,
485
  )
486
  d_pretrained_path = gr.Dropdown(
487
  label=i18n("Custom Pretrained D"),
488
+ info=i18n(
489
+ "Select the custom pretrained model for the discriminator."
490
+ ),
491
  choices=sorted(pretraineds_list_d),
492
  interactive=True,
493
  allow_custom_value=True,
494
  )
495
  with gr.Column(visible=False) as gpu_custom_settings:
496
+ with gr.Accordion(i18n("GPU Settings")):
497
  gpu = gr.Textbox(
498
  label=i18n("GPU Number"),
499
+ info=i18n(
500
+ "Specify the number of GPUs you wish to utilize for training by entering them separated by hyphens (-)."
501
+ ),
502
  placeholder=i18n("0 to ∞ separated by -"),
503
  value="0",
504
  interactive=True,
505
  )
506
  gr.Textbox(
507
  label=i18n("GPU Information"),
508
+ info=i18n("The GPU information will be displayed here."),
509
  value=get_gpu_info(),
510
  interactive=False,
511
  )
512
+ with gr.Column(visible=False) as overtraining_settings:
513
+ with gr.Accordion(i18n("Overtraining Detector Settings")):
514
+ overtraining_threshold = gr.Slider(
515
+ 1,
516
+ 100,
517
+ 50,
518
+ step=1,
519
+ label=i18n("Overtraining Threshold"),
520
+ info=i18n(
521
+ "Set the maximum number of epochs you want your model to stop training if no improvement is detected."
522
+ ),
523
+ interactive=True,
524
+ )
525
 
526
  with gr.Row():
527
  train_output_info = gr.Textbox(
528
  label=i18n("Output Information"),
529
+ info=i18n("The output information will be displayed here."),
530
  value="",
531
  max_lines=8,
532
  interactive=False,
 
547
  batch_size,
548
  gpu,
549
  pitch_guidance,
550
+ overtraining_detector,
551
+ overtraining_threshold,
552
  pretrained,
553
  custom_pretrained,
554
  g_pretrained_path,
 
558
  api_name="start_training",
559
  )
560
 
561
+ stop_train_button = gr.Button(
562
+ i18n("Stop Training & Restart Applio"), visible=False
563
+ )
564
+ stop_train_button.click(
565
+ fn=restart_applio,
566
+ inputs=[],
567
+ outputs=[],
568
+ )
569
+
570
  index_button = gr.Button(i18n("Generate Index"))
571
  index_button.click(
572
  run_index_script,
 
575
  api_name="generate_index",
576
  )
577
 
578
+ with gr.Accordion(i18n("Export Model"), open=False):
579
+ if not os.name == "nt":
580
+ gr.Markdown(
581
+ i18n(
582
+ "The button 'Upload' is only for google colab: Uploads the exported files to the ApplioExported folder in your Google Drive."
583
+ )
584
+ )
585
+ with gr.Row():
586
+ with gr.Column():
587
+ pth_file_export = gr.File(
588
+ label=i18n("Exported Pth file"),
589
+ type="filepath",
590
+ value=None,
591
+ interactive=False,
592
+ )
593
+ pth_dropdown_export = gr.Dropdown(
594
+ label=i18n("Pth file"),
595
+ info=i18n("Select the pth file to be exported"),
596
+ choices=get_pth_list(),
597
+ value=None,
598
+ interactive=True,
599
+ allow_custom_value=True,
600
+ )
601
+ with gr.Column():
602
+ index_file_export = gr.File(
603
+ label=i18n("Exported Index File"),
604
+ type="filepath",
605
+ value=None,
606
+ interactive=False,
607
+ )
608
+ index_dropdown_export = gr.Dropdown(
609
+ label=i18n("Index File"),
610
+ info=i18n("Select the index file to be exported"),
611
+ choices=get_index_list(),
612
+ value=None,
613
+ interactive=True,
614
+ allow_custom_value=True,
615
+ )
616
+ with gr.Row():
617
+ with gr.Column():
618
+ refresh_export = gr.Button(i18n("Refresh"))
619
+ if not os.name == "nt":
620
+ upload_exported = gr.Button(i18n("Upload"), variant="primary")
621
+ upload_exported.click(
622
+ fn=upload_to_google_drive,
623
+ inputs=[pth_dropdown_export, index_dropdown_export],
624
+ outputs=[],
625
+ )
626
+
627
  def toggle_visible(checkbox):
628
  return {"visible": checkbox, "__type__": "update"}
629
 
630
+ def toggle_visible_hop_length(f0method):
631
+ if f0method == "crepe" or f0method == "crepe-tiny":
632
+ return {"visible": True, "__type__": "update"}
633
+ return {"visible": False, "__type__": "update"}
634
+
635
+ def toggle_pretrained(pretrained, custom_pretrained):
636
+ if custom_pretrained == False:
637
+ return {"visible": pretrained, "__type__": "update"}, {
638
+ "visible": False,
639
+ "__type__": "update",
640
+ }
641
+ else:
642
+ return {"visible": pretrained, "__type__": "update"}, {
643
+ "visible": pretrained,
644
+ "__type__": "update",
645
+ }
646
+
647
+ def enable_stop_train_button():
648
+ return {"visible": False, "__type__": "update"}, {
649
+ "visible": True,
650
+ "__type__": "update",
651
+ }
652
+
653
+ def disable_stop_train_button():
654
+ return {"visible": True, "__type__": "update"}, {
655
+ "visible": False,
656
+ "__type__": "update",
657
+ }
658
+
659
+ def download_prerequisites(version):
660
+ for remote_folder, file_list in pretraineds_v1:
661
+ local_folder = folder_mapping.get(remote_folder, "")
662
+ missing = False
663
+ for file in file_list:
664
+ destination_path = os.path.join(local_folder, file)
665
+ if not os.path.exists(destination_path):
666
+ missing = True
667
+ if version == "v1" and missing == True:
668
+ gr.Info(
669
+ "Downloading prerequisites... Please wait till it finishes to start preprocessing."
670
+ )
671
+ run_prerequisites_script("True", "False", "True", "True")
672
+ gr.Info(
673
+ "Prerequisites downloaded successfully, you may now start preprocessing."
674
+ )
675
+
676
+ rvc_version.change(
677
+ fn=download_prerequisites,
678
+ inputs=[rvc_version],
679
+ outputs=[],
680
+ )
681
+
682
+ refresh.click(
683
+ fn=refresh_models_and_datasets,
684
  inputs=[],
685
+ outputs=[model_name, dataset_path],
686
  )
687
 
688
  dataset_creator.change(
 
697
  outputs=[upload_audio_dataset, dataset_path],
698
  )
699
 
700
+ f0method.change(
701
+ fn=toggle_visible_hop_length,
702
+ inputs=[f0method],
703
+ outputs=[hop_length],
704
+ )
705
+
706
+ pretrained.change(
707
+ fn=toggle_pretrained,
708
+ inputs=[pretrained, custom_pretrained],
709
+ outputs=[custom_pretrained, pretrained_custom_settings],
710
+ )
711
+
712
  custom_pretrained.change(
713
  fn=toggle_visible,
714
  inputs=[custom_pretrained],
 
727
  outputs=[upload_pretrained],
728
  )
729
 
730
+ overtraining_detector.change(
731
+ fn=toggle_visible,
732
+ inputs=[overtraining_detector],
733
+ outputs=[overtraining_settings],
734
+ )
735
+
736
  multiple_gpu.change(
737
  fn=toggle_visible,
738
  inputs=[multiple_gpu],
739
  outputs=[gpu_custom_settings],
740
  )
741
+
742
+ train_button.click(
743
+ fn=enable_stop_train_button,
744
+ inputs=[],
745
+ outputs=[train_button, stop_train_button],
746
+ )
747
+
748
+ train_output_info.change(
749
+ fn=disable_stop_train_button,
750
+ inputs=[],
751
+ outputs=[train_button, stop_train_button],
752
+ )
753
+
754
+ pth_dropdown_export.change(
755
+ fn=export_pth,
756
+ inputs=[pth_dropdown_export],
757
+ outputs=[pth_file_export],
758
+ )
759
+
760
+ index_dropdown_export.change(
761
+ fn=export_index,
762
+ inputs=[index_dropdown_export],
763
+ outputs=[index_file_export],
764
+ )
765
+
766
+ refresh_export.click(
767
+ fn=refresh_pth_and_index_list,
768
+ inputs=[],
769
+ outputs=[pth_dropdown_export, index_dropdown_export],
770
+ )
tabs/tts/tts.py CHANGED
@@ -2,8 +2,6 @@ import os, sys
2
  import gradio as gr
3
  import regex as re
4
  import json
5
- import shutil
6
- import datetime
7
  import random
8
 
9
  from core import (
@@ -18,26 +16,7 @@ now_dir = os.getcwd()
18
  sys.path.append(now_dir)
19
 
20
  model_root = os.path.join(now_dir, "logs")
21
- audio_root = os.path.join(now_dir, "assets", "audios")
22
-
23
  model_root_relative = os.path.relpath(model_root, now_dir)
24
- audio_root_relative = os.path.relpath(audio_root, now_dir)
25
-
26
- sup_audioext = {
27
- "wav",
28
- "mp3",
29
- "flac",
30
- "ogg",
31
- "opus",
32
- "m4a",
33
- "mp4",
34
- "aac",
35
- "alac",
36
- "wma",
37
- "aiff",
38
- "webm",
39
- "ac3",
40
- }
41
 
42
  names = [
43
  os.path.join(root, file)
@@ -56,15 +35,6 @@ indexes_list = [
56
  if name.endswith(".index") and "trained" not in name
57
  ]
58
 
59
- audio_paths = [
60
- os.path.join(root, name)
61
- for root, _, files in os.walk(audio_root_relative, topdown=False)
62
- for name in files
63
- if name.endswith(tuple(sup_audioext))
64
- and root == audio_root_relative
65
- and "_output" not in name
66
- ]
67
-
68
 
69
  def change_choices():
70
  names = [
@@ -83,19 +53,9 @@ def change_choices():
83
  for name in files
84
  if name.endswith(".index") and "trained" not in name
85
  ]
86
-
87
- audio_paths = [
88
- os.path.join(root, name)
89
- for root, _, files in os.walk(audio_root_relative, topdown=False)
90
- for name in files
91
- if name.endswith(tuple(sup_audioext))
92
- and root == audio_root_relative
93
- and "_output" not in name
94
- ]
95
  return (
96
  {"choices": sorted(names), "__type__": "update"},
97
  {"choices": sorted(indexes_list), "__type__": "update"},
98
- {"choices": sorted(audio_paths), "__type__": "update"},
99
  )
100
 
101
 
@@ -110,93 +70,30 @@ def get_indexes():
110
  return indexes_list if indexes_list else ""
111
 
112
 
113
- def match_index(model_file: str) -> tuple:
114
- model_files_trip = re.sub(r"\.pth|\.onnx$", "", model_file)
115
- model_file_name = os.path.split(model_files_trip)[
116
- -1
117
- ] # Extract only the name, not the directory
118
-
119
- # Check if the sid0strip has the specific ending format _eXXX_sXXX
120
- if re.match(r".+_e\d+_s\d+$", model_file_name):
121
- base_model_name = model_file_name.rsplit("_", 2)[0]
122
- else:
123
- base_model_name = model_file_name
124
-
125
- sid_directory = os.path.join(model_root_relative, base_model_name)
126
- directories_to_search = [sid_directory] if os.path.exists(sid_directory) else []
127
- directories_to_search.append(model_root_relative)
128
-
129
- matching_index_files = []
130
-
131
- for directory in directories_to_search:
132
- for filename in os.listdir(directory):
133
- if filename.endswith(".index") and "trained" not in filename:
134
- # Condition to match the name
135
- name_match = any(
136
- name.lower() in filename.lower()
137
- for name in [model_file_name, base_model_name]
138
- )
139
-
140
- # If in the specific directory, it's automatically a match
141
- folder_match = directory == sid_directory
142
-
143
- if name_match or folder_match:
144
- index_path = os.path.join(directory, filename)
145
- if index_path in indexes_list:
146
- matching_index_files.append(
147
- (
148
- index_path,
149
- os.path.getsize(index_path),
150
- " " not in filename,
151
- )
152
- )
153
 
154
- if matching_index_files:
155
- # Sort by favoring files without spaces and by size (largest size first)
156
- matching_index_files.sort(key=lambda x: (-x[2], -x[1]))
157
- best_match_index_path = matching_index_files[0][0]
158
- return best_match_index_path
159
 
 
 
 
 
 
 
 
160
  return ""
161
 
162
 
163
- def save_to_wav(record_button):
164
- if record_button is None:
165
- pass
166
- else:
167
- path_to_file = record_button
168
- new_name = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + ".wav"
169
- target_path = os.path.join(audio_root_relative, os.path.basename(new_name))
170
-
171
- shutil.move(path_to_file, target_path)
172
- return target_path
173
-
174
-
175
- def save_to_wav2(upload_audio):
176
- file_path = upload_audio
177
- target_path = os.path.join(audio_root_relative, os.path.basename(file_path))
178
-
179
- if os.path.exists(target_path):
180
- os.remove(target_path)
181
-
182
- shutil.copy(file_path, target_path)
183
- return target_path
184
-
185
-
186
- def delete_outputs():
187
- for root, _, files in os.walk(audio_root_relative, topdown=False):
188
- for name in files:
189
- if name.endswith(tuple(sup_audioext)) and name.__contains__("_output"):
190
- os.remove(os.path.join(root, name))
191
- gr.Info(f"Outputs cleared!")
192
-
193
-
194
  def tts_tab():
195
  default_weight = random.choice(names) if names else ""
196
  with gr.Row():
197
  with gr.Row():
198
  model_file = gr.Dropdown(
199
  label=i18n("Voice Model"),
 
200
  choices=sorted(names, key=lambda path: os.path.getsize(path)),
201
  interactive=True,
202
  value=default_weight,
@@ -205,6 +102,7 @@ def tts_tab():
205
  best_default_index_path = match_index(model_file.value)
206
  index_file = gr.Dropdown(
207
  label=i18n("Index File"),
 
208
  choices=get_indexes(),
209
  value=best_default_index_path,
210
  interactive=True,
@@ -215,13 +113,16 @@ def tts_tab():
215
  unload_button = gr.Button(i18n("Unload Voice"))
216
 
217
  unload_button.click(
218
- fn=lambda: ({"value": "", "__type__": "update"}),
 
 
 
219
  inputs=[],
220
- outputs=[model_file],
221
  )
222
 
223
  model_file.select(
224
- fn=match_index,
225
  inputs=[model_file],
226
  outputs=[index_file],
227
  )
@@ -234,6 +135,7 @@ def tts_tab():
234
 
235
  tts_voice = gr.Dropdown(
236
  label=i18n("TTS Voices"),
 
237
  choices=short_names,
238
  interactive=True,
239
  value=None,
@@ -241,10 +143,16 @@ def tts_tab():
241
 
242
  tts_text = gr.Textbox(
243
  label=i18n("Text to Synthesize"),
 
244
  placeholder=i18n("Enter text to synthesize"),
245
  lines=3,
246
  )
247
 
 
 
 
 
 
248
  with gr.Accordion(i18n("Advanced Settings"), open=False):
249
  with gr.Column():
250
  output_tts_path = gr.Textbox(
@@ -253,27 +161,74 @@ def tts_tab():
253
  value=os.path.join(now_dir, "assets", "audios", "tts_output.wav"),
254
  interactive=True,
255
  )
256
-
257
  output_rvc_path = gr.Textbox(
258
  label=i18n("Output Path for RVC Audio"),
259
  placeholder=i18n("Enter output path"),
260
  value=os.path.join(now_dir, "assets", "audios", "tts_rvc_output.wav"),
261
  interactive=True,
262
  )
263
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
  pitch = gr.Slider(
265
  minimum=-24,
266
  maximum=24,
267
  step=1,
268
  label=i18n("Pitch"),
 
 
 
269
  value=0,
270
  interactive=True,
271
  )
272
  filter_radius = gr.Slider(
273
  minimum=0,
274
  maximum=7,
275
- label=i18n(
276
- "If >=3: apply median filtering to the harvested pitch results. The value represents the filter radius and can reduce breathiness"
 
277
  ),
278
  value=3,
279
  step=1,
@@ -283,43 +238,90 @@ def tts_tab():
283
  minimum=0,
284
  maximum=1,
285
  label=i18n("Search Feature Ratio"),
 
 
 
286
  value=0.75,
287
  interactive=True,
288
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
289
  hop_length = gr.Slider(
290
  minimum=1,
291
  maximum=512,
292
  step=1,
293
  label=i18n("Hop Length"),
 
 
 
294
  value=128,
295
  interactive=True,
296
  )
297
- with gr.Column():
298
- f0method = gr.Radio(
299
- label=i18n("Pitch extraction algorithm"),
300
- choices=[
301
- "pm",
302
- "harvest",
303
- "dio",
304
- "crepe",
305
- "crepe-tiny",
306
- "rmvpe",
307
- ],
308
- value="rmvpe",
309
- interactive=True,
310
- )
 
 
 
 
 
311
 
312
  convert_button1 = gr.Button(i18n("Convert"))
313
 
314
  with gr.Row(): # Defines output info + output audio download after conversion
315
- vc_output1 = gr.Textbox(label=i18n("Output Information"))
 
 
 
316
  vc_output2 = gr.Audio(label=i18n("Export Audio"))
317
 
 
 
 
 
 
 
 
 
318
  refresh_button.click(
319
  fn=change_choices,
320
  inputs=[],
321
  outputs=[model_file, index_file],
322
  )
 
 
 
 
 
323
  convert_button1.click(
324
  fn=run_tts_script,
325
  inputs=[
@@ -328,12 +330,19 @@ def tts_tab():
328
  pitch,
329
  filter_radius,
330
  index_rate,
 
 
331
  hop_length,
332
  f0method,
333
  output_tts_path,
334
  output_rvc_path,
335
  model_file,
336
  index_file,
 
 
 
 
 
337
  ],
338
  outputs=[vc_output1, vc_output2],
339
  )
 
2
  import gradio as gr
3
  import regex as re
4
  import json
 
 
5
  import random
6
 
7
  from core import (
 
16
  sys.path.append(now_dir)
17
 
18
  model_root = os.path.join(now_dir, "logs")
 
 
19
  model_root_relative = os.path.relpath(model_root, now_dir)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  names = [
22
  os.path.join(root, file)
 
35
  if name.endswith(".index") and "trained" not in name
36
  ]
37
 
 
 
 
 
 
 
 
 
 
38
 
39
  def change_choices():
40
  names = [
 
53
  for name in files
54
  if name.endswith(".index") and "trained" not in name
55
  ]
 
 
 
 
 
 
 
 
 
56
  return (
57
  {"choices": sorted(names), "__type__": "update"},
58
  {"choices": sorted(indexes_list), "__type__": "update"},
 
59
  )
60
 
61
 
 
70
  return indexes_list if indexes_list else ""
71
 
72
 
73
+ def process_input(file_path):
74
+ with open(file_path, "r") as file:
75
+ file_contents = file.read()
76
+ gr.Info(f"The text from the txt file has been loaded!")
77
+ return file_contents, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
 
 
 
 
 
79
 
80
+ def match_index(model_file_value):
81
+ if model_file_value:
82
+ model_folder = os.path.dirname(model_file_value)
83
+ index_files = get_indexes()
84
+ for index_file in index_files:
85
+ if os.path.dirname(index_file) == model_folder:
86
+ return index_file
87
  return ""
88
 
89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  def tts_tab():
91
  default_weight = random.choice(names) if names else ""
92
  with gr.Row():
93
  with gr.Row():
94
  model_file = gr.Dropdown(
95
  label=i18n("Voice Model"),
96
+ info=i18n("Select the voice model to use for the conversion."),
97
  choices=sorted(names, key=lambda path: os.path.getsize(path)),
98
  interactive=True,
99
  value=default_weight,
 
102
  best_default_index_path = match_index(model_file.value)
103
  index_file = gr.Dropdown(
104
  label=i18n("Index File"),
105
+ info=i18n("Select the index file to use for the conversion."),
106
  choices=get_indexes(),
107
  value=best_default_index_path,
108
  interactive=True,
 
113
  unload_button = gr.Button(i18n("Unload Voice"))
114
 
115
  unload_button.click(
116
+ fn=lambda: (
117
+ {"value": "", "__type__": "update"},
118
+ {"value": "", "__type__": "update"},
119
+ ),
120
  inputs=[],
121
+ outputs=[model_file, index_file],
122
  )
123
 
124
  model_file.select(
125
+ fn=lambda model_file_value: match_index(model_file_value),
126
  inputs=[model_file],
127
  outputs=[index_file],
128
  )
 
135
 
136
  tts_voice = gr.Dropdown(
137
  label=i18n("TTS Voices"),
138
+ info=i18n("Select the TTS voice to use for the conversion."),
139
  choices=short_names,
140
  interactive=True,
141
  value=None,
 
143
 
144
  tts_text = gr.Textbox(
145
  label=i18n("Text to Synthesize"),
146
+ info=i18n("Enter the text to synthesize."),
147
  placeholder=i18n("Enter text to synthesize"),
148
  lines=3,
149
  )
150
 
151
+ txt_file = gr.File(
152
+ label=i18n("Or you can upload a .txt file"),
153
+ type="filepath",
154
+ )
155
+
156
  with gr.Accordion(i18n("Advanced Settings"), open=False):
157
  with gr.Column():
158
  output_tts_path = gr.Textbox(
 
161
  value=os.path.join(now_dir, "assets", "audios", "tts_output.wav"),
162
  interactive=True,
163
  )
 
164
  output_rvc_path = gr.Textbox(
165
  label=i18n("Output Path for RVC Audio"),
166
  placeholder=i18n("Enter output path"),
167
  value=os.path.join(now_dir, "assets", "audios", "tts_rvc_output.wav"),
168
  interactive=True,
169
  )
170
+ export_format = gr.Radio(
171
+ label=i18n("Export Format"),
172
+ info=i18n("Select the format to export the audio."),
173
+ choices=["WAV", "MP3", "FLAC", "OGG", "M4A"],
174
+ value="WAV",
175
+ interactive=True,
176
+ )
177
+ split_audio = gr.Checkbox(
178
+ label=i18n("Split Audio"),
179
+ info=i18n(
180
+ "Split the audio into chunks for inference to obtain better results in some cases."
181
+ ),
182
+ visible=True,
183
+ value=False,
184
+ interactive=True,
185
+ )
186
+ autotune = gr.Checkbox(
187
+ label=i18n("Autotune"),
188
+ info=i18n(
189
+ "Apply a soft autotune to your inferences, recommended for singing conversions."
190
+ ),
191
+ visible=True,
192
+ value=False,
193
+ interactive=True,
194
+ )
195
+ clean_audio = gr.Checkbox(
196
+ label=i18n("Clean Audio"),
197
+ info=i18n(
198
+ "Clean your audio output using noise detection algorithms, recommended for speaking audios."
199
+ ),
200
+ visible=True,
201
+ value=True,
202
+ interactive=True,
203
+ )
204
+ clean_strength = gr.Slider(
205
+ minimum=0,
206
+ maximum=1,
207
+ label=i18n("Clean Strength"),
208
+ info=i18n(
209
+ "Set the clean-up level to the audio you want, the more you increase it the more it will clean up, but it is possible that the audio will be more compressed."
210
+ ),
211
+ visible=True,
212
+ value=0.5,
213
+ interactive=True,
214
+ )
215
  pitch = gr.Slider(
216
  minimum=-24,
217
  maximum=24,
218
  step=1,
219
  label=i18n("Pitch"),
220
+ info=i18n(
221
+ "Set the pitch of the audio, the higher the value, the higher the pitch."
222
+ ),
223
  value=0,
224
  interactive=True,
225
  )
226
  filter_radius = gr.Slider(
227
  minimum=0,
228
  maximum=7,
229
+ label=i18n("Filter Radius"),
230
+ info=i18n(
231
+ "If the number is greater than or equal to three, employing median filtering on the collected tone results has the potential to decrease respiration."
232
  ),
233
  value=3,
234
  step=1,
 
238
  minimum=0,
239
  maximum=1,
240
  label=i18n("Search Feature Ratio"),
241
+ info=i18n(
242
+ "Influence exerted by the index file; a higher value corresponds to greater influence. However, opting for lower values can help mitigate artifacts present in the audio."
243
+ ),
244
  value=0.75,
245
  interactive=True,
246
  )
247
+ rms_mix_rate = gr.Slider(
248
+ minimum=0,
249
+ maximum=1,
250
+ label=i18n("Volume Envelope"),
251
+ info=i18n(
252
+ "Substitute or blend with the volume envelope of the output. The closer the ratio is to 1, the more the output envelope is employed."
253
+ ),
254
+ value=1,
255
+ interactive=True,
256
+ )
257
+ protect = gr.Slider(
258
+ minimum=0,
259
+ maximum=0.5,
260
+ label=i18n("Protect Voiceless Consonants"),
261
+ info=i18n(
262
+ "Safeguard distinct consonants and breathing sounds to prevent electro-acoustic tearing and other artifacts. Pulling the parameter to its maximum value of 0.5 offers comprehensive protection. However, reducing this value might decrease the extent of protection while potentially mitigating the indexing effect."
263
+ ),
264
+ value=0.5,
265
+ interactive=True,
266
+ )
267
  hop_length = gr.Slider(
268
  minimum=1,
269
  maximum=512,
270
  step=1,
271
  label=i18n("Hop Length"),
272
+ info=i18n(
273
+ "Denotes the duration it takes for the system to transition to a significant pitch change. Smaller hop lengths require more time for inference but tend to yield higher pitch accuracy."
274
+ ),
275
  value=128,
276
  interactive=True,
277
  )
278
+ with gr.Column():
279
+ f0method = gr.Radio(
280
+ label=i18n("Pitch extraction algorithm"),
281
+ info=i18n(
282
+ "Pitch extraction algorithm to use for the audio conversion. The default algorithm is rmvpe, which is recommended for most cases."
283
+ ),
284
+ choices=[
285
+ "pm",
286
+ "harvest",
287
+ "dio",
288
+ "crepe",
289
+ "crepe-tiny",
290
+ "rmvpe",
291
+ "fcpe",
292
+ "hybrid[rmvpe+fcpe]",
293
+ ],
294
+ value="rmvpe",
295
+ interactive=True,
296
+ )
297
 
298
  convert_button1 = gr.Button(i18n("Convert"))
299
 
300
  with gr.Row(): # Defines output info + output audio download after conversion
301
+ vc_output1 = gr.Textbox(
302
+ label=i18n("Output Information"),
303
+ info=i18n("The output information will be displayed here."),
304
+ )
305
  vc_output2 = gr.Audio(label=i18n("Export Audio"))
306
 
307
+ def toggle_visible(checkbox):
308
+ return {"visible": checkbox, "__type__": "update"}
309
+
310
+ clean_audio.change(
311
+ fn=toggle_visible,
312
+ inputs=[clean_audio],
313
+ outputs=[clean_strength],
314
+ )
315
  refresh_button.click(
316
  fn=change_choices,
317
  inputs=[],
318
  outputs=[model_file, index_file],
319
  )
320
+ txt_file.upload(
321
+ fn=process_input,
322
+ inputs=[txt_file],
323
+ outputs=[tts_text, txt_file],
324
+ )
325
  convert_button1.click(
326
  fn=run_tts_script,
327
  inputs=[
 
330
  pitch,
331
  filter_radius,
332
  index_rate,
333
+ rms_mix_rate,
334
+ protect,
335
  hop_length,
336
  f0method,
337
  output_tts_path,
338
  output_rvc_path,
339
  model_file,
340
  index_file,
341
+ split_audio,
342
+ autotune,
343
+ clean_audio,
344
+ clean_strength,
345
+ export_format,
346
  ],
347
  outputs=[vc_output1, vc_output2],
348
  )
tabs/voice_blender/voice_blender.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, sys
2
+ import gradio as gr
3
+ import shutil
4
+
5
+ now_dir = os.getcwd()
6
+ sys.path.append(now_dir)
7
+
8
+ from assets.i18n.i18n import I18nAuto
9
+ from core import run_model_blender_script
10
+
11
+ i18n = I18nAuto()
12
+
13
+
14
+ def update_model_fusion(dropbox):
15
+ return dropbox, None
16
+
17
+
18
+ def voice_blender_tab():
19
+ gr.Markdown(i18n("## Voice Blender"))
20
+ gr.Markdown(
21
+ i18n(
22
+ "Select two voice models, set your desired blend percentage, and blend them into an entirely new voice."
23
+ )
24
+ )
25
+ with gr.Column():
26
+ model_fusion_name = gr.Textbox(
27
+ label=i18n("Model Name"),
28
+ info=i18n("Name of the new model."),
29
+ value="",
30
+ max_lines=1,
31
+ interactive=True,
32
+ placeholder=i18n("Enter model name"),
33
+ )
34
+ with gr.Row():
35
+ with gr.Column():
36
+ model_fusion_a_dropbox = gr.File(
37
+ label=i18n("Drag and drop your model here"), type="filepath"
38
+ )
39
+ model_fusion_a = gr.Textbox(
40
+ label=i18n("Path to Model"),
41
+ value="",
42
+ interactive=True,
43
+ placeholder=i18n("Enter path to model"),
44
+ info=i18n("You can also use a custom path."),
45
+ )
46
+ with gr.Column():
47
+ model_fusion_b_dropbox = gr.File(
48
+ label=i18n("Drag and drop your model here"), type="filepath"
49
+ )
50
+ model_fusion_b = gr.Textbox(
51
+ label=i18n("Path to Model"),
52
+ value="",
53
+ interactive=True,
54
+ placeholder=i18n("Enter path to model"),
55
+ info=i18n("You can also use a custom path."),
56
+ )
57
+ alpha_a = gr.Slider(
58
+ minimum=0,
59
+ maximum=1,
60
+ label=i18n("Blend Ratio"),
61
+ value=0.5,
62
+ interactive=True,
63
+ info=i18n(
64
+ "Adjusting the position more towards one side or the other will make the model more similar to the first or second."
65
+ ),
66
+ )
67
+ model_fusion_button = gr.Button(i18n("Fusion"), variant="primary")
68
+ with gr.Row():
69
+ model_fusion_output_info = gr.Textbox(
70
+ label=i18n("Output Information"),
71
+ info=i18n("The output information will be displayed here."),
72
+ value="",
73
+ )
74
+ model_fusion_pth_output = gr.File(
75
+ label=i18n("Download Model"), type="filepath", interactive=False
76
+ )
77
+
78
+ model_fusion_button.click(
79
+ fn=run_model_blender_script,
80
+ inputs=[
81
+ model_fusion_name,
82
+ model_fusion_a,
83
+ model_fusion_b,
84
+ alpha_a,
85
+ ],
86
+ outputs=[model_fusion_output_info, model_fusion_pth_output],
87
+ )
88
+
89
+ model_fusion_a_dropbox.upload(
90
+ fn=update_model_fusion,
91
+ inputs=model_fusion_a_dropbox,
92
+ outputs=[model_fusion_a, model_fusion_a_dropbox],
93
+ )
94
+
95
+ model_fusion_b_dropbox.upload(
96
+ fn=update_model_fusion,
97
+ inputs=model_fusion_b_dropbox,
98
+ outputs=[model_fusion_b, model_fusion_b_dropbox],
99
+ )