Aitron Emper commited on
Commit
7e361e2
1 Parent(s): c99d8c1

Update core.py

Browse files
Files changed (1) hide show
  1. core.py +1392 -1390
core.py CHANGED
@@ -1,1390 +1,1392 @@
1
- import os
2
- import sys
3
- import json
4
- import argparse
5
- import subprocess
6
-
7
- now_dir = os.getcwd()
8
- sys.path.append(now_dir)
9
-
10
- from rvc.configs.config import Config
11
-
12
- from rvc.lib.tools.prerequisites_download import prequisites_download_pipeline
13
- from rvc.train.extract.preparing_files import generate_config, generate_filelist
14
- from rvc.lib.tools.pretrained_selector import pretrained_selector
15
-
16
- from rvc.train.process.model_blender import model_blender
17
- from rvc.train.process.model_information import model_information
18
- from rvc.train.process.extract_small_model import extract_small_model
19
-
20
- from rvc.infer.infer import infer_pipeline
21
-
22
- from rvc.lib.tools.analyzer import analyze_audio
23
-
24
- from rvc.lib.tools.launch_tensorboard import launch_tensorboard_pipeline
25
-
26
- from rvc.lib.tools.model_download import model_download_pipeline
27
-
28
- config = Config()
29
- current_script_directory = os.path.dirname(os.path.realpath(__file__))
30
- logs_path = os.path.join(current_script_directory, "logs")
31
-
32
- # Get TTS Voices
33
- with open(os.path.join("rvc", "lib", "tools", "tts_voices.json"), "r") as f:
34
- voices_data = json.load(f)
35
-
36
- locales = list({voice["Locale"] for voice in voices_data})
37
-
38
-
39
- # Infer
40
- def run_infer_script(
41
- f0up_key,
42
- filter_radius,
43
- index_rate,
44
- rms_mix_rate,
45
- protect,
46
- hop_length,
47
- f0method,
48
- input_path,
49
- output_path,
50
- pth_path,
51
- index_path,
52
- split_audio,
53
- f0autotune,
54
- clean_audio,
55
- clean_strength,
56
- export_format,
57
- embedder_model,
58
- embedder_model_custom,
59
- upscale_audio,
60
- ):
61
- f0autotune = "True" if str(f0autotune) == "True" else "False"
62
- clean_audio = "True" if str(clean_audio) == "True" else "False"
63
- upscale_audio = "True" if str(upscale_audio) == "True" else "False"
64
- infer_pipeline(
65
- f0up_key,
66
- filter_radius,
67
- index_rate,
68
- rms_mix_rate,
69
- protect,
70
- hop_length,
71
- f0method,
72
- input_path,
73
- output_path,
74
- pth_path,
75
- index_path,
76
- split_audio,
77
- f0autotune,
78
- clean_audio,
79
- clean_strength,
80
- export_format,
81
- embedder_model,
82
- embedder_model_custom,
83
- upscale_audio,
84
- )
85
- return f"File {input_path} inferred successfully.", output_path.replace(
86
- ".wav", f".{export_format.lower()}"
87
- )
88
-
89
-
90
- # Batch infer
91
- def run_batch_infer_script(
92
- f0up_key,
93
- filter_radius,
94
- index_rate,
95
- rms_mix_rate,
96
- protect,
97
- hop_length,
98
- f0method,
99
- input_folder,
100
- output_folder,
101
- pth_path,
102
- index_path,
103
- split_audio,
104
- f0autotune,
105
- clean_audio,
106
- clean_strength,
107
- export_format,
108
- embedder_model,
109
- embedder_model_custom,
110
- upscale_audio,
111
- ):
112
- f0autotune = "True" if str(f0autotune) == "True" else "False"
113
- clean_audio = "True" if str(clean_audio) == "True" else "False"
114
- upscale_audio = "True" if str(upscale_audio) == "True" else "False"
115
- audio_files = [
116
- f for f in os.listdir(input_folder) if f.endswith((".mp3", ".wav", ".flac"))
117
- ]
118
- print(f"Detected {len(audio_files)} audio files for inference.")
119
-
120
- for audio_file in audio_files:
121
- if "_output" in audio_file:
122
- pass
123
- else:
124
- input_path = os.path.join(input_folder, audio_file)
125
- output_file_name = os.path.splitext(os.path.basename(audio_file))[0]
126
- output_path = os.path.join(
127
- output_folder,
128
- f"{output_file_name}_output{os.path.splitext(audio_file)[1]}",
129
- )
130
- print(f"Inferring {input_path}...")
131
-
132
- infer_pipeline(
133
- f0up_key,
134
- filter_radius,
135
- index_rate,
136
- rms_mix_rate,
137
- protect,
138
- hop_length,
139
- f0method,
140
- input_path,
141
- output_path,
142
- pth_path,
143
- index_path,
144
- split_audio,
145
- f0autotune,
146
- clean_audio,
147
- clean_strength,
148
- export_format,
149
- embedder_model,
150
- embedder_model_custom,
151
- upscale_audio,
152
- )
153
-
154
- return f"Files from {input_folder} inferred successfully."
155
-
156
-
157
- # TTS
158
- def run_tts_script(
159
- tts_text,
160
- tts_voice,
161
- tts_rate,
162
- f0up_key,
163
- filter_radius,
164
- index_rate,
165
- rms_mix_rate,
166
- protect,
167
- hop_length,
168
- f0method,
169
- output_tts_path,
170
- output_rvc_path,
171
- pth_path,
172
- index_path,
173
- split_audio,
174
- f0autotune,
175
- clean_audio,
176
- clean_strength,
177
- export_format,
178
- embedder_model,
179
- embedder_model_custom,
180
- upscale_audio,
181
- ):
182
- f0autotune = "True" if str(f0autotune) == "True" else "False"
183
- clean_audio = "True" if str(clean_audio) == "True" else "False"
184
- upscale_audio = "True" if str(upscale_audio) == "True" else "False"
185
- tts_script_path = os.path.join("rvc", "lib", "tools", "tts.py")
186
-
187
- if os.path.exists(output_tts_path):
188
- os.remove(output_tts_path)
189
-
190
- command_tts = [
191
- "python",
192
- tts_script_path,
193
- tts_text,
194
- tts_voice,
195
- str(tts_rate),
196
- output_tts_path,
197
- ]
198
- subprocess.run(command_tts)
199
-
200
- infer_pipeline(
201
- f0up_key,
202
- filter_radius,
203
- index_rate,
204
- rms_mix_rate,
205
- protect,
206
- hop_length,
207
- f0method,
208
- output_tts_path,
209
- output_rvc_path,
210
- pth_path,
211
- index_path,
212
- split_audio,
213
- f0autotune,
214
- clean_audio,
215
- clean_strength,
216
- export_format,
217
- embedder_model,
218
- embedder_model_custom,
219
- upscale_audio,
220
- )
221
-
222
- return f"Text {tts_text} synthesized successfully.", output_rvc_path.replace(
223
- ".wav", f".{export_format.lower()}"
224
- )
225
-
226
-
227
- # Preprocess
228
- def run_preprocess_script(model_name, dataset_path, sampling_rate):
229
- per = 3.0 if config.is_half else 3.7
230
- preprocess_script_path = os.path.join("rvc", "train", "preprocess", "preprocess.py")
231
- command = [
232
- "python",
233
- preprocess_script_path,
234
- *map(
235
- str,
236
- [
237
- os.path.join(logs_path, model_name),
238
- dataset_path,
239
- sampling_rate,
240
- per,
241
- ],
242
- ),
243
- ]
244
-
245
- os.makedirs(os.path.join(logs_path, model_name), exist_ok=True)
246
- subprocess.run(command)
247
- return f"Model {model_name} preprocessed successfully."
248
-
249
-
250
- # Extract
251
- def run_extract_script(
252
- model_name,
253
- rvc_version,
254
- f0method,
255
- hop_length,
256
- sampling_rate,
257
- embedder_model,
258
- embedder_model_custom,
259
- ):
260
- model_path = os.path.join(logs_path, model_name)
261
- extract_f0_script_path = os.path.join(
262
- "rvc", "train", "extract", "extract_f0_print.py"
263
- )
264
- extract_feature_script_path = os.path.join(
265
- "rvc", "train", "extract", "extract_feature_print.py"
266
- )
267
-
268
- command_1 = [
269
- "python",
270
- extract_f0_script_path,
271
- *map(
272
- str,
273
- [
274
- model_path,
275
- f0method,
276
- hop_length,
277
- ],
278
- ),
279
- ]
280
- command_2 = [
281
- "python",
282
- extract_feature_script_path,
283
- *map(
284
- str,
285
- [
286
- config.device,
287
- "1",
288
- "0",
289
- "0",
290
- model_path,
291
- rvc_version,
292
- "True",
293
- embedder_model,
294
- embedder_model_custom,
295
- ],
296
- ),
297
- ]
298
- subprocess.run(command_1)
299
- subprocess.run(command_2)
300
-
301
- generate_config(rvc_version, sampling_rate, model_path)
302
- generate_filelist(f0method, model_path, rvc_version, sampling_rate)
303
- return f"Model {model_name} extracted successfully."
304
-
305
-
306
- # Train
307
- def run_train_script(
308
- model_name,
309
- rvc_version,
310
- save_every_epoch,
311
- save_only_latest,
312
- save_every_weights,
313
- total_epoch,
314
- sampling_rate,
315
- batch_size,
316
- gpu,
317
- pitch_guidance,
318
- overtraining_detector,
319
- overtraining_threshold,
320
- pretrained,
321
- custom_pretrained,
322
- sync_graph,
323
- g_pretrained_path=None,
324
- d_pretrained_path=None,
325
- ):
326
- f0 = 1 if str(pitch_guidance) == "True" else 0
327
- latest = 1 if str(save_only_latest) == "True" else 0
328
- save_every = 1 if str(save_every_weights) == "True" else 0
329
- detector = 1 if str(overtraining_detector) == "True" else 0
330
- sync = 1 if str(sync_graph) == "True" else 0
331
-
332
- if str(pretrained) == "True":
333
- if str(custom_pretrained) == "False":
334
- pg, pd = pretrained_selector(f0)[rvc_version][sampling_rate]
335
- else:
336
- if g_pretrained_path is None or d_pretrained_path is None:
337
- raise ValueError(
338
- "Please provide the path to the pretrained G and D models."
339
- )
340
- pg, pd = g_pretrained_path, d_pretrained_path
341
- else:
342
- pg, pd = "", ""
343
-
344
- train_script_path = os.path.join("rvc", "train", "train.py")
345
- command = [
346
- "python",
347
- train_script_path,
348
- *map(
349
- str,
350
- [
351
- "-se",
352
- save_every_epoch,
353
- "-te",
354
- total_epoch,
355
- "-pg",
356
- pg,
357
- "-pd",
358
- pd,
359
- "-sr",
360
- sampling_rate,
361
- "-bs",
362
- batch_size,
363
- "-g",
364
- gpu,
365
- "-e",
366
- os.path.join(logs_path, model_name),
367
- "-v",
368
- rvc_version,
369
- "-l",
370
- latest,
371
- "-c",
372
- "0",
373
- "-sw",
374
- save_every,
375
- "-f0",
376
- f0,
377
- "-od",
378
- detector,
379
- "-ot",
380
- overtraining_threshold,
381
- "-sg",
382
- sync,
383
- ],
384
- ),
385
- ]
386
-
387
- subprocess.run(command)
388
- run_index_script(model_name, rvc_version)
389
- return f"Model {model_name} trained successfully."
390
-
391
-
392
- # Index
393
- def run_index_script(model_name, rvc_version):
394
- index_script_path = os.path.join("rvc", "train", "process", "extract_index.py")
395
- command = [
396
- "python",
397
- index_script_path,
398
- os.path.join(logs_path, model_name),
399
- rvc_version,
400
- ]
401
-
402
- subprocess.run(command)
403
- return f"Index file for {model_name} generated successfully."
404
-
405
-
406
- # Model extract
407
- def run_model_extract_script(
408
- pth_path, model_name, sampling_rate, pitch_guidance, rvc_version, epoch, step
409
- ):
410
- f0 = 1 if str(pitch_guidance) == "True" else 0
411
- extract_small_model(
412
- pth_path, model_name, sampling_rate, f0, rvc_version, epoch, step
413
- )
414
- return f"Model {model_name} extracted successfully."
415
-
416
-
417
- # Model information
418
- def run_model_information_script(pth_path):
419
- print(model_information(pth_path))
420
-
421
-
422
- # Model blender
423
- def run_model_blender_script(model_name, pth_path_1, pth_path_2, ratio):
424
- message, model_blended = model_blender(model_name, pth_path_1, pth_path_2, ratio)
425
- return message, model_blended
426
-
427
-
428
- # Tensorboard
429
- def run_tensorboard_script():
430
- launch_tensorboard_pipeline()
431
-
432
-
433
- # Download
434
- def run_download_script(model_link):
435
- model_download_pipeline(model_link)
436
- return f"Model downloaded successfully."
437
-
438
-
439
- # Prerequisites
440
- def run_prerequisites_script(pretraineds_v1, pretraineds_v2, models, exe):
441
- prequisites_download_pipeline(pretraineds_v1, pretraineds_v2, models, exe)
442
- return "Prerequisites installed successfully."
443
-
444
-
445
- # Audio analyzer
446
- def run_audio_analyzer_script(input_path, save_plot_path="logs/audio_analysis.png"):
447
- audio_info, plot_path = analyze_audio(input_path, save_plot_path)
448
- print(
449
- f"Audio info of {input_path}: {audio_info}",
450
- f"Audio file {input_path} analyzed successfully. Plot saved at: {plot_path}",
451
- )
452
- return audio_info, plot_path
453
-
454
-
455
- # API
456
- def run_api_script(ip, port):
457
- command = [
458
- "env/Scripts/uvicorn.exe" if os.name == "nt" else "uvicorn",
459
- "api:app",
460
- "--host",
461
- ip,
462
- "--port",
463
- port,
464
- ]
465
- subprocess.run(command)
466
-
467
-
468
- # Parse arguments
469
- def parse_arguments():
470
- parser = argparse.ArgumentParser(
471
- description="Run the main.py script with specific parameters."
472
- )
473
- subparsers = parser.add_subparsers(
474
- title="subcommands", dest="mode", help="Choose a mode"
475
- )
476
-
477
- # Parser for 'infer' mode
478
- infer_parser = subparsers.add_parser("infer", help="Run inference")
479
- infer_parser.add_argument(
480
- "--f0up_key",
481
- type=str,
482
- help="Value for f0up_key",
483
- choices=[str(i) for i in range(-24, 25)],
484
- default="0",
485
- )
486
- infer_parser.add_argument(
487
- "--filter_radius",
488
- type=str,
489
- help="Value for filter_radius",
490
- choices=[str(i) for i in range(11)],
491
- default="3",
492
- )
493
- infer_parser.add_argument(
494
- "--index_rate",
495
- type=str,
496
- help="Value for index_rate",
497
- choices=[str(i / 10) for i in range(11)],
498
- default="0.3",
499
- )
500
- infer_parser.add_argument(
501
- "--rms_mix_rate",
502
- type=str,
503
- help="Value for rms_mix_rate",
504
- choices=[str(i / 10) for i in range(11)],
505
- default="1",
506
- )
507
- infer_parser.add_argument(
508
- "--protect",
509
- type=str,
510
- help="Value for protect",
511
- choices=[str(i / 10) for i in range(6)],
512
- default="0.33",
513
- )
514
- infer_parser.add_argument(
515
- "--hop_length",
516
- type=str,
517
- help="Value for hop_length",
518
- choices=[str(i) for i in range(1, 513)],
519
- default="128",
520
- )
521
- infer_parser.add_argument(
522
- "--f0method",
523
- type=str,
524
- help="Value for f0method",
525
- choices=[
526
- "pm",
527
- "harvest",
528
- "dio",
529
- "crepe",
530
- "crepe-tiny",
531
- "rmvpe",
532
- "fcpe",
533
- "hybrid[crepe+rmvpe]",
534
- "hybrid[crepe+fcpe]",
535
- "hybrid[rmvpe+fcpe]",
536
- "hybrid[crepe+rmvpe+fcpe]",
537
- ],
538
- default="rmvpe",
539
- )
540
- infer_parser.add_argument("--input_path", type=str, help="Input path")
541
- infer_parser.add_argument("--output_path", type=str, help="Output path")
542
- infer_parser.add_argument("--pth_path", type=str, help="Path to the .pth file")
543
- infer_parser.add_argument(
544
- "--index_path",
545
- type=str,
546
- help="Path to the .index file",
547
- )
548
- infer_parser.add_argument(
549
- "--split_audio",
550
- type=str,
551
- help="Enable split audio",
552
- choices=["True", "False"],
553
- default="False",
554
- )
555
- infer_parser.add_argument(
556
- "--f0autotune",
557
- type=str,
558
- help="Enable autotune",
559
- choices=["True", "False"],
560
- default="False",
561
- )
562
- infer_parser.add_argument(
563
- "--clean_audio",
564
- type=str,
565
- help="Enable clean audio",
566
- choices=["True", "False"],
567
- default="False",
568
- )
569
- infer_parser.add_argument(
570
- "--clean_strength",
571
- type=str,
572
- help="Value for clean_strength",
573
- choices=[str(i / 10) for i in range(11)],
574
- default="0.7",
575
- )
576
- infer_parser.add_argument(
577
- "--export_format",
578
- type=str,
579
- help="Export format",
580
- choices=["WAV", "MP3", "FLAC", "OGG", "M4A"],
581
- default="WAV",
582
- )
583
- infer_parser.add_argument(
584
- "--embedder_model",
585
- type=str,
586
- help="Embedder model",
587
- choices=["contentvec", "hubert", "custom"],
588
- default="hubert",
589
- )
590
- infer_parser.add_argument(
591
- "--embedder_model_custom",
592
- type=str,
593
- help="Custom Embedder model",
594
- default=None,
595
- )
596
- infer_parser.add_argument(
597
- "--upscale_audio",
598
- type=str,
599
- help="Enable audio upscaling",
600
- choices=["True", "False"],
601
- default="False",
602
- )
603
-
604
- # Parser for 'batch_infer' mode
605
- batch_infer_parser = subparsers.add_parser(
606
- "batch_infer", help="Run batch inference"
607
- )
608
- batch_infer_parser.add_argument(
609
- "--f0up_key",
610
- type=str,
611
- help="Value for f0up_key",
612
- choices=[str(i) for i in range(-24, 25)],
613
- default="0",
614
- )
615
- batch_infer_parser.add_argument(
616
- "--filter_radius",
617
- type=str,
618
- help="Value for filter_radius",
619
- choices=[str(i) for i in range(11)],
620
- default="3",
621
- )
622
- batch_infer_parser.add_argument(
623
- "--index_rate",
624
- type=str,
625
- help="Value for index_rate",
626
- choices=[str(i / 10) for i in range(11)],
627
- default="0.3",
628
- )
629
- batch_infer_parser.add_argument(
630
- "--rms_mix_rate",
631
- type=str,
632
- help="Value for rms_mix_rate",
633
- choices=[str(i / 10) for i in range(11)],
634
- default="1",
635
- )
636
- batch_infer_parser.add_argument(
637
- "--protect",
638
- type=str,
639
- help="Value for protect",
640
- choices=[str(i / 10) for i in range(6)],
641
- default="0.33",
642
- )
643
- batch_infer_parser.add_argument(
644
- "--hop_length",
645
- type=str,
646
- help="Value for hop_length",
647
- choices=[str(i) for i in range(1, 513)],
648
- default="128",
649
- )
650
- batch_infer_parser.add_argument(
651
- "--f0method",
652
- type=str,
653
- help="Value for f0method",
654
- choices=[
655
- "pm",
656
- "harvest",
657
- "dio",
658
- "crepe",
659
- "crepe-tiny",
660
- "rmvpe",
661
- "fcpe",
662
- "hybrid[crepe+rmvpe]",
663
- "hybrid[crepe+fcpe]",
664
- "hybrid[rmvpe+fcpe]",
665
- "hybrid[crepe+rmvpe+fcpe]",
666
- ],
667
- default="rmvpe",
668
- )
669
- batch_infer_parser.add_argument("--input_folder", type=str, help="Input folder")
670
- batch_infer_parser.add_argument("--output_folder", type=str, help="Output folder")
671
- batch_infer_parser.add_argument(
672
- "--pth_path", type=str, help="Path to the .pth file"
673
- )
674
- batch_infer_parser.add_argument(
675
- "--index_path",
676
- type=str,
677
- help="Path to the .index file",
678
- )
679
- batch_infer_parser.add_argument(
680
- "--split_audio",
681
- type=str,
682
- help="Enable split audio",
683
- choices=["True", "False"],
684
- default="False",
685
- )
686
- batch_infer_parser.add_argument(
687
- "--f0autotune",
688
- type=str,
689
- help="Enable autotune",
690
- choices=["True", "False"],
691
- default="False",
692
- )
693
- batch_infer_parser.add_argument(
694
- "--clean_audio",
695
- type=str,
696
- help="Enable clean audio",
697
- choices=["True", "False"],
698
- default="False",
699
- )
700
- batch_infer_parser.add_argument(
701
- "--clean_strength",
702
- type=str,
703
- help="Value for clean_strength",
704
- choices=[str(i / 10) for i in range(11)],
705
- default="0.7",
706
- )
707
- batch_infer_parser.add_argument(
708
- "--export_format",
709
- type=str,
710
- help="Export format",
711
- choices=["WAV", "MP3", "FLAC", "OGG", "M4A"],
712
- default="WAV",
713
- )
714
- batch_infer_parser.add_argument(
715
- "--embedder_model",
716
- type=str,
717
- help="Embedder model",
718
- choices=["contentvec", "hubert", "custom"],
719
- default="hubert",
720
- )
721
- batch_infer_parser.add_argument(
722
- "--embedder_model_custom",
723
- type=str,
724
- help="Custom Embedder model",
725
- default=None,
726
- )
727
- batch_infer_parser.add_argument(
728
- "--upscale_audio",
729
- type=str,
730
- help="Enable audio upscaling",
731
- choices=["True", "False"],
732
- default="False",
733
- )
734
-
735
- # Parser for 'tts' mode
736
- tts_parser = subparsers.add_parser("tts", help="Run TTS")
737
- tts_parser.add_argument(
738
- "--tts_text",
739
- type=str,
740
- help="Text to be synthesized",
741
- )
742
- tts_parser.add_argument(
743
- "--tts_voice",
744
- type=str,
745
- help="Voice to be used",
746
- choices=locales,
747
- )
748
- tts_parser.add_argument(
749
- "--tts_rate",
750
- type=str,
751
- help="Increase or decrease TTS speed",
752
- choices=[str(i) for i in range(-100, 100)],
753
- default="0",
754
- )
755
- tts_parser.add_argument(
756
- "--f0up_key",
757
- type=str,
758
- help="Value for f0up_key",
759
- choices=[str(i) for i in range(-24, 25)],
760
- default="0",
761
- )
762
- tts_parser.add_argument(
763
- "--filter_radius",
764
- type=str,
765
- help="Value for filter_radius",
766
- choices=[str(i) for i in range(11)],
767
- default="3",
768
- )
769
- tts_parser.add_argument(
770
- "--index_rate",
771
- type=str,
772
- help="Value for index_rate",
773
- choices=[str(i / 10) for i in range(11)],
774
- default="0.3",
775
- )
776
- tts_parser.add_argument(
777
- "--rms_mix_rate",
778
- type=str,
779
- help="Value for rms_mix_rate",
780
- choices=[str(i / 10) for i in range(11)],
781
- default="1",
782
- )
783
- tts_parser.add_argument(
784
- "--protect",
785
- type=str,
786
- help="Value for protect",
787
- choices=[str(i / 10) for i in range(6)],
788
- default="0.33",
789
- )
790
- tts_parser.add_argument(
791
- "--hop_length",
792
- type=str,
793
- help="Value for hop_length",
794
- choices=[str(i) for i in range(1, 513)],
795
- default="128",
796
- )
797
- tts_parser.add_argument(
798
- "--f0method",
799
- type=str,
800
- help="Value for f0method",
801
- choices=[
802
- "pm",
803
- "harvest",
804
- "dio",
805
- "crepe",
806
- "crepe-tiny",
807
- "rmvpe",
808
- "fcpe",
809
- "hybrid[crepe+rmvpe]",
810
- "hybrid[crepe+fcpe]",
811
- "hybrid[rmvpe+fcpe]",
812
- "hybrid[crepe+rmvpe+fcpe]",
813
- ],
814
- default="rmvpe",
815
- )
816
- tts_parser.add_argument("--output_tts_path", type=str, help="Output tts path")
817
- tts_parser.add_argument("--output_rvc_path", type=str, help="Output rvc path")
818
- tts_parser.add_argument("--pth_path", type=str, help="Path to the .pth file")
819
- tts_parser.add_argument(
820
- "--index_path",
821
- type=str,
822
- help="Path to the .index file",
823
- )
824
- tts_parser.add_argument(
825
- "--split_audio",
826
- type=str,
827
- help="Enable split audio",
828
- choices=["True", "False"],
829
- default="False",
830
- )
831
- tts_parser.add_argument(
832
- "--f0autotune",
833
- type=str,
834
- help="Enable autotune",
835
- choices=["True", "False"],
836
- default="False",
837
- )
838
- tts_parser.add_argument(
839
- "--clean_audio",
840
- type=str,
841
- help="Enable clean audio",
842
- choices=["True", "False"],
843
- default="False",
844
- )
845
- tts_parser.add_argument(
846
- "--clean_strength",
847
- type=str,
848
- help="Value for clean_strength",
849
- choices=[str(i / 10) for i in range(11)],
850
- default="0.7",
851
- )
852
- tts_parser.add_argument(
853
- "--export_format",
854
- type=str,
855
- help="Export format",
856
- choices=["WAV", "MP3", "FLAC", "OGG", "M4A"],
857
- default="WAV",
858
- )
859
- tts_parser.add_argument(
860
- "--embedder_model",
861
- type=str,
862
- help="Embedder model",
863
- choices=["contentvec", "hubert", "custom"],
864
- default="hubert",
865
- )
866
- tts_parser.add_argument(
867
- "--embedder_model_custom",
868
- type=str,
869
- help="Custom Embedder model",
870
- default=None,
871
- )
872
- tts_parser.add_argument(
873
- "--upscale_audio",
874
- type=str,
875
- help="Enable audio upscaling",
876
- choices=["True", "False"],
877
- default="False",
878
- )
879
-
880
- # Parser for 'preprocess' mode
881
- preprocess_parser = subparsers.add_parser("preprocess", help="Run preprocessing")
882
- preprocess_parser.add_argument("--model_name", type=str, help="Name of the model")
883
- preprocess_parser.add_argument(
884
- "--dataset_path",
885
- type=str,
886
- help="Path to the dataset",
887
- )
888
- preprocess_parser.add_argument(
889
- "--sampling_rate",
890
- type=str,
891
- help="Sampling rate",
892
- choices=["32000", "40000", "48000"],
893
- )
894
-
895
- # Parser for 'extract' mode
896
- extract_parser = subparsers.add_parser("extract", help="Run extract")
897
- extract_parser.add_argument(
898
- "--model_name",
899
- type=str,
900
- help="Name of the model",
901
- )
902
- extract_parser.add_argument(
903
- "--rvc_version",
904
- type=str,
905
- help="Version of the model",
906
- choices=["v1", "v2"],
907
- default="v2",
908
- )
909
- extract_parser.add_argument(
910
- "--f0method",
911
- type=str,
912
- help="Value for f0method",
913
- choices=[
914
- "pm",
915
- "harvest",
916
- "dio",
917
- "crepe",
918
- "crepe-tiny",
919
- "rmvpe",
920
- ],
921
- default="rmvpe",
922
- )
923
- extract_parser.add_argument(
924
- "--hop_length",
925
- type=str,
926
- help="Value for hop_length",
927
- choices=[str(i) for i in range(1, 513)],
928
- default="128",
929
- )
930
- extract_parser.add_argument(
931
- "--sampling_rate",
932
- type=str,
933
- help="Sampling rate",
934
- choices=["32000", "40000", "48000"],
935
- )
936
- extract_parser.add_argument(
937
- "--embedder_model",
938
- type=str,
939
- help="Embedder model",
940
- choices=["contentvec", "hubert", "custom"],
941
- default="hubert",
942
- )
943
- extract_parser.add_argument(
944
- "--embedder_model_custom",
945
- type=str,
946
- help="Custom Embedder model",
947
- default=None,
948
- )
949
-
950
- # Parser for 'train' mode
951
- train_parser = subparsers.add_parser("train", help="Run training")
952
- train_parser.add_argument(
953
- "--model_name",
954
- type=str,
955
- help="Name of the model",
956
- )
957
- train_parser.add_argument(
958
- "--rvc_version",
959
- type=str,
960
- help="Version of the model",
961
- choices=["v1", "v2"],
962
- default="v2",
963
- )
964
- train_parser.add_argument(
965
- "--save_every_epoch",
966
- type=str,
967
- help="Save every epoch",
968
- choices=[str(i) for i in range(1, 101)],
969
- )
970
- train_parser.add_argument(
971
- "--save_only_latest",
972
- type=str,
973
- help="Save weight only at last epoch",
974
- choices=["True", "False"],
975
- default="False",
976
- )
977
- train_parser.add_argument(
978
- "--save_every_weights",
979
- type=str,
980
- help="Save weight every epoch",
981
- choices=["True", "False"],
982
- default="True",
983
- )
984
- train_parser.add_argument(
985
- "--total_epoch",
986
- type=str,
987
- help="Total epoch",
988
- choices=[str(i) for i in range(1, 10001)],
989
- default="1000",
990
- )
991
- train_parser.add_argument(
992
- "--sampling_rate",
993
- type=str,
994
- help="Sampling rate",
995
- choices=["32000", "40000", "48000"],
996
- )
997
- train_parser.add_argument(
998
- "--batch_size",
999
- type=str,
1000
- help="Batch size",
1001
- choices=[str(i) for i in range(1, 51)],
1002
- default="8",
1003
- )
1004
- train_parser.add_argument(
1005
- "--gpu",
1006
- type=str,
1007
- help="GPU number",
1008
- default="0",
1009
- )
1010
- train_parser.add_argument(
1011
- "--pitch_guidance",
1012
- type=str,
1013
- help="Pitch guidance",
1014
- choices=["True", "False"],
1015
- default="True",
1016
- )
1017
- train_parser.add_argument(
1018
- "--pretrained",
1019
- type=str,
1020
- help="Pretrained",
1021
- choices=["True", "False"],
1022
- default="True",
1023
- )
1024
- train_parser.add_argument(
1025
- "--custom_pretrained",
1026
- type=str,
1027
- help="Custom pretrained",
1028
- choices=["True", "False"],
1029
- default="False",
1030
- )
1031
- train_parser.add_argument(
1032
- "--g_pretrained_path",
1033
- type=str,
1034
- nargs="?",
1035
- default=None,
1036
- help="Path to the pretrained G file",
1037
- )
1038
- train_parser.add_argument(
1039
- "--d_pretrained_path",
1040
- type=str,
1041
- nargs="?",
1042
- default=None,
1043
- help="Path to the pretrained D file",
1044
- )
1045
- train_parser.add_argument(
1046
- "--overtraining_detector",
1047
- type=str,
1048
- help="Overtraining detector",
1049
- choices=["True", "False"],
1050
- default="False",
1051
- )
1052
- train_parser.add_argument(
1053
- "--overtraining_threshold",
1054
- type=str,
1055
- help="Overtraining threshold",
1056
- choices=[str(i) for i in range(1, 101)],
1057
- default="50",
1058
- )
1059
- train_parser.add_argument(
1060
- "--sync_graph",
1061
- type=str,
1062
- help="Sync graph",
1063
- choices=["True", "False"],
1064
- default="False",
1065
- )
1066
-
1067
- # Parser for 'index' mode
1068
- index_parser = subparsers.add_parser("index", help="Generate index file")
1069
- index_parser.add_argument(
1070
- "--model_name",
1071
- type=str,
1072
- help="Name of the model",
1073
- )
1074
- index_parser.add_argument(
1075
- "--rvc_version",
1076
- type=str,
1077
- help="Version of the model",
1078
- choices=["v1", "v2"],
1079
- default="v2",
1080
- )
1081
-
1082
- # Parser for 'model_extract' mode
1083
- model_extract_parser = subparsers.add_parser("model_extract", help="Extract model")
1084
- model_extract_parser.add_argument(
1085
- "--pth_path",
1086
- type=str,
1087
- help="Path to the .pth file",
1088
- )
1089
- model_extract_parser.add_argument(
1090
- "--model_name",
1091
- type=str,
1092
- help="Name of the model",
1093
- )
1094
- model_extract_parser.add_argument(
1095
- "--sampling_rate",
1096
- type=str,
1097
- help="Sampling rate",
1098
- choices=["40000", "48000"],
1099
- )
1100
- model_extract_parser.add_argument(
1101
- "--pitch_guidance",
1102
- type=str,
1103
- help="Pitch guidance",
1104
- choices=["True", "False"],
1105
- )
1106
- model_extract_parser.add_argument(
1107
- "--rvc_version",
1108
- type=str,
1109
- help="Version of the model",
1110
- choices=["v1", "v2"],
1111
- default="v2",
1112
- )
1113
- model_extract_parser.add_argument(
1114
- "--epoch",
1115
- type=str,
1116
- help="Epochs of the model",
1117
- choices=[str(i) for i in range(1, 10001)],
1118
- )
1119
- model_extract_parser.add_argument(
1120
- "--step",
1121
- type=str,
1122
- help="Steps of the model",
1123
- )
1124
-
1125
- # Parser for 'model_information' mode
1126
- model_information_parser = subparsers.add_parser(
1127
- "model_information", help="Print model information"
1128
- )
1129
- model_information_parser.add_argument(
1130
- "--pth_path",
1131
- type=str,
1132
- help="Path to the .pth file",
1133
- )
1134
-
1135
- # Parser for 'model_blender' mode
1136
- model_blender_parser = subparsers.add_parser(
1137
- "model_blender", help="Fuse two models"
1138
- )
1139
- model_blender_parser.add_argument(
1140
- "--model_name",
1141
- type=str,
1142
- help="Name of the model",
1143
- )
1144
- model_blender_parser.add_argument(
1145
- "--pth_path_1",
1146
- type=str,
1147
- help="Path to the first .pth file",
1148
- )
1149
- model_blender_parser.add_argument(
1150
- "--pth_path_2",
1151
- type=str,
1152
- help="Path to the second .pth file",
1153
- )
1154
- model_blender_parser.add_argument(
1155
- "--ratio",
1156
- type=str,
1157
- help="Value for blender ratio",
1158
- choices=[str(i / 10) for i in range(11)],
1159
- default="0.5",
1160
- )
1161
-
1162
- # Parser for 'tensorboard' mode
1163
- subparsers.add_parser("tensorboard", help="Run tensorboard")
1164
-
1165
- # Parser for 'download' mode
1166
- download_parser = subparsers.add_parser("download", help="Download models")
1167
- download_parser.add_argument(
1168
- "--model_link",
1169
- type=str,
1170
- help="Link of the model",
1171
- )
1172
-
1173
- # Parser for 'prerequisites' mode
1174
- prerequisites_parser = subparsers.add_parser(
1175
- "prerequisites", help="Install prerequisites"
1176
- )
1177
- prerequisites_parser.add_argument(
1178
- "--pretraineds_v1",
1179
- type=str,
1180
- choices=["True", "False"],
1181
- default="True",
1182
- help="Download pretrained models for v1",
1183
- )
1184
- prerequisites_parser.add_argument(
1185
- "--pretraineds_v2",
1186
- type=str,
1187
- choices=["True", "False"],
1188
- default="True",
1189
- help="Download pretrained models for v2",
1190
- )
1191
- prerequisites_parser.add_argument(
1192
- "--models",
1193
- type=str,
1194
- choices=["True", "False"],
1195
- default="True",
1196
- help="Donwload models",
1197
- )
1198
- prerequisites_parser.add_argument(
1199
- "--exe",
1200
- type=str,
1201
- choices=["True", "False"],
1202
- default="True",
1203
- help="Download executables",
1204
- )
1205
-
1206
- # Parser for 'audio_analyzer' mode
1207
- audio_analyzer = subparsers.add_parser("audio_analyzer", help="Run audio analyzer")
1208
- audio_analyzer.add_argument(
1209
- "--input_path",
1210
- type=str,
1211
- help="Path to the input audio file",
1212
- )
1213
-
1214
- # Parser for 'api' mode
1215
- api_parser = subparsers.add_parser("api", help="Run the API")
1216
- api_parser.add_argument(
1217
- "--host", type=str, help="Host address", default="127.0.0.1"
1218
- )
1219
- api_parser.add_argument("--port", type=str, help="Port", default="8000")
1220
-
1221
- return parser.parse_args()
1222
-
1223
-
1224
- def main():
1225
- if len(sys.argv) == 1:
1226
- print("Please run the script with '-h' for more information.")
1227
- sys.exit(1)
1228
-
1229
- args = parse_arguments()
1230
-
1231
- try:
1232
- if args.mode == "infer":
1233
- run_infer_script(
1234
- str(args.f0up_key),
1235
- str(args.filter_radius),
1236
- str(args.index_rate),
1237
- str(args.rms_mix_rate),
1238
- str(args.protect),
1239
- str(args.hop_length),
1240
- str(args.f0method),
1241
- str(args.input_path),
1242
- str(args.output_path),
1243
- str(args.pth_path),
1244
- str(args.index_path),
1245
- str(args.split_audio),
1246
- str(args.f0autotune),
1247
- str(args.clean_audio),
1248
- str(args.clean_strength),
1249
- str(args.export_format),
1250
- str(args.embedder_model),
1251
- str(args.embedder_model_custom),
1252
- str(args.upscale_audio),
1253
- )
1254
- elif args.mode == "batch_infer":
1255
- run_batch_infer_script(
1256
- str(args.f0up_key),
1257
- str(args.filter_radius),
1258
- str(args.index_rate),
1259
- str(args.rms_mix_rate),
1260
- str(args.protect),
1261
- str(args.hop_length),
1262
- str(args.f0method),
1263
- str(args.input_folder),
1264
- str(args.output_folder),
1265
- str(args.pth_path),
1266
- str(args.index_path),
1267
- str(args.split_audio),
1268
- str(args.f0autotune),
1269
- str(args.clean_audio),
1270
- str(args.clean_strength),
1271
- str(args.export_format),
1272
- str(args.embedder_model),
1273
- str(args.embedder_model_custom),
1274
- str(args.upscale_audio),
1275
- )
1276
- elif args.mode == "tts":
1277
- run_tts_script(
1278
- str(args.tts_text),
1279
- str(args.tts_voice),
1280
- str(args.tts_rate),
1281
- str(args.f0up_key),
1282
- str(args.filter_radius),
1283
- str(args.index_rate),
1284
- str(args.rms_mix_rate),
1285
- str(args.protect),
1286
- str(args.hop_length),
1287
- str(args.f0method),
1288
- str(args.output_tts_path),
1289
- str(args.output_rvc_path),
1290
- str(args.pth_path),
1291
- str(args.index_path),
1292
- str(args.split_audio),
1293
- str(args.f0autotune),
1294
- str(args.clean_audio),
1295
- str(args.clean_strength),
1296
- str(args.export_format),
1297
- str(args.embedder_model),
1298
- str(args.embedder_model_custom),
1299
- str(args.upscale_audio),
1300
- )
1301
- elif args.mode == "preprocess":
1302
- run_preprocess_script(
1303
- str(args.model_name),
1304
- str(args.dataset_path),
1305
- str(args.sampling_rate),
1306
- )
1307
- elif args.mode == "extract":
1308
- run_extract_script(
1309
- str(args.model_name),
1310
- str(args.rvc_version),
1311
- str(args.f0method),
1312
- str(args.hop_length),
1313
- str(args.sampling_rate),
1314
- str(args.embedder_model),
1315
- str(args.embedder_model_custom),
1316
- )
1317
- elif args.mode == "train":
1318
- run_train_script(
1319
- str(args.model_name),
1320
- str(args.rvc_version),
1321
- str(args.save_every_epoch),
1322
- str(args.save_only_latest),
1323
- str(args.save_every_weights),
1324
- str(args.total_epoch),
1325
- str(args.sampling_rate),
1326
- str(args.batch_size),
1327
- str(args.gpu),
1328
- str(args.pitch_guidance),
1329
- str(args.overtraining_detector),
1330
- str(args.overtraining_threshold),
1331
- str(args.pretrained),
1332
- str(args.custom_pretrained),
1333
- str(args.sync_graph),
1334
- str(args.g_pretrained_path),
1335
- str(args.d_pretrained_path),
1336
- )
1337
- elif args.mode == "index":
1338
- run_index_script(
1339
- str(args.model_name),
1340
- str(args.rvc_version),
1341
- )
1342
- elif args.mode == "model_extract":
1343
- run_model_extract_script(
1344
- str(args.pth_path),
1345
- str(args.model_name),
1346
- str(args.sampling_rate),
1347
- str(args.pitch_guidance),
1348
- str(args.rvc_version),
1349
- str(args.epoch),
1350
- str(args.step),
1351
- )
1352
- elif args.mode == "model_information":
1353
- run_model_information_script(
1354
- str(args.pth_path),
1355
- )
1356
- elif args.mode == "model_blender":
1357
- run_model_blender_script(
1358
- str(args.model_name),
1359
- str(args.pth_path_1),
1360
- str(args.pth_path_2),
1361
- str(args.ratio),
1362
- )
1363
- elif args.mode == "tensorboard":
1364
- run_tensorboard_script()
1365
- elif args.mode == "download":
1366
- run_download_script(
1367
- str(args.model_link),
1368
- )
1369
- elif args.mode == "prerequisites":
1370
- run_prerequisites_script(
1371
- str(args.pretraineds_v1),
1372
- str(args.pretraineds_v2),
1373
- str(args.models),
1374
- str(args.exe),
1375
- )
1376
- elif args.mode == "audio_analyzer":
1377
- run_audio_analyzer_script(
1378
- str(args.input_path),
1379
- )
1380
- elif args.mode == "api":
1381
- run_api_script(
1382
- str(args.host),
1383
- str(args.port),
1384
- )
1385
- except Exception as error:
1386
- print(f"Error: {error}")
1387
-
1388
-
1389
- if __name__ == "__main__":
1390
- main()
 
 
 
1
+ import os
2
+ import sys
3
+ import json
4
+ import argparse
5
+ import subprocess
6
+ import spaces
7
+
8
+ now_dir = os.getcwd()
9
+ sys.path.append(now_dir)
10
+
11
+ from rvc.configs.config import Config
12
+
13
+ from rvc.lib.tools.prerequisites_download import prequisites_download_pipeline
14
+ from rvc.train.extract.preparing_files import generate_config, generate_filelist
15
+ from rvc.lib.tools.pretrained_selector import pretrained_selector
16
+
17
+ from rvc.train.process.model_blender import model_blender
18
+ from rvc.train.process.model_information import model_information
19
+ from rvc.train.process.extract_small_model import extract_small_model
20
+
21
+ from rvc.infer.infer import infer_pipeline
22
+
23
+ from rvc.lib.tools.analyzer import analyze_audio
24
+
25
+ from rvc.lib.tools.launch_tensorboard import launch_tensorboard_pipeline
26
+
27
+ from rvc.lib.tools.model_download import model_download_pipeline
28
+
29
+ config = Config()
30
+ current_script_directory = os.path.dirname(os.path.realpath(__file__))
31
+ logs_path = os.path.join(current_script_directory, "logs")
32
+
33
+ # Get TTS Voices
34
+ with open(os.path.join("rvc", "lib", "tools", "tts_voices.json"), "r") as f:
35
+ voices_data = json.load(f)
36
+
37
+ locales = list({voice["Locale"] for voice in voices_data})
38
+
39
+
40
+ # Infer
41
+ @spaces.GPU
42
+ def run_infer_script(
43
+ f0up_key,
44
+ filter_radius,
45
+ index_rate,
46
+ rms_mix_rate,
47
+ protect,
48
+ hop_length,
49
+ f0method,
50
+ input_path,
51
+ output_path,
52
+ pth_path,
53
+ index_path,
54
+ split_audio,
55
+ f0autotune,
56
+ clean_audio,
57
+ clean_strength,
58
+ export_format,
59
+ embedder_model,
60
+ embedder_model_custom,
61
+ upscale_audio,
62
+ ):
63
+ f0autotune = "True" if str(f0autotune) == "True" else "False"
64
+ clean_audio = "True" if str(clean_audio) == "True" else "False"
65
+ upscale_audio = "True" if str(upscale_audio) == "True" else "False"
66
+ infer_pipeline(
67
+ f0up_key,
68
+ filter_radius,
69
+ index_rate,
70
+ rms_mix_rate,
71
+ protect,
72
+ hop_length,
73
+ f0method,
74
+ input_path,
75
+ output_path,
76
+ pth_path,
77
+ index_path,
78
+ split_audio,
79
+ f0autotune,
80
+ clean_audio,
81
+ clean_strength,
82
+ export_format,
83
+ embedder_model,
84
+ embedder_model_custom,
85
+ upscale_audio,
86
+ )
87
+ return f"File {input_path} inferred successfully.", output_path.replace(
88
+ ".wav", f".{export_format.lower()}"
89
+ )
90
+
91
+
92
+ # Batch infer
93
+ def run_batch_infer_script(
94
+ f0up_key,
95
+ filter_radius,
96
+ index_rate,
97
+ rms_mix_rate,
98
+ protect,
99
+ hop_length,
100
+ f0method,
101
+ input_folder,
102
+ output_folder,
103
+ pth_path,
104
+ index_path,
105
+ split_audio,
106
+ f0autotune,
107
+ clean_audio,
108
+ clean_strength,
109
+ export_format,
110
+ embedder_model,
111
+ embedder_model_custom,
112
+ upscale_audio,
113
+ ):
114
+ f0autotune = "True" if str(f0autotune) == "True" else "False"
115
+ clean_audio = "True" if str(clean_audio) == "True" else "False"
116
+ upscale_audio = "True" if str(upscale_audio) == "True" else "False"
117
+ audio_files = [
118
+ f for f in os.listdir(input_folder) if f.endswith((".mp3", ".wav", ".flac"))
119
+ ]
120
+ print(f"Detected {len(audio_files)} audio files for inference.")
121
+
122
+ for audio_file in audio_files:
123
+ if "_output" in audio_file:
124
+ pass
125
+ else:
126
+ input_path = os.path.join(input_folder, audio_file)
127
+ output_file_name = os.path.splitext(os.path.basename(audio_file))[0]
128
+ output_path = os.path.join(
129
+ output_folder,
130
+ f"{output_file_name}_output{os.path.splitext(audio_file)[1]}",
131
+ )
132
+ print(f"Inferring {input_path}...")
133
+
134
+ infer_pipeline(
135
+ f0up_key,
136
+ filter_radius,
137
+ index_rate,
138
+ rms_mix_rate,
139
+ protect,
140
+ hop_length,
141
+ f0method,
142
+ input_path,
143
+ output_path,
144
+ pth_path,
145
+ index_path,
146
+ split_audio,
147
+ f0autotune,
148
+ clean_audio,
149
+ clean_strength,
150
+ export_format,
151
+ embedder_model,
152
+ embedder_model_custom,
153
+ upscale_audio,
154
+ )
155
+
156
+ return f"Files from {input_folder} inferred successfully."
157
+
158
+
159
+ # TTS
160
+ def run_tts_script(
161
+ tts_text,
162
+ tts_voice,
163
+ tts_rate,
164
+ f0up_key,
165
+ filter_radius,
166
+ index_rate,
167
+ rms_mix_rate,
168
+ protect,
169
+ hop_length,
170
+ f0method,
171
+ output_tts_path,
172
+ output_rvc_path,
173
+ pth_path,
174
+ index_path,
175
+ split_audio,
176
+ f0autotune,
177
+ clean_audio,
178
+ clean_strength,
179
+ export_format,
180
+ embedder_model,
181
+ embedder_model_custom,
182
+ upscale_audio,
183
+ ):
184
+ f0autotune = "True" if str(f0autotune) == "True" else "False"
185
+ clean_audio = "True" if str(clean_audio) == "True" else "False"
186
+ upscale_audio = "True" if str(upscale_audio) == "True" else "False"
187
+ tts_script_path = os.path.join("rvc", "lib", "tools", "tts.py")
188
+
189
+ if os.path.exists(output_tts_path):
190
+ os.remove(output_tts_path)
191
+
192
+ command_tts = [
193
+ "python",
194
+ tts_script_path,
195
+ tts_text,
196
+ tts_voice,
197
+ str(tts_rate),
198
+ output_tts_path,
199
+ ]
200
+ subprocess.run(command_tts)
201
+
202
+ infer_pipeline(
203
+ f0up_key,
204
+ filter_radius,
205
+ index_rate,
206
+ rms_mix_rate,
207
+ protect,
208
+ hop_length,
209
+ f0method,
210
+ output_tts_path,
211
+ output_rvc_path,
212
+ pth_path,
213
+ index_path,
214
+ split_audio,
215
+ f0autotune,
216
+ clean_audio,
217
+ clean_strength,
218
+ export_format,
219
+ embedder_model,
220
+ embedder_model_custom,
221
+ upscale_audio,
222
+ )
223
+
224
+ return f"Text {tts_text} synthesized successfully.", output_rvc_path.replace(
225
+ ".wav", f".{export_format.lower()}"
226
+ )
227
+
228
+
229
+ # Preprocess
230
+ def run_preprocess_script(model_name, dataset_path, sampling_rate):
231
+ per = 3.0 if config.is_half else 3.7
232
+ preprocess_script_path = os.path.join("rvc", "train", "preprocess", "preprocess.py")
233
+ command = [
234
+ "python",
235
+ preprocess_script_path,
236
+ *map(
237
+ str,
238
+ [
239
+ os.path.join(logs_path, model_name),
240
+ dataset_path,
241
+ sampling_rate,
242
+ per,
243
+ ],
244
+ ),
245
+ ]
246
+
247
+ os.makedirs(os.path.join(logs_path, model_name), exist_ok=True)
248
+ subprocess.run(command)
249
+ return f"Model {model_name} preprocessed successfully."
250
+
251
+
252
+ # Extract
253
+ def run_extract_script(
254
+ model_name,
255
+ rvc_version,
256
+ f0method,
257
+ hop_length,
258
+ sampling_rate,
259
+ embedder_model,
260
+ embedder_model_custom,
261
+ ):
262
+ model_path = os.path.join(logs_path, model_name)
263
+ extract_f0_script_path = os.path.join(
264
+ "rvc", "train", "extract", "extract_f0_print.py"
265
+ )
266
+ extract_feature_script_path = os.path.join(
267
+ "rvc", "train", "extract", "extract_feature_print.py"
268
+ )
269
+
270
+ command_1 = [
271
+ "python",
272
+ extract_f0_script_path,
273
+ *map(
274
+ str,
275
+ [
276
+ model_path,
277
+ f0method,
278
+ hop_length,
279
+ ],
280
+ ),
281
+ ]
282
+ command_2 = [
283
+ "python",
284
+ extract_feature_script_path,
285
+ *map(
286
+ str,
287
+ [
288
+ config.device,
289
+ "1",
290
+ "0",
291
+ "0",
292
+ model_path,
293
+ rvc_version,
294
+ "True",
295
+ embedder_model,
296
+ embedder_model_custom,
297
+ ],
298
+ ),
299
+ ]
300
+ subprocess.run(command_1)
301
+ subprocess.run(command_2)
302
+
303
+ generate_config(rvc_version, sampling_rate, model_path)
304
+ generate_filelist(f0method, model_path, rvc_version, sampling_rate)
305
+ return f"Model {model_name} extracted successfully."
306
+
307
+
308
+ # Train
309
+ def run_train_script(
310
+ model_name,
311
+ rvc_version,
312
+ save_every_epoch,
313
+ save_only_latest,
314
+ save_every_weights,
315
+ total_epoch,
316
+ sampling_rate,
317
+ batch_size,
318
+ gpu,
319
+ pitch_guidance,
320
+ overtraining_detector,
321
+ overtraining_threshold,
322
+ pretrained,
323
+ custom_pretrained,
324
+ sync_graph,
325
+ g_pretrained_path=None,
326
+ d_pretrained_path=None,
327
+ ):
328
+ f0 = 1 if str(pitch_guidance) == "True" else 0
329
+ latest = 1 if str(save_only_latest) == "True" else 0
330
+ save_every = 1 if str(save_every_weights) == "True" else 0
331
+ detector = 1 if str(overtraining_detector) == "True" else 0
332
+ sync = 1 if str(sync_graph) == "True" else 0
333
+
334
+ if str(pretrained) == "True":
335
+ if str(custom_pretrained) == "False":
336
+ pg, pd = pretrained_selector(f0)[rvc_version][sampling_rate]
337
+ else:
338
+ if g_pretrained_path is None or d_pretrained_path is None:
339
+ raise ValueError(
340
+ "Please provide the path to the pretrained G and D models."
341
+ )
342
+ pg, pd = g_pretrained_path, d_pretrained_path
343
+ else:
344
+ pg, pd = "", ""
345
+
346
+ train_script_path = os.path.join("rvc", "train", "train.py")
347
+ command = [
348
+ "python",
349
+ train_script_path,
350
+ *map(
351
+ str,
352
+ [
353
+ "-se",
354
+ save_every_epoch,
355
+ "-te",
356
+ total_epoch,
357
+ "-pg",
358
+ pg,
359
+ "-pd",
360
+ pd,
361
+ "-sr",
362
+ sampling_rate,
363
+ "-bs",
364
+ batch_size,
365
+ "-g",
366
+ gpu,
367
+ "-e",
368
+ os.path.join(logs_path, model_name),
369
+ "-v",
370
+ rvc_version,
371
+ "-l",
372
+ latest,
373
+ "-c",
374
+ "0",
375
+ "-sw",
376
+ save_every,
377
+ "-f0",
378
+ f0,
379
+ "-od",
380
+ detector,
381
+ "-ot",
382
+ overtraining_threshold,
383
+ "-sg",
384
+ sync,
385
+ ],
386
+ ),
387
+ ]
388
+
389
+ subprocess.run(command)
390
+ run_index_script(model_name, rvc_version)
391
+ return f"Model {model_name} trained successfully."
392
+
393
+
394
+ # Index
395
+ def run_index_script(model_name, rvc_version):
396
+ index_script_path = os.path.join("rvc", "train", "process", "extract_index.py")
397
+ command = [
398
+ "python",
399
+ index_script_path,
400
+ os.path.join(logs_path, model_name),
401
+ rvc_version,
402
+ ]
403
+
404
+ subprocess.run(command)
405
+ return f"Index file for {model_name} generated successfully."
406
+
407
+
408
+ # Model extract
409
+ def run_model_extract_script(
410
+ pth_path, model_name, sampling_rate, pitch_guidance, rvc_version, epoch, step
411
+ ):
412
+ f0 = 1 if str(pitch_guidance) == "True" else 0
413
+ extract_small_model(
414
+ pth_path, model_name, sampling_rate, f0, rvc_version, epoch, step
415
+ )
416
+ return f"Model {model_name} extracted successfully."
417
+
418
+
419
+ # Model information
420
+ def run_model_information_script(pth_path):
421
+ print(model_information(pth_path))
422
+
423
+
424
+ # Model blender
425
+ def run_model_blender_script(model_name, pth_path_1, pth_path_2, ratio):
426
+ message, model_blended = model_blender(model_name, pth_path_1, pth_path_2, ratio)
427
+ return message, model_blended
428
+
429
+
430
+ # Tensorboard
431
+ def run_tensorboard_script():
432
+ launch_tensorboard_pipeline()
433
+
434
+
435
+ # Download
436
+ def run_download_script(model_link):
437
+ model_download_pipeline(model_link)
438
+ return f"Model downloaded successfully."
439
+
440
+
441
+ # Prerequisites
442
+ def run_prerequisites_script(pretraineds_v1, pretraineds_v2, models, exe):
443
+ prequisites_download_pipeline(pretraineds_v1, pretraineds_v2, models, exe)
444
+ return "Prerequisites installed successfully."
445
+
446
+
447
+ # Audio analyzer
448
+ def run_audio_analyzer_script(input_path, save_plot_path="logs/audio_analysis.png"):
449
+ audio_info, plot_path = analyze_audio(input_path, save_plot_path)
450
+ print(
451
+ f"Audio info of {input_path}: {audio_info}",
452
+ f"Audio file {input_path} analyzed successfully. Plot saved at: {plot_path}",
453
+ )
454
+ return audio_info, plot_path
455
+
456
+
457
+ # API
458
+ def run_api_script(ip, port):
459
+ command = [
460
+ "env/Scripts/uvicorn.exe" if os.name == "nt" else "uvicorn",
461
+ "api:app",
462
+ "--host",
463
+ ip,
464
+ "--port",
465
+ port,
466
+ ]
467
+ subprocess.run(command)
468
+
469
+
470
+ # Parse arguments
471
+ def parse_arguments():
472
+ parser = argparse.ArgumentParser(
473
+ description="Run the main.py script with specific parameters."
474
+ )
475
+ subparsers = parser.add_subparsers(
476
+ title="subcommands", dest="mode", help="Choose a mode"
477
+ )
478
+
479
+ # Parser for 'infer' mode
480
+ infer_parser = subparsers.add_parser("infer", help="Run inference")
481
+ infer_parser.add_argument(
482
+ "--f0up_key",
483
+ type=str,
484
+ help="Value for f0up_key",
485
+ choices=[str(i) for i in range(-24, 25)],
486
+ default="0",
487
+ )
488
+ infer_parser.add_argument(
489
+ "--filter_radius",
490
+ type=str,
491
+ help="Value for filter_radius",
492
+ choices=[str(i) for i in range(11)],
493
+ default="3",
494
+ )
495
+ infer_parser.add_argument(
496
+ "--index_rate",
497
+ type=str,
498
+ help="Value for index_rate",
499
+ choices=[str(i / 10) for i in range(11)],
500
+ default="0.3",
501
+ )
502
+ infer_parser.add_argument(
503
+ "--rms_mix_rate",
504
+ type=str,
505
+ help="Value for rms_mix_rate",
506
+ choices=[str(i / 10) for i in range(11)],
507
+ default="1",
508
+ )
509
+ infer_parser.add_argument(
510
+ "--protect",
511
+ type=str,
512
+ help="Value for protect",
513
+ choices=[str(i / 10) for i in range(6)],
514
+ default="0.33",
515
+ )
516
+ infer_parser.add_argument(
517
+ "--hop_length",
518
+ type=str,
519
+ help="Value for hop_length",
520
+ choices=[str(i) for i in range(1, 513)],
521
+ default="128",
522
+ )
523
+ infer_parser.add_argument(
524
+ "--f0method",
525
+ type=str,
526
+ help="Value for f0method",
527
+ choices=[
528
+ "pm",
529
+ "harvest",
530
+ "dio",
531
+ "crepe",
532
+ "crepe-tiny",
533
+ "rmvpe",
534
+ "fcpe",
535
+ "hybrid[crepe+rmvpe]",
536
+ "hybrid[crepe+fcpe]",
537
+ "hybrid[rmvpe+fcpe]",
538
+ "hybrid[crepe+rmvpe+fcpe]",
539
+ ],
540
+ default="rmvpe",
541
+ )
542
+ infer_parser.add_argument("--input_path", type=str, help="Input path")
543
+ infer_parser.add_argument("--output_path", type=str, help="Output path")
544
+ infer_parser.add_argument("--pth_path", type=str, help="Path to the .pth file")
545
+ infer_parser.add_argument(
546
+ "--index_path",
547
+ type=str,
548
+ help="Path to the .index file",
549
+ )
550
+ infer_parser.add_argument(
551
+ "--split_audio",
552
+ type=str,
553
+ help="Enable split audio",
554
+ choices=["True", "False"],
555
+ default="False",
556
+ )
557
+ infer_parser.add_argument(
558
+ "--f0autotune",
559
+ type=str,
560
+ help="Enable autotune",
561
+ choices=["True", "False"],
562
+ default="False",
563
+ )
564
+ infer_parser.add_argument(
565
+ "--clean_audio",
566
+ type=str,
567
+ help="Enable clean audio",
568
+ choices=["True", "False"],
569
+ default="False",
570
+ )
571
+ infer_parser.add_argument(
572
+ "--clean_strength",
573
+ type=str,
574
+ help="Value for clean_strength",
575
+ choices=[str(i / 10) for i in range(11)],
576
+ default="0.7",
577
+ )
578
+ infer_parser.add_argument(
579
+ "--export_format",
580
+ type=str,
581
+ help="Export format",
582
+ choices=["WAV", "MP3", "FLAC", "OGG", "M4A"],
583
+ default="WAV",
584
+ )
585
+ infer_parser.add_argument(
586
+ "--embedder_model",
587
+ type=str,
588
+ help="Embedder model",
589
+ choices=["contentvec", "hubert", "custom"],
590
+ default="hubert",
591
+ )
592
+ infer_parser.add_argument(
593
+ "--embedder_model_custom",
594
+ type=str,
595
+ help="Custom Embedder model",
596
+ default=None,
597
+ )
598
+ infer_parser.add_argument(
599
+ "--upscale_audio",
600
+ type=str,
601
+ help="Enable audio upscaling",
602
+ choices=["True", "False"],
603
+ default="False",
604
+ )
605
+
606
+ # Parser for 'batch_infer' mode
607
+ batch_infer_parser = subparsers.add_parser(
608
+ "batch_infer", help="Run batch inference"
609
+ )
610
+ batch_infer_parser.add_argument(
611
+ "--f0up_key",
612
+ type=str,
613
+ help="Value for f0up_key",
614
+ choices=[str(i) for i in range(-24, 25)],
615
+ default="0",
616
+ )
617
+ batch_infer_parser.add_argument(
618
+ "--filter_radius",
619
+ type=str,
620
+ help="Value for filter_radius",
621
+ choices=[str(i) for i in range(11)],
622
+ default="3",
623
+ )
624
+ batch_infer_parser.add_argument(
625
+ "--index_rate",
626
+ type=str,
627
+ help="Value for index_rate",
628
+ choices=[str(i / 10) for i in range(11)],
629
+ default="0.3",
630
+ )
631
+ batch_infer_parser.add_argument(
632
+ "--rms_mix_rate",
633
+ type=str,
634
+ help="Value for rms_mix_rate",
635
+ choices=[str(i / 10) for i in range(11)],
636
+ default="1",
637
+ )
638
+ batch_infer_parser.add_argument(
639
+ "--protect",
640
+ type=str,
641
+ help="Value for protect",
642
+ choices=[str(i / 10) for i in range(6)],
643
+ default="0.33",
644
+ )
645
+ batch_infer_parser.add_argument(
646
+ "--hop_length",
647
+ type=str,
648
+ help="Value for hop_length",
649
+ choices=[str(i) for i in range(1, 513)],
650
+ default="128",
651
+ )
652
+ batch_infer_parser.add_argument(
653
+ "--f0method",
654
+ type=str,
655
+ help="Value for f0method",
656
+ choices=[
657
+ "pm",
658
+ "harvest",
659
+ "dio",
660
+ "crepe",
661
+ "crepe-tiny",
662
+ "rmvpe",
663
+ "fcpe",
664
+ "hybrid[crepe+rmvpe]",
665
+ "hybrid[crepe+fcpe]",
666
+ "hybrid[rmvpe+fcpe]",
667
+ "hybrid[crepe+rmvpe+fcpe]",
668
+ ],
669
+ default="rmvpe",
670
+ )
671
+ batch_infer_parser.add_argument("--input_folder", type=str, help="Input folder")
672
+ batch_infer_parser.add_argument("--output_folder", type=str, help="Output folder")
673
+ batch_infer_parser.add_argument(
674
+ "--pth_path", type=str, help="Path to the .pth file"
675
+ )
676
+ batch_infer_parser.add_argument(
677
+ "--index_path",
678
+ type=str,
679
+ help="Path to the .index file",
680
+ )
681
+ batch_infer_parser.add_argument(
682
+ "--split_audio",
683
+ type=str,
684
+ help="Enable split audio",
685
+ choices=["True", "False"],
686
+ default="False",
687
+ )
688
+ batch_infer_parser.add_argument(
689
+ "--f0autotune",
690
+ type=str,
691
+ help="Enable autotune",
692
+ choices=["True", "False"],
693
+ default="False",
694
+ )
695
+ batch_infer_parser.add_argument(
696
+ "--clean_audio",
697
+ type=str,
698
+ help="Enable clean audio",
699
+ choices=["True", "False"],
700
+ default="False",
701
+ )
702
+ batch_infer_parser.add_argument(
703
+ "--clean_strength",
704
+ type=str,
705
+ help="Value for clean_strength",
706
+ choices=[str(i / 10) for i in range(11)],
707
+ default="0.7",
708
+ )
709
+ batch_infer_parser.add_argument(
710
+ "--export_format",
711
+ type=str,
712
+ help="Export format",
713
+ choices=["WAV", "MP3", "FLAC", "OGG", "M4A"],
714
+ default="WAV",
715
+ )
716
+ batch_infer_parser.add_argument(
717
+ "--embedder_model",
718
+ type=str,
719
+ help="Embedder model",
720
+ choices=["contentvec", "hubert", "custom"],
721
+ default="hubert",
722
+ )
723
+ batch_infer_parser.add_argument(
724
+ "--embedder_model_custom",
725
+ type=str,
726
+ help="Custom Embedder model",
727
+ default=None,
728
+ )
729
+ batch_infer_parser.add_argument(
730
+ "--upscale_audio",
731
+ type=str,
732
+ help="Enable audio upscaling",
733
+ choices=["True", "False"],
734
+ default="False",
735
+ )
736
+
737
+ # Parser for 'tts' mode
738
+ tts_parser = subparsers.add_parser("tts", help="Run TTS")
739
+ tts_parser.add_argument(
740
+ "--tts_text",
741
+ type=str,
742
+ help="Text to be synthesized",
743
+ )
744
+ tts_parser.add_argument(
745
+ "--tts_voice",
746
+ type=str,
747
+ help="Voice to be used",
748
+ choices=locales,
749
+ )
750
+ tts_parser.add_argument(
751
+ "--tts_rate",
752
+ type=str,
753
+ help="Increase or decrease TTS speed",
754
+ choices=[str(i) for i in range(-100, 100)],
755
+ default="0",
756
+ )
757
+ tts_parser.add_argument(
758
+ "--f0up_key",
759
+ type=str,
760
+ help="Value for f0up_key",
761
+ choices=[str(i) for i in range(-24, 25)],
762
+ default="0",
763
+ )
764
+ tts_parser.add_argument(
765
+ "--filter_radius",
766
+ type=str,
767
+ help="Value for filter_radius",
768
+ choices=[str(i) for i in range(11)],
769
+ default="3",
770
+ )
771
+ tts_parser.add_argument(
772
+ "--index_rate",
773
+ type=str,
774
+ help="Value for index_rate",
775
+ choices=[str(i / 10) for i in range(11)],
776
+ default="0.3",
777
+ )
778
+ tts_parser.add_argument(
779
+ "--rms_mix_rate",
780
+ type=str,
781
+ help="Value for rms_mix_rate",
782
+ choices=[str(i / 10) for i in range(11)],
783
+ default="1",
784
+ )
785
+ tts_parser.add_argument(
786
+ "--protect",
787
+ type=str,
788
+ help="Value for protect",
789
+ choices=[str(i / 10) for i in range(6)],
790
+ default="0.33",
791
+ )
792
+ tts_parser.add_argument(
793
+ "--hop_length",
794
+ type=str,
795
+ help="Value for hop_length",
796
+ choices=[str(i) for i in range(1, 513)],
797
+ default="128",
798
+ )
799
+ tts_parser.add_argument(
800
+ "--f0method",
801
+ type=str,
802
+ help="Value for f0method",
803
+ choices=[
804
+ "pm",
805
+ "harvest",
806
+ "dio",
807
+ "crepe",
808
+ "crepe-tiny",
809
+ "rmvpe",
810
+ "fcpe",
811
+ "hybrid[crepe+rmvpe]",
812
+ "hybrid[crepe+fcpe]",
813
+ "hybrid[rmvpe+fcpe]",
814
+ "hybrid[crepe+rmvpe+fcpe]",
815
+ ],
816
+ default="rmvpe",
817
+ )
818
+ tts_parser.add_argument("--output_tts_path", type=str, help="Output tts path")
819
+ tts_parser.add_argument("--output_rvc_path", type=str, help="Output rvc path")
820
+ tts_parser.add_argument("--pth_path", type=str, help="Path to the .pth file")
821
+ tts_parser.add_argument(
822
+ "--index_path",
823
+ type=str,
824
+ help="Path to the .index file",
825
+ )
826
+ tts_parser.add_argument(
827
+ "--split_audio",
828
+ type=str,
829
+ help="Enable split audio",
830
+ choices=["True", "False"],
831
+ default="False",
832
+ )
833
+ tts_parser.add_argument(
834
+ "--f0autotune",
835
+ type=str,
836
+ help="Enable autotune",
837
+ choices=["True", "False"],
838
+ default="False",
839
+ )
840
+ tts_parser.add_argument(
841
+ "--clean_audio",
842
+ type=str,
843
+ help="Enable clean audio",
844
+ choices=["True", "False"],
845
+ default="False",
846
+ )
847
+ tts_parser.add_argument(
848
+ "--clean_strength",
849
+ type=str,
850
+ help="Value for clean_strength",
851
+ choices=[str(i / 10) for i in range(11)],
852
+ default="0.7",
853
+ )
854
+ tts_parser.add_argument(
855
+ "--export_format",
856
+ type=str,
857
+ help="Export format",
858
+ choices=["WAV", "MP3", "FLAC", "OGG", "M4A"],
859
+ default="WAV",
860
+ )
861
+ tts_parser.add_argument(
862
+ "--embedder_model",
863
+ type=str,
864
+ help="Embedder model",
865
+ choices=["contentvec", "hubert", "custom"],
866
+ default="hubert",
867
+ )
868
+ tts_parser.add_argument(
869
+ "--embedder_model_custom",
870
+ type=str,
871
+ help="Custom Embedder model",
872
+ default=None,
873
+ )
874
+ tts_parser.add_argument(
875
+ "--upscale_audio",
876
+ type=str,
877
+ help="Enable audio upscaling",
878
+ choices=["True", "False"],
879
+ default="False",
880
+ )
881
+
882
+ # Parser for 'preprocess' mode
883
+ preprocess_parser = subparsers.add_parser("preprocess", help="Run preprocessing")
884
+ preprocess_parser.add_argument("--model_name", type=str, help="Name of the model")
885
+ preprocess_parser.add_argument(
886
+ "--dataset_path",
887
+ type=str,
888
+ help="Path to the dataset",
889
+ )
890
+ preprocess_parser.add_argument(
891
+ "--sampling_rate",
892
+ type=str,
893
+ help="Sampling rate",
894
+ choices=["32000", "40000", "48000"],
895
+ )
896
+
897
+ # Parser for 'extract' mode
898
+ extract_parser = subparsers.add_parser("extract", help="Run extract")
899
+ extract_parser.add_argument(
900
+ "--model_name",
901
+ type=str,
902
+ help="Name of the model",
903
+ )
904
+ extract_parser.add_argument(
905
+ "--rvc_version",
906
+ type=str,
907
+ help="Version of the model",
908
+ choices=["v1", "v2"],
909
+ default="v2",
910
+ )
911
+ extract_parser.add_argument(
912
+ "--f0method",
913
+ type=str,
914
+ help="Value for f0method",
915
+ choices=[
916
+ "pm",
917
+ "harvest",
918
+ "dio",
919
+ "crepe",
920
+ "crepe-tiny",
921
+ "rmvpe",
922
+ ],
923
+ default="rmvpe",
924
+ )
925
+ extract_parser.add_argument(
926
+ "--hop_length",
927
+ type=str,
928
+ help="Value for hop_length",
929
+ choices=[str(i) for i in range(1, 513)],
930
+ default="128",
931
+ )
932
+ extract_parser.add_argument(
933
+ "--sampling_rate",
934
+ type=str,
935
+ help="Sampling rate",
936
+ choices=["32000", "40000", "48000"],
937
+ )
938
+ extract_parser.add_argument(
939
+ "--embedder_model",
940
+ type=str,
941
+ help="Embedder model",
942
+ choices=["contentvec", "hubert", "custom"],
943
+ default="hubert",
944
+ )
945
+ extract_parser.add_argument(
946
+ "--embedder_model_custom",
947
+ type=str,
948
+ help="Custom Embedder model",
949
+ default=None,
950
+ )
951
+
952
+ # Parser for 'train' mode
953
+ train_parser = subparsers.add_parser("train", help="Run training")
954
+ train_parser.add_argument(
955
+ "--model_name",
956
+ type=str,
957
+ help="Name of the model",
958
+ )
959
+ train_parser.add_argument(
960
+ "--rvc_version",
961
+ type=str,
962
+ help="Version of the model",
963
+ choices=["v1", "v2"],
964
+ default="v2",
965
+ )
966
+ train_parser.add_argument(
967
+ "--save_every_epoch",
968
+ type=str,
969
+ help="Save every epoch",
970
+ choices=[str(i) for i in range(1, 101)],
971
+ )
972
+ train_parser.add_argument(
973
+ "--save_only_latest",
974
+ type=str,
975
+ help="Save weight only at last epoch",
976
+ choices=["True", "False"],
977
+ default="False",
978
+ )
979
+ train_parser.add_argument(
980
+ "--save_every_weights",
981
+ type=str,
982
+ help="Save weight every epoch",
983
+ choices=["True", "False"],
984
+ default="True",
985
+ )
986
+ train_parser.add_argument(
987
+ "--total_epoch",
988
+ type=str,
989
+ help="Total epoch",
990
+ choices=[str(i) for i in range(1, 10001)],
991
+ default="1000",
992
+ )
993
+ train_parser.add_argument(
994
+ "--sampling_rate",
995
+ type=str,
996
+ help="Sampling rate",
997
+ choices=["32000", "40000", "48000"],
998
+ )
999
+ train_parser.add_argument(
1000
+ "--batch_size",
1001
+ type=str,
1002
+ help="Batch size",
1003
+ choices=[str(i) for i in range(1, 51)],
1004
+ default="8",
1005
+ )
1006
+ train_parser.add_argument(
1007
+ "--gpu",
1008
+ type=str,
1009
+ help="GPU number",
1010
+ default="0",
1011
+ )
1012
+ train_parser.add_argument(
1013
+ "--pitch_guidance",
1014
+ type=str,
1015
+ help="Pitch guidance",
1016
+ choices=["True", "False"],
1017
+ default="True",
1018
+ )
1019
+ train_parser.add_argument(
1020
+ "--pretrained",
1021
+ type=str,
1022
+ help="Pretrained",
1023
+ choices=["True", "False"],
1024
+ default="True",
1025
+ )
1026
+ train_parser.add_argument(
1027
+ "--custom_pretrained",
1028
+ type=str,
1029
+ help="Custom pretrained",
1030
+ choices=["True", "False"],
1031
+ default="False",
1032
+ )
1033
+ train_parser.add_argument(
1034
+ "--g_pretrained_path",
1035
+ type=str,
1036
+ nargs="?",
1037
+ default=None,
1038
+ help="Path to the pretrained G file",
1039
+ )
1040
+ train_parser.add_argument(
1041
+ "--d_pretrained_path",
1042
+ type=str,
1043
+ nargs="?",
1044
+ default=None,
1045
+ help="Path to the pretrained D file",
1046
+ )
1047
+ train_parser.add_argument(
1048
+ "--overtraining_detector",
1049
+ type=str,
1050
+ help="Overtraining detector",
1051
+ choices=["True", "False"],
1052
+ default="False",
1053
+ )
1054
+ train_parser.add_argument(
1055
+ "--overtraining_threshold",
1056
+ type=str,
1057
+ help="Overtraining threshold",
1058
+ choices=[str(i) for i in range(1, 101)],
1059
+ default="50",
1060
+ )
1061
+ train_parser.add_argument(
1062
+ "--sync_graph",
1063
+ type=str,
1064
+ help="Sync graph",
1065
+ choices=["True", "False"],
1066
+ default="False",
1067
+ )
1068
+
1069
+ # Parser for 'index' mode
1070
+ index_parser = subparsers.add_parser("index", help="Generate index file")
1071
+ index_parser.add_argument(
1072
+ "--model_name",
1073
+ type=str,
1074
+ help="Name of the model",
1075
+ )
1076
+ index_parser.add_argument(
1077
+ "--rvc_version",
1078
+ type=str,
1079
+ help="Version of the model",
1080
+ choices=["v1", "v2"],
1081
+ default="v2",
1082
+ )
1083
+
1084
+ # Parser for 'model_extract' mode
1085
+ model_extract_parser = subparsers.add_parser("model_extract", help="Extract model")
1086
+ model_extract_parser.add_argument(
1087
+ "--pth_path",
1088
+ type=str,
1089
+ help="Path to the .pth file",
1090
+ )
1091
+ model_extract_parser.add_argument(
1092
+ "--model_name",
1093
+ type=str,
1094
+ help="Name of the model",
1095
+ )
1096
+ model_extract_parser.add_argument(
1097
+ "--sampling_rate",
1098
+ type=str,
1099
+ help="Sampling rate",
1100
+ choices=["40000", "48000"],
1101
+ )
1102
+ model_extract_parser.add_argument(
1103
+ "--pitch_guidance",
1104
+ type=str,
1105
+ help="Pitch guidance",
1106
+ choices=["True", "False"],
1107
+ )
1108
+ model_extract_parser.add_argument(
1109
+ "--rvc_version",
1110
+ type=str,
1111
+ help="Version of the model",
1112
+ choices=["v1", "v2"],
1113
+ default="v2",
1114
+ )
1115
+ model_extract_parser.add_argument(
1116
+ "--epoch",
1117
+ type=str,
1118
+ help="Epochs of the model",
1119
+ choices=[str(i) for i in range(1, 10001)],
1120
+ )
1121
+ model_extract_parser.add_argument(
1122
+ "--step",
1123
+ type=str,
1124
+ help="Steps of the model",
1125
+ )
1126
+
1127
+ # Parser for 'model_information' mode
1128
+ model_information_parser = subparsers.add_parser(
1129
+ "model_information", help="Print model information"
1130
+ )
1131
+ model_information_parser.add_argument(
1132
+ "--pth_path",
1133
+ type=str,
1134
+ help="Path to the .pth file",
1135
+ )
1136
+
1137
+ # Parser for 'model_blender' mode
1138
+ model_blender_parser = subparsers.add_parser(
1139
+ "model_blender", help="Fuse two models"
1140
+ )
1141
+ model_blender_parser.add_argument(
1142
+ "--model_name",
1143
+ type=str,
1144
+ help="Name of the model",
1145
+ )
1146
+ model_blender_parser.add_argument(
1147
+ "--pth_path_1",
1148
+ type=str,
1149
+ help="Path to the first .pth file",
1150
+ )
1151
+ model_blender_parser.add_argument(
1152
+ "--pth_path_2",
1153
+ type=str,
1154
+ help="Path to the second .pth file",
1155
+ )
1156
+ model_blender_parser.add_argument(
1157
+ "--ratio",
1158
+ type=str,
1159
+ help="Value for blender ratio",
1160
+ choices=[str(i / 10) for i in range(11)],
1161
+ default="0.5",
1162
+ )
1163
+
1164
+ # Parser for 'tensorboard' mode
1165
+ subparsers.add_parser("tensorboard", help="Run tensorboard")
1166
+
1167
+ # Parser for 'download' mode
1168
+ download_parser = subparsers.add_parser("download", help="Download models")
1169
+ download_parser.add_argument(
1170
+ "--model_link",
1171
+ type=str,
1172
+ help="Link of the model",
1173
+ )
1174
+
1175
+ # Parser for 'prerequisites' mode
1176
+ prerequisites_parser = subparsers.add_parser(
1177
+ "prerequisites", help="Install prerequisites"
1178
+ )
1179
+ prerequisites_parser.add_argument(
1180
+ "--pretraineds_v1",
1181
+ type=str,
1182
+ choices=["True", "False"],
1183
+ default="True",
1184
+ help="Download pretrained models for v1",
1185
+ )
1186
+ prerequisites_parser.add_argument(
1187
+ "--pretraineds_v2",
1188
+ type=str,
1189
+ choices=["True", "False"],
1190
+ default="True",
1191
+ help="Download pretrained models for v2",
1192
+ )
1193
+ prerequisites_parser.add_argument(
1194
+ "--models",
1195
+ type=str,
1196
+ choices=["True", "False"],
1197
+ default="True",
1198
+ help="Donwload models",
1199
+ )
1200
+ prerequisites_parser.add_argument(
1201
+ "--exe",
1202
+ type=str,
1203
+ choices=["True", "False"],
1204
+ default="True",
1205
+ help="Download executables",
1206
+ )
1207
+
1208
+ # Parser for 'audio_analyzer' mode
1209
+ audio_analyzer = subparsers.add_parser("audio_analyzer", help="Run audio analyzer")
1210
+ audio_analyzer.add_argument(
1211
+ "--input_path",
1212
+ type=str,
1213
+ help="Path to the input audio file",
1214
+ )
1215
+
1216
+ # Parser for 'api' mode
1217
+ api_parser = subparsers.add_parser("api", help="Run the API")
1218
+ api_parser.add_argument(
1219
+ "--host", type=str, help="Host address", default="127.0.0.1"
1220
+ )
1221
+ api_parser.add_argument("--port", type=str, help="Port", default="8000")
1222
+
1223
+ return parser.parse_args()
1224
+
1225
+
1226
+ def main():
1227
+ if len(sys.argv) == 1:
1228
+ print("Please run the script with '-h' for more information.")
1229
+ sys.exit(1)
1230
+
1231
+ args = parse_arguments()
1232
+
1233
+ try:
1234
+ if args.mode == "infer":
1235
+ run_infer_script(
1236
+ str(args.f0up_key),
1237
+ str(args.filter_radius),
1238
+ str(args.index_rate),
1239
+ str(args.rms_mix_rate),
1240
+ str(args.protect),
1241
+ str(args.hop_length),
1242
+ str(args.f0method),
1243
+ str(args.input_path),
1244
+ str(args.output_path),
1245
+ str(args.pth_path),
1246
+ str(args.index_path),
1247
+ str(args.split_audio),
1248
+ str(args.f0autotune),
1249
+ str(args.clean_audio),
1250
+ str(args.clean_strength),
1251
+ str(args.export_format),
1252
+ str(args.embedder_model),
1253
+ str(args.embedder_model_custom),
1254
+ str(args.upscale_audio),
1255
+ )
1256
+ elif args.mode == "batch_infer":
1257
+ run_batch_infer_script(
1258
+ str(args.f0up_key),
1259
+ str(args.filter_radius),
1260
+ str(args.index_rate),
1261
+ str(args.rms_mix_rate),
1262
+ str(args.protect),
1263
+ str(args.hop_length),
1264
+ str(args.f0method),
1265
+ str(args.input_folder),
1266
+ str(args.output_folder),
1267
+ str(args.pth_path),
1268
+ str(args.index_path),
1269
+ str(args.split_audio),
1270
+ str(args.f0autotune),
1271
+ str(args.clean_audio),
1272
+ str(args.clean_strength),
1273
+ str(args.export_format),
1274
+ str(args.embedder_model),
1275
+ str(args.embedder_model_custom),
1276
+ str(args.upscale_audio),
1277
+ )
1278
+ elif args.mode == "tts":
1279
+ run_tts_script(
1280
+ str(args.tts_text),
1281
+ str(args.tts_voice),
1282
+ str(args.tts_rate),
1283
+ str(args.f0up_key),
1284
+ str(args.filter_radius),
1285
+ str(args.index_rate),
1286
+ str(args.rms_mix_rate),
1287
+ str(args.protect),
1288
+ str(args.hop_length),
1289
+ str(args.f0method),
1290
+ str(args.output_tts_path),
1291
+ str(args.output_rvc_path),
1292
+ str(args.pth_path),
1293
+ str(args.index_path),
1294
+ str(args.split_audio),
1295
+ str(args.f0autotune),
1296
+ str(args.clean_audio),
1297
+ str(args.clean_strength),
1298
+ str(args.export_format),
1299
+ str(args.embedder_model),
1300
+ str(args.embedder_model_custom),
1301
+ str(args.upscale_audio),
1302
+ )
1303
+ elif args.mode == "preprocess":
1304
+ run_preprocess_script(
1305
+ str(args.model_name),
1306
+ str(args.dataset_path),
1307
+ str(args.sampling_rate),
1308
+ )
1309
+ elif args.mode == "extract":
1310
+ run_extract_script(
1311
+ str(args.model_name),
1312
+ str(args.rvc_version),
1313
+ str(args.f0method),
1314
+ str(args.hop_length),
1315
+ str(args.sampling_rate),
1316
+ str(args.embedder_model),
1317
+ str(args.embedder_model_custom),
1318
+ )
1319
+ elif args.mode == "train":
1320
+ run_train_script(
1321
+ str(args.model_name),
1322
+ str(args.rvc_version),
1323
+ str(args.save_every_epoch),
1324
+ str(args.save_only_latest),
1325
+ str(args.save_every_weights),
1326
+ str(args.total_epoch),
1327
+ str(args.sampling_rate),
1328
+ str(args.batch_size),
1329
+ str(args.gpu),
1330
+ str(args.pitch_guidance),
1331
+ str(args.overtraining_detector),
1332
+ str(args.overtraining_threshold),
1333
+ str(args.pretrained),
1334
+ str(args.custom_pretrained),
1335
+ str(args.sync_graph),
1336
+ str(args.g_pretrained_path),
1337
+ str(args.d_pretrained_path),
1338
+ )
1339
+ elif args.mode == "index":
1340
+ run_index_script(
1341
+ str(args.model_name),
1342
+ str(args.rvc_version),
1343
+ )
1344
+ elif args.mode == "model_extract":
1345
+ run_model_extract_script(
1346
+ str(args.pth_path),
1347
+ str(args.model_name),
1348
+ str(args.sampling_rate),
1349
+ str(args.pitch_guidance),
1350
+ str(args.rvc_version),
1351
+ str(args.epoch),
1352
+ str(args.step),
1353
+ )
1354
+ elif args.mode == "model_information":
1355
+ run_model_information_script(
1356
+ str(args.pth_path),
1357
+ )
1358
+ elif args.mode == "model_blender":
1359
+ run_model_blender_script(
1360
+ str(args.model_name),
1361
+ str(args.pth_path_1),
1362
+ str(args.pth_path_2),
1363
+ str(args.ratio),
1364
+ )
1365
+ elif args.mode == "tensorboard":
1366
+ run_tensorboard_script()
1367
+ elif args.mode == "download":
1368
+ run_download_script(
1369
+ str(args.model_link),
1370
+ )
1371
+ elif args.mode == "prerequisites":
1372
+ run_prerequisites_script(
1373
+ str(args.pretraineds_v1),
1374
+ str(args.pretraineds_v2),
1375
+ str(args.models),
1376
+ str(args.exe),
1377
+ )
1378
+ elif args.mode == "audio_analyzer":
1379
+ run_audio_analyzer_script(
1380
+ str(args.input_path),
1381
+ )
1382
+ elif args.mode == "api":
1383
+ run_api_script(
1384
+ str(args.host),
1385
+ str(args.port),
1386
+ )
1387
+ except Exception as error:
1388
+ print(f"Error: {error}")
1389
+
1390
+
1391
+ if __name__ == "__main__":
1392
+ main()