Spaces:

IAHispano
/

Applio

Running

App Files Files Community

Aitron Emper commited on Mar 31, 2024

Commit

1a7d583

verified ·

1 Parent(s): dc8f793

Upload 74 files

Browse files

Files changed (41) hide show

app.py +53 -15
core.py +784 -304
rvc/configs/config.py +6 -7
rvc/infer/infer.py +126 -63
rvc/infer/pipeline.py +625 -0
rvc/lib/FCPEF0Predictor.py +1036 -0
rvc/lib/infer_pack/models.py +18 -18
rvc/lib/tools/analyzer.py +76 -0
rvc/lib/tools/gdown.py +15 -8
rvc/lib/tools/launch_tensorboard.py +2 -1
rvc/lib/tools/model_download.py +167 -72
rvc/lib/tools/prerequisites_download.py +47 -36
rvc/lib/tools/pretrained_selector.py +1 -1
rvc/lib/tools/split_audio.py +7 -5
rvc/lib/utils.py +7 -5
rvc/train/extract/extract_feature_print.py +3 -0
rvc/train/process/extract_index.py +5 -2
rvc/train/process/extract_model.py +17 -7
rvc/train/process/extract_small_model.py +175 -0
rvc/train/process/model_blender.py +63 -0
rvc/train/process/model_information.py +33 -0
rvc/train/train.py +68 -38
rvc/train/utils.py +18 -43
tabs/download/download.py +37 -1
tabs/extra/analyzer/analyzer.py +15 -68
tabs/extra/extra.py +2 -2
tabs/extra/model_information.py +4 -2
tabs/extra/processing/processing.py +10 -114
tabs/inference/inference.py +234 -62
tabs/plugins/plugins_core.py +37 -17
tabs/report/report.py +1 -1
tabs/settings/fake_gpu.py +55 -0
tabs/settings/flask_server.py +43 -0
tabs/settings/lang.py +57 -0
tabs/settings/presence.py +23 -10
tabs/settings/restart.py +39 -0
tabs/settings/themes.py +4 -1
tabs/settings/version.py +24 -0
tabs/train/train.py +392 -17
tabs/tts/tts.py +147 -138
tabs/voice_blender/voice_blender.py +99 -0

app.py CHANGED Viewed

@@ -13,24 +13,35 @@ from tabs.extra.extra import extra_tab
 from tabs.report.report import report_tab
 from tabs.download.download import download_tab
 from tabs.tts.tts import tts_tab
-from tabs.settings.presence import presence_tab
 from tabs.settings.themes import theme_tab
 from tabs.plugins.plugins import plugins_tab
 # Assets
 import assets.themes.loadThemes as loadThemes
 from assets.i18n.i18n import I18nAuto
 import assets.installation_checker as installation_checker
 from assets.discord_presence import RPCManager
-import assets.delete_models as delete_models
-delete_models.start_infinite_loop()
 i18n = I18nAuto()
-RPCManager.start_presence()
 installation_checker.check_installation()
 logging.getLogger("uvicorn").disabled = True
 logging.getLogger("fairseq").disabled = True
-logging.getLogger("h11").disabled = True
 my_applio = loadThemes.load_json()
 if my_applio:
@@ -53,17 +64,24 @@ with gr.Blocks(theme=my_applio, title="Applio") as Applio:
     with gr.Tab(i18n("Inference")):
         inference_tab()
-#    with gr.Tab(i18n("Train")):
-#        train_tab()
     with gr.Tab(i18n("TTS")):
         tts_tab()
-    with gr.Tab(i18n("Extra")):
-        extra_tab()
-#    with gr.Tab(i18n("Plugins")):
-#        plugins_tab()
     with gr.Tab(i18n("Download")):
         download_tab()
@@ -71,10 +89,30 @@ with gr.Blocks(theme=my_applio, title="Applio") as Applio:
     with gr.Tab(i18n("Report a Bug")):
         report_tab()
-#    with gr.Tab(i18n("Settings")):
-#        presence_tab()
-#        theme_tab()
 if __name__ == "__main__":
-    Applio.launch()

 from tabs.report.report import report_tab
 from tabs.download.download import download_tab
 from tabs.tts.tts import tts_tab
+from tabs.voice_blender.voice_blender import voice_blender_tab
+from tabs.settings.presence import presence_tab, load_config_presence
+from tabs.settings.flask_server import flask_server_tab
+from tabs.settings.fake_gpu import fake_gpu_tab, gpu_available, load_fake_gpu
 from tabs.settings.themes import theme_tab
 from tabs.plugins.plugins import plugins_tab
+from tabs.settings.version import version_tab
+from tabs.settings.lang import lang_tab
+from tabs.settings.restart import restart_tab
 # Assets
 import assets.themes.loadThemes as loadThemes
 from assets.i18n.i18n import I18nAuto
 import assets.installation_checker as installation_checker
 from assets.discord_presence import RPCManager
+from assets.flask.server import start_flask, load_config_flask
+from core import run_prerequisites_script
+run_prerequisites_script("False", "True", "True", "True")
 i18n = I18nAuto()
+if load_config_presence() == True:
+    RPCManager.start_presence()
 installation_checker.check_installation()
 logging.getLogger("uvicorn").disabled = True
 logging.getLogger("fairseq").disabled = True
+if load_config_flask() == True:
+    print("Starting Flask server")
+    start_flask()
 my_applio = loadThemes.load_json()
 if my_applio:
     with gr.Tab(i18n("Inference")):
         inference_tab()
+    with gr.Tab(i18n("Train")):
+        if gpu_available() or load_fake_gpu():
+            train_tab()
+        else:
+            gr.Markdown(
+                i18n(
+                    "Training is currently unsupported due to the absence of a GPU. To activate the training tab, navigate to the settings tab and enable the 'Fake GPU' option."
+                )
+            )
     with gr.Tab(i18n("TTS")):
         tts_tab()
+    with gr.Tab(i18n("Voice Blender")):
+        voice_blender_tab()
+    with gr.Tab(i18n("Plugins")):
+        plugins_tab()
     with gr.Tab(i18n("Download")):
         download_tab()
     with gr.Tab(i18n("Report a Bug")):
         report_tab()
+    with gr.Tab(i18n("Extra")):
+        extra_tab()
+    with gr.Tab(i18n("Settings")):
+        presence_tab()
+        flask_server_tab()
+        if not gpu_available():
+            fake_gpu_tab()
+        theme_tab()
+        version_tab()
+        lang_tab()
+        restart_tab()
 if __name__ == "__main__":
+    port = 6969
+    if "--port" in sys.argv:
+        port_index = sys.argv.index("--port") + 1
+        if port_index < len(sys.argv):
+            port = int(sys.argv[port_index])
+    Applio.launch(
+        favicon_path="assets/ICON.ico",
+        share="--share" in sys.argv,
+        inbrowser="--open" in sys.argv,
+        server_port=port,
+    )

core.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import os
 import sys
 import argparse
 import subprocess
@@ -7,26 +8,32 @@ now_dir = os.getcwd()
 sys.path.append(now_dir)
 from rvc.configs.config import Config
-from rvc.lib.tools.validators import (
-    validate_sampling_rate,
-    validate_f0up_key,
-    validate_f0method,
-    validate_true_false,
-    validate_tts_voices,
-)
 from rvc.train.extract.preparing_files import generate_config, generate_filelist
 from rvc.lib.tools.pretrained_selector import pretrained_selector
-from rvc.lib.process.model_fusion import model_fusion
-from rvc.lib.process.model_information import model_information
 config = Config()
 current_script_directory = os.path.dirname(os.path.realpath(__file__))
 logs_path = os.path.join(current_script_directory, "logs")
-subprocess.run(
-    ["python", os.path.join("rvc", "lib", "tools", "prerequisites_download.py")]
-)
 # Infer
@@ -34,31 +41,41 @@ def run_infer_script(
     f0up_key,
     filter_radius,
     index_rate,
     hop_length,
     f0method,
     input_path,
     output_path,
-    pth_file,
     index_path,
     split_audio,
 ):
-    infer_script_path = os.path.join("rvc", "infer", "infer.py")
-    command = [
-        "python",
-        infer_script_path,
-        str(f0up_key),
-        str(filter_radius),
-        str(index_rate),
-        str(hop_length),
         f0method,
         input_path,
         output_path,
-        pth_file,
         index_path,
-        str(split_audio),
-    ]
-    subprocess.run(command)
-    return f"File {input_path} inferred successfully.", output_path
 # Batch infer
@@ -66,16 +83,20 @@ def run_batch_infer_script(
     f0up_key,
     filter_radius,
     index_rate,
     hop_length,
     f0method,
     input_folder,
     output_folder,
-    pth_file,
     index_path,
     split_audio,
 ):
-    infer_script_path = os.path.join("rvc", "infer", "infer.py")
     audio_files = [
         f for f in os.listdir(input_folder) if f.endswith((".mp3", ".wav", ".flac"))
     ]
@@ -93,21 +114,24 @@ def run_batch_infer_script(
             )
             print(f"Inferring {input_path}...")
-        command = [
-            "python",
-            infer_script_path,
-            str(f0up_key),
-            str(filter_radius),
-            str(index_rate),
-            str(hop_length),
-            f0method,
-            input_path,
-            output_path,
-            pth_file,
-            index_path,
-            str(split_audio),
-        ]
-        subprocess.run(command)
     return f"Files from {input_folder} inferred successfully."
@@ -119,15 +143,21 @@ def run_tts_script(
     f0up_key,
     filter_radius,
     index_rate,
     hop_length,
     f0method,
     output_tts_path,
     output_rvc_path,
-    pth_file,
     index_path,
 ):
     tts_script_path = os.path.join("rvc", "lib", "tools", "tts.py")
-    infer_script_path = os.path.join("rvc", "infer", "infer.py")
     if os.path.exists(output_tts_path):
         os.remove(output_tts_path)
@@ -139,23 +169,30 @@ def run_tts_script(
         tts_voice,
         output_tts_path,
     ]
-    command_infer = [
-        "python",
-        infer_script_path,
-        str(f0up_key),
-        str(filter_radius),
-        str(index_rate),
-        str(hop_length),
         f0method,
         output_tts_path,
         output_rvc_path,
-        pth_file,
         index_path,
-    ]
-    subprocess.run(command_tts)
-    subprocess.run(command_infer)
-    return f"Text {tts_text} synthesized successfully.", output_rvc_path
 # Preprocess
@@ -165,20 +202,25 @@ def run_preprocess_script(model_name, dataset_path, sampling_rate):
     command = [
         "python",
         preprocess_script_path,
-        os.path.join(logs_path, str(model_name)),
-        dataset_path,
-        str(sampling_rate),
-        str(per),
     ]
-    os.makedirs(os.path.join(logs_path, str(model_name)), exist_ok=True)
     subprocess.run(command)
     return f"Model {model_name} preprocessed successfully."
 # Extract
 def run_extract_script(model_name, rvc_version, f0method, hop_length, sampling_rate):
-    model_path = os.path.join(logs_path, str(model_name))
     extract_f0_script_path = os.path.join(
         "rvc", "train", "extract", "extract_f0_print.py"
     )
@@ -189,20 +231,30 @@ def run_extract_script(model_name, rvc_version, f0method, hop_length, sampling_r
     command_1 = [
         "python",
         extract_f0_script_path,
-        model_path,
-        f0method,
-        str(hop_length),
     ]
     command_2 = [
         "python",
         extract_feature_script_path,
-        config.device,
-        "1",
-        "0",
-        "0",
-        model_path,
-        rvc_version,
-        "True",
     ]
     subprocess.run(command_1)
     subprocess.run(command_2)
@@ -224,6 +276,8 @@ def run_train_script(
     batch_size,
     gpu,
     pitch_guidance,
     pretrained,
     custom_pretrained,
     g_pretrained_path=None,
@@ -232,6 +286,7 @@ def run_train_script(
     f0 = 1 if str(pitch_guidance) == "True" else 0
     latest = 1 if str(save_only_latest) == "True" else 0
     save_every = 1 if str(save_every_weights) == "True" else 0
     if str(pretrained) == "True":
         if str(custom_pretrained) == "False":
@@ -248,33 +303,42 @@ def run_train_script(
     train_script_path = os.path.join("rvc", "train", "train.py")
     command = [
         "python",
-        str(train_script_path),
-        "-se",
-        str(save_every_epoch),
-        "-te",
-        str(total_epoch),
-        "-pg",
-        str(pg),
-        "-pd",
-        str(pd),
-        "-sr",
-        str(sampling_rate),
-        "-bs",
-        str(batch_size),
-        "-g",
-        str(gpu),
-        "-e",
-        os.path.join(logs_path, str(model_name)),
-        "-v",
-        str(rvc_version),
-        "-l",
-        str(latest),
-        "-c",
-        "0",
-        "-sw",
-        str(save_every),
-        "-f0",
-        str(f0),
     ]
     subprocess.run(command)
@@ -284,11 +348,11 @@ def run_train_script(
 # Index
 def run_index_script(model_name, rvc_version):
-    index_script_path = os.path.join("rvc", "train", "index_generator.py")
     command = [
         "python",
         index_script_path,
-        os.path.join(logs_path, str(model_name)),
         rvc_version,
     ]
@@ -296,38 +360,66 @@ def run_index_script(model_name, rvc_version):
     return f"Index file for {model_name} generated successfully."
 # Model information
 def run_model_information_script(pth_path):
     print(model_information(pth_path))
-# Model fusion
-def run_model_fusion_script(model_name, pth_path_1, pth_path_2):
-    model_fusion(model_name, pth_path_1, pth_path_2)
 # Tensorboard
 def run_tensorboard_script():
-    tensorboard_script_path = os.path.join(
-        "rvc", "lib", "tools", "launch_tensorboard.py"
-    )
-    command = [
-        "python",
-        tensorboard_script_path,
-    ]
-    subprocess.run(command)
 # Download
 def run_download_script(model_link):
-    download_script_path = os.path.join("rvc", "lib", "tools", "model_download.py")
     command = [
-        "python",
-        download_script_path,
-        model_link,
     ]
     subprocess.run(command)
-    return f"Model downloaded successfully."
 # Parse arguments
@@ -342,48 +434,108 @@ def parse_arguments():
     # Parser for 'infer' mode
     infer_parser = subparsers.add_parser("infer", help="Run inference")
     infer_parser.add_argument(
-        "f0up_key",
-        type=validate_f0up_key,
-        help="Value for f0up_key (-24 to +24)",
     )
     infer_parser.add_argument(
-        "filter_radius",
         type=str,
-        help="Value for filter_radius (0 to 10)",
     )
     infer_parser.add_argument(
-        "index_rate",
         type=str,
-        help="Value for index_rate (0.0 to 1)",
     )
     infer_parser.add_argument(
-        "hop_length",
         type=str,
-        help="Value for hop_length (1 to 512)",
     )
     infer_parser.add_argument(
-        "f0method",
-        type=validate_f0method,
-        help="Value for f0method (pm, dio, crepe, crepe-tiny, harvest, rmvpe)",
     )
     infer_parser.add_argument(
-        "input_path", type=str, help="Input path (enclose in double quotes)"
     )
     infer_parser.add_argument(
-        "output_path", type=str, help="Output path (enclose in double quotes)"
     )
     infer_parser.add_argument(
-        "pth_file", type=str, help="Path to the .pth file (enclose in double quotes)"
     )
     infer_parser.add_argument(
-        "index_path",
         type=str,
-        help="Path to the .index file (enclose in double quotes)",
     )
     infer_parser.add_argument(
-        "split_audio",
         type=str,
-        help="Enable split audio ( better results )",
     )
     # Parser for 'batch_infer' mode
@@ -391,229 +543,454 @@ def parse_arguments():
         "batch_infer", help="Run batch inference"
     )
     batch_infer_parser.add_argument(
-        "f0up_key",
-        type=validate_f0up_key,
-        help="Value for f0up_key (-24 to +24)",
     )
     batch_infer_parser.add_argument(
-        "filter_radius",
         type=str,
-        help="Value for filter_radius (0 to 10)",
     )
     batch_infer_parser.add_argument(
-        "index_rate",
         type=str,
-        help="Value for index_rate (0.0 to 1)",
     )
     batch_infer_parser.add_argument(
-        "hop_length",
         type=str,
-        help="Value for hop_length (1 to 512)",
     )
     batch_infer_parser.add_argument(
-        "f0method",
-        type=validate_f0method,
-        help="Value for f0method (pm, dio, crepe, crepe-tiny, harvest, rmvpe)",
     )
     batch_infer_parser.add_argument(
-        "input_folder", type=str, help="Input folder (enclose in double quotes)"
     )
     batch_infer_parser.add_argument(
-        "output_folder", type=str, help="Output folder (enclose in double quotes)"
     )
     batch_infer_parser.add_argument(
-        "pth_file", type=str, help="Path to the .pth file (enclose in double quotes)"
     )
     batch_infer_parser.add_argument(
-        "index_path",
         type=str,
-        help="Path to the .index file (enclose in double quotes)",
     )
     # Parser for 'tts' mode
     tts_parser = subparsers.add_parser("tts", help="Run TTS")
     tts_parser.add_argument(
-        "tts_text",
         type=str,
-        help="Text to be synthesized (enclose in double quotes)",
     )
     tts_parser.add_argument(
-        "tts_voice",
-        type=validate_tts_voices,
-        help="Voice to be used (enclose in double quotes)",
     )
     tts_parser.add_argument(
-        "f0up_key",
-        type=validate_f0up_key,
-        help="Value for f0up_key (-24 to +24)",
     )
     tts_parser.add_argument(
-        "filter_radius",
         type=str,
-        help="Value for filter_radius (0 to 10)",
     )
     tts_parser.add_argument(
-        "index_rate",
         type=str,
-        help="Value for index_rate (0.0 to 1)",
     )
     tts_parser.add_argument(
-        "hop_length",
         type=str,
-        help="Value for hop_length (1 to 512)",
     )
     tts_parser.add_argument(
-        "f0method",
-        type=validate_f0method,
-        help="Value for f0method (pm, dio, crepe, crepe-tiny, harvest, rmvpe)",
     )
     tts_parser.add_argument(
-        "output_tts_path", type=str, help="Output tts path (enclose in double quotes)"
     )
     tts_parser.add_argument(
-        "output_rvc_path", type=str, help="Output rvc path (enclose in double quotes)"
     )
     tts_parser.add_argument(
-        "pth_file", type=str, help="Path to the .pth file (enclose in double quotes)"
     )
     tts_parser.add_argument(
-        "index_path",
         type=str,
-        help="Path to the .index file (enclose in double quotes)",
     )
     # Parser for 'preprocess' mode
     preprocess_parser = subparsers.add_parser("preprocess", help="Run preprocessing")
     preprocess_parser.add_argument(
-        "model_name", type=str, help="Name of the model (enclose in double quotes)"
-    )
-    preprocess_parser.add_argument(
-        "dataset_path",
         type=str,
-        help="Path to the dataset (enclose in double quotes)",
     )
     preprocess_parser.add_argument(
-        "sampling_rate",
-        type=validate_sampling_rate,
-        help="Sampling rate (32000, 40000 or 48000)",
     )
     # Parser for 'extract' mode
     extract_parser = subparsers.add_parser("extract", help="Run extract")
     extract_parser.add_argument(
-        "model_name",
         type=str,
-        help="Name of the model (enclose in double quotes)",
     )
     extract_parser.add_argument(
-        "rvc_version",
         type=str,
-        help="Version of the model (v1 or v2)",
     )
     extract_parser.add_argument(
-        "f0method",
-        type=validate_f0method,
-        help="Value for f0method (pm, dio, crepe, crepe-tiny, mangio-crepe, mangio-crepe-tiny, harvest, rmvpe)",
     )
     extract_parser.add_argument(
-        "hop_length",
         type=str,
-        help="Value for hop_length (1 to 512)",
     )
     extract_parser.add_argument(
-        "sampling_rate",
-        type=validate_sampling_rate,
-        help="Sampling rate (32000, 40000 or 48000)",
     )
     # Parser for 'train' mode
     train_parser = subparsers.add_parser("train", help="Run training")
     train_parser.add_argument(
-        "model_name",
         type=str,
-        help="Name of the model (enclose in double quotes)",
     )
     train_parser.add_argument(
-        "rvc_version",
         type=str,
-        help="Version of the model (v1 or v2)",
     )
     train_parser.add_argument(
-        "save_every_epoch",
         type=str,
         help="Save every epoch",
     )
     train_parser.add_argument(
-        "save_only_latest",
         type=str,
         help="Save weight only at last epoch",
     )
     train_parser.add_argument(
-        "save_every_weights",
         type=str,
         help="Save weight every epoch",
     )
     train_parser.add_argument(
-        "total_epoch",
         type=str,
         help="Total epoch",
     )
     train_parser.add_argument(
-        "sampling_rate",
-        type=validate_sampling_rate,
-        help="Sampling rate (32000, 40000, or 48000)",
     )
     train_parser.add_argument(
-        "batch_size",
         type=str,
         help="Batch size",
     )
     train_parser.add_argument(
-        "gpu",
         type=str,
-        help="GPU number (0 to 10 separated by -)",
     )
     train_parser.add_argument(
-        "pitch_guidance",
-        type=validate_true_false,
-        help="Pitch guidance (True or False)",
     )
     train_parser.add_argument(
-        "pretrained",
-        type=validate_true_false,
-        help="Pretrained (True or False)",
     )
     train_parser.add_argument(
-        "custom_pretrained",
-        type=validate_true_false,
-        help="Custom pretrained (True or False)",
     )
     train_parser.add_argument(
-        "g_pretrained_path",
         type=str,
         nargs="?",
         default=None,
-        help="Path to the pretrained G file (enclose in double quotes)",
     )
     train_parser.add_argument(
-        "d_pretrained_path",
         type=str,
         nargs="?",
         default=None,
-        help="Path to the pretrained D file (enclose in double quotes)",
     )
     # Parser for 'index' mode
     index_parser = subparsers.add_parser("index", help="Generate index file")
     index_parser.add_argument(
-        "model_name",
         type=str,
-        help="Name of the model (enclose in double quotes)",
     )
     index_parser.add_argument(
-        "rvc_version",
         type=str,
-        help="Version of the model (v1 or v2)",
     )
     # Parser for 'model_information' mode
@@ -621,27 +998,36 @@ def parse_arguments():
         "model_information", help="Print model information"
     )
     model_information_parser.add_argument(
-        "pth_path",
         type=str,
-        help="Path to the .pth file (enclose in double quotes)",
     )
-    # Parser for 'model_fusion' mode
-    model_fusion_parser = subparsers.add_parser("model_fusion", help="Fuse two models")
-    model_fusion_parser.add_argument(
-        "model_name",
         type=str,
-        help="Name of the model (enclose in double quotes)",
     )
-    model_fusion_parser.add_argument(
-        "pth_path_1",
         type=str,
-        help="Path to the first .pth file (enclose in double quotes)",
     )
-    model_fusion_parser.add_argument(
-        "pth_path_2",
         type=str,
-        help="Path to the second .pth file (enclose in double quotes)",
     )
     # Parser for 'tensorboard' mode
@@ -650,11 +1036,57 @@ def parse_arguments():
     # Parser for 'download' mode
     download_parser = subparsers.add_parser("download", help="Download models")
     download_parser.add_argument(
-        "model_link",
         type=str,
-        help="Link of the model (enclose in double quotes)",
     )
     return parser.parse_args()
@@ -668,95 +1100,143 @@ def main():
     try:
         if args.mode == "infer":
             run_infer_script(
-                args.f0up_key,
-                args.filter_radius,
-                args.index_rate,
-                args.hop_length,
-                args.f0method,
-                args.input_path,
-                args.output_path,
-                args.pth_file,
-                args.index_path,
-                args.split_audio,
             )
         elif args.mode == "batch_infer":
             run_batch_infer_script(
-                args.f0up_key,
-                args.filter_radius,
-                args.index_rate,
-                args.hop_length,
-                args.f0method,
-                args.input_folder,
-                args.output_folder,
-                args.pth_file,
-                args.index_path,
             )
         elif args.mode == "tts":
             run_tts_script(
-                args.tts_text,
-                args.tts_voice,
-                args.f0up_key,
-                args.filter_radius,
-                args.index_rate,
-                args.hop_length,
-                args.f0method,
-                args.output_tts_path,
-                args.output_rvc_path,
-                args.pth_file,
-                args.index_path,
             )
         elif args.mode == "preprocess":
             run_preprocess_script(
-                args.model_name,
-                args.dataset_path,
                 str(args.sampling_rate),
             )
         elif args.mode == "extract":
             run_extract_script(
-                args.model_name,
-                args.rvc_version,
-                args.f0method,
-                args.hop_length,
-                args.sampling_rate,
             )
         elif args.mode == "train":
             run_train_script(
-                args.model_name,
-                args.rvc_version,
-                args.save_every_epoch,
-                args.save_only_latest,
-                args.save_every_weights,
-                args.total_epoch,
-                args.sampling_rate,
-                args.batch_size,
-                args.gpu,
-                args.pitch_guidance,
-                args.pretrained,
-                args.custom_pretrained,
-                args.g_pretrained_path,
-                args.d_pretrained_path,
             )
         elif args.mode == "index":
             run_index_script(
-                args.model_name,
-                args.rvc_version,
             )
         elif args.mode == "model_information":
             run_model_information_script(
-                args.pth_path,
             )
-        elif args.mode == "model_fusion":
-            run_model_fusion_script(
-                args.model_name,
-                args.pth_path_1,
-                args.pth_path_2,
             )
         elif args.mode == "tensorboard":
             run_tensorboard_script()
         elif args.mode == "download":
             run_download_script(
-                args.model_link,
             )
     except Exception as error:
         print(f"Error: {error}")

 import os
 import sys
+import json
 import argparse
 import subprocess
 sys.path.append(now_dir)
 from rvc.configs.config import Config
+from rvc.lib.tools.prerequisites_download import prequisites_download_pipeline
 from rvc.train.extract.preparing_files import generate_config, generate_filelist
 from rvc.lib.tools.pretrained_selector import pretrained_selector
+from rvc.train.process.model_blender import model_blender
+from rvc.train.process.model_information import model_information
+from rvc.train.process.extract_small_model import extract_small_model
+from rvc.infer.infer import infer_pipeline
+from rvc.lib.tools.analyzer import analyze_audio
+from rvc.lib.tools.launch_tensorboard import launch_tensorboard_pipeline
+from rvc.lib.tools.model_download import model_download_pipeline
 config = Config()
 current_script_directory = os.path.dirname(os.path.realpath(__file__))
 logs_path = os.path.join(current_script_directory, "logs")
+# Get TTS Voices
+with open(os.path.join("rvc", "lib", "tools", "tts_voices.json"), "r") as f:
+    voices_data = json.load(f)
+locales = list({voice["Locale"] for voice in voices_data})
 # Infer
     f0up_key,
     filter_radius,
     index_rate,
+    rms_mix_rate,
+    protect,
     hop_length,
     f0method,
     input_path,
     output_path,
+    pth_path,
     index_path,
     split_audio,
+    f0autotune,
+    clean_audio,
+    clean_strength,
+    export_format,
 ):
+    infer_pipeline(
+        f0up_key,
+        filter_radius,
+        index_rate,
+        rms_mix_rate,
+        protect,
+        hop_length,
         f0method,
         input_path,
         output_path,
+        pth_path,
         index_path,
+        split_audio,
+        f0autotune,
+        clean_audio,
+        clean_strength,
+        export_format,
+    )
+    return f"File {input_path} inferred successfully.", output_path.replace(
+        ".wav", f".{export_format.lower()}"
+    )
 # Batch infer
     f0up_key,
     filter_radius,
     index_rate,
+    rms_mix_rate,
+    protect,
     hop_length,
     f0method,
     input_folder,
     output_folder,
+    pth_path,
     index_path,
     split_audio,
+    f0autotune,
+    clean_audio,
+    clean_strength,
+    export_format,
 ):
     audio_files = [
         f for f in os.listdir(input_folder) if f.endswith((".mp3", ".wav", ".flac"))
     ]
             )
             print(f"Inferring {input_path}...")
+            infer_pipeline(
+                f0up_key,
+                filter_radius,
+                index_rate,
+                rms_mix_rate,
+                protect,
+                hop_length,
+                f0method,
+                input_path,
+                output_path,
+                pth_path,
+                index_path,
+                split_audio,
+                f0autotune,
+                clean_audio,
+                clean_strength,
+                export_format,
+            )
     return f"Files from {input_folder} inferred successfully."
     f0up_key,
     filter_radius,
     index_rate,
+    rms_mix_rate,
+    protect,
     hop_length,
     f0method,
     output_tts_path,
     output_rvc_path,
+    pth_path,
     index_path,
+    split_audio,
+    f0autotune,
+    clean_audio,
+    clean_strength,
+    export_format,
 ):
     tts_script_path = os.path.join("rvc", "lib", "tools", "tts.py")
     if os.path.exists(output_tts_path):
         os.remove(output_tts_path)
         tts_voice,
         output_tts_path,
     ]
+    subprocess.run(command_tts)
+    infer_pipeline(
+        f0up_key,
+        filter_radius,
+        index_rate,
+        rms_mix_rate,
+        protect,
+        hop_length,
         f0method,
         output_tts_path,
         output_rvc_path,
+        pth_path,
         index_path,
+        split_audio,
+        f0autotune,
+        clean_audio,
+        clean_strength,
+        export_format,
+    )
+    return f"Text {tts_text} synthesized successfully.", output_rvc_path.replace(
+        ".wav", f".{export_format.lower()}"
+    )
 # Preprocess
     command = [
         "python",
         preprocess_script_path,
+        *map(
+            str,
+            [
+                os.path.join(logs_path, model_name),
+                dataset_path,
+                sampling_rate,
+                per,
+            ],
+        ),
     ]
+    os.makedirs(os.path.join(logs_path, model_name), exist_ok=True)
     subprocess.run(command)
     return f"Model {model_name} preprocessed successfully."
 # Extract
 def run_extract_script(model_name, rvc_version, f0method, hop_length, sampling_rate):
+    model_path = os.path.join(logs_path, model_name)
     extract_f0_script_path = os.path.join(
         "rvc", "train", "extract", "extract_f0_print.py"
     )
     command_1 = [
         "python",
         extract_f0_script_path,
+        *map(
+            str,
+            [
+                model_path,
+                f0method,
+                hop_length,
+            ],
+        ),
     ]
     command_2 = [
         "python",
         extract_feature_script_path,
+        *map(
+            str,
+            [
+                config.device,
+                "1",
+                "0",
+                "0",
+                model_path,
+                rvc_version,
+                "True",
+            ],
+        ),
     ]
     subprocess.run(command_1)
     subprocess.run(command_2)
     batch_size,
     gpu,
     pitch_guidance,
+    overtraining_detector,
+    overtraining_threshold,
     pretrained,
     custom_pretrained,
     g_pretrained_path=None,
     f0 = 1 if str(pitch_guidance) == "True" else 0
     latest = 1 if str(save_only_latest) == "True" else 0
     save_every = 1 if str(save_every_weights) == "True" else 0
+    detector = 1 if str(overtraining_detector) == "True" else 0
     if str(pretrained) == "True":
         if str(custom_pretrained) == "False":
     train_script_path = os.path.join("rvc", "train", "train.py")
     command = [
         "python",
+        train_script_path,
+        *map(
+            str,
+            [
+                "-se",
+                save_every_epoch,
+                "-te",
+                total_epoch,
+                "-pg",
+                pg,
+                "-pd",
+                pd,
+                "-sr",
+                sampling_rate,
+                "-bs",
+                batch_size,
+                "-g",
+                gpu,
+                "-e",
+                os.path.join(logs_path, model_name),
+                "-v",
+                rvc_version,
+                "-l",
+                latest,
+                "-c",
+                "0",
+                "-sw",
+                save_every,
+                "-f0",
+                f0,
+                "-od",
+                detector,
+                "-ot",
+                overtraining_threshold,
+            ],
+        ),
     ]
     subprocess.run(command)
 # Index
 def run_index_script(model_name, rvc_version):
+    index_script_path = os.path.join("rvc", "train", "process", "extract_index.py")
     command = [
         "python",
         index_script_path,
+        os.path.join(logs_path, model_name),
         rvc_version,
     ]
     return f"Index file for {model_name} generated successfully."
+# Model extract
+def run_model_extract_script(
+    pth_path, model_name, sampling_rate, pitch_guidance, rvc_version, epoch, step
+):
+    f0 = 1 if str(pitch_guidance) == "True" else 0
+    extract_small_model(
+        pth_path, model_name, sampling_rate, f0, rvc_version, epoch, step
+    )
+    return f"Model {model_name} extracted successfully."
 # Model information
 def run_model_information_script(pth_path):
     print(model_information(pth_path))
+# Model blender
+def run_model_blender_script(model_name, pth_path_1, pth_path_2, ratio):
+    message, model_blended = model_blender(model_name, pth_path_1, pth_path_2, ratio)
+    return message, model_blended
 # Tensorboard
 def run_tensorboard_script():
+    launch_tensorboard_pipeline()
 # Download
 def run_download_script(model_link):
+    model_download_pipeline(model_link)
+    return f"Model downloaded successfully."
+# Prerequisites
+def run_prerequisites_script(pretraineds_v1, pretraineds_v2, models, exe):
+    prequisites_download_pipeline(pretraineds_v1, pretraineds_v2, models, exe)
+    return "Prerequisites installed successfully."
+# Audio analyzer
+def run_audio_analyzer_script(input_path, save_plot_path="logs/audio_analysis.png"):
+    audio_info, plot_path = analyze_audio(input_path, save_plot_path)
+    print(
+        f"Audio info of {input_path}: {audio_info}",
+        f"Audio file {input_path} analyzed successfully. Plot saved at: {plot_path}",
+    )
+    return audio_info, plot_path
+# API
+def run_api_script(ip, port):
     command = [
+        "env/Scripts/uvicorn.exe" if os.name == "nt" else "uvicorn",
+        "api:app",
+        "--host",
+        ip,
+        "--port",
+        port,
     ]
     subprocess.run(command)
 # Parse arguments
     # Parser for 'infer' mode
     infer_parser = subparsers.add_parser("infer", help="Run inference")
     infer_parser.add_argument(
+        "--f0up_key",
+        type=str,
+        help="Value for f0up_key",
+        choices=[str(i) for i in range(-24, 25)],
+        default="0",
     )
     infer_parser.add_argument(
+        "--filter_radius",
         type=str,
+        help="Value for filter_radius",
+        choices=[str(i) for i in range(11)],
+        default="3",
     )
     infer_parser.add_argument(
+        "--index_rate",
         type=str,
+        help="Value for index_rate",
+        choices=[str(i / 10) for i in range(11)],
+        default="0.3",
     )
     infer_parser.add_argument(
+        "--rms_mix_rate",
         type=str,
+        help="Value for rms_mix_rate",
+        choices=[str(i / 10) for i in range(11)],
+        default="1",
     )
     infer_parser.add_argument(
+        "--protect",
+        type=str,
+        help="Value for protect",
+        choices=[str(i / 10) for i in range(6)],
+        default="0.33",
     )
     infer_parser.add_argument(
+        "--hop_length",
+        type=str,
+        help="Value for hop_length",
+        choices=[str(i) for i in range(1, 513)],
+        default="128",
     )
     infer_parser.add_argument(
+        "--f0method",
+        type=str,
+        help="Value for f0method",
+        choices=[
+            "pm",
+            "harvest",
+            "dio",
+            "crepe",
+            "crepe-tiny",
+            "rmvpe",
+            "fcpe",
+            "hybrid[crepe+rmvpe]",
+            "hybrid[crepe+fcpe]",
+            "hybrid[rmvpe+fcpe]",
+            "hybrid[crepe+rmvpe+fcpe]",
+        ],
+        default="rmvpe",
+    )
+    infer_parser.add_argument("--input_path", type=str, help="Input path")
+    infer_parser.add_argument("--output_path", type=str, help="Output path")
+    infer_parser.add_argument("--pth_path", type=str, help="Path to the .pth file")
+    infer_parser.add_argument(
+        "--index_path",
+        type=str,
+        help="Path to the .index file",
     )
     infer_parser.add_argument(
+        "--split_audio",
+        type=str,
+        help="Enable split audio",
+        choices=["True", "False"],
+        default="False",
     )
     infer_parser.add_argument(
+        "--f0autotune",
         type=str,
+        help="Enable autotune",
+        choices=["True", "False"],
+        default="False",
     )
     infer_parser.add_argument(
+        "--clean_audio",
         type=str,
+        help="Enable clean audio",
+        choices=["True", "False"],
+        default="False",
+    )
+    infer_parser.add_argument(
+        "--clean_strength",
+        type=str,
+        help="Value for clean_strength",
+        choices=[str(i / 10) for i in range(11)],
+        default="0.7",
+    )
+    infer_parser.add_argument(
+        "--export_format",
+        type=str,
+        help="Export format",
+        choices=["WAV", "MP3", "FLAC", "OGG", "M4A"],
+        default="WAV",
     )
     # Parser for 'batch_infer' mode
         "batch_infer", help="Run batch inference"
     )
     batch_infer_parser.add_argument(
+        "--f0up_key",
+        type=str,
+        help="Value for f0up_key",
+        choices=[str(i) for i in range(-24, 25)],
+        default="0",
+    )
+    batch_infer_parser.add_argument(
+        "--filter_radius",
+        type=str,
+        help="Value for filter_radius",
+        choices=[str(i) for i in range(11)],
+        default="3",
+    )
+    batch_infer_parser.add_argument(
+        "--index_rate",
+        type=str,
+        help="Value for index_rate",
+        choices=[str(i / 10) for i in range(11)],
+        default="0.3",
+    )
+    batch_infer_parser.add_argument(
+        "--rms_mix_rate",
+        type=str,
+        help="Value for rms_mix_rate",
+        choices=[str(i / 10) for i in range(11)],
+        default="1",
     )
     batch_infer_parser.add_argument(
+        "--protect",
         type=str,
+        help="Value for protect",
+        choices=[str(i / 10) for i in range(6)],
+        default="0.33",
     )
     batch_infer_parser.add_argument(
+        "--hop_length",
         type=str,
+        help="Value for hop_length",
+        choices=[str(i) for i in range(1, 513)],
+        default="128",
     )
     batch_infer_parser.add_argument(
+        "--f0method",
         type=str,
+        help="Value for f0method",
+        choices=[
+            "pm",
+            "harvest",
+            "dio",
+            "crepe",
+            "crepe-tiny",
+            "rmvpe",
+            "fcpe",
+            "hybrid[crepe+rmvpe]",
+            "hybrid[crepe+fcpe]",
+            "hybrid[rmvpe+fcpe]",
+            "hybrid[crepe+rmvpe+fcpe]",
+        ],
+        default="rmvpe",
+    )
+    batch_infer_parser.add_argument("--input_folder", type=str, help="Input folder")
+    batch_infer_parser.add_argument("--output_folder", type=str, help="Output folder")
+    batch_infer_parser.add_argument(
+        "--pth_path", type=str, help="Path to the .pth file"
     )
     batch_infer_parser.add_argument(
+        "--index_path",
+        type=str,
+        help="Path to the .index file",
     )
     batch_infer_parser.add_argument(
+        "--split_audio",
+        type=str,
+        help="Enable split audio",
+        choices=["True", "False"],
+        default="False",
     )
     batch_infer_parser.add_argument(
+        "--f0autotune",
+        type=str,
+        help="Enable autotune",
+        choices=["True", "False"],
+        default="False",
     )
     batch_infer_parser.add_argument(
+        "--clean_audio",
+        type=str,
+        help="Enable clean audio",
+        choices=["True", "False"],
+        default="False",
     )
     batch_infer_parser.add_argument(
+        "--clean_strength",
         type=str,
+        help="Value for clean_strength",
+        choices=[str(i / 10) for i in range(11)],
+        default="0.7",
+    )
+    batch_infer_parser.add_argument(
+        "--export_format",
+        type=str,
+        help="Export format",
+        choices=["WAV", "MP3", "FLAC", "OGG", "M4A"],
+        default="WAV",
     )
     # Parser for 'tts' mode
     tts_parser = subparsers.add_parser("tts", help="Run TTS")
     tts_parser.add_argument(
+        "--tts_text",
         type=str,
+        help="Text to be synthesized",
     )
     tts_parser.add_argument(
+        "--tts_voice",
+        type=str,
+        help="Voice to be used",
+        choices=locales,
     )
     tts_parser.add_argument(
+        "--f0up_key",
+        type=str,
+        help="Value for f0up_key",
+        choices=[str(i) for i in range(-24, 25)],
+        default="0",
     )
     tts_parser.add_argument(
+        "--filter_radius",
         type=str,
+        help="Value for filter_radius",
+        choices=[str(i) for i in range(11)],
+        default="3",
     )
     tts_parser.add_argument(
+        "--index_rate",
         type=str,
+        help="Value for index_rate",
+        choices=[str(i / 10) for i in range(11)],
+        default="0.3",
     )
     tts_parser.add_argument(
+        "--rms_mix_rate",
         type=str,
+        help="Value for rms_mix_rate",
+        choices=[str(i / 10) for i in range(11)],
+        default="1",
     )
     tts_parser.add_argument(
+        "--protect",
+        type=str,
+        help="Value for protect",
+        choices=[str(i / 10) for i in range(6)],
+        default="0.33",
     )
     tts_parser.add_argument(
+        "--hop_length",
+        type=str,
+        help="Value for hop_length",
+        choices=[str(i) for i in range(1, 513)],
+        default="128",
     )
     tts_parser.add_argument(
+        "--f0method",
+        type=str,
+        help="Value for f0method",
+        choices=[
+            "pm",
+            "harvest",
+            "dio",
+            "crepe",
+            "crepe-tiny",
+            "rmvpe",
+            "fcpe",
+            "hybrid[crepe+rmvpe]",
+            "hybrid[crepe+fcpe]",
+            "hybrid[rmvpe+fcpe]",
+            "hybrid[crepe+rmvpe+fcpe]",
+        ],
+        default="rmvpe",
+    )
+    tts_parser.add_argument("--output_tts_path", type=str, help="Output tts path")
+    tts_parser.add_argument("--output_rvc_path", type=str, help="Output rvc path")
+    tts_parser.add_argument("--pth_path", type=str, help="Path to the .pth file")
+    tts_parser.add_argument(
+        "--index_path",
+        type=str,
+        help="Path to the .index file",
     )
     tts_parser.add_argument(
+        "--split_audio",
+        type=str,
+        help="Enable split audio",
+        choices=["True", "False"],
+        default="False",
     )
     tts_parser.add_argument(
+        "--f0autotune",
         type=str,
+        help="Enable autotune",
+        choices=["True", "False"],
+        default="False",
+    )
+    tts_parser.add_argument(
+        "--clean_audio",
+        type=str,
+        help="Enable clean audio",
+        choices=["True", "False"],
+        default="False",
+    )
+    tts_parser.add_argument(
+        "--clean_strength",
+        type=str,
+        help="Value for clean_strength",
+        choices=[str(i / 10) for i in range(11)],
+        default="0.7",
+    )
+    tts_parser.add_argument(
+        "--export_format",
+        type=str,
+        help="Export format",
+        choices=["WAV", "MP3", "FLAC", "OGG", "M4A"],
+        default="WAV",
     )
     # Parser for 'preprocess' mode
     preprocess_parser = subparsers.add_parser("preprocess", help="Run preprocessing")
+    preprocess_parser.add_argument("--model_name", type=str, help="Name of the model")
     preprocess_parser.add_argument(
+        "--dataset_path",
         type=str,
+        help="Path to the dataset",
     )
     preprocess_parser.add_argument(
+        "--sampling_rate",
+        type=str,
+        help="Sampling rate",
+        choices=["32000", "40000", "48000"],
     )
     # Parser for 'extract' mode
     extract_parser = subparsers.add_parser("extract", help="Run extract")
     extract_parser.add_argument(
+        "--model_name",
         type=str,
+        help="Name of the model",
     )
     extract_parser.add_argument(
+        "--rvc_version",
         type=str,
+        help="Version of the model",
+        choices=["v1", "v2"],
+        default="v2",
     )
     extract_parser.add_argument(
+        "--f0method",
+        type=str,
+        help="Value for f0method",
+        choices=[
+            "pm",
+            "harvest",
+            "dio",
+            "crepe",
+            "crepe-tiny",
+            "rmvpe",
+        ],
+        default="rmvpe",
     )
     extract_parser.add_argument(
+        "--hop_length",
         type=str,
+        help="Value for hop_length",
+        choices=[str(i) for i in range(1, 513)],
+        default="128",
     )
     extract_parser.add_argument(
+        "--sampling_rate",
+        type=str,
+        help="Sampling rate",
+        choices=["32000", "40000", "48000"],
     )
     # Parser for 'train' mode
     train_parser = subparsers.add_parser("train", help="Run training")
     train_parser.add_argument(
+        "--model_name",
         type=str,
+        help="Name of the model",
     )
     train_parser.add_argument(
+        "--rvc_version",
         type=str,
+        help="Version of the model",
+        choices=["v1", "v2"],
+        default="v2",
     )
     train_parser.add_argument(
+        "--save_every_epoch",
         type=str,
         help="Save every epoch",
+        choices=[str(i) for i in range(1, 101)],
     )
     train_parser.add_argument(
+        "--save_only_latest",
         type=str,
         help="Save weight only at last epoch",
+        choices=["True", "False"],
+        default="False",
     )
     train_parser.add_argument(
+        "--save_every_weights",
         type=str,
         help="Save weight every epoch",
+        choices=["True", "False"],
+        default="True",
     )
     train_parser.add_argument(
+        "--total_epoch",
         type=str,
         help="Total epoch",
+        choices=[str(i) for i in range(1, 10001)],
+        default="1000",
     )
     train_parser.add_argument(
+        "--sampling_rate",
+        type=str,
+        help="Sampling rate",
+        choices=["32000", "40000", "48000"],
     )
     train_parser.add_argument(
+        "--batch_size",
         type=str,
         help="Batch size",
+        choices=[str(i) for i in range(1, 51)],
+        default="8",
     )
     train_parser.add_argument(
+        "--gpu",
         type=str,
+        help="GPU number",
+        choices=[str(i) for i in range(0, 11)],
+        default="0",
     )
     train_parser.add_argument(
+        "--pitch_guidance",
+        type=str,
+        help="Pitch guidance",
+        choices=["True", "False"],
+        default="True",
     )
     train_parser.add_argument(
+        "--pretrained",
+        type=str,
+        help="Pretrained",
+        choices=["True", "False"],
+        default="True",
     )
     train_parser.add_argument(
+        "--custom_pretrained",
+        type=str,
+        help="Custom pretrained",
+        choices=["True", "False"],
+        default="False",
     )
     train_parser.add_argument(
+        "--g_pretrained_path",
         type=str,
         nargs="?",
         default=None,
+        help="Path to the pretrained G file",
     )
     train_parser.add_argument(
+        "--d_pretrained_path",
         type=str,
         nargs="?",
         default=None,
+        help="Path to the pretrained D file",
+    )
+    train_parser.add_argument(
+        "--overtraining_detector",
+        type=str,
+        help="Overtraining detector",
+        choices=["True", "False"],
+        default="False",
+    )
+    train_parser.add_argument(
+        "--overtraining_threshold",
+        type=str,
+        help="Overtraining threshold",
+        choices=[str(i) for i in range(1, 101)],
+        default="50",
     )
     # Parser for 'index' mode
     index_parser = subparsers.add_parser("index", help="Generate index file")
     index_parser.add_argument(
+        "--model_name",
         type=str,
+        help="Name of the model",
     )
     index_parser.add_argument(
+        "--rvc_version",
         type=str,
+        help="Version of the model",
+        choices=["v1", "v2"],
+        default="v2",
+    )
+    # Parser for 'model_extract' mode
+    model_extract_parser = subparsers.add_parser("model_extract", help="Extract model")
+    model_extract_parser.add_argument(
+        "--pth_path",
+        type=str,
+        help="Path to the .pth file",
+    )
+    model_extract_parser.add_argument(
+        "--model_name",
+        type=str,
+        help="Name of the model",
+    )
+    model_extract_parser.add_argument(
+        "--sampling_rate",
+        type=str,
+        help="Sampling rate",
+        choices=["40000", "48000"],
+    )
+    model_extract_parser.add_argument(
+        "--pitch_guidance",
+        type=str,
+        help="Pitch guidance",
+        choices=["True", "False"],
+    )
+    model_extract_parser.add_argument(
+        "--rvc_version",
+        type=str,
+        help="Version of the model",
+        choices=["v1", "v2"],
+        default="v2",
+    )
+    model_extract_parser.add_argument(
+        "--epoch",
+        type=str,
+        help="Epochs of the model",
+        choices=[str(i) for i in range(1, 10001)],
+    )
+    model_extract_parser.add_argument(
+        "--step",
+        type=str,
+        help="Steps of the model",
     )
     # Parser for 'model_information' mode
         "model_information", help="Print model information"
     )
     model_information_parser.add_argument(
+        "--pth_path",
         type=str,
+        help="Path to the .pth file",
     )
+    # Parser for 'model_blender' mode
+    model_blender_parser = subparsers.add_parser(
+        "model_blender", help="Fuse two models"
+    )
+    model_blender_parser.add_argument(
+        "--model_name",
+        type=str,
+        help="Name of the model",
+    )
+    model_blender_parser.add_argument(
+        "--pth_path_1",
         type=str,
+        help="Path to the first .pth file",
     )
+    model_blender_parser.add_argument(
+        "--pth_path_2",
         type=str,
+        help="Path to the second .pth file",
     )
+    model_blender_parser.add_argument(
+        "--ratio",
         type=str,
+        help="Value for blender ratio",
+        choices=[str(i / 10) for i in range(11)],
+        default="0.5",
     )
     # Parser for 'tensorboard' mode
     # Parser for 'download' mode
     download_parser = subparsers.add_parser("download", help="Download models")
     download_parser.add_argument(
+        "--model_link",
+        type=str,
+        help="Link of the model",
+    )
+    # Parser for 'prerequisites' mode
+    prerequisites_parser = subparsers.add_parser(
+        "prerequisites", help="Install prerequisites"
+    )
+    prerequisites_parser.add_argument(
+        "--pretraineds_v1",
         type=str,
+        choices=["True", "False"],
+        default="True",
+        help="Download pretrained models for v1",
+    )
+    prerequisites_parser.add_argument(
+        "--pretraineds_v2",
+        type=str,
+        choices=["True", "False"],
+        default="True",
+        help="Download pretrained models for v2",
+    )
+    prerequisites_parser.add_argument(
+        "--models",
+        type=str,
+        choices=["True", "False"],
+        default="True",
+        help="Donwload models",
+    )
+    prerequisites_parser.add_argument(
+        "--exe",
+        type=str,
+        choices=["True", "False"],
+        default="True",
+        help="Download executables",
+    )
+    # Parser for 'audio_analyzer' mode
+    audio_analyzer = subparsers.add_parser("audio_analyzer", help="Run audio analyzer")
+    audio_analyzer.add_argument(
+        "--input_path",
+        type=str,
+        help="Path to the input audio file",
     )
+    # Parser for 'api' mode
+    api_parser = subparsers.add_parser("api", help="Run the API")
+    api_parser.add_argument("--ip", type=str, help="IP address", default="127.0.0.1")
+    api_parser.add_argument("--port", type=str, help="Port", default="8000")
     return parser.parse_args()
     try:
         if args.mode == "infer":
             run_infer_script(
+                str(args.f0up_key),
+                str(args.filter_radius),
+                str(args.index_rate),
+                str(args.rms_mix_rate),
+                str(args.protect),
+                str(args.hop_length),
+                str(args.f0method),
+                str(args.input_path),
+                str(args.output_path),
+                str(args.pth_path),
+                str(args.index_path),
+                str(args.split_audio),
+                str(args.f0autotune),
+                str(args.clean_audio),
+                str(args.clean_strength),
+                str(args.export_format),
             )
         elif args.mode == "batch_infer":
             run_batch_infer_script(
+                str(args.f0up_key),
+                str(args.filter_radius),
+                str(args.index_rate),
+                str(args.rms_mix_rate),
+                str(args.protect),
+                str(args.hop_length),
+                str(args.f0method),
+                str(args.input_folder),
+                str(args.output_folder),
+                str(args.pth_path),
+                str(args.index_path),
+                str(args.split_audio),
+                str(args.f0autotune),
+                str(args.clean_audio),
+                str(args.clean_strength),
+                str(args.export_format),
             )
         elif args.mode == "tts":
             run_tts_script(
+                str(args.tts_text),
+                str(args.tts_voice),
+                str(args.f0up_key),
+                str(args.filter_radius),
+                str(args.index_rate),
+                str(args.rms_mix_rate),
+                str(args.protect),
+                str(args.hop_length),
+                str(args.f0method),
+                str(args.output_tts_path),
+                str(args.output_rvc_path),
+                str(args.pth_path),
+                str(args.index_path),
+                str(args.split_audio),
+                str(args.f0autotune),
+                str(args.clean_audio),
+                str(args.clean_strength),
+                str(args.export_format),
             )
         elif args.mode == "preprocess":
             run_preprocess_script(
+                str(args.model_name),
+                str(args.dataset_path),
                 str(args.sampling_rate),
             )
         elif args.mode == "extract":
             run_extract_script(
+                str(args.model_name),
+                str(args.rvc_version),
+                str(args.f0method),
+                str(args.hop_length),
+                str(args.sampling_rate),
             )
         elif args.mode == "train":
             run_train_script(
+                str(args.model_name),
+                str(args.rvc_version),
+                str(args.save_every_epoch),
+                str(args.save_only_latest),
+                str(args.save_every_weights),
+                str(args.total_epoch),
+                str(args.sampling_rate),
+                str(args.batch_size),
+                str(args.gpu),
+                str(args.pitch_guidance),
+                str(args.pretrained),
+                str(args.custom_pretrained),
+                str(args.g_pretrained_path),
+                str(args.d_pretrained_path),
+                str(args.overtraining_detector),
+                str(args.overtraining_threshold),
             )
         elif args.mode == "index":
             run_index_script(
+                str(args.model_name),
+                str(args.rvc_version),
+            )
+        elif args.mode == "model_extract":
+            run_model_extract_script(
+                str(args.pth_path),
+                str(args.model_name),
+                str(args.sampling_rate),
+                str(args.pitch_guidance),
+                str(args.rvc_version),
+                str(args.epoch),
+                str(args.step),
             )
         elif args.mode == "model_information":
             run_model_information_script(
+                str(args.pth_path),
             )
+        elif args.mode == "model_blender":
+            run_model_blender_script(
+                str(args.model_name),
+                str(args.pth_path_1),
+                str(args.pth_path_2),
+                str(args.ratio),
             )
         elif args.mode == "tensorboard":
             run_tensorboard_script()
         elif args.mode == "download":
             run_download_script(
+                str(args.model_link),
+            )
+        elif args.mode == "prerequisites":
+            run_prerequisites_script(
+                str(args.pretraineds_v1),
+                str(args.pretraineds_v2),
+                str(args.models),
+                str(args.exe),
+            )
+        elif args.mode == "audio_analyzer":
+            run_audio_analyzer_script(
+                str(args.input_path),
+            )
+        elif args.mode == "api":
+            run_api_script(
+                str(args.ip),
+                str(args.port),
             )
     except Exception as error:
         print(f"Error: {error}")

rvc/configs/config.py CHANGED Viewed

@@ -1,10 +1,6 @@
-import argparse
-import os
-import sys
-import json
-from multiprocessing import cpu_count
 import torch
 version_config_list = [
     "v1/32000.json",
@@ -64,6 +60,9 @@ class Config:
             return False
     def use_fp32_config(self):
         for config_file in version_config_list:
             self.json_config[config_file]["train"]["fp16_run"] = False
             with open(f"rvc/configs/{config_file}", "r") as f:
@@ -116,7 +115,7 @@ class Config:
             self.use_fp32_config()
         if self.n_cpu == 0:
-            self.n_cpu = cpu_count()
         if self.is_half:
             x_pad = 3

 import torch
+import json
+import os
 version_config_list = [
     "v1/32000.json",
             return False
     def use_fp32_config(self):
+        print(
+            f"Using FP32 config instead of FP16 due to GPU compatibility ({self.gpu_name})"
+        )
         for config_file in version_config_list:
             self.json_config[config_file]["train"]["fp16_run"] = False
             with open(f"rvc/configs/{config_file}", "r") as f:
             self.use_fp32_config()
         if self.n_cpu == 0:
+            self.n_cpu = os.cpu_count()
         if self.is_half:
             x_pad = 3

rvc/infer/infer.py CHANGED Viewed

@@ -1,9 +1,19 @@
 import os
 import sys
 import torch
 import numpy as np
 import soundfile as sf
-from vc_infer_pipeline import VC
 from rvc.lib.utils import load_audio
 from rvc.lib.tools.split_audio import process_audio, merge_audio
 from fairseq import checkpoint_utils
@@ -13,13 +23,19 @@ from rvc.lib.infer_pack.models import (
     SynthesizerTrnMs768NSFsid,
     SynthesizerTrnMs768NSFsid_nono,
 )
 from rvc.configs.config import Config
-config = Config()
-torch.manual_seed(114514)
 hubert_model = None
 def load_hubert():
@@ -37,6 +53,44 @@ def load_hubert():
     hubert_model.eval()
 def vc_single(
     sid=0,
     input_audio_path=None,
@@ -46,17 +100,16 @@ def vc_single(
     file_index=None,
     index_rate=None,
     resample_sr=0,
-    rms_mix_rate=1,
-    protect=0.33,
     hop_length=None,
     output_path=None,
     split_audio=False,
 ):
     global tgt_sr, net_g, vc, hubert_model, version
-    if input_audio_path is None:
-        return "Please, load an audio!", None
     f0_up_key = int(f0_up_key)
     try:
         audio = load_audio(input_audio_path, 16000)
@@ -95,7 +148,7 @@ def vc_single(
                 ]
             try:
                 for path in paths:
-                    info, opt = vc_single(
                         sid,
                         path,
                         f0_up_key,
@@ -109,17 +162,18 @@ def vc_single(
                         hop_length,
                         path,
                         False,
                     )
-                    # new_dir_path
             except Exception as error:
                 print(error)
-                return "Error", None
             print("Finished processing segmented audio, now merging audio...")
             merge_timestamps_file = os.path.join(
                 os.path.dirname(new_dir_path),
                 f"{os.path.basename(input_audio_path).split('.')[0]}_timestamps.txt",
             )
             tgt_sr, audio_opt = merge_audio(merge_timestamps_file)
         else:
             audio_opt = vc.pipeline(
@@ -140,9 +194,9 @@ def vc_single(
                 version,
                 protect,
                 hop_length,
                 f0_file=f0_file,
             )
         if output_path is not None:
             sf.write(output_path, audio_opt, tgt_sr, format="WAV")
@@ -158,7 +212,7 @@ def get_vc(weight_root, sid):
         global hubert_model
         if hubert_model is not None:
             print("clean_empty_cache")
-            del net_g, n_spk, vc, hubert_model, tgt_sr  # ,cpt
             hubert_model = net_g = n_spk = vc = hubert_model = tgt_sr = None
             if torch.cuda.is_available():
                 torch.cuda.empty_cache()
@@ -211,55 +265,64 @@ def get_vc(weight_root, sid):
     n_spk = cpt["config"][-3]
-f0up_key = sys.argv[1]
-filter_radius = sys.argv[2]
-index_rate = float(sys.argv[3])
-hop_length = sys.argv[4]
-f0method = sys.argv[5]
-audio_input_path = sys.argv[6]
-audio_output_path = sys.argv[7]
-model_path = sys.argv[8]
-index_path = sys.argv[9]
-try:
-    split_audio = sys.argv[10]
-except IndexError:
-    split_audio = None
-sid = f0up_key
-input_audio = audio_input_path
-f0_pitch = f0up_key
-f0_file = None
-f0_method = f0method
-file_index = index_path
-index_rate = index_rate
-output_file = audio_output_path
-split_audio = split_audio
-get_vc(model_path, 0)
-try:
-    result, audio_opt = vc_single(
-        sid=0,
-        input_audio_path=input_audio,
-        f0_up_key=f0_pitch,
-        f0_file=None,
-        f0_method=f0_method,
-        file_index=file_index,
-        index_rate=index_rate,
-        hop_length=hop_length,
-        output_path=output_file,
-        split_audio=split_audio,
-    )
-    if os.path.exists(output_file) and os.path.getsize(output_file) > 0:
-        message = result
-    else:
-        message = result
-    print(f"Conversion completed. Output file: '{output_file}'")
-except Exception as error:
-    print(f"Voice conversion failed: {error}")

 import os
 import sys
+import time
 import torch
+import logging
 import numpy as np
 import soundfile as sf
+import librosa
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+from rvc.infer.pipeline import VC
+from scipy.io import wavfile
+import noisereduce as nr
 from rvc.lib.utils import load_audio
 from rvc.lib.tools.split_audio import process_audio, merge_audio
 from fairseq import checkpoint_utils
     SynthesizerTrnMs768NSFsid,
     SynthesizerTrnMs768NSFsid_nono,
 )
 from rvc.configs.config import Config
+logging.getLogger("fairseq").setLevel(logging.WARNING)
+logging.getLogger("httpx").setLevel(logging.WARNING)
+config = Config()
 hubert_model = None
+tgt_sr = None
+net_g = None
+vc = None
+cpt = None
+version = None
+n_spk = None
 def load_hubert():
     hubert_model.eval()
+def remove_audio_noise(input_audio_path, reduction_strength=0.7):
+    try:
+        rate, data = wavfile.read(input_audio_path)
+        reduced_noise = nr.reduce_noise(
+            y=data,
+            sr=rate,
+            prop_decrease=reduction_strength,
+        )
+        return reduced_noise
+    except Exception as error:
+        print(f"Error cleaning audio: {error}")
+        return None
+def convert_audio_format(input_path, output_path, output_format):
+    try:
+        if output_format != "WAV":
+            print(f"Converting audio to {output_format} format...")
+            audio, sample_rate = librosa.load(input_path, sr=None)
+            common_sample_rates = [
+                8000,
+                11025,
+                12000,
+                16000,
+                22050,
+                24000,
+                32000,
+                44100,
+                48000,
+            ]
+            target_sr = min(common_sample_rates, key=lambda x: abs(x - sample_rate))
+            audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=target_sr)
+            sf.write(output_path, audio, target_sr, format=output_format.lower())
+        return output_path
+    except Exception as error:
+        print(f"Failed to convert audio to {output_format} format: {error}")
 def vc_single(
     sid=0,
     input_audio_path=None,
     file_index=None,
     index_rate=None,
     resample_sr=0,
+    rms_mix_rate=None,
+    protect=None,
     hop_length=None,
     output_path=None,
     split_audio=False,
+    f0autotune=False,
+    filter_radius=None,
 ):
     global tgt_sr, net_g, vc, hubert_model, version
     f0_up_key = int(f0_up_key)
     try:
         audio = load_audio(input_audio_path, 16000)
                 ]
             try:
                 for path in paths:
+                    vc_single(
                         sid,
                         path,
                         f0_up_key,
                         hop_length,
                         path,
                         False,
+                        f0autotune,
                     )
             except Exception as error:
                 print(error)
+                return f"Error {error}"
             print("Finished processing segmented audio, now merging audio...")
             merge_timestamps_file = os.path.join(
                 os.path.dirname(new_dir_path),
                 f"{os.path.basename(input_audio_path).split('.')[0]}_timestamps.txt",
             )
             tgt_sr, audio_opt = merge_audio(merge_timestamps_file)
+            os.remove(merge_timestamps_file)
         else:
             audio_opt = vc.pipeline(
                 version,
                 protect,
                 hop_length,
+                f0autotune,
                 f0_file=f0_file,
             )
         if output_path is not None:
             sf.write(output_path, audio_opt, tgt_sr, format="WAV")
         global hubert_model
         if hubert_model is not None:
             print("clean_empty_cache")
+            del net_g, n_spk, vc, hubert_model, tgt_sr
             hubert_model = net_g = n_spk = vc = hubert_model = tgt_sr = None
             if torch.cuda.is_available():
                 torch.cuda.empty_cache()
     n_spk = cpt["config"][-3]
+def infer_pipeline(
+    f0up_key,
+    filter_radius,
+    index_rate,
+    rms_mix_rate,
+    protect,
+    hop_length,
+    f0method,
+    audio_input_path,
+    audio_output_path,
+    model_path,
+    index_path,
+    split_audio,
+    f0autotune,
+    clean_audio,
+    clean_strength,
+    export_format,
+):
+    global tgt_sr, net_g, vc, cpt
+    get_vc(model_path, 0)
+    try:
+        start_time = time.time()
+        vc_single(
+            sid=0,
+            input_audio_path=audio_input_path,
+            f0_up_key=f0up_key,
+            f0_file=None,
+            f0_method=f0method,
+            file_index=index_path,
+            index_rate=index_rate,
+            rms_mix_rate=rms_mix_rate,
+            protect=protect,
+            hop_length=hop_length,
+            output_path=audio_output_path,
+            split_audio=split_audio,
+            f0autotune=f0autotune,
+            filter_radius=filter_radius,
+        )
+        if clean_audio == "True":
+            cleaned_audio = remove_audio_noise(audio_output_path, clean_strength)
+            if cleaned_audio is not None:
+                sf.write(audio_output_path, cleaned_audio, tgt_sr, format="WAV")
+        output_path_format = audio_output_path.replace(
+            ".wav", f".{export_format.lower()}"
+        )
+        audio_output_path = convert_audio_format(
+            audio_output_path, output_path_format, export_format
+        )
+        end_time = time.time()
+        elapsed_time = end_time - start_time
+        print(
+            f"Conversion completed. Output file: '{audio_output_path}' in {elapsed_time:.2f} seconds."
+        )
+    except Exception as error:
+        print(f"Voice conversion failed: {error}")

rvc/infer/pipeline.py ADDED Viewed

	@@ -0,0 +1,625 @@

+import numpy as np, parselmouth, torch, pdb, sys, os
+from time import time as ttime
+import torch.nn.functional as F
+import torchcrepe
+from torch import Tensor
+import scipy.signal as signal
+import pyworld, os, faiss, librosa, torchcrepe
+from scipy import signal
+from functools import lru_cache
+import random
+import gc
+import re
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+from rvc.lib.FCPEF0Predictor import FCPEF0Predictor
+bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
+input_audio_path2wav = {}
+@lru_cache
+def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period):
+    audio = input_audio_path2wav[input_audio_path]
+    f0, t = pyworld.harvest(
+        audio,
+        fs=fs,
+        f0_ceil=f0max,
+        f0_floor=f0min,
+        frame_period=frame_period,
+    )
+    f0 = pyworld.stonemask(audio, f0, t, fs)
+    return f0
+def change_rms(data1, sr1, data2, sr2, rate):
+    # print(data1.max(),data2.max())
+    rms1 = librosa.feature.rms(y=data1, frame_length=sr1 // 2 * 2, hop_length=sr1 // 2)
+    rms2 = librosa.feature.rms(y=data2, frame_length=sr2 // 2 * 2, hop_length=sr2 // 2)
+    rms1 = torch.from_numpy(rms1)
+    rms1 = F.interpolate(
+        rms1.unsqueeze(0), size=data2.shape[0], mode="linear"
+    ).squeeze()
+    rms2 = torch.from_numpy(rms2)
+    rms2 = F.interpolate(
+        rms2.unsqueeze(0), size=data2.shape[0], mode="linear"
+    ).squeeze()
+    rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-6)
+    data2 *= (
+        torch.pow(rms1, torch.tensor(1 - rate))
+        * torch.pow(rms2, torch.tensor(rate - 1))
+    ).numpy()
+    return data2
+class VC(object):
+    def __init__(self, tgt_sr, config):
+        self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = (
+            config.x_pad,
+            config.x_query,
+            config.x_center,
+            config.x_max,
+            config.is_half,
+        )
+        self.sr = 16000
+        self.window = 160
+        self.t_pad = self.sr * self.x_pad
+        self.t_pad_tgt = tgt_sr * self.x_pad
+        self.t_pad2 = self.t_pad * 2
+        self.t_query = self.sr * self.x_query
+        self.t_center = self.sr * self.x_center
+        self.t_max = self.sr * self.x_max
+        self.device = config.device
+        self.ref_freqs = [
+            65.41,
+            82.41,
+            110.00,
+            146.83,
+            196.00,
+            246.94,
+            329.63,
+            440.00,
+            587.33,
+            783.99,
+            1046.50,
+        ]
+        # Generate interpolated frequencies
+        self.note_dict = self.generate_interpolated_frequencies()
+    def generate_interpolated_frequencies(self):
+        # Generate interpolated frequencies based on the reference frequencies.
+        note_dict = []
+        for i in range(len(self.ref_freqs) - 1):
+            freq_low = self.ref_freqs[i]
+            freq_high = self.ref_freqs[i + 1]
+            # Interpolate between adjacent reference frequencies
+            interpolated_freqs = np.linspace(
+                freq_low, freq_high, num=10, endpoint=False
+            )
+            note_dict.extend(interpolated_freqs)
+        # Add the last reference frequency
+        note_dict.append(self.ref_freqs[-1])
+        return note_dict
+    def autotune_f0(self, f0):
+        # Autotunes the given fundamental frequency (f0) to the nearest musical note.
+        autotuned_f0 = np.zeros_like(f0)
+        for i, freq in enumerate(f0):
+            # Find the closest note
+            closest_note = min(self.note_dict, key=lambda x: abs(x - freq))
+            autotuned_f0[i] = closest_note
+        return autotuned_f0
+    def get_optimal_torch_device(self, index: int = 0) -> torch.device:
+        if torch.cuda.is_available():
+            return torch.device(f"cuda:{index % torch.cuda.device_count()}")
+        elif torch.backends.mps.is_available():
+            return torch.device("mps")
+        return torch.device("cpu")
+    def get_f0_crepe_computation(
+        self,
+        x,
+        f0_min,
+        f0_max,
+        p_len,
+        hop_length,
+        model="full",
+    ):
+        x = x.astype(np.float32)
+        x /= np.quantile(np.abs(x), 0.999)
+        torch_device = self.get_optimal_torch_device()
+        audio = torch.from_numpy(x).to(torch_device, copy=True)
+        audio = torch.unsqueeze(audio, dim=0)
+        if audio.ndim == 2 and audio.shape[0] > 1:
+            audio = torch.mean(audio, dim=0, keepdim=True).detach()
+        audio = audio.detach()
+        pitch: Tensor = torchcrepe.predict(
+            audio,
+            self.sr,
+            hop_length,
+            f0_min,
+            f0_max,
+            model,
+            batch_size=hop_length * 2,
+            device=torch_device,
+            pad=True,
+        )
+        p_len = p_len or x.shape[0] // hop_length
+        source = np.array(pitch.squeeze(0).cpu().float().numpy())
+        source[source < 0.001] = np.nan
+        target = np.interp(
+            np.arange(0, len(source) * p_len, len(source)) / p_len,
+            np.arange(0, len(source)),
+            source,
+        )
+        f0 = np.nan_to_num(target)
+        return f0
+    def get_f0_official_crepe_computation(
+        self,
+        x,
+        f0_min,
+        f0_max,
+        model="full",
+    ):
+        batch_size = 512
+        audio = torch.tensor(np.copy(x))[None].float()
+        f0, pd = torchcrepe.predict(
+            audio,
+            self.sr,
+            self.window,
+            f0_min,
+            f0_max,
+            model,
+            batch_size=batch_size,
+            device=self.device,
+            return_periodicity=True,
+        )
+        pd = torchcrepe.filter.median(pd, 3)
+        f0 = torchcrepe.filter.mean(f0, 3)
+        f0[pd < 0.1] = 0
+        f0 = f0[0].cpu().numpy()
+        return f0
+    def get_f0_hybrid_computation(
+        self,
+        methods_str,
+        x,
+        f0_min,
+        f0_max,
+        p_len,
+        hop_length,
+    ):
+        methods_str = re.search("hybrid\[(.+)\]", methods_str)
+        if methods_str:
+            methods = [method.strip() for method in methods_str.group(1).split("+")]
+        f0_computation_stack = []
+        print(f"Calculating f0 pitch estimations for methods {str(methods)}")
+        x = x.astype(np.float32)
+        x /= np.quantile(np.abs(x), 0.999)
+        for method in methods:
+            f0 = None
+            if method == "crepe":
+                f0 = self.get_f0_crepe_computation(
+                    x, f0_min, f0_max, p_len, int(hop_length)
+                )
+            elif method == "rmvpe":
+                if hasattr(self, "model_rmvpe") == False:
+                    from rvc.lib.rmvpe import RMVPE
+                    self.model_rmvpe = RMVPE(
+                        "rmvpe.pt", is_half=self.is_half, device=self.device
+                    )
+                f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
+                f0 = f0[1:]
+            elif method == "fcpe":
+                self.model_fcpe = FCPEF0Predictor(
+                    "fcpe.pt",
+                    f0_min=int(f0_min),
+                    f0_max=int(f0_max),
+                    dtype=torch.float32,
+                    device=self.device,
+                    sampling_rate=self.sr,
+                    threshold=0.03,
+                )
+                f0 = self.model_fcpe.compute_f0(x, p_len=p_len)
+                del self.model_fcpe
+                gc.collect()
+            f0_computation_stack.append(f0)
+        print(f"Calculating hybrid median f0 from the stack of {str(methods)}")
+        f0_computation_stack = [fc for fc in f0_computation_stack if fc is not None]
+        f0_median_hybrid = None
+        if len(f0_computation_stack) == 1:
+            f0_median_hybrid = f0_computation_stack[0]
+        else:
+            f0_median_hybrid = np.nanmedian(f0_computation_stack, axis=0)
+        return f0_median_hybrid
+    def get_f0(
+        self,
+        input_audio_path,
+        x,
+        p_len,
+        f0_up_key,
+        f0_method,
+        filter_radius,
+        hop_length,
+        f0autotune,
+        inp_f0=None,
+    ):
+        global input_audio_path2wav
+        time_step = self.window / self.sr * 1000
+        f0_min = 50
+        f0_max = 1100
+        f0_mel_min = 1127 * np.log(1 + f0_min / 700)
+        f0_mel_max = 1127 * np.log(1 + f0_max / 700)
+        if f0_method == "pm":
+            f0 = (
+                parselmouth.Sound(x, self.sr)
+                .to_pitch_ac(
+                    time_step=time_step / 1000,
+                    voicing_threshold=0.6,
+                    pitch_floor=f0_min,
+                    pitch_ceiling=f0_max,
+                )
+                .selected_array["frequency"]
+            )
+            pad_size = (p_len - len(f0) + 1) // 2
+            if pad_size > 0 or p_len - len(f0) - pad_size > 0:
+                f0 = np.pad(
+                    f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
+                )
+        elif f0_method == "harvest":
+            input_audio_path2wav[input_audio_path] = x.astype(np.double)
+            f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10)
+            if int(filter_radius) > 2:
+                f0 = signal.medfilt(f0, 3)
+        elif f0_method == "dio":
+            f0, t = pyworld.dio(
+                x.astype(np.double),
+                fs=self.sr,
+                f0_ceil=f0_max,
+                f0_floor=f0_min,
+                frame_period=10,
+            )
+            f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)
+            f0 = signal.medfilt(f0, 3)
+        elif f0_method == "crepe":
+            f0 = self.get_f0_crepe_computation(
+                x, f0_min, f0_max, p_len, int(hop_length)
+            )
+        elif f0_method == "crepe-tiny":
+            f0 = self.get_f0_crepe_computation(
+                x, f0_min, f0_max, p_len, int(hop_length), "tiny"
+            )
+        elif f0_method == "rmvpe":
+            if hasattr(self, "model_rmvpe") == False:
+                from rvc.lib.rmvpe import RMVPE
+                self.model_rmvpe = RMVPE(
+                    "rmvpe.pt", is_half=self.is_half, device=self.device
+                )
+            f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
+        elif f0_method == "fcpe":
+            self.model_fcpe = FCPEF0Predictor(
+                "fcpe.pt",
+                f0_min=int(f0_min),
+                f0_max=int(f0_max),
+                dtype=torch.float32,
+                device=self.device,
+                sampling_rate=self.sr,
+                threshold=0.03,
+            )
+            f0 = self.model_fcpe.compute_f0(x, p_len=p_len)
+            del self.model_fcpe
+            gc.collect()
+        elif "hybrid" in f0_method:
+            input_audio_path2wav[input_audio_path] = x.astype(np.double)
+            f0 = self.get_f0_hybrid_computation(
+                f0_method,
+                x,
+                f0_min,
+                f0_max,
+                p_len,
+                hop_length,
+            )
+        if f0autotune == "True":
+            f0 = self.autotune_f0(f0)
+        f0 *= pow(2, f0_up_key / 12)
+        tf0 = self.sr // self.window
+        if inp_f0 is not None:
+            delta_t = np.round(
+                (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1
+            ).astype("int16")
+            replace_f0 = np.interp(
+                list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1]
+            )
+            shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0]
+            f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[
+                :shape
+            ]
+        f0bak = f0.copy()
+        f0_mel = 1127 * np.log(1 + f0 / 700)
+        f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
+            f0_mel_max - f0_mel_min
+        ) + 1
+        f0_mel[f0_mel <= 1] = 1
+        f0_mel[f0_mel > 255] = 255
+        f0_coarse = np.rint(f0_mel).astype(np.int)
+        return f0_coarse, f0bak
+    def vc(
+        self,
+        model,
+        net_g,
+        sid,
+        audio0,
+        pitch,
+        pitchf,
+        index,
+        big_npy,
+        index_rate,
+        version,
+        protect,
+    ):
+        feats = torch.from_numpy(audio0)
+        if self.is_half:
+            feats = feats.half()
+        else:
+            feats = feats.float()
+        if feats.dim() == 2:
+            feats = feats.mean(-1)
+        assert feats.dim() == 1, feats.dim()
+        feats = feats.view(1, -1)
+        padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
+        inputs = {
+            "source": feats.to(self.device),
+            "padding_mask": padding_mask,
+            "output_layer": 9 if version == "v1" else 12,
+        }
+        t0 = ttime()
+        with torch.no_grad():
+            logits = model.extract_features(**inputs)
+            feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
+        if protect < 0.5 and pitch != None and pitchf != None:
+            feats0 = feats.clone()
+        if (
+            isinstance(index, type(None)) == False
+            and isinstance(big_npy, type(None)) == False
+            and index_rate != 0
+        ):
+            npy = feats[0].cpu().numpy()
+            if self.is_half:
+                npy = npy.astype("float32")
+            score, ix = index.search(npy, k=8)
+            weight = np.square(1 / score)
+            weight /= weight.sum(axis=1, keepdims=True)
+            npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
+            if self.is_half:
+                npy = npy.astype("float16")
+            feats = (
+                torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
+                + (1 - index_rate) * feats
+            )
+        feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
+        if protect < 0.5 and pitch != None and pitchf != None:
+            feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
+                0, 2, 1
+            )
+        t1 = ttime()
+        p_len = audio0.shape[0] // self.window
+        if feats.shape[1] < p_len:
+            p_len = feats.shape[1]
+            if pitch != None and pitchf != None:
+                pitch = pitch[:, :p_len]
+                pitchf = pitchf[:, :p_len]
+        if protect < 0.5 and pitch != None and pitchf != None:
+            pitchff = pitchf.clone()
+            pitchff[pitchf > 0] = 1
+            pitchff[pitchf < 1] = protect
+            pitchff = pitchff.unsqueeze(-1)
+            feats = feats * pitchff + feats0 * (1 - pitchff)
+            feats = feats.to(feats0.dtype)
+        p_len = torch.tensor([p_len], device=self.device).long()
+        with torch.no_grad():
+            if pitch != None and pitchf != None:
+                audio1 = (
+                    (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0])
+                    .data.cpu()
+                    .float()
+                    .numpy()
+                )
+            else:
+                audio1 = (
+                    (net_g.infer(feats, p_len, sid)[0][0, 0]).data.cpu().float().numpy()
+                )
+        del feats, p_len, padding_mask
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        t2 = ttime()
+        return audio1
+    def pipeline(
+        self,
+        model,
+        net_g,
+        sid,
+        audio,
+        input_audio_path,
+        f0_up_key,
+        f0_method,
+        file_index,
+        index_rate,
+        if_f0,
+        filter_radius,
+        tgt_sr,
+        resample_sr,
+        rms_mix_rate,
+        version,
+        protect,
+        hop_length,
+        f0autotune,
+        f0_file=None,
+    ):
+        if file_index != "" and os.path.exists(file_index) == True and index_rate != 0:
+            try:
+                index = faiss.read_index(file_index)
+                big_npy = index.reconstruct_n(0, index.ntotal)
+            except Exception as error:
+                print(error)
+                index = big_npy = None
+        else:
+            index = big_npy = None
+        audio = signal.filtfilt(bh, ah, audio)
+        audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
+        opt_ts = []
+        if audio_pad.shape[0] > self.t_max:
+            audio_sum = np.zeros_like(audio)
+            for i in range(self.window):
+                audio_sum += audio_pad[i : i - self.window]
+            for t in range(self.t_center, audio.shape[0], self.t_center):
+                opt_ts.append(
+                    t
+                    - self.t_query
+                    + np.where(
+                        np.abs(audio_sum[t - self.t_query : t + self.t_query])
+                        == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
+                    )[0][0]
+                )
+        s = 0
+        audio_opt = []
+        t = None
+        t1 = ttime()
+        audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
+        p_len = audio_pad.shape[0] // self.window
+        inp_f0 = None
+        if hasattr(f0_file, "name") == True:
+            try:
+                with open(f0_file.name, "r") as f:
+                    lines = f.read().strip("\n").split("\n")
+                inp_f0 = []
+                for line in lines:
+                    inp_f0.append([float(i) for i in line.split(",")])
+                inp_f0 = np.array(inp_f0, dtype="float32")
+            except Exception as error:
+                print(error)
+        sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
+        pitch, pitchf = None, None
+        if if_f0 == 1:
+            pitch, pitchf = self.get_f0(
+                input_audio_path,
+                audio_pad,
+                p_len,
+                f0_up_key,
+                f0_method,
+                filter_radius,
+                hop_length,
+                f0autotune,
+                inp_f0,
+            )
+            pitch = pitch[:p_len]
+            pitchf = pitchf[:p_len]
+            if self.device == "mps":
+                pitchf = pitchf.astype(np.float32)
+            pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
+            pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
+        t2 = ttime()
+        for t in opt_ts:
+            t = t // self.window * self.window
+            if if_f0 == 1:
+                audio_opt.append(
+                    self.vc(
+                        model,
+                        net_g,
+                        sid,
+                        audio_pad[s : t + self.t_pad2 + self.window],
+                        pitch[:, s // self.window : (t + self.t_pad2) // self.window],
+                        pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
+                        index,
+                        big_npy,
+                        index_rate,
+                        version,
+                        protect,
+                    )[self.t_pad_tgt : -self.t_pad_tgt]
+                )
+            else:
+                audio_opt.append(
+                    self.vc(
+                        model,
+                        net_g,
+                        sid,
+                        audio_pad[s : t + self.t_pad2 + self.window],
+                        None,
+                        None,
+                        index,
+                        big_npy,
+                        index_rate,
+                        version,
+                        protect,
+                    )[self.t_pad_tgt : -self.t_pad_tgt]
+                )
+            s = t
+        if if_f0 == 1:
+            audio_opt.append(
+                self.vc(
+                    model,
+                    net_g,
+                    sid,
+                    audio_pad[t:],
+                    pitch[:, t // self.window :] if t is not None else pitch,
+                    pitchf[:, t // self.window :] if t is not None else pitchf,
+                    index,
+                    big_npy,
+                    index_rate,
+                    version,
+                    protect,
+                )[self.t_pad_tgt : -self.t_pad_tgt]
+            )
+        else:
+            audio_opt.append(
+                self.vc(
+                    model,
+                    net_g,
+                    sid,
+                    audio_pad[t:],
+                    None,
+                    None,
+                    index,
+                    big_npy,
+                    index_rate,
+                    version,
+                    protect,
+                )[self.t_pad_tgt : -self.t_pad_tgt]
+            )
+        audio_opt = np.concatenate(audio_opt)
+        if rms_mix_rate != 1:
+            audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate)
+        if resample_sr >= 16000 and tgt_sr != resample_sr:
+            audio_opt = librosa.resample(
+                audio_opt, orig_sr=tgt_sr, target_sr=resample_sr
+            )
+        audio_max = np.abs(audio_opt).max() / 0.99
+        max_int16 = 32768
+        if audio_max > 1:
+            max_int16 /= audio_max
+        audio_opt = (audio_opt * max_int16).astype(np.int16)
+        del pitch, pitchf, sid
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        return audio_opt

rvc/lib/FCPEF0Predictor.py ADDED Viewed

	@@ -0,0 +1,1036 @@

+from typing import Union
+import torch.nn.functional as F
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn.utils.parametrizations import weight_norm
+from torchaudio.transforms import Resample
+import os
+import librosa
+import soundfile as sf
+import torch.utils.data
+from librosa.filters import mel as librosa_mel_fn
+import math
+from functools import partial
+from einops import rearrange, repeat
+from local_attention import LocalAttention
+from torch import nn
+os.environ["LRU_CACHE_CAPACITY"] = "3"
+def load_wav_to_torch(full_path, target_sr=None, return_empty_on_exception=False):
+    sampling_rate = None
+    try:
+        data, sampling_rate = sf.read(full_path, always_2d=True)  # than soundfile.
+    except Exception as error:
+        print(f"'{full_path}' failed to load with {error}")
+        if return_empty_on_exception:
+            return [], sampling_rate or target_sr or 48000
+        else:
+            raise Exception(error)
+    if len(data.shape) > 1:
+        data = data[:, 0]
+        assert (
+            len(data) > 2
+        )  # check duration of audio file is > 2 samples (because otherwise the slice operation was on the wrong dimension)
+    if np.issubdtype(data.dtype, np.integer):  # if audio data is type int
+        max_mag = -np.iinfo(
+            data.dtype
+        ).min  # maximum magnitude = min possible value of intXX
+    else:  # if audio data is type fp32
+        max_mag = max(np.amax(data), -np.amin(data))
+        max_mag = (
+            (2**31) + 1
+            if max_mag > (2**15)
+            else ((2**15) + 1 if max_mag > 1.01 else 1.0)
+        )  # data should be either 16-bit INT, 32-bit INT or [-1 to 1] float32
+    data = torch.FloatTensor(data.astype(np.float32)) / max_mag
+    if (
+        torch.isinf(data) | torch.isnan(data)
+    ).any() and return_empty_on_exception:  # resample will crash with inf/NaN inputs. return_empty_on_exception will return empty arr instead of except
+        return [], sampling_rate or target_sr or 48000
+    if target_sr is not None and sampling_rate != target_sr:
+        data = torch.from_numpy(
+            librosa.core.resample(
+                data.numpy(), orig_sr=sampling_rate, target_sr=target_sr
+            )
+        )
+        sampling_rate = target_sr
+    return data, sampling_rate
+def dynamic_range_compression(x, C=1, clip_val=1e-5):
+    return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
+def dynamic_range_decompression(x, C=1):
+    return np.exp(x) / C
+def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
+    return torch.log(torch.clamp(x, min=clip_val) * C)
+def dynamic_range_decompression_torch(x, C=1):
+    return torch.exp(x) / C
+class STFT:
+    def __init__(
+        self,
+        sr=22050,
+        n_mels=80,
+        n_fft=1024,
+        win_size=1024,
+        hop_length=256,
+        fmin=20,
+        fmax=11025,
+        clip_val=1e-5,
+    ):
+        self.target_sr = sr
+        self.n_mels = n_mels
+        self.n_fft = n_fft
+        self.win_size = win_size
+        self.hop_length = hop_length
+        self.fmin = fmin
+        self.fmax = fmax
+        self.clip_val = clip_val
+        self.mel_basis = {}
+        self.hann_window = {}
+    def get_mel(self, y, keyshift=0, speed=1, center=False, train=False):
+        sampling_rate = self.target_sr
+        n_mels = self.n_mels
+        n_fft = self.n_fft
+        win_size = self.win_size
+        hop_length = self.hop_length
+        fmin = self.fmin
+        fmax = self.fmax
+        clip_val = self.clip_val
+        factor = 2 ** (keyshift / 12)
+        n_fft_new = int(np.round(n_fft * factor))
+        win_size_new = int(np.round(win_size * factor))
+        hop_length_new = int(np.round(hop_length * speed))
+        if not train:
+            mel_basis = self.mel_basis
+            hann_window = self.hann_window
+        else:
+            mel_basis = {}
+            hann_window = {}
+        mel_basis_key = str(fmax) + "_" + str(y.device)
+        if mel_basis_key not in mel_basis:
+            mel = librosa_mel_fn(
+                sr=sampling_rate, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax
+            )
+            mel_basis[mel_basis_key] = torch.from_numpy(mel).float().to(y.device)
+        keyshift_key = str(keyshift) + "_" + str(y.device)
+        if keyshift_key not in hann_window:
+            hann_window[keyshift_key] = torch.hann_window(win_size_new).to(y.device)
+        pad_left = (win_size_new - hop_length_new) // 2
+        pad_right = max(
+            (win_size_new - hop_length_new + 1) // 2,
+            win_size_new - y.size(-1) - pad_left,
+        )
+        if pad_right < y.size(-1):
+            mode = "reflect"
+        else:
+            mode = "constant"
+        y = torch.nn.functional.pad(y.unsqueeze(1), (pad_left, pad_right), mode=mode)
+        y = y.squeeze(1)
+        spec = torch.stft(
+            y,
+            n_fft_new,
+            hop_length=hop_length_new,
+            win_length=win_size_new,
+            window=hann_window[keyshift_key],
+            center=center,
+            pad_mode="reflect",
+            normalized=False,
+            onesided=True,
+            return_complex=True,
+        )
+        spec = torch.sqrt(spec.real.pow(2) + spec.imag.pow(2) + (1e-9))
+        if keyshift != 0:
+            size = n_fft // 2 + 1
+            resize = spec.size(1)
+            if resize < size:
+                spec = F.pad(spec, (0, 0, 0, size - resize))
+            spec = spec[:, :size, :] * win_size / win_size_new
+        spec = torch.matmul(mel_basis[mel_basis_key], spec)
+        spec = dynamic_range_compression_torch(spec, clip_val=clip_val)
+        return spec
+    def __call__(self, audiopath):
+        audio, sr = load_wav_to_torch(audiopath, target_sr=self.target_sr)
+        spect = self.get_mel(audio.unsqueeze(0)).squeeze(0)
+        return spect
+stft = STFT()
+# import fast_transformers.causal_product.causal_product_cuda
+def softmax_kernel(
+    data, *, projection_matrix, is_query, normalize_data=True, eps=1e-4, device=None
+):
+    b, h, *_ = data.shape
+    # (batch size, head, length, model_dim)
+    # normalize model dim
+    data_normalizer = (data.shape[-1] ** -0.25) if normalize_data else 1.0
+    # what is ration?, projection_matrix.shape[0] --> 266
+    ratio = projection_matrix.shape[0] ** -0.5
+    projection = repeat(projection_matrix, "j d -> b h j d", b=b, h=h)
+    projection = projection.type_as(data)
+    # data_dash = w^T x
+    data_dash = torch.einsum("...id,...jd->...ij", (data_normalizer * data), projection)
+    # diag_data = D**2
+    diag_data = data**2
+    diag_data = torch.sum(diag_data, dim=-1)
+    diag_data = (diag_data / 2.0) * (data_normalizer**2)
+    diag_data = diag_data.unsqueeze(dim=-1)
+    if is_query:
+        data_dash = ratio * (
+            torch.exp(
+                data_dash
+                - diag_data
+                - torch.max(data_dash, dim=-1, keepdim=True).values
+            )
+            + eps
+        )
+    else:
+        data_dash = ratio * (
+            torch.exp(data_dash - diag_data + eps)
+        )  # - torch.max(data_dash)) + eps)
+    return data_dash.type_as(data)
+def orthogonal_matrix_chunk(cols, qr_uniform_q=False, device=None):
+    unstructured_block = torch.randn((cols, cols), device=device)
+    q, r = torch.linalg.qr(unstructured_block.cpu(), mode="reduced")
+    q, r = map(lambda t: t.to(device), (q, r))
+    # proposed by @Parskatt
+    # to make sure Q is uniform https://arxiv.org/pdf/math-ph/0609050.pdf
+    if qr_uniform_q:
+        d = torch.diag(r, 0)
+        q *= d.sign()
+    return q.t()
+def exists(val):
+    return val is not None
+def empty(tensor):
+    return tensor.numel() == 0
+def default(val, d):
+    return val if exists(val) else d
+def cast_tuple(val):
+    return (val,) if not isinstance(val, tuple) else val
+class PCmer(nn.Module):
+    """The encoder that is used in the Transformer model."""
+    def __init__(
+        self,
+        num_layers,
+        num_heads,
+        dim_model,
+        dim_keys,
+        dim_values,
+        residual_dropout,
+        attention_dropout,
+    ):
+        super().__init__()
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        self.dim_model = dim_model
+        self.dim_values = dim_values
+        self.dim_keys = dim_keys
+        self.residual_dropout = residual_dropout
+        self.attention_dropout = attention_dropout
+        self._layers = nn.ModuleList([_EncoderLayer(self) for _ in range(num_layers)])
+    #  METHODS  ########################################################################################################
+    def forward(self, phone, mask=None):
+        # apply all layers to the input
+        for i, layer in enumerate(self._layers):
+            phone = layer(phone, mask)
+        # provide the final sequence
+        return phone
+# ==================================================================================================================== #
+#  CLASS  _ E N C O D E R  L A Y E R                                                                                   #
+# ==================================================================================================================== #
+class _EncoderLayer(nn.Module):
+    """One layer of the encoder.
+    Attributes:
+        attn: (:class:`mha.MultiHeadAttention`): The attention mechanism that is used to read the input sequence.
+        feed_forward (:class:`ffl.FeedForwardLayer`): The feed-forward layer on top of the attention mechanism.
+    """
+    def __init__(self, parent: PCmer):
+        """Creates a new instance of ``_EncoderLayer``.
+        Args:
+            parent (Encoder): The encoder that the layers is created for.
+        """
+        super().__init__()
+        self.conformer = ConformerConvModule(parent.dim_model)
+        self.norm = nn.LayerNorm(parent.dim_model)
+        self.dropout = nn.Dropout(parent.residual_dropout)
+        # selfatt -> fastatt: performer!
+        self.attn = SelfAttention(
+            dim=parent.dim_model, heads=parent.num_heads, causal=False
+        )
+    #  METHODS  ########################################################################################################
+    def forward(self, phone, mask=None):
+        # compute attention sub-layer
+        phone = phone + (self.attn(self.norm(phone), mask=mask))
+        phone = phone + (self.conformer(phone))
+        return phone
+def calc_same_padding(kernel_size):
+    pad = kernel_size // 2
+    return (pad, pad - (kernel_size + 1) % 2)
+# helper classes
+class Swish(nn.Module):
+    def forward(self, x):
+        return x * x.sigmoid()
+class Transpose(nn.Module):
+    def __init__(self, dims):
+        super().__init__()
+        assert len(dims) == 2, "dims must be a tuple of two dimensions"
+        self.dims = dims
+    def forward(self, x):
+        return x.transpose(*self.dims)
+class GLU(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+    def forward(self, x):
+        out, gate = x.chunk(2, dim=self.dim)
+        return out * gate.sigmoid()
+class DepthWiseConv1d(nn.Module):
+    def __init__(self, chan_in, chan_out, kernel_size, padding):
+        super().__init__()
+        self.padding = padding
+        self.conv = nn.Conv1d(chan_in, chan_out, kernel_size, groups=chan_in)
+    def forward(self, x):
+        x = F.pad(x, self.padding)
+        return self.conv(x)
+class ConformerConvModule(nn.Module):
+    def __init__(
+        self, dim, causal=False, expansion_factor=2, kernel_size=31, dropout=0.0
+    ):
+        super().__init__()
+        inner_dim = dim * expansion_factor
+        padding = calc_same_padding(kernel_size) if not causal else (kernel_size - 1, 0)
+        self.net = nn.Sequential(
+            nn.LayerNorm(dim),
+            Transpose((1, 2)),
+            nn.Conv1d(dim, inner_dim * 2, 1),
+            GLU(dim=1),
+            DepthWiseConv1d(
+                inner_dim, inner_dim, kernel_size=kernel_size, padding=padding
+            ),
+            # nn.BatchNorm1d(inner_dim) if not causal else nn.Identity(),
+            Swish(),
+            nn.Conv1d(inner_dim, dim, 1),
+            Transpose((1, 2)),
+            nn.Dropout(dropout),
+        )
+    def forward(self, x):
+        return self.net(x)
+def linear_attention(q, k, v):
+    if v is None:
+        out = torch.einsum("...ed,...nd->...ne", k, q)
+        return out
+    else:
+        k_cumsum = k.sum(dim=-2)
+        # k_cumsum = k.sum(dim = -2)
+        D_inv = 1.0 / (torch.einsum("...nd,...d->...n", q, k_cumsum.type_as(q)) + 1e-8)
+        context = torch.einsum("...nd,...ne->...de", k, v)
+        out = torch.einsum("...de,...nd,...n->...ne", context, q, D_inv)
+        return out
+def gaussian_orthogonal_random_matrix(
+    nb_rows, nb_columns, scaling=0, qr_uniform_q=False, device=None
+):
+    nb_full_blocks = int(nb_rows / nb_columns)
+    block_list = []
+    for _ in range(nb_full_blocks):
+        q = orthogonal_matrix_chunk(
+            nb_columns, qr_uniform_q=qr_uniform_q, device=device
+        )
+        block_list.append(q)
+    remaining_rows = nb_rows - nb_full_blocks * nb_columns
+    if remaining_rows > 0:
+        q = orthogonal_matrix_chunk(
+            nb_columns, qr_uniform_q=qr_uniform_q, device=device
+        )
+        block_list.append(q[:remaining_rows])
+    final_matrix = torch.cat(block_list)
+    if scaling == 0:
+        multiplier = torch.randn((nb_rows, nb_columns), device=device).norm(dim=1)
+    elif scaling == 1:
+        multiplier = math.sqrt((float(nb_columns))) * torch.ones(
+            (nb_rows,), device=device
+        )
+    else:
+        raise ValueError(f"Invalid scaling {scaling}")
+    return torch.diag(multiplier) @ final_matrix
+class FastAttention(nn.Module):
+    def __init__(
+        self,
+        dim_heads,
+        nb_features=None,
+        ortho_scaling=0,
+        causal=False,
+        generalized_attention=False,
+        kernel_fn=nn.ReLU(),
+        qr_uniform_q=False,
+        no_projection=False,
+    ):
+        super().__init__()
+        nb_features = default(nb_features, int(dim_heads * math.log(dim_heads)))
+        self.dim_heads = dim_heads
+        self.nb_features = nb_features
+        self.ortho_scaling = ortho_scaling
+        self.create_projection = partial(
+            gaussian_orthogonal_random_matrix,
+            nb_rows=self.nb_features,
+            nb_columns=dim_heads,
+            scaling=ortho_scaling,
+            qr_uniform_q=qr_uniform_q,
+        )
+        projection_matrix = self.create_projection()
+        self.register_buffer("projection_matrix", projection_matrix)
+        self.generalized_attention = generalized_attention
+        self.kernel_fn = kernel_fn
+        # if this is turned on, no projection will be used
+        # queries and keys will be softmax-ed as in the original efficient attention paper
+        self.no_projection = no_projection
+        self.causal = causal
+    @torch.no_grad()
+    def redraw_projection_matrix(self):
+        projections = self.create_projection()
+        self.projection_matrix.copy_(projections)
+        del projections
+    def forward(self, q, k, v):
+        device = q.device
+        if self.no_projection:
+            q = q.softmax(dim=-1)
+            k = torch.exp(k) if self.causal else k.softmax(dim=-2)
+        else:
+            create_kernel = partial(
+                softmax_kernel, projection_matrix=self.projection_matrix, device=device
+            )
+            q = create_kernel(q, is_query=True)
+            k = create_kernel(k, is_query=False)
+        attn_fn = linear_attention if not self.causal else self.causal_linear_fn
+        if v is None:
+            out = attn_fn(q, k, None)
+            return out
+        else:
+            out = attn_fn(q, k, v)
+            return out
+class SelfAttention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        causal=False,
+        heads=8,
+        dim_head=64,
+        local_heads=0,
+        local_window_size=256,
+        nb_features=None,
+        feature_redraw_interval=1000,
+        generalized_attention=False,
+        kernel_fn=nn.ReLU(),
+        qr_uniform_q=False,
+        dropout=0.0,
+        no_projection=False,
+    ):
+        super().__init__()
+        assert dim % heads == 0, "dimension must be divisible by number of heads"
+        dim_head = default(dim_head, dim // heads)
+        inner_dim = dim_head * heads
+        self.fast_attention = FastAttention(
+            dim_head,
+            nb_features,
+            causal=causal,
+            generalized_attention=generalized_attention,
+            kernel_fn=kernel_fn,
+            qr_uniform_q=qr_uniform_q,
+            no_projection=no_projection,
+        )
+        self.heads = heads
+        self.global_heads = heads - local_heads
+        self.local_attn = (
+            LocalAttention(
+                window_size=local_window_size,
+                causal=causal,
+                autopad=True,
+                dropout=dropout,
+                look_forward=int(not causal),
+                rel_pos_emb_config=(dim_head, local_heads),
+            )
+            if local_heads > 0
+            else None
+        )
+        self.to_q = nn.Linear(dim, inner_dim)
+        self.to_k = nn.Linear(dim, inner_dim)
+        self.to_v = nn.Linear(dim, inner_dim)
+        self.to_out = nn.Linear(inner_dim, dim)
+        self.dropout = nn.Dropout(dropout)
+    @torch.no_grad()
+    def redraw_projection_matrix(self):
+        self.fast_attention.redraw_projection_matrix()
+    def forward(
+        self,
+        x,
+        context=None,
+        mask=None,
+        context_mask=None,
+        name=None,
+        inference=False,
+        **kwargs,
+    ):
+        _, _, _, h, gh = *x.shape, self.heads, self.global_heads
+        cross_attend = exists(context)
+        context = default(context, x)
+        context_mask = default(context_mask, mask) if not cross_attend else context_mask
+        q, k, v = self.to_q(x), self.to_k(context), self.to_v(context)
+        q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=h), (q, k, v))
+        (q, lq), (k, lk), (v, lv) = map(lambda t: (t[:, :gh], t[:, gh:]), (q, k, v))
+        attn_outs = []
+        if not empty(q):
+            if exists(context_mask):
+                global_mask = context_mask[:, None, :, None]
+                v.masked_fill_(~global_mask, 0.0)
+            if cross_attend:
+                pass
+            else:
+                out = self.fast_attention(q, k, v)
+            attn_outs.append(out)
+        if not empty(lq):
+            assert (
+                not cross_attend
+            ), "local attention is not compatible with cross attention"
+            out = self.local_attn(lq, lk, lv, input_mask=mask)
+            attn_outs.append(out)
+        out = torch.cat(attn_outs, dim=1)
+        out = rearrange(out, "b h n d -> b n (h d)")
+        out = self.to_out(out)
+        return self.dropout(out)
+def l2_regularization(model, l2_alpha):
+    l2_loss = []
+    for module in model.modules():
+        if type(module) is nn.Conv2d:
+            l2_loss.append((module.weight**2).sum() / 2.0)
+    return l2_alpha * sum(l2_loss)
+class FCPE(nn.Module):
+    def __init__(
+        self,
+        input_channel=128,
+        out_dims=360,
+        n_layers=12,
+        n_chans=512,
+        use_siren=False,
+        use_full=False,
+        loss_mse_scale=10,
+        loss_l2_regularization=False,
+        loss_l2_regularization_scale=1,
+        loss_grad1_mse=False,
+        loss_grad1_mse_scale=1,
+        f0_max=1975.5,
+        f0_min=32.70,
+        confidence=False,
+        threshold=0.05,
+        use_input_conv=True,
+    ):
+        super().__init__()
+        if use_siren is True:
+            raise ValueError("Siren is not supported yet.")
+        if use_full is True:
+            raise ValueError("Full model is not supported yet.")
+        self.loss_mse_scale = loss_mse_scale if (loss_mse_scale is not None) else 10
+        self.loss_l2_regularization = (
+            loss_l2_regularization if (loss_l2_regularization is not None) else False
+        )
+        self.loss_l2_regularization_scale = (
+            loss_l2_regularization_scale
+            if (loss_l2_regularization_scale is not None)
+            else 1
+        )
+        self.loss_grad1_mse = loss_grad1_mse if (loss_grad1_mse is not None) else False
+        self.loss_grad1_mse_scale = (
+            loss_grad1_mse_scale if (loss_grad1_mse_scale is not None) else 1
+        )
+        self.f0_max = f0_max if (f0_max is not None) else 1975.5
+        self.f0_min = f0_min if (f0_min is not None) else 32.70
+        self.confidence = confidence if (confidence is not None) else False
+        self.threshold = threshold if (threshold is not None) else 0.05
+        self.use_input_conv = use_input_conv if (use_input_conv is not None) else True
+        self.cent_table_b = torch.Tensor(
+            np.linspace(
+                self.f0_to_cent(torch.Tensor([f0_min]))[0],
+                self.f0_to_cent(torch.Tensor([f0_max]))[0],
+                out_dims,
+            )
+        )
+        self.register_buffer("cent_table", self.cent_table_b)
+        # conv in stack
+        _leaky = nn.LeakyReLU()
+        self.stack = nn.Sequential(
+            nn.Conv1d(input_channel, n_chans, 3, 1, 1),
+            nn.GroupNorm(4, n_chans),
+            _leaky,
+            nn.Conv1d(n_chans, n_chans, 3, 1, 1),
+        )
+        # transformer
+        self.decoder = PCmer(
+            num_layers=n_layers,
+            num_heads=8,
+            dim_model=n_chans,
+            dim_keys=n_chans,
+            dim_values=n_chans,
+            residual_dropout=0.1,
+            attention_dropout=0.1,
+        )
+        self.norm = nn.LayerNorm(n_chans)
+        # out
+        self.n_out = out_dims
+        self.dense_out = weight_norm(nn.Linear(n_chans, self.n_out))
+    def forward(
+        self, mel, infer=True, gt_f0=None, return_hz_f0=False, cdecoder="local_argmax"
+    ):
+        """
+        input:
+            B x n_frames x n_unit
+        return:
+            dict of B x n_frames x feat
+        """
+        if cdecoder == "argmax":
+            self.cdecoder = self.cents_decoder
+        elif cdecoder == "local_argmax":
+            self.cdecoder = self.cents_local_decoder
+        if self.use_input_conv:
+            x = self.stack(mel.transpose(1, 2)).transpose(1, 2)
+        else:
+            x = mel
+        x = self.decoder(x)
+        x = self.norm(x)
+        x = self.dense_out(x)  # [B,N,D]
+        x = torch.sigmoid(x)
+        if not infer:
+            gt_cent_f0 = self.f0_to_cent(gt_f0)  # mel f0  #[B,N,1]
+            gt_cent_f0 = self.gaussian_blurred_cent(gt_cent_f0)  # #[B,N,out_dim]
+            loss_all = self.loss_mse_scale * F.binary_cross_entropy(
+                x, gt_cent_f0
+            )  # bce loss
+            # l2 regularization
+            if self.loss_l2_regularization:
+                loss_all = loss_all + l2_regularization(
+                    model=self, l2_alpha=self.loss_l2_regularization_scale
+                )
+            x = loss_all
+        if infer:
+            x = self.cdecoder(x)
+            x = self.cent_to_f0(x)
+            if not return_hz_f0:
+                x = (1 + x / 700).log()
+        return x
+    def cents_decoder(self, y, mask=True):
+        B, N, _ = y.size()
+        ci = self.cent_table[None, None, :].expand(B, N, -1)
+        rtn = torch.sum(ci * y, dim=-1, keepdim=True) / torch.sum(
+            y, dim=-1, keepdim=True
+        )  # cents: [B,N,1]
+        if mask:
+            confident = torch.max(y, dim=-1, keepdim=True)[0]
+            confident_mask = torch.ones_like(confident)
+            confident_mask[confident <= self.threshold] = float("-INF")
+            rtn = rtn * confident_mask
+        if self.confidence:
+            return rtn, confident
+        else:
+            return rtn
+    def cents_local_decoder(self, y, mask=True):
+        B, N, _ = y.size()
+        ci = self.cent_table[None, None, :].expand(B, N, -1)
+        confident, max_index = torch.max(y, dim=-1, keepdim=True)
+        local_argmax_index = torch.arange(0, 9).to(max_index.device) + (max_index - 4)
+        local_argmax_index[local_argmax_index < 0] = 0
+        local_argmax_index[local_argmax_index >= self.n_out] = self.n_out - 1
+        ci_l = torch.gather(ci, -1, local_argmax_index)
+        y_l = torch.gather(y, -1, local_argmax_index)
+        rtn = torch.sum(ci_l * y_l, dim=-1, keepdim=True) / torch.sum(
+            y_l, dim=-1, keepdim=True
+        )  # cents: [B,N,1]
+        if mask:
+            confident_mask = torch.ones_like(confident)
+            confident_mask[confident <= self.threshold] = float("-INF")
+            rtn = rtn * confident_mask
+        if self.confidence:
+            return rtn, confident
+        else:
+            return rtn
+    def cent_to_f0(self, cent):
+        return 10.0 * 2 ** (cent / 1200.0)
+    def f0_to_cent(self, f0):
+        return 1200.0 * torch.log2(f0 / 10.0)
+    def gaussian_blurred_cent(self, cents):  # cents: [B,N,1]
+        mask = (cents > 0.1) & (cents < (1200.0 * np.log2(self.f0_max / 10.0)))
+        B, N, _ = cents.size()
+        ci = self.cent_table[None, None, :].expand(B, N, -1)
+        return torch.exp(-torch.square(ci - cents) / 1250) * mask.float()
+class FCPEInfer:
+    def __init__(self, model_path, device=None, dtype=torch.float32):
+        if device is None:
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.device = device
+        ckpt = torch.load(model_path, map_location=torch.device(self.device))
+        self.args = DotDict(ckpt["config"])
+        self.dtype = dtype
+        model = FCPE(
+            input_channel=self.args.model.input_channel,
+            out_dims=self.args.model.out_dims,
+            n_layers=self.args.model.n_layers,
+            n_chans=self.args.model.n_chans,
+            use_siren=self.args.model.use_siren,
+            use_full=self.args.model.use_full,
+            loss_mse_scale=self.args.loss.loss_mse_scale,
+            loss_l2_regularization=self.args.loss.loss_l2_regularization,
+            loss_l2_regularization_scale=self.args.loss.loss_l2_regularization_scale,
+            loss_grad1_mse=self.args.loss.loss_grad1_mse,
+            loss_grad1_mse_scale=self.args.loss.loss_grad1_mse_scale,
+            f0_max=self.args.model.f0_max,
+            f0_min=self.args.model.f0_min,
+            confidence=self.args.model.confidence,
+        )
+        model.to(self.device).to(self.dtype)
+        model.load_state_dict(ckpt["model"])
+        model.eval()
+        self.model = model
+        self.wav2mel = Wav2Mel(self.args, dtype=self.dtype, device=self.device)
+    @torch.no_grad()
+    def __call__(self, audio, sr, threshold=0.05):
+        self.model.threshold = threshold
+        audio = audio[None, :]
+        mel = self.wav2mel(audio=audio, sample_rate=sr).to(self.dtype)
+        f0 = self.model(mel=mel, infer=True, return_hz_f0=True)
+        return f0
+class Wav2Mel:
+    def __init__(self, args, device=None, dtype=torch.float32):
+        # self.args = args
+        self.sampling_rate = args.mel.sampling_rate
+        self.hop_size = args.mel.hop_size
+        if device is None:
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.device = device
+        self.dtype = dtype
+        self.stft = STFT(
+            args.mel.sampling_rate,
+            args.mel.num_mels,
+            args.mel.n_fft,
+            args.mel.win_size,
+            args.mel.hop_size,
+            args.mel.fmin,
+            args.mel.fmax,
+        )
+        self.resample_kernel = {}
+    def extract_nvstft(self, audio, keyshift=0, train=False):
+        mel = self.stft.get_mel(audio, keyshift=keyshift, train=train).transpose(
+            1, 2
+        )  # B, n_frames, bins
+        return mel
+    def extract_mel(self, audio, sample_rate, keyshift=0, train=False):
+        audio = audio.to(self.dtype).to(self.device)
+        # resample
+        if sample_rate == self.sampling_rate:
+            audio_res = audio
+        else:
+            key_str = str(sample_rate)
+            if key_str not in self.resample_kernel:
+                self.resample_kernel[key_str] = Resample(
+                    sample_rate, self.sampling_rate, lowpass_filter_width=128
+                )
+            self.resample_kernel[key_str] = (
+                self.resample_kernel[key_str].to(self.dtype).to(self.device)
+            )
+            audio_res = self.resample_kernel[key_str](audio)
+        # extract
+        mel = self.extract_nvstft(
+            audio_res, keyshift=keyshift, train=train
+        )  # B, n_frames, bins
+        n_frames = int(audio.shape[1] // self.hop_size) + 1
+        if n_frames > int(mel.shape[1]):
+            mel = torch.cat((mel, mel[:, -1:, :]), 1)
+        if n_frames < int(mel.shape[1]):
+            mel = mel[:, :n_frames, :]
+        return mel
+    def __call__(self, audio, sample_rate, keyshift=0, train=False):
+        return self.extract_mel(audio, sample_rate, keyshift=keyshift, train=train)
+class DotDict(dict):
+    def __getattr__(*args):
+        val = dict.get(*args)
+        return DotDict(val) if type(val) is dict else val
+    __setattr__ = dict.__setitem__
+    __delattr__ = dict.__delitem__
+class F0Predictor(object):
+    def compute_f0(self, wav, p_len):
+        """
+        input: wav:[signal_length]
+               p_len:int
+        output: f0:[signal_length//hop_length]
+        """
+        pass
+    def compute_f0_uv(self, wav, p_len):
+        """
+        input: wav:[signal_length]
+               p_len:int
+        output: f0:[signal_length//hop_length],uv:[signal_length//hop_length]
+        """
+        pass
+class FCPEF0Predictor(F0Predictor):
+    def __init__(
+        self,
+        model_path,
+        hop_length=512,
+        f0_min=50,
+        f0_max=1100,
+        dtype=torch.float32,
+        device=None,
+        sampling_rate=44100,
+        threshold=0.05,
+    ):
+        self.fcpe = FCPEInfer(model_path, device=device, dtype=dtype)
+        self.hop_length = hop_length
+        self.f0_min = f0_min
+        self.f0_max = f0_max
+        if device is None:
+            self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        else:
+            self.device = device
+        self.threshold = threshold
+        self.sampling_rate = sampling_rate
+        self.dtype = dtype
+        self.name = "fcpe"
+    def repeat_expand(
+        self,
+        content: Union[torch.Tensor, np.ndarray],
+        target_len: int,
+        mode: str = "nearest",
+    ):
+        ndim = content.ndim
+        if content.ndim == 1:
+            content = content[None, None]
+        elif content.ndim == 2:
+            content = content[None]
+        assert content.ndim == 3
+        is_np = isinstance(content, np.ndarray)
+        if is_np:
+            content = torch.from_numpy(content)
+        results = torch.nn.functional.interpolate(content, size=target_len, mode=mode)
+        if is_np:
+            results = results.numpy()
+        if ndim == 1:
+            return results[0, 0]
+        elif ndim == 2:
+            return results[0]
+    def post_process(self, x, sampling_rate, f0, pad_to):
+        if isinstance(f0, np.ndarray):
+            f0 = torch.from_numpy(f0).float().to(x.device)
+        if pad_to is None:
+            return f0
+        f0 = self.repeat_expand(f0, pad_to)
+        vuv_vector = torch.zeros_like(f0)
+        vuv_vector[f0 > 0.0] = 1.0
+        vuv_vector[f0 <= 0.0] = 0.0
+        # 去掉0频率, 并线性插值
+        nzindex = torch.nonzero(f0).squeeze()
+        f0 = torch.index_select(f0, dim=0, index=nzindex).cpu().numpy()
+        time_org = self.hop_length / sampling_rate * nzindex.cpu().numpy()
+        time_frame = np.arange(pad_to) * self.hop_length / sampling_rate
+        vuv_vector = F.interpolate(vuv_vector[None, None, :], size=pad_to)[0][0]
+        if f0.shape[0] <= 0:
+            return (
+                torch.zeros(pad_to, dtype=torch.float, device=x.device).cpu().numpy(),
+                vuv_vector.cpu().numpy(),
+            )
+        if f0.shape[0] == 1:
+            return (
+                torch.ones(pad_to, dtype=torch.float, device=x.device) * f0[0]
+            ).cpu().numpy(), vuv_vector.cpu().numpy()
+        # 大概可以用 torch 重写?
+        f0 = np.interp(time_frame, time_org, f0, left=f0[0], right=f0[-1])
+        # vuv_vector = np.ceil(scipy.ndimage.zoom(vuv_vector,pad_to/len(vuv_vector),order = 0))
+        return f0, vuv_vector.cpu().numpy()
+    def compute_f0(self, wav, p_len=None):
+        x = torch.FloatTensor(wav).to(self.dtype).to(self.device)
+        if p_len is None:
+            print("fcpe p_len is None")
+            p_len = x.shape[0] // self.hop_length
+        f0 = self.fcpe(x, sr=self.sampling_rate, threshold=self.threshold)[0, :, 0]
+        if torch.all(f0 == 0):
+            rtn = f0.cpu().numpy() if p_len is None else np.zeros(p_len)
+            return rtn, rtn
+        return self.post_process(x, self.sampling_rate, f0, p_len)[0]
+    def compute_f0_uv(self, wav, p_len=None):
+        x = torch.FloatTensor(wav).to(self.dtype).to(self.device)
+        if p_len is None:
+            p_len = x.shape[0] // self.hop_length
+        f0 = self.fcpe(x, sr=self.sampling_rate, threshold=self.threshold)[0, :, 0]
+        if torch.all(f0 == 0):
+            rtn = f0.cpu().numpy() if p_len is None else np.zeros(p_len)
+            return rtn, rtn
+        return self.post_process(x, self.sampling_rate, f0, p_len)

rvc/lib/infer_pack/models.py CHANGED Viewed

@@ -178,7 +178,7 @@ class ResidualCouplingBlock(nn.Module):
         for i in range(self.n_flows):
             for hook in self.flows[i * 2]._forward_pre_hooks.values():
                 if (
-                    hook.__module__ == "torch.nn.utils.weight_norm"
                     and hook.__class__.__name__ == "WeightNorm"
                 ):
                     torch.nn.utils.remove_weight_norm(self.flows[i * 2])
@@ -235,7 +235,7 @@ class PosteriorEncoder(nn.Module):
     def __prepare_scriptable__(self):
         for hook in self.enc._forward_pre_hooks.values():
             if (
-                hook.__module__ == "torch.nn.utils.weight_norm"
                 and hook.__class__.__name__ == "WeightNorm"
             ):
                 torch.nn.utils.remove_weight_norm(self.enc)
@@ -319,7 +319,7 @@ class Generator(torch.nn.Module):
                 # because of shadowing, so we check the module name directly.
                 # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3
                 if (
-                    hook.__module__ == "torch.nn.utils.weight_norm"
                     and hook.__class__.__name__ == "WeightNorm"
                 ):
                     torch.nn.utils.remove_weight_norm(l)
@@ -327,7 +327,7 @@ class Generator(torch.nn.Module):
         for l in self.resblocks:
             for hook in l._forward_pre_hooks.values():
                 if (
-                    hook.__module__ == "torch.nn.utils.weight_norm"
                     and hook.__class__.__name__ == "WeightNorm"
                 ):
                     torch.nn.utils.remove_weight_norm(l)
@@ -610,14 +610,14 @@ class GeneratorNSF(torch.nn.Module):
                 # because of shadowing, so we check the module name directly.
                 # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3
                 if (
-                    hook.__module__ == "torch.nn.utils.weight_norm"
                     and hook.__class__.__name__ == "WeightNorm"
                 ):
                     torch.nn.utils.remove_weight_norm(l)
         for l in self.resblocks:
             for hook in self.resblocks._forward_pre_hooks.values():
                 if (
-                    hook.__module__ == "torch.nn.utils.weight_norm"
                     and hook.__class__.__name__ == "WeightNorm"
                 ):
                     torch.nn.utils.remove_weight_norm(l)
@@ -722,20 +722,20 @@ class SynthesizerTrnMs256NSFsid(nn.Module):
             # because of shadowing, so we check the module name directly.
             # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3
             if (
-                hook.__module__ == "torch.nn.utils.weight_norm"
                 and hook.__class__.__name__ == "WeightNorm"
             ):
                 torch.nn.utils.remove_weight_norm(self.dec)
         for hook in self.flow._forward_pre_hooks.values():
             if (
-                hook.__module__ == "torch.nn.utils.weight_norm"
                 and hook.__class__.__name__ == "WeightNorm"
             ):
                 torch.nn.utils.remove_weight_norm(self.flow)
         if hasattr(self, "enc_q"):
             for hook in self.enc_q._forward_pre_hooks.values():
                 if (
-                    hook.__module__ == "torch.nn.utils.weight_norm"
                     and hook.__class__.__name__ == "WeightNorm"
                 ):
                     torch.nn.utils.remove_weight_norm(self.enc_q)
@@ -881,20 +881,20 @@ class SynthesizerTrnMs768NSFsid(nn.Module):
             # because of shadowing, so we check the module name directly.
             # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3
             if (
-                hook.__module__ == "torch.nn.utils.weight_norm"
                 and hook.__class__.__name__ == "WeightNorm"
             ):
                 torch.nn.utils.remove_weight_norm(self.dec)
         for hook in self.flow._forward_pre_hooks.values():
             if (
-                hook.__module__ == "torch.nn.utils.weight_norm"
                 and hook.__class__.__name__ == "WeightNorm"
             ):
                 torch.nn.utils.remove_weight_norm(self.flow)
         if hasattr(self, "enc_q"):
             for hook in self.enc_q._forward_pre_hooks.values():
                 if (
-                    hook.__module__ == "torch.nn.utils.weight_norm"
                     and hook.__class__.__name__ == "WeightNorm"
                 ):
                     torch.nn.utils.remove_weight_norm(self.enc_q)
@@ -1029,20 +1029,20 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
             # because of shadowing, so we check the module name directly.
             # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3
             if (
-                hook.__module__ == "torch.nn.utils.weight_norm"
                 and hook.__class__.__name__ == "WeightNorm"
             ):
                 torch.nn.utils.remove_weight_norm(self.dec)
         for hook in self.flow._forward_pre_hooks.values():
             if (
-                hook.__module__ == "torch.nn.utils.weight_norm"
                 and hook.__class__.__name__ == "WeightNorm"
             ):
                 torch.nn.utils.remove_weight_norm(self.flow)
         if hasattr(self, "enc_q"):
             for hook in self.enc_q._forward_pre_hooks.values():
                 if (
-                    hook.__module__ == "torch.nn.utils.weight_norm"
                     and hook.__class__.__name__ == "WeightNorm"
                 ):
                     torch.nn.utils.remove_weight_norm(self.enc_q)
@@ -1168,20 +1168,20 @@ class SynthesizerTrnMs768NSFsid_nono(nn.Module):
             # because of shadowing, so we check the module name directly.
             # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3
             if (
-                hook.__module__ == "torch.nn.utils.weight_norm"
                 and hook.__class__.__name__ == "WeightNorm"
             ):
                 torch.nn.utils.remove_weight_norm(self.dec)
         for hook in self.flow._forward_pre_hooks.values():
             if (
-                hook.__module__ == "torch.nn.utils.weight_norm"
                 and hook.__class__.__name__ == "WeightNorm"
             ):
                 torch.nn.utils.remove_weight_norm(self.flow)
         if hasattr(self, "enc_q"):
             for hook in self.enc_q._forward_pre_hooks.values():
                 if (
-                    hook.__module__ == "torch.nn.utils.weight_norm"
                     and hook.__class__.__name__ == "WeightNorm"
                 ):
                     torch.nn.utils.remove_weight_norm(self.enc_q)

         for i in range(self.n_flows):
             for hook in self.flows[i * 2]._forward_pre_hooks.values():
                 if (
+                    hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
                     and hook.__class__.__name__ == "WeightNorm"
                 ):
                     torch.nn.utils.remove_weight_norm(self.flows[i * 2])
     def __prepare_scriptable__(self):
         for hook in self.enc._forward_pre_hooks.values():
             if (
+                hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
                 and hook.__class__.__name__ == "WeightNorm"
             ):
                 torch.nn.utils.remove_weight_norm(self.enc)
                 # because of shadowing, so we check the module name directly.
                 # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3
                 if (
+                    hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
                     and hook.__class__.__name__ == "WeightNorm"
                 ):
                     torch.nn.utils.remove_weight_norm(l)
         for l in self.resblocks:
             for hook in l._forward_pre_hooks.values():
                 if (
+                    hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
                     and hook.__class__.__name__ == "WeightNorm"
                 ):
                     torch.nn.utils.remove_weight_norm(l)
                 # because of shadowing, so we check the module name directly.
                 # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3
                 if (
+                    hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
                     and hook.__class__.__name__ == "WeightNorm"
                 ):
                     torch.nn.utils.remove_weight_norm(l)
         for l in self.resblocks:
             for hook in self.resblocks._forward_pre_hooks.values():
                 if (
+                    hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
                     and hook.__class__.__name__ == "WeightNorm"
                 ):
                     torch.nn.utils.remove_weight_norm(l)
             # because of shadowing, so we check the module name directly.
             # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3
             if (
+                hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
                 and hook.__class__.__name__ == "WeightNorm"
             ):
                 torch.nn.utils.remove_weight_norm(self.dec)
         for hook in self.flow._forward_pre_hooks.values():
             if (
+                hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
                 and hook.__class__.__name__ == "WeightNorm"
             ):
                 torch.nn.utils.remove_weight_norm(self.flow)
         if hasattr(self, "enc_q"):
             for hook in self.enc_q._forward_pre_hooks.values():
                 if (
+                    hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
                     and hook.__class__.__name__ == "WeightNorm"
                 ):
                     torch.nn.utils.remove_weight_norm(self.enc_q)
             # because of shadowing, so we check the module name directly.
             # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3
             if (
+                hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
                 and hook.__class__.__name__ == "WeightNorm"
             ):
                 torch.nn.utils.remove_weight_norm(self.dec)
         for hook in self.flow._forward_pre_hooks.values():
             if (
+                hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
                 and hook.__class__.__name__ == "WeightNorm"
             ):
                 torch.nn.utils.remove_weight_norm(self.flow)
         if hasattr(self, "enc_q"):
             for hook in self.enc_q._forward_pre_hooks.values():
                 if (
+                    hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
                     and hook.__class__.__name__ == "WeightNorm"
                 ):
                     torch.nn.utils.remove_weight_norm(self.enc_q)
             # because of shadowing, so we check the module name directly.
             # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3
             if (
+                hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
                 and hook.__class__.__name__ == "WeightNorm"
             ):
                 torch.nn.utils.remove_weight_norm(self.dec)
         for hook in self.flow._forward_pre_hooks.values():
             if (
+                hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
                 and hook.__class__.__name__ == "WeightNorm"
             ):
                 torch.nn.utils.remove_weight_norm(self.flow)
         if hasattr(self, "enc_q"):
             for hook in self.enc_q._forward_pre_hooks.values():
                 if (
+                    hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
                     and hook.__class__.__name__ == "WeightNorm"
                 ):
                     torch.nn.utils.remove_weight_norm(self.enc_q)
             # because of shadowing, so we check the module name directly.
             # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3
             if (
+                hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
                 and hook.__class__.__name__ == "WeightNorm"
             ):
                 torch.nn.utils.remove_weight_norm(self.dec)
         for hook in self.flow._forward_pre_hooks.values():
             if (
+                hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
                 and hook.__class__.__name__ == "WeightNorm"
             ):
                 torch.nn.utils.remove_weight_norm(self.flow)
         if hasattr(self, "enc_q"):
             for hook in self.enc_q._forward_pre_hooks.values():
                 if (
+                    hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
                     and hook.__class__.__name__ == "WeightNorm"
                 ):
                     torch.nn.utils.remove_weight_norm(self.enc_q)

rvc/lib/tools/analyzer.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import numpy as np
+import matplotlib.pyplot as plt
+import librosa.display
+import librosa
+def calculate_features(y, sr):
+    stft = np.abs(librosa.stft(y))
+    duration = librosa.get_duration(y=y, sr=sr)
+    cent = librosa.feature.spectral_centroid(S=stft, sr=sr)[0]
+    bw = librosa.feature.spectral_bandwidth(S=stft, sr=sr)[0]
+    rolloff = librosa.feature.spectral_rolloff(S=stft, sr=sr)[0]
+    return stft, duration, cent, bw, rolloff
+def plot_title(title):
+    plt.suptitle(title, fontsize=16, fontweight="bold")
+def plot_spectrogram(y, sr, stft, duration, cmap="inferno"):
+    plt.subplot(3, 1, 1)
+    plt.imshow(
+        librosa.amplitude_to_db(stft, ref=np.max),
+        origin="lower",
+        extent=[0, duration, 0, sr / 1000],
+        aspect="auto",
+        cmap=cmap,  # Change the colormap here
+    )
+    plt.colorbar(format="%+2.0f dB")
+    plt.xlabel("Time (s)")
+    plt.ylabel("Frequency (kHz)")
+    plt.title("Spectrogram")
+def plot_waveform(y, sr, duration):
+    plt.subplot(3, 1, 2)
+    librosa.display.waveshow(y, sr=sr)
+    plt.xlabel("Time (s)")
+    plt.ylabel("Amplitude")
+    plt.title("Waveform")
+def plot_features(times, cent, bw, rolloff, duration):
+    plt.subplot(3, 1, 3)
+    plt.plot(times, cent, label="Spectral Centroid (kHz)", color="b")
+    plt.plot(times, bw, label="Spectral Bandwidth (kHz)", color="g")
+    plt.plot(times, rolloff, label="Spectral Rolloff (kHz)", color="r")
+    plt.xlabel("Time (s)")
+    plt.title("Spectral Features")
+    plt.legend()
+def analyze_audio(audio_file, save_plot_path="logs/audio_analysis.png"):
+    y, sr = librosa.load(audio_file)
+    stft, duration, cent, bw, rolloff = calculate_features(y, sr)
+    plt.figure(figsize=(12, 10))
+    plot_title("Audio Analysis" + " - " + audio_file.split("/")[-1])
+    plot_spectrogram(y, sr, stft, duration)
+    plot_waveform(y, sr, duration)
+    plot_features(librosa.times_like(cent), cent, bw, rolloff, duration)
+    plt.tight_layout()
+    if save_plot_path:
+        plt.savefig(save_plot_path, bbox_inches="tight", dpi=300)
+    plt.close()
+    audio_info = f"""Sample Rate: {sr}\nDuration: {(
+            str(round(duration, 2)) + " seconds"
+            if duration < 60
+            else str(round(duration / 60, 2)) + " minutes"
+    )}\nNumber of Samples: {len(y)}\nBits per Sample: {librosa.get_samplerate(audio_file)}\nChannels: {"Mono (1)" if y.ndim == 1 else "Stereo (2)"}"""
+    return audio_info, save_plot_path

rvc/lib/tools/gdown.py CHANGED Viewed

@@ -16,6 +16,7 @@ import requests
 import six
 import tqdm
 def indent(text, prefix):
     def prefixed_lines():
         for line in text.splitlines(True):
@@ -23,6 +24,7 @@ def indent(text, prefix):
     return "".join(prefixed_lines())
 class FileURLRetrievalError(Exception):
     pass
@@ -30,6 +32,7 @@ class FileURLRetrievalError(Exception):
 class FolderContentsMaximumLimitError(Exception):
     pass
 def parse_url(url, warning=True):
     """Parse URLs especially for Google Drive links.
@@ -93,11 +96,17 @@ def get_url_from_gdrive_confirmation(contents):
     m = re.search(r'href="/open\?id=([^"]+)"', contents)
     if m:
         url = m.groups()[0]
-        uuid = re.search(r'<input\s+type="hidden"\s+name="uuid"\s+value="([^"]+)"', contents)
         uuid = uuid.groups()[0]
-        url = "https://drive.usercontent.google.com/download?id=" + url + "&confirm=t&uuid=" + uuid
         return url
     m = re.search(r'"downloadUrl":"([^"]+)', contents)
     if m:
@@ -116,6 +125,8 @@ def get_url_from_gdrive_confirmation(contents):
         "You may need to change the permission to "
         "'Anyone with the link', or have had many accesses."
     )
 def _get_session(proxy, use_cookies, return_cookies_file=False):
     sess = requests.session()
@@ -211,16 +222,12 @@ def download(
         url_origin = url
         is_gdrive_download_link = True
     while True:
         res = sess.get(url, stream=True, verify=verify)
         if url == url_origin and res.status_code == 500:
             # The file could be Google Docs or Spreadsheets.
-            url = "https://drive.google.com/open?id={id}".format(
-                id=gdrive_file_id
-            )
             continue
         if res.headers["Content-Type"].startswith("text/html"):

 import six
 import tqdm
 def indent(text, prefix):
     def prefixed_lines():
         for line in text.splitlines(True):
     return "".join(prefixed_lines())
 class FileURLRetrievalError(Exception):
     pass
 class FolderContentsMaximumLimitError(Exception):
     pass
 def parse_url(url, warning=True):
     """Parse URLs especially for Google Drive links.
     m = re.search(r'href="/open\?id=([^"]+)"', contents)
     if m:
         url = m.groups()[0]
+        uuid = re.search(
+            r'<input\s+type="hidden"\s+name="uuid"\s+value="([^"]+)"', contents
+        )
         uuid = uuid.groups()[0]
+        url = (
+            "https://drive.usercontent.google.com/download?id="
+            + url
+            + "&confirm=t&uuid="
+            + uuid
+        )
         return url
     m = re.search(r'"downloadUrl":"([^"]+)', contents)
     if m:
         "You may need to change the permission to "
         "'Anyone with the link', or have had many accesses."
     )
 def _get_session(proxy, use_cookies, return_cookies_file=False):
     sess = requests.session()
         url_origin = url
         is_gdrive_download_link = True
     while True:
         res = sess.get(url, stream=True, verify=verify)
         if url == url_origin and res.status_code == 500:
             # The file could be Google Docs or Spreadsheets.
+            url = "https://drive.google.com/open?id={id}".format(id=gdrive_file_id)
             continue
         if res.headers["Content-Type"].startswith("text/html"):

rvc/lib/tools/launch_tensorboard.py CHANGED Viewed

@@ -3,7 +3,8 @@ from tensorboard import program
 log_path = "logs"
-if __name__ == "__main__":
     tb = program.TensorBoard()
     tb.configure(argv=[None, "--logdir", log_path])
     url = tb.launch()

 log_path = "logs"
+def launch_tensorboard_pipeline():
     tb = program.TensorBoard()
     tb.configure(argv=[None, "--logdir", log_path])
     url = tb.launch()

rvc/lib/tools/model_download.py CHANGED Viewed

@@ -4,9 +4,11 @@ import wget
 import zipfile
 from bs4 import BeautifulSoup
 import requests
-from urllib.parse import unquote
 import re
 import shutil
 def find_folder_parent(search_dir, folder_name):
     for dirpath, dirnames, _ in os.walk(search_dir):
@@ -14,12 +16,13 @@ def find_folder_parent(search_dir, folder_name):
             return os.path.abspath(dirpath)
     return None
 now_dir = os.getcwd()
 sys.path.append(now_dir)
 from rvc.lib.utils import format_title
-import rvc.lib.tools.gdown as gdown
 file_path = find_folder_parent(now_dir, "logs")
@@ -71,7 +74,7 @@ def download_from_url(url):
                 try:
                     gdown.download(
                         f"https://drive.google.com/uc?id={file_id}",
-                        quiet=False,
                         fuzzy=True,
                     )
                 except Exception as error:
@@ -91,7 +94,60 @@ def download_from_url(url):
                         print(error_message)
                         os.chdir(now_dir)
                         return None
         elif "/blob/" in url or "/resolve/" in url:
             os.chdir(zips_path)
             if "/blob/" in url:
@@ -99,11 +155,12 @@ def download_from_url(url):
             response = requests.get(url, stream=True)
             if response.status_code == 200:
-                file_name = url.split("/")[-1]
-                file_name = unquote(file_name)
-                file_name = re.sub(r"[^a-zA-Z0-9_.-]", "_", file_name)
                 total_size_in_bytes = int(response.headers.get("content-length", 0))
                 block_size = 1024
                 progress_bar_length = 50
@@ -152,6 +209,31 @@ def download_from_url(url):
             else:
                 os.chdir(now_dir)
                 return None
         else:
             try:
                 os.chdir(zips_path)
@@ -197,73 +279,86 @@ def unzip_file(zip_path, zip_file_name):
     os.remove(zip_file_path)
-url = sys.argv[1]
-if "?download=true" in url:
-    url = url.replace("?download=true", "")
-verify = download_from_url(url)
-if verify == "downloaded":
-    extract_folder_path = ""
-    for filename in os.listdir(zips_path):
-        if filename.endswith(".zip"):
-            zipfile_path = os.path.join(zips_path, filename)
-            print("Proceeding with the extraction...")
-            model_zip = os.path.basename(zipfile_path)
-            model_name = format_title(model_zip.split(".zip")[0])
-            extract_folder_path = os.path.join(
-                "logs",
-                os.path.normpath(model_name),
-            )
-            success = extract_and_show_progress(zipfile_path, extract_folder_path)
-            subfolders = [f for f in os.listdir(extract_folder_path) if os.path.isdir(os.path.join(extract_folder_path, f))]
-            if len(subfolders) == 1:
-                subfolder_path = os.path.join(extract_folder_path, subfolders[0])
-                for item in os.listdir(subfolder_path):
-                    s = os.path.join(subfolder_path, item)
-                    d = os.path.join(extract_folder_path, item)
-                    shutil.move(s, d)
-                os.rmdir(subfolder_path)
-            for item in os.listdir(extract_folder_path):
-                if ".pth" in item:
-                    file_name = item.split(".pth")[0]
-                    if file_name != model_name:
-                        os.rename(
-                            os.path.join(extract_folder_path, item),
-                            os.path.join(extract_folder_path, model_name + ".pth"),
-                        )
-                else:
-                    if "v2" not in item:
-                        file_name = item.split("_nprobe_1_")[1].split("_v1")[0]
                         if file_name != model_name:
-                            new_file_name = item.split("_nprobe_1_")[0] + "_nprobe_1_" + model_name + "_v1"
                             os.rename(
                                 os.path.join(extract_folder_path, item),
-                                os.path.join(extract_folder_path, new_file_name + ".index"),
                             )
                     else:
-                        file_name = item.split("_nprobe_1_")[1].split("_v2")[0]
-                        if file_name != model_name:
-                            new_file_name = item.split("_nprobe_1_")[0] + "_nprobe_1_" + model_name + "_v2"
-                            os.rename(
-                                os.path.join(extract_folder_path, item),
-                                os.path.join(extract_folder_path, new_file_name + ".index"),
-                            )
-            if success:
-                print(f"Model {model_name} downloaded!")
-            else:
-                print(f"Error downloading {model_name}")
-                sys.exit()
-    if extract_folder_path == "":
-        print("Zip file was not found.")
         sys.exit()
-    result = search_pth_index(extract_folder_path)
-else:
-    message = "Error"
-    sys.exit()

 import zipfile
 from bs4 import BeautifulSoup
 import requests
+from urllib.parse import unquote, urlencode, parse_qs, urlparse
 import re
 import shutil
+import six
 def find_folder_parent(search_dir, folder_name):
     for dirpath, dirnames, _ in os.walk(search_dir):
             return os.path.abspath(dirpath)
     return None
 now_dir = os.getcwd()
 sys.path.append(now_dir)
 from rvc.lib.utils import format_title
+from rvc.lib.tools import gdown
 file_path = find_folder_parent(now_dir, "logs")
                 try:
                     gdown.download(
                         f"https://drive.google.com/uc?id={file_id}",
+                        quiet=True,
                         fuzzy=True,
                     )
                 except Exception as error:
                         print(error_message)
                         os.chdir(now_dir)
                         return None
+        elif "disk.yandex.ru" in url:
+            base_url = "https://cloud-api.yandex.net/v1/disk/public/resources/download?"
+            public_key = url
+            final_url = base_url + urlencode(dict(public_key=public_key))
+            response = requests.get(final_url)
+            download_url = response.json()["href"]
+            download_response = requests.get(download_url)
+            if download_response.status_code == 200:
+                filename = parse_qs(urlparse(unquote(download_url)).query).get(
+                    "filename", [""]
+                )[0]
+                if filename:
+                    os.chdir(zips_path)
+                    with open(filename, "wb") as f:
+                        f.write(download_response.content)
+            else:
+                print("Failed to get filename from URL.")
+                return None
+        elif "pixeldrain.com" in url:
+            try:
+                file_id = url.split("pixeldrain.com/u/")[1]
+                os.chdir(zips_path)
+                print(file_id)
+                response = requests.get(f"https://pixeldrain.com/api/file/{file_id}")
+                if response.status_code == 200:
+                    file_name = (
+                        response.headers.get("Content-Disposition")
+                        .split("filename=")[-1]
+                        .strip('";')
+                    )
+                    os.makedirs(zips_path, exist_ok=True)
+                    with open(os.path.join(zips_path, file_name), "wb") as newfile:
+                        newfile.write(response.content)
+                        os.chdir(file_path)
+                        return "downloaded"
+                else:
+                    os.chdir(file_path)
+                    return None
+            except Exception as e:
+                print(e)
+                os.chdir(file_path)
+                return None
+        elif "cdn.discordapp.com" in url:
+            file = requests.get(url)
+            os.chdir(zips_path)
+            if file.status_code == 200:
+                name = url.split("/")
+                with open(os.path.join(name[-1]), "wb") as newfile:
+                    newfile.write(file.content)
+            else:
+                return None
         elif "/blob/" in url or "/resolve/" in url:
             os.chdir(zips_path)
             if "/blob/" in url:
             response = requests.get(url, stream=True)
             if response.status_code == 200:
+                content_disposition = six.moves.urllib_parse.unquote(
+                    response.headers["Content-Disposition"]
+                )
+                m = re.search(r'filename="([^"]+)"', content_disposition)
+                file_name = m.groups()[0]
+                file_name = file_name.replace(os.path.sep, "_")
                 total_size_in_bytes = int(response.headers.get("content-length", 0))
                 block_size = 1024
                 progress_bar_length = 50
             else:
                 os.chdir(now_dir)
                 return None
+        elif "applio.org" in url:
+            parts = url.split("/")
+            id_with_query = parts[-1]
+            id_parts = id_with_query.split("?")
+            id_number = id_parts[0]
+            url = "https://cjtfqzjfdimgpvpwhzlv.supabase.co/rest/v1/models"
+            headers = {
+                "apikey": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6ImNqdGZxempmZGltZ3B2cHdoemx2Iiwicm9sZSI6ImFub24iLCJpYXQiOjE2OTUxNjczODgsImV4cCI6MjAxMDc0MzM4OH0.7z5WMIbjR99c2Ooc0ma7B_FyGq10G8X-alkCYTkKR10"
+            }
+            params = {"id": f"eq.{id_number}"}
+            response = requests.get(url, headers=headers, params=params)
+            if response.status_code == 200:
+                json_response = response.json()
+                print(json_response)
+                if json_response:
+                    link = json_response[0]["link"]
+                    verify = download_from_url(link)
+                    if verify == "downloaded":
+                        return "downloaded"
+                    else:
+                        return None
+            else:
+                return None
         else:
             try:
                 os.chdir(zips_path)
     os.remove(zip_file_path)
+def model_download_pipeline(url):
+    verify = download_from_url(url)
+    if verify == "downloaded":
+        extract_folder_path = ""
+        for filename in os.listdir(zips_path):
+            if filename.endswith(".zip"):
+                zipfile_path = os.path.join(zips_path, filename)
+                print("Proceeding with the extraction...")
+                model_zip = os.path.basename(zipfile_path)
+                model_name = format_title(model_zip.split(".zip")[0])
+                extract_folder_path = os.path.join(
+                    "logs",
+                    os.path.normpath(model_name),
+                )
+                success = extract_and_show_progress(zipfile_path, extract_folder_path)
+                subfolders = [
+                    f
+                    for f in os.listdir(extract_folder_path)
+                    if os.path.isdir(os.path.join(extract_folder_path, f))
+                ]
+                if len(subfolders) == 1:
+                    subfolder_path = os.path.join(extract_folder_path, subfolders[0])
+                    for item in os.listdir(subfolder_path):
+                        s = os.path.join(subfolder_path, item)
+                        d = os.path.join(extract_folder_path, item)
+                        shutil.move(s, d)
+                    os.rmdir(subfolder_path)
+                for item in os.listdir(extract_folder_path):
+                    if ".pth" in item:
+                        file_name = item.split(".pth")[0]
                         if file_name != model_name:
                             os.rename(
                                 os.path.join(extract_folder_path, item),
+                                os.path.join(extract_folder_path, model_name + ".pth"),
                             )
                     else:
+                        if "v2" not in item:
+                            file_name = item.split("_nprobe_1_")[1].split("_v1")[0]
+                            if file_name != model_name:
+                                new_file_name = (
+                                    item.split("_nprobe_1_")[0]
+                                    + "_nprobe_1_"
+                                    + model_name
+                                    + "_v1"
+                                )
+                                os.rename(
+                                    os.path.join(extract_folder_path, item),
+                                    os.path.join(
+                                        extract_folder_path, new_file_name + ".index"
+                                    ),
+                                )
+                        else:
+                            file_name = item.split("_nprobe_1_")[1].split("_v2")[0]
+                            if file_name != model_name:
+                                new_file_name = (
+                                    item.split("_nprobe_1_")[0]
+                                    + "_nprobe_1_"
+                                    + model_name
+                                    + "_v2"
+                                )
+                                os.rename(
+                                    os.path.join(extract_folder_path, item),
+                                    os.path.join(
+                                        extract_folder_path, new_file_name + ".index"
+                                    ),
+                                )
+                if success:
+                    print(f"Model {model_name} downloaded!")
+                else:
+                    print(f"Error downloading {model_name}")
+                    sys.exit()
+        if extract_folder_path == "":
+            print("Zip file was not found.")
+            sys.exit()
+        result = search_pth_index(extract_folder_path)
+    else:
+        message = "Error"
         sys.exit()

rvc/lib/tools/prerequisites_download.py CHANGED Viewed

@@ -1,11 +1,10 @@
 import os
 import wget
-import sys
-url_base = "https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main"
-models_download = [
     (
-        "pretrained/",
         [
             "D32k.pth",
             "D40k.pth",
@@ -21,6 +20,8 @@ models_download = [
             "f0G48k.pth",
         ],
     ),
     (
         "pretrained_v2/",
         [
@@ -40,45 +41,55 @@ models_download = [
     ),
 ]
-models_file = [
     "hubert_base.pt",
     "rmvpe.pt",
-    # "rmvpe.onnx",
 ]
-executables_file = [
-    "ffmpeg.exe",
-    "ffprobe.exe",
-]
-folder_mapping = {
-    "pretrained/": "rvc/pretraineds/pretrained_v1/",
     "pretrained_v2/": "rvc/pretraineds/pretrained_v2/",
 }
-for file_name in models_file:
-    destination_path = os.path.join(file_name)
-    url = f"{url_base}/{file_name}"
-    if not os.path.exists(destination_path):
-        os.makedirs(os.path.dirname(destination_path) or ".", exist_ok=True)
-        print(f"\nDownloading {url} to {destination_path}...")
-        wget.download(url, out=destination_path)
-for file_name in executables_file:
-    if sys.platform == "win32":
-        destination_path = os.path.join(file_name)
-        url = f"{url_base}/{file_name}"
-        if not os.path.exists(destination_path):
-            os.makedirs(os.path.dirname(destination_path) or ".", exist_ok=True)
-            print(f"\nDownloading {url} to {destination_path}...")
-            wget.download(url, out=destination_path)
-for remote_folder, file_list in models_download:
-    local_folder = folder_mapping.get(remote_folder, "")
-    for file in file_list:
-        destination_path = os.path.join(local_folder, file)
-        url = f"{url_base}/{remote_folder}{file}"
-        if not os.path.exists(destination_path):
-            os.makedirs(os.path.dirname(destination_path) or ".", exist_ok=True)
-            print(f"\nDownloading {url} to {destination_path}...")
-            wget.download(url, out=destination_path)

 import os
 import wget
+url_base = "https://huggingface.co/IAHispano/Applio/resolve/main/Resources"
+pretraineds_v1_list = [
     (
+        "pretrained_v1/",
         [
             "D32k.pth",
             "D40k.pth",
             "f0G48k.pth",
         ],
     ),
+]
+pretraineds_v2_list = [
     (
         "pretrained_v2/",
         [
     ),
 ]
+models_list = [
     "hubert_base.pt",
     "rmvpe.pt",
+    "fcpe.pt",
+    # "rmvpe.onnx"
 ]
+executables_list = ["ffmpeg.exe", "ffprobe.exe"]
+folder_mapping_list = {
+    "pretrained_v1/": "rvc/pretraineds/pretrained_v1/",
     "pretrained_v2/": "rvc/pretraineds/pretrained_v2/",
 }
+def prequisites_download_pipeline(pretraineds_v1, pretraineds_v2, models, exe):
+    def download_files(file_list):
+        for file_name in file_list:
+            destination_path = os.path.join(file_name)
+            url = f"{url_base}/{file_name}"
+            if not os.path.exists(destination_path):
+                os.makedirs(os.path.dirname(destination_path) or ".", exist_ok=True)
+                print(f"\nDownloading {url} to {destination_path}...")
+                wget.download(url, out=destination_path)
+    if models == "True":
+        download_files(models_list)
+    if exe == "True" and os.name == "nt":
+        download_files(executables_list)
+    if pretraineds_v1 == "True":
+        for remote_folder, file_list in pretraineds_v1_list:
+            local_folder = folder_mapping_list.get(remote_folder, "")
+            for file in file_list:
+                destination_path = os.path.join(local_folder, file)
+                url = f"{url_base}/{remote_folder}{file}"
+                if not os.path.exists(destination_path):
+                    os.makedirs(os.path.dirname(destination_path) or ".", exist_ok=True)
+                    print(f"\nDownloading {url} to {destination_path}...")
+                    wget.download(url, out=destination_path)
+    if pretraineds_v2 == "True":
+        for remote_folder, file_list in pretraineds_v2_list:
+            local_folder = folder_mapping_list.get(remote_folder, "")
+            for file in file_list:
+                destination_path = os.path.join(local_folder, file)
+                url = f"{url_base}/{remote_folder}{file}"
+                if not os.path.exists(destination_path):
+                    os.makedirs(os.path.dirname(destination_path) or ".", exist_ok=True)
+                    print(f"\nDownloading {url} to {destination_path}...")
+                    wget.download(url, out=destination_path)

rvc/lib/tools/pretrained_selector.py CHANGED Viewed

@@ -60,4 +60,4 @@ def pretrained_selector(pitch_guidance):
                     "rvc/pretraineds/pretrained_v2/D48k.pth",
                 ),
             },
-        }

                     "rvc/pretraineds/pretrained_v2/D48k.pth",
                 ),
             },
+        }

rvc/lib/tools/split_audio.py CHANGED Viewed

@@ -17,11 +17,13 @@ def process_audio(file_path):
         min_silence_len = 750  # ms, adjust as needed
         # detect nonsilent parts
-        nonsilent_parts = detect_nonsilent(song, min_silence_len=min_silence_len, silence_thresh=silence_thresh)
         # Create a new directory to store chunks
         file_dir = os.path.dirname(file_path)
-        file_name = os.path.basename(file_path).split('.')[0]
         file_name = format_title(file_name)
         new_dir_path = os.path.join(file_dir, file_name)
         os.makedirs(new_dir_path, exist_ok=True)
@@ -58,7 +60,7 @@ def process_audio(file_path):
 def merge_audio(timestamps_file):
     try:
         # Extract prefix from the timestamps filename
-        prefix = os.path.basename(timestamps_file).replace('_timestamps.txt', '')
         timestamps_dir = os.path.dirname(timestamps_file)
         # Open the timestamps file
@@ -98,8 +100,8 @@ def merge_audio(timestamps_file):
         # Concatenate all audio_segments and export
         merged_audio = sum(audio_segments)
         merged_audio_np = np.array(merged_audio.get_array_of_samples())
-        #print(f"Exported merged file: {merged_filename}\n")
         return merged_audio.frame_rate, merged_audio_np
     except Exception as e:
-        print(f"An error occurred: {e}")

         min_silence_len = 750  # ms, adjust as needed
         # detect nonsilent parts
+        nonsilent_parts = detect_nonsilent(
+            song, min_silence_len=min_silence_len, silence_thresh=silence_thresh
+        )
         # Create a new directory to store chunks
         file_dir = os.path.dirname(file_path)
+        file_name = os.path.basename(file_path).split(".")[0]
         file_name = format_title(file_name)
         new_dir_path = os.path.join(file_dir, file_name)
         os.makedirs(new_dir_path, exist_ok=True)
 def merge_audio(timestamps_file):
     try:
         # Extract prefix from the timestamps filename
+        prefix = os.path.basename(timestamps_file).replace("_timestamps.txt", "")
         timestamps_dir = os.path.dirname(timestamps_file)
         # Open the timestamps file
         # Concatenate all audio_segments and export
         merged_audio = sum(audio_segments)
         merged_audio_np = np.array(merged_audio.get_array_of_samples())
+        # print(f"Exported merged file: {merged_filename}\n")
         return merged_audio.frame_rate, merged_audio_np
     except Exception as e:
+        print(f"An error occurred: {e}")

rvc/lib/utils.py CHANGED Viewed

@@ -19,8 +19,10 @@ def load_audio(file, sampling_rate):
 def format_title(title):
-    formatted_title = unicodedata.normalize('NFKD', title).encode('ascii', 'ignore').decode('utf-8')
-    formatted_title = re.sub(r'[\u2500-\u257F]+', '', formatted_title)
-    formatted_title = re.sub(r'[^\w\s.-]', '', formatted_title)
-    formatted_title = re.sub(r'\s+', '_', formatted_title)
-    return formatted_title

 def format_title(title):
+    formatted_title = (
+        unicodedata.normalize("NFKD", title).encode("ascii", "ignore").decode("utf-8")
+    )
+    formatted_title = re.sub(r"[\u2500-\u257F]+", "", formatted_title)
+    formatted_title = re.sub(r"[^\w\s.-]", "", formatted_title)
+    formatted_title = re.sub(r"\s+", "_", formatted_title)
+    return formatted_title

rvc/train/extract/extract_feature_print.py CHANGED Viewed

@@ -7,6 +7,9 @@ import fairseq
 import soundfile as sf
 import numpy as np
 device = sys.argv[1]
 n_parts = int(sys.argv[2])

 import soundfile as sf
 import numpy as np
+import logging
+logging.getLogger("fairseq").setLevel(logging.WARNING)
 device = sys.argv[1]
 n_parts = int(sys.argv[2])

rvc/train/process/extract_index.py CHANGED Viewed

@@ -78,8 +78,11 @@ try:
         index_added.add(big_npy[i : i + batch_size_add])
     faiss.write_index(index_added, index_filepath_added)
 except Exception as error:
     print(f"Failed to train index: {error}")
-print(f"Saved index file '{index_filepath_added}'")

         index_added.add(big_npy[i : i + batch_size_add])
     faiss.write_index(index_added, index_filepath_added)
+    print(f"Saved index file '{index_filepath_added}'")
 except Exception as error:
     print(f"Failed to train index: {error}")
+    if "one array to concatenate" in str(error):
+        print(
+            "If you are running this code in a virtual environment, make sure you have enough GPU available to generate the Index file."
+        )

rvc/train/process/extract_model.py CHANGED Viewed

@@ -1,28 +1,27 @@
 import os
 import torch
 from collections import OrderedDict
 def replace_keys_in_dict(d, old_key_part, new_key_part):
-    # Use OrderedDict if the original is an OrderedDict
     if isinstance(d, OrderedDict):
         updated_dict = OrderedDict()
     else:
         updated_dict = {}
     for key, value in d.items():
-        # Replace the key part if found
         new_key = key.replace(old_key_part, new_key_part)
-        # If the value is a dictionary, apply the function recursively
         if isinstance(value, dict):
             value = replace_keys_in_dict(value, old_key_part, new_key_part)
         updated_dict[new_key] = value
     return updated_dict
-def extract_model(ckpt, sr, if_f0, name, model_dir, epoch, version, hps):
     try:
-        print(f"Saved model '{model_dir}' (epoch {epoch})")
-        pth_file = f"{name}_{epoch}e.pth"
         pth_file_old_version_path = os.path.join(
             model_dir, f"{pth_file}_old_version.pth"
         )
@@ -51,7 +50,18 @@ def extract_model(ckpt, sr, if_f0, name, model_dir, epoch, version, hps):
             hps.model.gin_channels,
             hps.data.sampling_rate,
         ]
-        opt["info"], opt["sr"], opt["f0"], opt["version"] = epoch, sr, if_f0, version
         torch.save(opt, model_dir)
         model = torch.load(model_dir, map_location=torch.device("cpu"))

 import os
 import torch
+import hashlib
+import datetime
 from collections import OrderedDict
 def replace_keys_in_dict(d, old_key_part, new_key_part):
     if isinstance(d, OrderedDict):
         updated_dict = OrderedDict()
     else:
         updated_dict = {}
     for key, value in d.items():
         new_key = key.replace(old_key_part, new_key_part)
         if isinstance(value, dict):
             value = replace_keys_in_dict(value, old_key_part, new_key_part)
         updated_dict[new_key] = value
     return updated_dict
+def extract_model(ckpt, sr, if_f0, name, model_dir, epoch, step, version, hps):
     try:
+        print(f"Saved model '{model_dir}' (epoch {epoch} and step {step})")
+        pth_file = f"{name}_{epoch}e_{step}s.pth"
         pth_file_old_version_path = os.path.join(
             model_dir, f"{pth_file}_old_version.pth"
         )
             hps.model.gin_channels,
             hps.data.sampling_rate,
         ]
+        opt["epoch"] = epoch
+        opt["step"] = step
+        opt["sr"] = sr
+        opt["f0"] = if_f0
+        opt["version"] = version
+        opt["creation_date"] = datetime.datetime.now().isoformat()
+        hash_input = f"{str(ckpt)} {epoch} {step} {datetime.datetime.now().isoformat()}"
+        model_hash = hashlib.sha256(hash_input.encode()).hexdigest()
+        opt["model_hash"] = model_hash
         torch.save(opt, model_dir)
         model = torch.load(model_dir, map_location=torch.device("cpu"))

rvc/train/process/extract_small_model.py ADDED Viewed

	@@ -0,0 +1,175 @@

+import os
+import torch
+import hashlib
+import datetime
+from collections import OrderedDict
+def replace_keys_in_dict(d, old_key_part, new_key_part):
+    # Use OrderedDict if the original is an OrderedDict
+    if isinstance(d, OrderedDict):
+        updated_dict = OrderedDict()
+    else:
+        updated_dict = {}
+    for key, value in d.items():
+        # Replace the key part if found
+        new_key = key.replace(old_key_part, new_key_part)
+        # If the value is a dictionary, apply the function recursively
+        if isinstance(value, dict):
+            value = replace_keys_in_dict(value, old_key_part, new_key_part)
+        updated_dict[new_key] = value
+    return updated_dict
+def extract_small_model(path, name, sr, if_f0, version, epoch, step):
+    try:
+        ckpt = torch.load(path, map_location="cpu")
+        pth_file = f"{name}.pth"
+        pth_file_old_version_path = os.path.join("logs", f"{pth_file}_old_version.pth")
+        opt = OrderedDict(
+            weight={
+                key: value.half() for key, value in ckpt.items() if "enc_q" not in key
+            }
+        )
+        if "model" in ckpt:
+            ckpt = ckpt["model"]
+        opt = OrderedDict()
+        opt["weight"] = {}
+        for key in ckpt.keys():
+            if "enc_q" in key:
+                continue
+            opt["weight"][key] = ckpt[key].half()
+        if sr == "40k":
+            opt["config"] = [
+                1025,
+                32,
+                192,
+                192,
+                768,
+                2,
+                6,
+                3,
+                0,
+                "1",
+                [3, 7, 11],
+                [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+                [10, 10, 2, 2],
+                512,
+                [16, 16, 4, 4],
+                109,
+                256,
+                40000,
+            ]
+        elif sr == "48k":
+            if version == "v1":
+                opt["config"] = [
+                    1025,
+                    32,
+                    192,
+                    192,
+                    768,
+                    2,
+                    6,
+                    3,
+                    0,
+                    "1",
+                    [3, 7, 11],
+                    [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+                    [10, 6, 2, 2, 2],
+                    512,
+                    [16, 16, 4, 4, 4],
+                    109,
+                    256,
+                    48000,
+                ]
+            else:
+                opt["config"] = [
+                    1025,
+                    32,
+                    192,
+                    192,
+                    768,
+                    2,
+                    6,
+                    3,
+                    0,
+                    "1",
+                    [3, 7, 11],
+                    [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+                    [12, 10, 2, 2],
+                    512,
+                    [24, 20, 4, 4],
+                    109,
+                    256,
+                    48000,
+                ]
+        elif sr == "32k":
+            if version == "v1":
+                opt["config"] = [
+                    513,
+                    32,
+                    192,
+                    192,
+                    768,
+                    2,
+                    6,
+                    3,
+                    0,
+                    "1",
+                    [3, 7, 11],
+                    [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+                    [10, 4, 2, 2, 2],
+                    512,
+                    [16, 16, 4, 4, 4],
+                    109,
+                    256,
+                    32000,
+                ]
+            else:
+                opt["config"] = [
+                    513,
+                    32,
+                    192,
+                    192,
+                    768,
+                    2,
+                    6,
+                    3,
+                    0,
+                    "1",
+                    [3, 7, 11],
+                    [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+                    [10, 8, 2, 2],
+                    512,
+                    [20, 16, 4, 4],
+                    109,
+                    256,
+                    32000,
+                ]
+        opt["epoch"] = epoch
+        opt["step"] = step
+        opt["sr"] = sr
+        opt["f0"] = int(if_f0)
+        opt["version"] = version
+        opt["creation_date"] = datetime.datetime.now().isoformat()
+        hash_input = f"{str(ckpt)} {epoch} {step} {datetime.datetime.now().isoformat()}"
+        model_hash = hashlib.sha256(hash_input.encode()).hexdigest()
+        opt["model_hash"] = model_hash
+        model = torch.load(pth_file_old_version_path, map_location=torch.device("cpu"))
+        torch.save(
+            replace_keys_in_dict(
+                replace_keys_in_dict(
+                    model, ".parametrizations.weight.original1", ".weight_v"
+                ),
+                ".parametrizations.weight.original0",
+                ".weight_g",
+            ),
+            pth_file_old_version_path,
+        )
+        os.remove(pth_file_old_version_path)
+        os.rename(pth_file_old_version_path, pth_file)
+    except Exception as error:
+        print(error)

rvc/train/process/model_blender.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import os
+import torch
+from collections import OrderedDict
+def extract(ckpt):
+    a = ckpt["model"]
+    opt = OrderedDict()
+    opt["weight"] = {}
+    for key in a.keys():
+        if "enc_q" in key:
+            continue
+        opt["weight"][key] = a[key]
+    return opt
+def model_blender(name, path1, path2, ratio):
+    try:
+        message = f"Model {path1} and {path2} are merged with alpha {ratio}."
+        ckpt1 = torch.load(path1, map_location="cpu")
+        ckpt2 = torch.load(path2, map_location="cpu")
+        cfg = ckpt1["config"]
+        cfg_f0 = ckpt1["f0"]
+        cfg_version = ckpt1["version"]
+        if "model" in ckpt1:
+            ckpt1 = extract(ckpt1)
+        else:
+            ckpt1 = ckpt1["weight"]
+        if "model" in ckpt2:
+            ckpt2 = extract(ckpt2)
+        else:
+            ckpt2 = ckpt2["weight"]
+        if sorted(list(ckpt1.keys())) != sorted(list(ckpt2.keys())):
+            return "Fail to merge the models. The model architectures are not the same."
+        opt = OrderedDict()
+        opt["weight"] = {}
+        for key in ckpt1.keys():
+            if key == "emb_g.weight" and ckpt1[key].shape != ckpt2[key].shape:
+                min_shape0 = min(ckpt1[key].shape[0], ckpt2[key].shape[0])
+                opt["weight"][key] = (
+                    ratio * (ckpt1[key][:min_shape0].float())
+                    + (1 - ratio) * (ckpt2[key][:min_shape0].float())
+                ).half()
+            else:
+                opt["weight"][key] = (
+                    ratio * (ckpt1[key].float()) + (1 - ratio) * (ckpt2[key].float())
+                ).half()
+        opt["config"] = cfg
+        opt["sr"] = message
+        opt["f0"] = cfg_f0
+        opt["version"] = cfg_version
+        opt["info"] = message
+        torch.save(opt, os.path.join("logs", "%s.pth" % name))
+        print(message)
+        return message, os.path.join("logs", "%s.pth" % name)
+    except Exception as error:
+        print(error)
+        return error

rvc/train/process/model_information.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import torch
+from datetime import datetime
+def prettify_date(date_str):
+    date_time_obj = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S.%f")
+    return date_time_obj.strftime("%Y-%m-%d %H:%M:%S")
+def model_information(path):
+    model_data = torch.load(path, map_location="cpu")
+    print(f"Loaded model from {path}")
+    epochs = model_data.get("epoch", "None")
+    steps = model_data.get("step", "None")
+    sr = model_data.get("sr", "None")
+    f0 = model_data.get("f0", "None")
+    version = model_data.get("version", "None")
+    creation_date = model_data.get("creation_date", "None")
+    model_hash = model_data.get("model_hash", "None")
+    pitch_guidance = "True" if f0 == 1 else "False"
+    return (
+        f"Epochs: {epochs}\n"
+        f"Steps: {steps}\n"
+        f"RVC Version: {version}\n"
+        f"Sampling Rate: {sr}\n"
+        f"Pitch Guidance: {pitch_guidance}\n"
+        f"Creation Date: {prettify_date(creation_date)}\n"
+        f"Hash (ID): {model_hash}"
+    )

rvc/train/train.py CHANGED Viewed

@@ -70,15 +70,9 @@ torch.backends.cudnn.deterministic = False
 torch.backends.cudnn.benchmark = False
 global_step = 0
-bestEpochStep = 0
 last_loss_gen_all = 0
-lastValue = 1
-lowestValue = {"step": 0, "value": float("inf"), "epoch": 0}
-dirtyTb = []
-dirtyValues = []
-dirtySteps = []
-dirtyEpochs = []
-continued = False
 class EpochRecorder:
@@ -104,13 +98,16 @@ def main():
         print("GPU not detected, reverting to CPU (not recommended)")
         n_gpus = 1
     children = []
-    for i in range(n_gpus):
-        subproc = mp.Process(
-            target=run,
-            args=(i, n_gpus, hps),
-        )
-        children.append(subproc)
-        subproc.start()
     for i in range(n_gpus):
         children[i].join()
@@ -287,9 +284,13 @@ def run(
 def train_and_evaluate(rank, epoch, hps, nets, optims, scaler, loaders, writers, cache):
-    global global_step, last_loss_gen_all, lowestValue
     if epoch == 1:
-        last_loss_gen_all = {}
     net_g, net_d = nets
     optim_g, optim_d = optims
     train_loader = loaders[0] if loaders is not None else None
@@ -467,10 +468,15 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, scaler, loaders, writers,
                 loss_gen, losses_gen = generator_loss(y_d_hat_g)
                 loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl
-                if loss_gen_all < lowestValue["value"]:
-                    lowestValue["value"] = loss_gen_all
-                    lowestValue["step"] = global_step
-                    lowestValue["epoch"] = epoch
         optim_g.zero_grad()
         scaler.scale(loss_gen_all).backward()
@@ -558,25 +564,43 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, scaler, loaders, writers,
                 ckpt = net_g.module.state_dict()
             else:
                 ckpt = net_g.state_dict()
-        extract_model(
-            ckpt,
-            hps.sample_rate,
-            hps.if_f0,
-            hps.name,
-            os.path.join(hps.model_dir, "{}_{}e.pth".format(hps.name, epoch)),
-            epoch,
-            hps.version,
-            hps,
-        )
     if rank == 0:
         if epoch > 1:
-            change = last_loss_gen_all - loss_gen_all
-            change_str = ""
-            if change != 0:
-                change_str = f"({'decreased' if change > 0 else 'increased'} {abs(change)})"  # decreased = good
             print(
-                f"{hps.name} | epoch={epoch} | step={global_step} | {epoch_recorder.record()} | loss_gen_all={round(loss_gen_all.item(), 3)} {change_str}"
             )
         last_loss_gen_all = loss_gen_all
@@ -585,9 +609,12 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, scaler, loaders, writers,
             f"Training has been successfully completed with {epoch} epoch, {global_step} steps and {round(loss_gen_all.item(), 3)} loss gen."
         )
         print(
-            f"Lowest generator loss: {lowestValue['value']} at epoch {lowestValue['epoch']}, step {lowestValue['step']}"
         )
         if hasattr(net_g, "module"):
             ckpt = net_g.module.state_dict()
         else:
@@ -598,8 +625,11 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, scaler, loaders, writers,
             hps.sample_rate,
             hps.if_f0,
             hps.name,
-            os.path.join(hps.model_dir, "{}_{}e.pth".format(hps.name, epoch)),
             epoch,
             hps.version,
             hps,
         )

 torch.backends.cudnn.benchmark = False
 global_step = 0
+lowest_value = {"step": 0, "value": float("inf"), "epoch": 0}
 last_loss_gen_all = 0
+epochs_since_last_lowest = 0
 class EpochRecorder:
         print("GPU not detected, reverting to CPU (not recommended)")
         n_gpus = 1
     children = []
+    pid_file_path = os.path.join(now_dir, "rvc", "train", "train_pid.txt")
+    with open(pid_file_path, "w") as pid_file:
+        for i in range(n_gpus):
+            subproc = mp.Process(
+                target=run,
+                args=(i, n_gpus, hps),
+            )
+            children.append(subproc)
+            subproc.start()
+            pid_file.write(str(subproc.pid) + "\n")
     for i in range(n_gpus):
         children[i].join()
 def train_and_evaluate(rank, epoch, hps, nets, optims, scaler, loaders, writers, cache):
+    global global_step, last_loss_gen_all, lowest_value, epochs_since_last_lowest
     if epoch == 1:
+        lowest_value = {"step": 0, "value": float("inf"), "epoch": 0}
+        last_loss_gen_all = 0.0
+        epochs_since_last_lowest = 0
     net_g, net_d = nets
     optim_g, optim_d = optims
     train_loader = loaders[0] if loaders is not None else None
                 loss_gen, losses_gen = generator_loss(y_d_hat_g)
                 loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl
+                if loss_gen_all < lowest_value["value"]:
+                    lowest_value["value"] = loss_gen_all
+                    lowest_value["step"] = global_step
+                    lowest_value["epoch"] = epoch
+                    # print(f'Lowest generator loss updated: {lowest_value["value"]} at epoch {epoch}, step {global_step}')
+                    if epoch > lowest_value["epoch"]:
+                        print(
+                            "Alert: The lower generating loss has been exceeded by a lower loss in a subsequent epoch."
+                        )
         optim_g.zero_grad()
         scaler.scale(loss_gen_all).backward()
                 ckpt = net_g.module.state_dict()
             else:
                 ckpt = net_g.state_dict()
+            extract_model(
+                ckpt,
+                hps.sample_rate,
+                hps.if_f0,
+                hps.name,
+                os.path.join(
+                    hps.model_dir, "{}_{}e_{}s.pth".format(hps.name, epoch, global_step)
+                ),
+                epoch,
+                global_step,
+                hps.version,
+                hps,
+            )
+    if hps.overtraining_detector == 1:
+        if lowest_value["value"] < last_loss_gen_all:
+            epochs_since_last_lowest += 1
+        else:
+            epochs_since_last_lowest = 0
+        if epochs_since_last_lowest >= hps.overtraining_threshold:
+            print(
+                "Stopping training due to possible overtraining. Lowest generator loss: {} at epoch {}, step {}".format(
+                    lowest_value["value"], lowest_value["epoch"], lowest_value["step"]
+                )
+            )
+            os._exit(2333333)
     if rank == 0:
         if epoch > 1:
+            print(hps.overtraining_threshold)
             print(
+                f"{hps.name} | epoch={epoch} | step={global_step} | {epoch_recorder.record()} | lowest_value={lowest_value['value']} (epoch {lowest_value['epoch']} and step {lowest_value['step']})"
+            )
+        else:
+            print(
+                f"{hps.name} | epoch={epoch} | step={global_step} | {epoch_recorder.record()}"
             )
         last_loss_gen_all = loss_gen_all
             f"Training has been successfully completed with {epoch} epoch, {global_step} steps and {round(loss_gen_all.item(), 3)} loss gen."
         )
         print(
+            f"Lowest generator loss: {lowest_value['value']} at epoch {lowest_value['epoch']}, step {lowest_value['step']}"
         )
+        pid_file_path = os.path.join(now_dir, "rvc", "train", "train_pid.txt")
+        os.remove(pid_file_path)
         if hasattr(net_g, "module"):
             ckpt = net_g.module.state_dict()
         else:
             hps.sample_rate,
             hps.if_f0,
             hps.name,
+            os.path.join(
+                hps.model_dir, "{}_{}e_{}s.pth".format(hps.name, epoch, global_step)
+            ),
             epoch,
+            global_step,
             hps.version,
             hps,
         )

rvc/train/utils.py CHANGED Viewed

@@ -7,49 +7,6 @@ import numpy as np
 from scipy.io.wavfile import read
-def load_checkpoint_d(checkpoint_path, combd, sbd, optimizer=None, load_opt=1):
-    assert os.path.isfile(checkpoint_path)
-    checkpoint_dict = torch.load(checkpoint_path, map_location="cpu")
-    def go(model, bkey):
-        saved_state_dict = checkpoint_dict[bkey]
-        if hasattr(model, "module"):
-            state_dict = model.module.state_dict()
-        else:
-            state_dict = model.state_dict()
-        new_state_dict = {}
-        for k, v in state_dict.items():
-            try:
-                new_state_dict[k] = saved_state_dict[k]
-                if saved_state_dict[k].shape != state_dict[k].shape:
-                    print(
-                        "shape-%s-mismatch. need: %s, get: %s",
-                        k,
-                        state_dict[k].shape,
-                        saved_state_dict[k].shape,
-                    )
-                    raise KeyError
-            except:
-                print("%s is not in the checkpoint", k)
-                new_state_dict[k] = v
-        if hasattr(model, "module"):
-            model.module.load_state_dict(new_state_dict, strict=False)
-        else:
-            model.load_state_dict(new_state_dict, strict=False)
-        return model
-    go(combd, "combd")
-    model = go(sbd, "sbd")
-    iteration = checkpoint_dict["iteration"]
-    learning_rate = checkpoint_dict["learning_rate"]
-    if optimizer is not None and load_opt == 1:
-        optimizer.load_state_dict(checkpoint_dict["optimizer"])
-    print("Loaded checkpoint '{}' (epoch {})".format(checkpoint_path, iteration))
-    return model, optimizer, learning_rate, iteration
 def load_checkpoint(checkpoint_path, model, optimizer=None, load_opt=1):
     assert os.path.isfile(checkpoint_path)
     checkpoint_dict = torch.load(checkpoint_path, map_location="cpu")
@@ -218,6 +175,22 @@ def get_hparams():
         required=True,
         help="if caching the dataset in GPU memory, 1 or 0",
     )
     args = parser.parse_args()
     name = args.experiment_dir
     experiment_dir = os.path.join("./logs", args.experiment_dir)
@@ -240,6 +213,8 @@ def get_hparams():
     hparams.save_every_weights = args.save_every_weights
     hparams.if_cache_data_in_gpu = args.if_cache_data_in_gpu
     hparams.data.training_files = f"{experiment_dir}/filelist.txt"
     return hparams

 from scipy.io.wavfile import read
 def load_checkpoint(checkpoint_path, model, optimizer=None, load_opt=1):
     assert os.path.isfile(checkpoint_path)
     checkpoint_dict = torch.load(checkpoint_path, map_location="cpu")
         required=True,
         help="if caching the dataset in GPU memory, 1 or 0",
     )
+    parser.add_argument(
+        "-od",
+        "--overtraining_detector",
+        type=int,
+        required=True,
+        help="Detect overtraining or not, 1 or 0",
+    )
+    parser.add_argument(
+        "-ot",
+        "--overtraining_threshold",
+        type=int,
+        default=50,
+        help="overtraining_threshold",
+    )
     args = parser.parse_args()
     name = args.experiment_dir
     experiment_dir = os.path.join("./logs", args.experiment_dir)
     hparams.save_every_weights = args.save_every_weights
     hparams.if_cache_data_in_gpu = args.if_cache_data_in_gpu
     hparams.data.training_files = f"{experiment_dir}/filelist.txt"
+    hparams.overtraining_detector = args.overtraining_detector
+    hparams.overtraining_threshold = args.overtraining_threshold
     return hparams

tabs/download/download.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import os, sys, shutil
 import tempfile
 import gradio as gr
 from core import run_download_script
 from assets.i18n.i18n import I18nAuto
@@ -41,12 +43,30 @@ def save_drop_model(dropbox):
             os.makedirs(model_path)
         if os.path.exists(os.path.join(model_path, file_name)):
             os.remove(os.path.join(model_path, file_name))
-        os.rename(dropbox, os.path.join(model_path, file_name))
         print(f"{file_name} saved in {model_path}")
         gr.Info(f"{file_name} saved in {model_path}")
     return None
 def download_tab():
     with gr.Column():
         gr.Markdown(value=i18n("## Download Model"))
@@ -57,6 +77,7 @@ def download_tab():
         )
         model_download_output_info = gr.Textbox(
             label=i18n("Output Information"),
             value="",
             max_lines=8,
             interactive=False,
@@ -82,3 +103,18 @@ def download_tab():
             inputs=[dropbox],
             outputs=[dropbox],
         )

 import os, sys, shutil
 import tempfile
 import gradio as gr
+import pandas as pd
+import requests
 from core import run_download_script
 from assets.i18n.i18n import I18nAuto
             os.makedirs(model_path)
         if os.path.exists(os.path.join(model_path, file_name)):
             os.remove(os.path.join(model_path, file_name))
+        shutil.move(dropbox, os.path.join(model_path, file_name))
         print(f"{file_name} saved in {model_path}")
         gr.Info(f"{file_name} saved in {model_path}")
     return None
+def search_models(name):
+    url = f"https://cjtfqzjfdimgpvpwhzlv.supabase.co/rest/v1/models?name=ilike.%25{name}%25&order=created_at.desc&limit=15"
+    headers = {
+        "apikey": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6ImNqdGZxempmZGltZ3B2cHdoemx2Iiwicm9sZSI6ImFub24iLCJpYXQiOjE2OTUxNjczODgsImV4cCI6MjAxMDc0MzM4OH0.7z5WMIbjR99c2Ooc0ma7B_FyGq10G8X-alkCYTkKR10"
+    }
+    response = requests.get(url, headers=headers)
+    data = response.json()
+    if len(data) == 0:
+        gr.Info(i18n("We couldn't find models by that name."))
+        return None
+    else:
+        df = pd.DataFrame(data)[["name", "link", "epochs", "type"]]
+        df["link"] = df["link"].apply(
+            lambda x: f'<a href="{x}" target="_blank">{x}</a>'
+        )
+        return df
 def download_tab():
     with gr.Column():
         gr.Markdown(value=i18n("## Download Model"))
         )
         model_download_output_info = gr.Textbox(
             label=i18n("Output Information"),
+            info=i18n("The output information will be displayed here."),
             value="",
             max_lines=8,
             interactive=False,
             inputs=[dropbox],
             outputs=[dropbox],
         )
+        gr.Markdown(value=i18n("## Search Model"))
+        search_name = gr.Textbox(
+            label=i18n("Model Name"),
+            placeholder=i18n("Introduce the model name to search."),
+            interactive=True,
+        )
+        search_table = gr.Dataframe(datatype="markdown")
+        search = gr.Button(i18n("Search"))
+        search.click(
+            search_models,
+            [search_name],
+            search_table,
+        )
+        search_name.submit(search_models, [search_name], search_table)

tabs/extra/analyzer/analyzer.py CHANGED Viewed

@@ -1,85 +1,32 @@
 import gradio as gr
-import matplotlib.pyplot as plt
-import soundfile as sf
-import numpy as np
-import os
 from assets.i18n.i18n import I18nAuto
 i18n = I18nAuto()
-def generate_spectrogram(audio_data, sample_rate, file_name):
-    plt.clf()
-    plt.specgram(
-        audio_data,
-        Fs=sample_rate / 1,
-        NFFT=4096,
-        sides="onesided",
-        cmap="Reds_r",
-        scale_by_freq=True,
-        scale="dB",
-        mode="magnitude",
-        window=np.hanning(4096),
-    )
-    plt.title(file_name)
-    plt.savefig("spectrogram.png")
-def get_audio_info(audio_file):
-    audio_data, sample_rate = sf.read(audio_file)
-    if len(audio_data.shape) > 1:
-        audio_data = np.mean(audio_data, axis=1)
-    generate_spectrogram(audio_data, sample_rate, os.path.basename(audio_file))
-    audio_info = sf.info(audio_file)
-    bit_depth = {"PCM_16": 16, "FLOAT": 32}.get(audio_info.subtype, 0)
-    minutes, seconds = divmod(audio_info.duration, 60)
-    seconds, milliseconds = divmod(seconds, 1)
-    milliseconds *= 1000
-    speed_in_kbps = audio_info.samplerate * bit_depth / 1000
-    info_table = f"""
-    - **File Name:** {os.path.basename(audio_file)}
-    - **Duration:** {int(minutes)} minutes, {int(seconds)} seconds, {int(milliseconds)} milliseconds
-    - **Bitrate:** {speed_in_kbps} kbp/s
-    - **Audio Channels:** {audio_info.channels}
-    - **Sampling rate:** {audio_info.samplerate} Hz
-    - **Bit per second:** {audio_info.samplerate * audio_info.channels * bit_depth} bit/s
-    """
-    return info_table, "spectrogram.png"
 def analyzer():
     with gr.Column():
-        gr.Markdown(
-            "Tool inspired in the original [Ilaria-Audio-Analyzer](https://github.com/TheStingerX/Ilaria-Audio-Analyzer) code."
-        )
         audio_input = gr.Audio(type="filepath")
         get_info_button = gr.Button(
             value=i18n("Get information about the audio"), variant="primary"
         )
-    with gr.Column():
-        with gr.Row():
-            with gr.Column():
-                gr.Markdown(
-                    value=i18n("Information about the audio file"),
-                    visible=True,
-                )
-                output_markdown = gr.Markdown(
-                    value=i18n("Waiting for information..."), visible=True
-                )
-            image_output = gr.Image(type="filepath", interactive=False)
     get_info_button.click(
-        fn=get_audio_info,
         inputs=[audio_input],
-        outputs=[output_markdown, image_output],
     )

+import os, sys
 import gradio as gr
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+from core import run_audio_analyzer_script
 from assets.i18n.i18n import I18nAuto
 i18n = I18nAuto()
 def analyzer():
     with gr.Column():
         audio_input = gr.Audio(type="filepath")
+        output_info = gr.Textbox(
+            label=i18n("Output Information"),
+            info=i18n("The output information will be displayed here."),
+            value="",
+            max_lines=8,
+            interactive=False,
+        )
         get_info_button = gr.Button(
             value=i18n("Get information about the audio"), variant="primary"
         )
+        image_output = gr.Image(type="filepath", interactive=False)
     get_info_button.click(
+        fn=run_audio_analyzer_script,
         inputs=[audio_input],
+        outputs=[output_info, image_output],
     )

tabs/extra/extra.py CHANGED Viewed

@@ -15,8 +15,8 @@ def extra_tab():
         )
     )
-#    with gr.TabItem(i18n("Processing")):
-#        processing.processing()
     with gr.TabItem(i18n("Audio Analyzer")):
         analyzer.analyzer()

         )
     )
+    with gr.TabItem(i18n("Processing")):
+        processing.processing()
     with gr.TabItem(i18n("Audio Analyzer")):
         analyzer.analyzer()

tabs/extra/model_information.py CHANGED Viewed

@@ -9,12 +9,14 @@ i18n = I18nAuto()
 def model_information_tab():
     with gr.Column():
         model_name = gr.Textbox(
-            label=i18n("Model Path"),
-            placeholder=i18n("Introduce the model .pth path"),
             interactive=True,
         )
         model_information_output_info = gr.Textbox(
             label=i18n("Output Information"),
             value="",
             max_lines=8,
             interactive=False,

 def model_information_tab():
     with gr.Column():
         model_name = gr.Textbox(
+            label=i18n("Path to Model"),
+            info=i18n("Introduce the model pth path"),
+            placeholder=i18n("Introduce the model pth path"),
             interactive=True,
         )
         model_information_output_info = gr.Textbox(
             label=i18n("Output Information"),
+            info=i18n("The output information will be displayed here."),
             value="",
             max_lines=8,
             interactive=False,

tabs/extra/processing/processing.py CHANGED Viewed

@@ -1,18 +1,9 @@
-import sys
-sys.path.append("..")
-import os
 now_dir = os.getcwd()
-from rvc.train.process_ckpt import (
-    extract_small_model,
-)
-from rvc.lib.process.model_fusion import model_fusion
-from rvc.lib.process.model_information import (
-    model_information,
-)
 from assets.i18n.i18n import I18nAuto
 i18n = I18nAuto()
@@ -21,122 +12,27 @@ import gradio as gr
 def processing():
-    with gr.Accordion(label=i18n("Model fusion (On progress)"), open=False):
-        with gr.Column():
-            model_fusion_name = gr.Textbox(
-                label=i18n("Model Name"),
-                value="",
-                max_lines=1,
-                interactive=True,
-                placeholder=i18n("Enter model name"),
-            )
-            model_fusion_a = gr.Textbox(
-                label=i18n("Path to Model A"),
-                value="",
-                interactive=True,
-                placeholder=i18n("Path to model"),
-            )
-            model_fusion_b = gr.Textbox(
-                label=i18n("Path to Model B"),
-                value="",
-                interactive=True,
-                placeholder=i18n("Path to model"),
-            )
-        model_fusion_output_info = gr.Textbox(
-            label=i18n("Output Information"),
-            value="",
-        )
-        model_fusion_button = gr.Button(
-            i18n("Fusion"), variant="primary", interactive=False
-        )
-        model_fusion_button.click(
-            model_fusion,
-            [
-                model_fusion_name,
-                model_fusion_a,
-                model_fusion_b,
-            ],
-            model_fusion_output_info,
-            api_name="model_fusion",
-        )
     with gr.Accordion(label=i18n("View model information")):
         with gr.Row():
             with gr.Column():
                 model_view_model_path = gr.Textbox(
                     label=i18n("Path to Model"),
                     value="",
                     interactive=True,
-                    placeholder=i18n("Path to model"),
                 )
         model_view_output_info = gr.Textbox(
-            label=i18n("Output Information"), value="", max_lines=8
         )
         model_view_button = gr.Button(i18n("View"), variant="primary")
         model_view_button.click(
-            model_information,
             [model_view_model_path],
             model_view_output_info,
             api_name="model_info",
         )
-    with gr.Accordion(label=i18n("Model extraction")):
-        with gr.Row():
-            with gr.Column():
-                model_extract_name = gr.Textbox(
-                    label=i18n("Model Name"),
-                    value="",
-                    interactive=True,
-                    placeholder=i18n("Enter model name"),
-                )
-                model_extract_path = gr.Textbox(
-                    label=i18n("Path to Model"),
-                    placeholder=i18n("Path to model"),
-                    interactive=True,
-                )
-                model_extract_info = gr.Textbox(
-                    label=i18n("Model information to be placed"),
-                    value="",
-                    max_lines=8,
-                    interactive=True,
-                    placeholder=i18n("Model information to be placed"),
-                )
-            with gr.Column():
-                model_extract_pitch_guidance = gr.Checkbox(
-                    label=i18n("Pitch Guidance"),
-                    value=True,
-                    interactive=True,
-                )
-                model_extract_rvc_version = gr.Radio(
-                    label=i18n("RVC Version"),
-                    choices=["v1", "v2"],
-                    value="v2",
-                    interactive=True,
-                )
-                model_extract_sampling_rate = gr.Radio(
-                    label=i18n("Sampling Rate"),
-                    choices=["32000", "40000", "48000"],
-                    value="40000",
-                    interactive=True,
-                )
-        model_extract_output_info = gr.Textbox(
-            label=i18n("Output Information"), value="", max_lines=8
-        )
-        model_extract_button = gr.Button(i18n("Extract"), variant="primary")
-        model_extract_button.click(
-            extract_small_model,
-            [
-                model_extract_path,
-                model_extract_name,
-                model_extract_sampling_rate,
-                model_extract_pitch_guidance,
-                model_extract_info,
-                model_extract_rvc_version,
-            ],
-            model_extract_output_info,
-            api_name="model_extract",
-        )

+import os, sys
 now_dir = os.getcwd()
+sys.path.append(now_dir)
+from core import run_model_information_script
 from assets.i18n.i18n import I18nAuto
 i18n = I18nAuto()
 def processing():
     with gr.Accordion(label=i18n("View model information")):
         with gr.Row():
             with gr.Column():
                 model_view_model_path = gr.Textbox(
                     label=i18n("Path to Model"),
+                    info=i18n("Introduce the model pth path"),
                     value="",
                     interactive=True,
+                    placeholder=i18n("Enter path to model"),
                 )
         model_view_output_info = gr.Textbox(
+            label=i18n("Output Information"),
+            info=i18n("The output information will be displayed here."),
+            value="",
+            max_lines=8,
         )
         model_view_button = gr.Button(i18n("View"), variant="primary")
         model_view_button.click(
+            run_model_information_script,
             [model_view_model_path],
             model_view_output_info,
             api_name="model_info",
         )

tabs/inference/inference.py CHANGED Viewed

@@ -122,55 +122,6 @@ def get_indexes():
     return indexes_list if indexes_list else ""
-def match_index(model_file: str) -> tuple:
-    model_files_trip = re.sub(r"\.pth|\.onnx$", "", model_file)
-    model_file_name = os.path.split(model_files_trip)[
-        -1
-    ]  # Extract only the name, not the directory
-    # Check if the sid0strip has the specific ending format _eXXX_sXXX
-    if re.match(r".+_e\d+_s\d+$", model_file_name):
-        base_model_name = model_file_name.rsplit("_", 2)[0]
-    else:
-        base_model_name = model_file_name
-    sid_directory = os.path.join(model_root_relative, base_model_name)
-    directories_to_search = [sid_directory] if os.path.exists(sid_directory) else []
-    directories_to_search.append(model_root_relative)
-    matching_index_files = []
-    for directory in directories_to_search:
-        for filename in os.listdir(directory):
-            if filename.endswith(".index") and "trained" not in filename:
-                # Condition to match the name
-                name_match = any(
-                    name.lower() in filename.lower()
-                    for name in [model_file_name, base_model_name]
-                )
-                # If in the specific directory, it's automatically a match
-                folder_match = directory == sid_directory
-                if name_match or folder_match:
-                    index_path = os.path.join(directory, filename)
-                    updated_indexes_list = get_indexes()
-                    if index_path in updated_indexes_list:
-                        matching_index_files.append(
-                            (
-                                index_path,
-                                os.path.getsize(index_path),
-                                " " not in filename,
-                            )
-                        )
-    if matching_index_files:
-        # Sort by favoring files without spaces and by size (largest size first)
-        matching_index_files.sort(key=lambda x: (-x[2], -x[1]))
-        best_match_index_path = matching_index_files[0][0]
-        return best_match_index_path
-    return ""
 def save_to_wav(record_button):
     if record_button is None:
         pass
@@ -196,11 +147,21 @@ def save_to_wav2(upload_audio):
 def delete_outputs():
     for root, _, files in os.walk(audio_root_relative, topdown=False):
         for name in files:
             if name.endswith(tuple(sup_audioext)) and name.__contains__("_output"):
                 os.remove(os.path.join(root, name))
-    gr.Info(f"Outputs cleared!")
 # Inference tab
@@ -210,6 +171,7 @@ def inference_tab():
         with gr.Row():
             model_file = gr.Dropdown(
                 label=i18n("Voice Model"),
                 choices=sorted(names, key=lambda path: os.path.getsize(path)),
                 interactive=True,
                 value=default_weight,
@@ -218,6 +180,7 @@ def inference_tab():
             index_file = gr.Dropdown(
                 label=i18n("Index File"),
                 choices=get_indexes(),
                 value=match_index(default_weight) if default_weight else "",
                 interactive=True,
@@ -228,13 +191,16 @@ def inference_tab():
             unload_button = gr.Button(i18n("Unload Voice"))
             unload_button.click(
-                fn=lambda: ({"value": "", "__type__": "update"}),
                 inputs=[],
-                outputs=[model_file],
             )
             model_file.select(
-                fn=match_index,
                 inputs=[model_file],
                 outputs=[index_file],
             )
@@ -248,6 +214,7 @@ def inference_tab():
             with gr.Row():
                 audio = gr.Dropdown(
                     label=i18n("Select Audio"),
                     choices=sorted(audio_paths),
                     value=audio_paths[0] if audio_paths else "",
                     interactive=True,
@@ -256,12 +223,15 @@ def inference_tab():
         with gr.Accordion(i18n("Advanced Settings"), open=False):
             with gr.Column():
-                clear_outputs = gr.Button(
                     i18n("Clear Outputs (Deletes all audios in assets/audios)")
                 )
                 output_path = gr.Textbox(
                     label=i18n("Output Path"),
                     placeholder=i18n("Enter output path"),
                     value=(
                         output_path_fn(audio_paths[0])
                         if audio_paths
@@ -269,25 +239,68 @@ def inference_tab():
                     ),
                     interactive=True,
                 )
                 split_audio = gr.Checkbox(
                     label=i18n("Split Audio"),
                     visible=True,
                     value=False,
                     interactive=True,
                 )
                 pitch = gr.Slider(
                     minimum=-24,
                     maximum=24,
                     step=1,
                     label=i18n("Pitch"),
                     value=0,
                     interactive=True,
                 )
                 filter_radius = gr.Slider(
                     minimum=0,
                     maximum=7,
-                    label=i18n(
-                        "If >=3: apply median filtering to the harvested pitch results. The value represents the filter radius and can reduce breathiness"
                     ),
                     value=3,
                     step=1,
@@ -297,20 +310,50 @@ def inference_tab():
                     minimum=0,
                     maximum=1,
                     label=i18n("Search Feature Ratio"),
                     value=0.75,
                     interactive=True,
                 )
                 hop_length = gr.Slider(
                     minimum=1,
                     maximum=512,
                     step=1,
                     label=i18n("Hop Length"),
                     value=128,
                     interactive=True,
                 )
             with gr.Column():
                 f0method = gr.Radio(
                     label=i18n("Pitch extraction algorithm"),
                     choices=[
                         "pm",
                         "harvest",
@@ -318,6 +361,8 @@ def inference_tab():
                         "crepe",
                         "crepe-tiny",
                         "rmvpe",
                     ],
                     value="rmvpe",
                     interactive=True,
@@ -326,7 +371,10 @@ def inference_tab():
         convert_button1 = gr.Button(i18n("Convert"))
         with gr.Row():  # Defines output info + output audio download after conversion
-            vc_output1 = gr.Textbox(label=i18n("Output Information"))
             vc_output2 = gr.Audio(label=i18n("Export Audio"))
     # Batch inference tab
@@ -335,40 +383,87 @@ def inference_tab():
             with gr.Column():
                 input_folder_batch = gr.Textbox(
                     label=i18n("Input Folder"),
                     placeholder=i18n("Enter input path"),
                     value=os.path.join(now_dir, "assets", "audios"),
                     interactive=True,
                 )
                 output_folder_batch = gr.Textbox(
                     label=i18n("Output Folder"),
                     placeholder=i18n("Enter output path"),
                     value=os.path.join(now_dir, "assets", "audios"),
                     interactive=True,
                 )
         with gr.Accordion(i18n("Advanced Settings"), open=False):
             with gr.Column():
-                clear_outputs = gr.Button(
                     i18n("Clear Outputs (Deletes all audios in assets/audios)")
                 )
                 split_audio_batch = gr.Checkbox(
                     label=i18n("Split Audio"),
                     visible=True,
                     value=False,
                     interactive=True,
                 )
                 pitch_batch = gr.Slider(
                     minimum=-24,
                     maximum=24,
                     step=1,
                     label=i18n("Pitch"),
                     value=0,
                     interactive=True,
                 )
                 filter_radius_batch = gr.Slider(
                     minimum=0,
                     maximum=7,
-                    label=i18n(
-                        "If >=3: apply median filtering to the harvested pitch results. The value represents the filter radius and can reduce breathiness"
                     ),
                     value=3,
                     step=1,
@@ -378,20 +473,50 @@ def inference_tab():
                     minimum=0,
                     maximum=1,
                     label=i18n("Search Feature Ratio"),
                     value=0.75,
                     interactive=True,
                 )
                 hop_length_batch = gr.Slider(
                     minimum=1,
                     maximum=512,
                     step=1,
                     label=i18n("Hop Length"),
                     value=128,
                     interactive=True,
                 )
             with gr.Column():
                 f0method_batch = gr.Radio(
                     label=i18n("Pitch extraction algorithm"),
                     choices=[
                         "pm",
                         "harvest",
@@ -399,6 +524,8 @@ def inference_tab():
                         "crepe",
                         "crepe-tiny",
                         "rmvpe",
                     ],
                     value="rmvpe",
                     interactive=True,
@@ -407,11 +534,39 @@ def inference_tab():
         convert_button2 = gr.Button(i18n("Convert"))
         with gr.Row():  # Defines output info + output audio download after conversion
-            vc_output3 = gr.Textbox(label=i18n("Output Information"))
     def toggle_visible(checkbox):
         return {"visible": checkbox, "__type__": "update"}
     refresh_button.click(
         fn=change_choices,
         inputs=[],
@@ -432,7 +587,12 @@ def inference_tab():
         inputs=[upload_audio],
         outputs=[audio, output_path],
     )
-    clear_outputs.click(
         fn=delete_outputs,
         inputs=[],
         outputs=[],
@@ -443,6 +603,8 @@ def inference_tab():
             pitch,
             filter_radius,
             index_rate,
             hop_length,
             f0method,
             audio,
@@ -450,6 +612,10 @@ def inference_tab():
             model_file,
             index_file,
             split_audio,
         ],
         outputs=[vc_output1, vc_output2],
     )
@@ -459,6 +625,8 @@ def inference_tab():
             pitch_batch,
             filter_radius_batch,
             index_rate_batch,
             hop_length_batch,
             f0method_batch,
             input_folder_batch,
@@ -466,6 +634,10 @@ def inference_tab():
             model_file,
             index_file,
             split_audio_batch,
         ],
         outputs=[vc_output3],
     )

     return indexes_list if indexes_list else ""
 def save_to_wav(record_button):
     if record_button is None:
         pass
 def delete_outputs():
+    gr.Info(f"Outputs cleared!")
     for root, _, files in os.walk(audio_root_relative, topdown=False):
         for name in files:
             if name.endswith(tuple(sup_audioext)) and name.__contains__("_output"):
                 os.remove(os.path.join(root, name))
+def match_index(model_file_value):
+    if model_file_value:
+        model_folder = os.path.dirname(model_file_value)
+        index_files = get_indexes()
+        for index_file in index_files:
+            if os.path.dirname(index_file) == model_folder:
+                return index_file
+    return ""
 # Inference tab
         with gr.Row():
             model_file = gr.Dropdown(
                 label=i18n("Voice Model"),
+                info=i18n("Select the voice model to use for the conversion."),
                 choices=sorted(names, key=lambda path: os.path.getsize(path)),
                 interactive=True,
                 value=default_weight,
             index_file = gr.Dropdown(
                 label=i18n("Index File"),
+                info=i18n("Select the index file to use for the conversion."),
                 choices=get_indexes(),
                 value=match_index(default_weight) if default_weight else "",
                 interactive=True,
             unload_button = gr.Button(i18n("Unload Voice"))
             unload_button.click(
+                fn=lambda: (
+                    {"value": "", "__type__": "update"},
+                    {"value": "", "__type__": "update"},
+                ),
                 inputs=[],
+                outputs=[model_file, index_file],
             )
             model_file.select(
+                fn=lambda model_file_value: match_index(model_file_value),
                 inputs=[model_file],
                 outputs=[index_file],
             )
             with gr.Row():
                 audio = gr.Dropdown(
                     label=i18n("Select Audio"),
+                    info=i18n("Select the audio to convert."),
                     choices=sorted(audio_paths),
                     value=audio_paths[0] if audio_paths else "",
                     interactive=True,
         with gr.Accordion(i18n("Advanced Settings"), open=False):
             with gr.Column():
+                clear_outputs_infer = gr.Button(
                     i18n("Clear Outputs (Deletes all audios in assets/audios)")
                 )
                 output_path = gr.Textbox(
                     label=i18n("Output Path"),
                     placeholder=i18n("Enter output path"),
+                    info=i18n(
+                        "The path where the output audio will be saved, by default in assets/audios/output.wav"
+                    ),
                     value=(
                         output_path_fn(audio_paths[0])
                         if audio_paths
                     ),
                     interactive=True,
                 )
+                export_format = gr.Radio(
+                    label=i18n("Export Format"),
+                    info=i18n("Select the format to export the audio."),
+                    choices=["WAV", "MP3", "FLAC", "OGG", "M4A"],
+                    value="WAV",
+                    interactive=True,
+                )
                 split_audio = gr.Checkbox(
                     label=i18n("Split Audio"),
+                    info=i18n(
+                        "Split the audio into chunks for inference to obtain better results in some cases."
+                    ),
+                    visible=True,
+                    value=False,
+                    interactive=True,
+                )
+                autotune = gr.Checkbox(
+                    label=i18n("Autotune"),
+                    info=i18n(
+                        "Apply a soft autotune to your inferences, recommended for singing conversions."
+                    ),
+                    visible=True,
+                    value=False,
+                    interactive=True,
+                )
+                clean_audio = gr.Checkbox(
+                    label=i18n("Clean Audio"),
+                    info=i18n(
+                        "Clean your audio output using noise detection algorithms, recommended for speaking audios."
+                    ),
                     visible=True,
                     value=False,
                     interactive=True,
                 )
+                clean_strength = gr.Slider(
+                    minimum=0,
+                    maximum=1,
+                    label=i18n("Clean Strength"),
+                    info=i18n(
+                        "Set the clean-up level to the audio you want, the more you increase it the more it will clean up, but it is possible that the audio will be more compressed."
+                    ),
+                    visible=False,
+                    value=0.5,
+                    interactive=True,
+                )
                 pitch = gr.Slider(
                     minimum=-24,
                     maximum=24,
                     step=1,
                     label=i18n("Pitch"),
+                    info=i18n(
+                        "Set the pitch of the audio, the higher the value, the higher the pitch."
+                    ),
                     value=0,
                     interactive=True,
                 )
                 filter_radius = gr.Slider(
                     minimum=0,
                     maximum=7,
+                    label=i18n("Filter Radius"),
+                    info=i18n(
+                        "If the number is greater than or equal to three, employing median filtering on the collected tone results has the potential to decrease respiration."
                     ),
                     value=3,
                     step=1,
                     minimum=0,
                     maximum=1,
                     label=i18n("Search Feature Ratio"),
+                    info=i18n(
+                        "Influence exerted by the index file; a higher value corresponds to greater influence. However, opting for lower values can help mitigate artifacts present in the audio."
+                    ),
                     value=0.75,
                     interactive=True,
                 )
+                rms_mix_rate = gr.Slider(
+                    minimum=0,
+                    maximum=1,
+                    label=i18n("Volume Envelope"),
+                    info=i18n(
+                        "Substitute or blend with the volume envelope of the output. The closer the ratio is to 1, the more the output envelope is employed."
+                    ),
+                    value=1,
+                    interactive=True,
+                )
+                protect = gr.Slider(
+                    minimum=0,
+                    maximum=0.5,
+                    label=i18n("Protect Voiceless Consonants"),
+                    info=i18n(
+                        "Safeguard distinct consonants and breathing sounds to prevent electro-acoustic tearing and other artifacts. Pulling the parameter to its maximum value of 0.5 offers comprehensive protection. However, reducing this value might decrease the extent of protection while potentially mitigating the indexing effect."
+                    ),
+                    value=0.5,
+                    interactive=True,
+                )
                 hop_length = gr.Slider(
                     minimum=1,
                     maximum=512,
                     step=1,
                     label=i18n("Hop Length"),
+                    info=i18n(
+                        "Denotes the duration it takes for the system to transition to a significant pitch change. Smaller hop lengths require more time for inference but tend to yield higher pitch accuracy."
+                    ),
+                    visible=False,
                     value=128,
                     interactive=True,
                 )
             with gr.Column():
                 f0method = gr.Radio(
                     label=i18n("Pitch extraction algorithm"),
+                    info=i18n(
+                        "Pitch extraction algorithm to use for the audio conversion. The default algorithm is rmvpe, which is recommended for most cases."
+                    ),
                     choices=[
                         "pm",
                         "harvest",
                         "crepe",
                         "crepe-tiny",
                         "rmvpe",
+                        "fcpe",
+                        "hybrid[rmvpe+fcpe]",
                     ],
                     value="rmvpe",
                     interactive=True,
         convert_button1 = gr.Button(i18n("Convert"))
         with gr.Row():  # Defines output info + output audio download after conversion
+            vc_output1 = gr.Textbox(
+                label=i18n("Output Information"),
+                info=i18n("The output information will be displayed here."),
+            )
             vc_output2 = gr.Audio(label=i18n("Export Audio"))
     # Batch inference tab
             with gr.Column():
                 input_folder_batch = gr.Textbox(
                     label=i18n("Input Folder"),
+                    info=i18n("Select the folder containing the audios to convert."),
                     placeholder=i18n("Enter input path"),
                     value=os.path.join(now_dir, "assets", "audios"),
                     interactive=True,
                 )
                 output_folder_batch = gr.Textbox(
                     label=i18n("Output Folder"),
+                    info=i18n(
+                        "Select the folder where the output audios will be saved."
+                    ),
                     placeholder=i18n("Enter output path"),
                     value=os.path.join(now_dir, "assets", "audios"),
                     interactive=True,
                 )
         with gr.Accordion(i18n("Advanced Settings"), open=False):
             with gr.Column():
+                clear_outputs_batch = gr.Button(
                     i18n("Clear Outputs (Deletes all audios in assets/audios)")
                 )
+                export_format_batch = gr.Radio(
+                    label=i18n("Export Format"),
+                    info=i18n("Select the format to export the audio."),
+                    choices=["WAV", "MP3", "FLAC", "OGG", "M4A"],
+                    value="WAV",
+                    interactive=True,
+                )
                 split_audio_batch = gr.Checkbox(
                     label=i18n("Split Audio"),
+                    info=i18n(
+                        "Split the audio into chunks for inference to obtain better results in some cases."
+                    ),
+                    visible=True,
+                    value=False,
+                    interactive=True,
+                )
+                autotune_batch = gr.Checkbox(
+                    label=i18n("Autotune"),
+                    info=i18n(
+                        "Apply a soft autotune to your inferences, recommended for singing conversions."
+                    ),
+                    visible=True,
+                    value=False,
+                    interactive=True,
+                )
+                clean_audio_batch = gr.Checkbox(
+                    label=i18n("Clean Audio"),
+                    info=i18n(
+                        "Clean your audio output using noise detection algorithms, recommended for speaking audios."
+                    ),
                     visible=True,
                     value=False,
                     interactive=True,
                 )
+                clean_strength_batch = gr.Slider(
+                    minimum=0,
+                    maximum=1,
+                    label=i18n("Clean Strength"),
+                    info=i18n(
+                        "Set the clean-up level to the audio you want, the more you increase it the more it will clean up, but it is possible that the audio will be more compressed."
+                    ),
+                    visible=False,
+                    value=0.5,
+                    interactive=True,
+                )
                 pitch_batch = gr.Slider(
                     minimum=-24,
                     maximum=24,
                     step=1,
                     label=i18n("Pitch"),
+                    info=i18n(
+                        "Set the pitch of the audio, the higher the value, the higher the pitch."
+                    ),
                     value=0,
                     interactive=True,
                 )
                 filter_radius_batch = gr.Slider(
                     minimum=0,
                     maximum=7,
+                    label=i18n("Filter Radius"),
+                    info=i18n(
+                        "If the number is greater than or equal to three, employing median filtering on the collected tone results has the potential to decrease respiration."
                     ),
                     value=3,
                     step=1,
                     minimum=0,
                     maximum=1,
                     label=i18n("Search Feature Ratio"),
+                    info=i18n(
+                        "Influence exerted by the index file; a higher value corresponds to greater influence. However, opting for lower values can help mitigate artifacts present in the audio."
+                    ),
                     value=0.75,
                     interactive=True,
                 )
+                rms_mix_rate_batch = gr.Slider(
+                    minimum=0,
+                    maximum=1,
+                    label=i18n("Volume Envelope"),
+                    info=i18n(
+                        "Substitute or blend with the volume envelope of the output. The closer the ratio is to 1, the more the output envelope is employed."
+                    ),
+                    value=1,
+                    interactive=True,
+                )
+                protect_batch = gr.Slider(
+                    minimum=0,
+                    maximum=0.5,
+                    label=i18n("Protect Voiceless Consonants"),
+                    info=i18n(
+                        "Safeguard distinct consonants and breathing sounds to prevent electro-acoustic tearing and other artifacts. Pulling the parameter to its maximum value of 0.5 offers comprehensive protection. However, reducing this value might decrease the extent of protection while potentially mitigating the indexing effect."
+                    ),
+                    value=0.5,
+                    interactive=True,
+                )
                 hop_length_batch = gr.Slider(
                     minimum=1,
                     maximum=512,
                     step=1,
                     label=i18n("Hop Length"),
+                    info=i18n(
+                        "Denotes the duration it takes for the system to transition to a significant pitch change. Smaller hop lengths require more time for inference but tend to yield higher pitch accuracy."
+                    ),
+                    visible=False,
                     value=128,
                     interactive=True,
                 )
             with gr.Column():
                 f0method_batch = gr.Radio(
                     label=i18n("Pitch extraction algorithm"),
+                    info=i18n(
+                        "Pitch extraction algorithm to use for the audio conversion. The default algorithm is rmvpe, which is recommended for most cases."
+                    ),
                     choices=[
                         "pm",
                         "harvest",
                         "crepe",
                         "crepe-tiny",
                         "rmvpe",
+                        "fcpe",
+                        "hybrid[rmvpe+fcpe]",
                     ],
                     value="rmvpe",
                     interactive=True,
         convert_button2 = gr.Button(i18n("Convert"))
         with gr.Row():  # Defines output info + output audio download after conversion
+            vc_output3 = gr.Textbox(
+                label=i18n("Output Information"),
+                info=i18n("The output information will be displayed here."),
+            )
     def toggle_visible(checkbox):
         return {"visible": checkbox, "__type__": "update"}
+    def toggle_visible_hop_length(f0method):
+        if f0method == "crepe" or f0method == "crepe-tiny":
+            return {"visible": True, "__type__": "update"}
+        return {"visible": False, "__type__": "update"}
+    clean_audio.change(
+        fn=toggle_visible,
+        inputs=[clean_audio],
+        outputs=[clean_strength],
+    )
+    clean_audio_batch.change(
+        fn=toggle_visible,
+        inputs=[clean_audio_batch],
+        outputs=[clean_strength_batch],
+    )
+    f0method.change(
+        fn=toggle_visible_hop_length,
+        inputs=[f0method],
+        outputs=[hop_length],
+    )
+    f0method_batch.change(
+        fn=toggle_visible_hop_length,
+        inputs=[f0method_batch],
+        outputs=[hop_length_batch],
+    )
     refresh_button.click(
         fn=change_choices,
         inputs=[],
         inputs=[upload_audio],
         outputs=[audio, output_path],
     )
+    clear_outputs_infer.click(
+        fn=delete_outputs,
+        inputs=[],
+        outputs=[],
+    )
+    clear_outputs_batch.click(
         fn=delete_outputs,
         inputs=[],
         outputs=[],
             pitch,
             filter_radius,
             index_rate,
+            rms_mix_rate,
+            protect,
             hop_length,
             f0method,
             audio,
             model_file,
             index_file,
             split_audio,
+            autotune,
+            clean_audio,
+            clean_strength,
+            export_format,
         ],
         outputs=[vc_output1, vc_output2],
     )
             pitch_batch,
             filter_radius_batch,
             index_rate_batch,
+            rms_mix_rate_batch,
+            protect_batch,
             hop_length_batch,
             f0method_batch,
             input_folder_batch,
             model_file,
             index_file,
             split_audio_batch,
+            autotune_batch,
+            clean_audio_batch,
+            clean_strength_batch,
+            export_format_batch,
         ],
         outputs=[vc_output3],
     )

tabs/plugins/plugins_core.py CHANGED Viewed

@@ -11,24 +11,30 @@ i18n = I18nAuto()
 now_dir = os.getcwd()
 sys.path.append(now_dir)
 plugins_path = os.path.join(now_dir, "tabs", "plugins", "installed")
 if not os.path.exists(plugins_path):
     os.makedirs(plugins_path)
-json_file_path = os.path.join(now_dir, "tabs", "plugins", "installed_list.json")
 current_folders = os.listdir(plugins_path)
 def get_existing_folders():
     if os.path.exists(json_file_path):
         with open(json_file_path, "r") as file:
-            return json.load(file)
     else:
         return []
 def save_existing_folders(existing_folders):
     with open(json_file_path, "w") as file:
-        json.dump(existing_folders, file)
 def save_plugin_dropbox(dropbox):
@@ -53,33 +59,47 @@ def save_plugin_dropbox(dropbox):
         os.remove(zip_file_path)
         if os.path.exists(os.path.join(folder_path, "requirements.txt")):
-            subprocess.run(
-                [
-                    os.path.join("env", "python.exe"),
-                    "-m",
-                    "pip",
-                    "install",
-                    "-r",
-                    os.path.join(folder_path, "requirements.txt"),
-                ]
-            )
         else:
             print("No requirements.txt file found in the plugin folder.")
         save_existing_folders(get_existing_folders() + [folder_name])
         print(
-            f"{folder_name} plugin installed in {plugins_path}! Restart applio to see the changes."
         )
         gr.Info(
-            f"{folder_name} plugin installed in {plugins_path}! Restart applio to see the changes."
         )
     return None
 def check_new_folders():
     existing_folders = get_existing_folders()
     new_folders = set(current_folders) - set(existing_folders)
     if new_folders:
         for new_folder in new_folders:
             complete_path = os.path.join(plugins_path, new_folder)
@@ -98,5 +118,5 @@ def check_new_folders():
                 )
             else:
                 print("No requirements.txt file found in the plugin folder.")
-        print("Plugins checked and installed! Restart applio to see the changes.")
-    save_existing_folders(current_folders)

 now_dir = os.getcwd()
 sys.path.append(now_dir)
+from tabs.settings.restart import restart_applio
 plugins_path = os.path.join(now_dir, "tabs", "plugins", "installed")
 if not os.path.exists(plugins_path):
     os.makedirs(plugins_path)
+json_file_path = os.path.join(now_dir, "assets", "config.json")
 current_folders = os.listdir(plugins_path)
 def get_existing_folders():
     if os.path.exists(json_file_path):
         with open(json_file_path, "r") as file:
+            config = json.load(file)
+            return config["plugins"]
     else:
         return []
 def save_existing_folders(existing_folders):
+    with open(json_file_path, "r") as file:
+        config = json.load(file)
+        config["plugins"] = existing_folders
     with open(json_file_path, "w") as file:
+        json.dump(config, file, indent=2)
 def save_plugin_dropbox(dropbox):
         os.remove(zip_file_path)
         if os.path.exists(os.path.join(folder_path, "requirements.txt")):
+            if os.name == "nt":
+                subprocess.run(
+                    [
+                        os.path.join("env", "python.exe"),
+                        "-m",
+                        "pip",
+                        "install",
+                        "-r",
+                        os.path.join(folder_path, "requirements.txt"),
+                    ]
+                )
+            else:
+                subprocess.run(
+                    [
+                        "python",
+                        "-m",
+                        "pip",
+                        "install",
+                        "-r",
+                        os.path.join(folder_path, "requirements.txt"),
+                    ]
+                )
         else:
             print("No requirements.txt file found in the plugin folder.")
         save_existing_folders(get_existing_folders() + [folder_name])
         print(
+            f"{folder_name} plugin installed in {plugins_path}! Restarting applio to apply the changes."
         )
         gr.Info(
+            f"{folder_name} plugin installed in {plugins_path}! Restarting applio to apply the changes."
         )
+        restart_applio()
     return None
 def check_new_folders():
     existing_folders = get_existing_folders()
     new_folders = set(current_folders) - set(existing_folders)
+    save_existing_folders(current_folders)
     if new_folders:
         for new_folder in new_folders:
             complete_path = os.path.join(plugins_path, new_folder)
                 )
             else:
                 print("No requirements.txt file found in the plugin folder.")
+        print("Plugins checked and installed! Restarting applio to apply the changes.")
+        restart_applio()

tabs/report/report.py CHANGED Viewed

@@ -8,7 +8,7 @@ import gradio as gr
 from assets.i18n.i18n import I18nAuto
 now_dir = os.getcwd()
-sys.path.append("..")
 i18n = I18nAuto()

 from assets.i18n.i18n import I18nAuto
 now_dir = os.getcwd()
+sys.path.append(now_dir)
 i18n = I18nAuto()

tabs/settings/fake_gpu.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import os, sys
+import torch
+import json
+import gradio as gr
+from assets.i18n.i18n import I18nAuto
+from tabs.settings.restart import restart_applio
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+i18n = I18nAuto()
+ngpu = torch.cuda.device_count()
+config_file = os.path.join(now_dir, "assets", "config.json")
+def gpu_available():
+    if torch.cuda.is_available() or ngpu != 0:
+        return True
+def load_fake_gpu():
+    with open(config_file, "r", encoding="utf8") as file:
+        config = json.load(file)
+        return config["fake_gpu"]
+def save_config(value):
+    with open(config_file, "r", encoding="utf8") as file:
+        config = json.load(file)
+        config["fake_gpu"] = value
+    with open(config_file, "w", encoding="utf8") as file:
+        json.dump(config, file, indent=2)
+def fake_gpu_tab():
+    with gr.Row():
+        with gr.Column():
+            presence = gr.Checkbox(
+                label=i18n("Enable fake GPU"),
+                info=i18n(
+                    "Activates the train tab. However, please note that this device lacks GPU capabilities, hence training is not supported. This option is only for testing purposes. (This option will restart Applio)"
+                ),
+                interactive=True,
+                value=load_fake_gpu(),
+            )
+            presence.change(
+                fn=toggle,
+                inputs=[presence],
+                outputs=[],
+            )
+def toggle(checkbox):
+    save_config(bool(checkbox))
+    restart_applio()

tabs/settings/flask_server.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import os
+import sys
+import gradio as gr
+from assets.i18n.i18n import I18nAuto
+import requests
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+from assets.flask.server import start_flask, load_config_flask, save_config
+i18n = I18nAuto()
+def flask_server_tab():
+    with gr.Row():
+        with gr.Column():
+            flask_checkbox = gr.Checkbox(
+                label=i18n(
+                    "Enable Applio integration with applio.org/models using flask"
+                ),
+                info=i18n(
+                    "It will activate the possibility of downloading models with a click from the website."
+                ),
+                interactive=True,
+                value=load_config_flask(),
+            )
+            flask_checkbox.change(
+                fn=toggle,
+                inputs=[flask_checkbox],
+                outputs=[],
+            )
+def toggle(checkbox):
+    save_config(bool(checkbox))
+    if load_config_flask() == True:
+        start_flask()
+    else:
+        try:
+            requests.post("http://localhost:8000/shutdown")
+        except requests.exceptions.ConnectionError:
+            pass

tabs/settings/lang.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import os, sys
+import json
+import gradio as gr
+from assets.i18n.i18n import I18nAuto
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+i18n = I18nAuto()
+config_file = os.path.join(now_dir, "assets", "config.json")
+def get_language_settings():
+    with open(config_file, "r", encoding="utf8") as file:
+        config = json.load(file)
+    if config["lang"]["override"] == False:
+        return "Language automatically detected in the system"
+    else:
+        return config["lang"]["selected_lang"]
+def save_lang_settings(selected_language):
+    with open(config_file, "r", encoding="utf8") as file:
+        config = json.load(file)
+    if selected_language == "Language automatically detected in the system":
+        config["lang"]["override"] = False
+    else:
+        config["lang"]["override"] = True
+        config["lang"]["selected_lang"] = selected_language
+    gr.Info("Language have been saved. Restart Applio to apply the changes.")
+    with open(config_file, "w", encoding="utf8") as file:
+        json.dump(config, file, indent=2)
+def lang_tab():
+    with gr.Column():
+        selected_language = gr.Dropdown(
+            label=i18n("Language"),
+            info=i18n(
+                "Select the language you want to use. (Requires restarting Applio)"
+            ),
+            value=get_language_settings(),
+            choices=["Language automatically detected in the system"]
+            + i18n._get_available_languages(),
+            interactive=True,
+        )
+        selected_language.change(
+            fn=save_lang_settings,
+            inputs=[selected_language],
+            outputs=[],
+        )

tabs/settings/presence.py CHANGED Viewed

@@ -1,17 +1,29 @@
 import os
 import sys
-import base64
-import pathlib
-import tempfile
 import gradio as gr
-import threading
 from assets.i18n.i18n import I18nAuto
 from assets.discord_presence import RPCManager
 now_dir = os.getcwd()
-sys.path.append("..")
 i18n = I18nAuto()
 def presence_tab():
@@ -19,8 +31,11 @@ def presence_tab():
         with gr.Column():
             presence = gr.Checkbox(
                 label=i18n("Enable Applio integration with Discord presence"),
                 interactive=True,
-                value=True,
             )
             presence.change(
                 fn=toggle,
@@ -30,13 +45,11 @@ def presence_tab():
 def toggle(checkbox):
-    if bool(checkbox):
-        # print("Start Presence")
         try:
             RPCManager.start_presence()
         except KeyboardInterrupt:
             RPCManager.stop_presence()
     else:
-        # print("Stop presence")
         RPCManager.stop_presence()

 import os
 import sys
 import gradio as gr
+import json
 from assets.i18n.i18n import I18nAuto
 from assets.discord_presence import RPCManager
 now_dir = os.getcwd()
+sys.path.append(now_dir)
 i18n = I18nAuto()
+config_file = os.path.join(now_dir, "assets", "config.json")
+def load_config_presence():
+    with open(config_file, "r", encoding="utf8") as file:
+        config = json.load(file)
+        return config["discord_presence"]
+def save_config(value):
+    with open(config_file, "r", encoding="utf8") as file:
+        config = json.load(file)
+        config["discord_presence"] = value
+    with open(config_file, "w", encoding="utf8") as file:
+        json.dump(config, file, indent=2)
 def presence_tab():
         with gr.Column():
             presence = gr.Checkbox(
                 label=i18n("Enable Applio integration with Discord presence"),
+                info=i18n(
+                    "It will activate the possibility of displaying the current Applio activity in Discord."
+                ),
                 interactive=True,
+                value=load_config_presence(),
             )
             presence.change(
                 fn=toggle,
 def toggle(checkbox):
+    save_config(bool(checkbox))
+    if load_config_presence() == True:
         try:
             RPCManager.start_presence()
         except KeyboardInterrupt:
             RPCManager.stop_presence()
     else:
         RPCManager.stop_presence()

tabs/settings/restart.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import gradio as gr
+import os
+import sys
+now_dir = os.getcwd()
+pid_file_path = os.path.join(now_dir, "rvc", "train", "train_pid.txt")
+def restart_applio():
+    if os.name != "nt":
+        os.system("clear")
+    else:
+        os.system("cls")
+    try:
+        with open(pid_file_path, "r") as pid_file:
+            pids = [int(pid) for pid in pid_file.readlines()]
+        for pid in pids:
+            os.kill(pid, 9)
+        os.remove(pid_file_path)
+    except:
+        pass
+    python = sys.executable
+    os.execl(python, python, *sys.argv)
+from assets.i18n.i18n import I18nAuto
+i18n = I18nAuto()
+def restart_tab():
+    with gr.Row():
+        with gr.Column():
+            restart_button = gr.Button(i18n("Restart Applio"))
+            restart_button.click(
+                fn=restart_applio,
+                inputs=[],
+                outputs=[],
+            )

tabs/settings/themes.py CHANGED Viewed

@@ -9,7 +9,7 @@ from assets.i18n.i18n import I18nAuto
 import assets.themes.loadThemes as loadThemes
 now_dir = os.getcwd()
-sys.path.append("..")
 i18n = I18nAuto()
@@ -21,6 +21,9 @@ def theme_tab():
                 loadThemes.get_list(),
                 value=loadThemes.read_json(),
                 label=i18n("Theme"),
                 visible=True,
             )
             themes_select.change(

 import assets.themes.loadThemes as loadThemes
 now_dir = os.getcwd()
+sys.path.append(now_dir)
 i18n = I18nAuto()
                 loadThemes.get_list(),
                 value=loadThemes.read_json(),
                 label=i18n("Theme"),
+                info=i18n(
+                    "Select the theme you want to use. (Requires restarting Applio)"
+                ),
                 visible=True,
             )
             themes_select.change(

tabs/settings/version.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import gradio as gr
+from assets.version_checker import compare_version
+from assets.i18n.i18n import I18nAuto
+i18n = I18nAuto()
+def version_tab():
+    with gr.Row():
+        with gr.Column():
+            version_check = gr.Textbox(
+                label=i18n("Version Checker"),
+                info=i18n(
+                    "Check which version of Applio is the latest to see if you need to update."
+                ),
+                interactive=False,
+            )
+            version_button = gr.Button(i18n("Check for updates"))
+            version_button.click(
+                fn=compare_version,
+                inputs=[],
+                outputs=[version_check],
+            )

tabs/train/train.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import os
 import subprocess
 import sys
 import gradio as gr
 from assets.i18n.i18n import I18nAuto
 from core import (
@@ -8,14 +9,40 @@ from core import (
     run_extract_script,
     run_train_script,
     run_index_script,
 )
 from rvc.configs.config import max_vram_gpu, get_gpu_info
 from rvc.lib.utils import format_title
 i18n = I18nAuto()
 now_dir = os.getcwd()
 sys.path.append(now_dir)
 sup_audioext = {
     "wav",
     "mp3",
@@ -84,6 +111,31 @@ def refresh_datasets():
     return {"choices": sorted(get_datasets_list()), "__type__": "update"}
 # Drop Model
 def save_drop_model(dropbox):
     if ".pth" not in dropbox:
@@ -136,25 +188,92 @@ def save_drop_dataset_audio(dropbox, dataset_name):
             return None, relative_dataset_path
 # Train Tab
 def train_tab():
     with gr.Accordion(i18n("Preprocess")):
         with gr.Row():
             with gr.Column():
-                model_name = gr.Textbox(
                     label=i18n("Model Name"),
-                    placeholder=i18n("Enter model name"),
                     value="my-project",
                     interactive=True,
                 )
                 dataset_path = gr.Dropdown(
                     label=i18n("Dataset Path"),
                     # placeholder=i18n("Enter dataset path"),
                     choices=get_datasets_list(),
                     allow_custom_value=True,
                     interactive=True,
                 )
-                refresh_datasets_button = gr.Button(i18n("Refresh Datasets"))
                 dataset_creator = gr.Checkbox(
                     label=i18n("Dataset Creator"),
                     value=False,
@@ -163,9 +282,10 @@ def train_tab():
                 )
                 with gr.Column(visible=False) as dataset_creator_settings:
-                    with gr.Accordion("Dataset Creator"):
                         dataset_name = gr.Textbox(
                             label=i18n("Dataset Name"),
                             placeholder=i18n("Enter dataset name"),
                             interactive=True,
                         )
@@ -178,6 +298,7 @@ def train_tab():
             with gr.Column():
                 sampling_rate = gr.Radio(
                     label=i18n("Sampling Rate"),
                     choices=["32000", "40000", "48000"],
                     value="40000",
                     interactive=True,
@@ -185,6 +306,7 @@ def train_tab():
                 rvc_version = gr.Radio(
                     label=i18n("RVC Version"),
                     choices=["v1", "v2"],
                     value="v2",
                     interactive=True,
@@ -192,6 +314,7 @@ def train_tab():
         preprocess_output_info = gr.Textbox(
             label=i18n("Output Information"),
             value="",
             max_lines=8,
             interactive=False,
@@ -209,12 +332,24 @@ def train_tab():
     with gr.Accordion(i18n("Extract")):
         with gr.Row():
             hop_length = gr.Slider(
-                1, 512, 128, step=1, label=i18n("Hop Length"), interactive=True
             )
         with gr.Row():
             with gr.Column():
                 f0method = gr.Radio(
                     label=i18n("Pitch extraction algorithm"),
                     choices=["pm", "dio", "crepe", "crepe-tiny", "harvest", "rmvpe"],
                     value="rmvpe",
                     interactive=True,
@@ -222,6 +357,7 @@ def train_tab():
         extract_output_info = gr.Textbox(
             label=i18n("Output Information"),
             value="",
             max_lines=8,
             interactive=False,
@@ -242,39 +378,94 @@ def train_tab():
                 max_vram_gpu(0),
                 step=1,
                 label=i18n("Batch Size"),
                 interactive=True,
             )
             save_every_epoch = gr.Slider(
-                1, 100, 10, step=1, label=i18n("Save Every Epoch"), interactive=True
             )
             total_epoch = gr.Slider(
-                1, 1000, 500, step=1, label=i18n("Total Epoch"), interactive=True
             )
         with gr.Row():
             pitch_guidance = gr.Checkbox(
-                label=i18n("Pitch Guidance"), value=True, interactive=True
             )
             pretrained = gr.Checkbox(
-                label=i18n("Pretrained"), value=True, interactive=True
             )
             save_only_latest = gr.Checkbox(
-                label=i18n("Save Only Latest"), value=False, interactive=True
             )
             save_every_weights = gr.Checkbox(
                 label=i18n("Save Every Weights"),
                 value=True,
                 interactive=True,
             )
             custom_pretrained = gr.Checkbox(
-                label=i18n("Custom Pretrained"), value=False, interactive=True
             )
             multiple_gpu = gr.Checkbox(
-                label=i18n("GPU Settings"), value=False, interactive=True
             )
         with gr.Row():
             with gr.Column(visible=False) as pretrained_custom_settings:
-                with gr.Accordion("Pretrained Custom Settings"):
                     upload_pretrained = gr.File(
                         label=i18n("Upload Pretrained Model"),
                         type="filepath",
@@ -285,33 +476,57 @@ def train_tab():
                     )
                     g_pretrained_path = gr.Dropdown(
                         label=i18n("Custom Pretrained G"),
                         choices=sorted(pretraineds_list_g),
                         interactive=True,
                         allow_custom_value=True,
                     )
                     d_pretrained_path = gr.Dropdown(
                         label=i18n("Custom Pretrained D"),
                         choices=sorted(pretraineds_list_d),
                         interactive=True,
                         allow_custom_value=True,
                     )
             with gr.Column(visible=False) as gpu_custom_settings:
-                with gr.Accordion("GPU Settings"):
                     gpu = gr.Textbox(
                         label=i18n("GPU Number"),
                         placeholder=i18n("0 to ∞ separated by -"),
                         value="0",
                         interactive=True,
                     )
                     gr.Textbox(
                         label=i18n("GPU Information"),
                         value=get_gpu_info(),
                         interactive=False,
                     )
         with gr.Row():
             train_output_info = gr.Textbox(
                 label=i18n("Output Information"),
                 value="",
                 max_lines=8,
                 interactive=False,
@@ -332,6 +547,8 @@ def train_tab():
                     batch_size,
                     gpu,
                     pitch_guidance,
                     pretrained,
                     custom_pretrained,
                     g_pretrained_path,
@@ -341,6 +558,15 @@ def train_tab():
                 api_name="start_training",
             )
             index_button = gr.Button(i18n("Generate Index"))
             index_button.click(
                 run_index_script,
@@ -349,13 +575,114 @@ def train_tab():
                 api_name="generate_index",
             )
             def toggle_visible(checkbox):
                 return {"visible": checkbox, "__type__": "update"}
-            refresh_datasets_button.click(
-                fn=refresh_datasets,
                 inputs=[],
-                outputs=[dataset_path],
             )
             dataset_creator.change(
@@ -370,6 +697,18 @@ def train_tab():
                 outputs=[upload_audio_dataset, dataset_path],
             )
             custom_pretrained.change(
                 fn=toggle_visible,
                 inputs=[custom_pretrained],
@@ -388,8 +727,44 @@ def train_tab():
                 outputs=[upload_pretrained],
             )
             multiple_gpu.change(
                 fn=toggle_visible,
                 inputs=[multiple_gpu],
                 outputs=[gpu_custom_settings],
             )

 import os
 import subprocess
 import sys
+import shutil
 import gradio as gr
 from assets.i18n.i18n import I18nAuto
 from core import (
     run_extract_script,
     run_train_script,
     run_index_script,
+    run_prerequisites_script,
 )
 from rvc.configs.config import max_vram_gpu, get_gpu_info
 from rvc.lib.utils import format_title
+from tabs.settings.restart import restart_applio
 i18n = I18nAuto()
 now_dir = os.getcwd()
 sys.path.append(now_dir)
+pretraineds_v1 = [
+    (
+        "pretrained_v1/",
+        [
+            "D32k.pth",
+            "D40k.pth",
+            "D48k.pth",
+            "G32k.pth",
+            "G40k.pth",
+            "G48k.pth",
+            "f0D32k.pth",
+            "f0D40k.pth",
+            "f0D48k.pth",
+            "f0G32k.pth",
+            "f0G40k.pth",
+            "f0G48k.pth",
+        ],
+    ),
+]
+folder_mapping = {
+    "pretrained_v1/": "rvc/pretraineds/pretrained_v1/",
+}
 sup_audioext = {
     "wav",
     "mp3",
     return {"choices": sorted(get_datasets_list()), "__type__": "update"}
+# Model Names
+models_path = os.path.join(now_dir, "logs")
+def get_models_list():
+    return [
+        os.path.basename(dirpath)
+        for dirpath in os.listdir(models_path)
+        if os.path.isdir(os.path.join(models_path, dirpath))
+        and all(excluded not in dirpath for excluded in ["zips", "mute"])
+    ]
+def refresh_models():
+    return {"choices": sorted(get_models_list()), "__type__": "update"}
+# Refresh Models and Datasets
+def refresh_models_and_datasets():
+    return (
+        {"choices": sorted(get_models_list()), "__type__": "update"},
+        {"choices": sorted(get_datasets_list()), "__type__": "update"},
+    )
 # Drop Model
 def save_drop_model(dropbox):
     if ".pth" not in dropbox:
             return None, relative_dataset_path
+# Export
+## Get Pth and Index Files
+def get_pth_list():
+    return [
+        os.path.relpath(os.path.join(dirpath, filename), now_dir)
+        for dirpath, _, filenames in os.walk(models_path)
+        for filename in filenames
+        if filename.endswith(".pth")
+    ]
+def get_index_list():
+    return [
+        os.path.relpath(os.path.join(dirpath, filename), now_dir)
+        for dirpath, _, filenames in os.walk(models_path)
+        for filename in filenames
+        if filename.endswith(".index") and "trained" not in filename
+    ]
+def refresh_pth_and_index_list():
+    return (
+        {"choices": sorted(get_pth_list()), "__type__": "update"},
+        {"choices": sorted(get_index_list()), "__type__": "update"},
+    )
+## Export Pth and Index Files
+def export_pth(pth_path):
+    if pth_path and os.path.exists(pth_path):
+        return pth_path
+    return None
+def export_index(index_path):
+    if index_path and os.path.exists(index_path):
+        return index_path
+    return None
+## Upload to Google Drive
+def upload_to_google_drive(pth_path, index_path):
+    def upload_file(file_path):
+        if file_path:
+            try:
+                gr.Info(f"Uploading {pth_path} to Google Drive...")
+                google_drive_folder = "/content/drive/MyDrive/ApplioExported"
+                if not os.path.exists(google_drive_folder):
+                    os.makedirs(google_drive_folder)
+                google_drive_file_path = os.path.join(
+                    google_drive_folder, os.path.basename(file_path)
+                )
+                if os.path.exists(google_drive_file_path):
+                    os.remove(google_drive_file_path)
+                shutil.copy2(file_path, google_drive_file_path)
+                gr.Info("File uploaded successfully.")
+            except Exception as error:
+                print(error)
+                gr.Info("Error uploading to Google Drive")
+    upload_file(pth_path)
+    upload_file(index_path)
 # Train Tab
 def train_tab():
     with gr.Accordion(i18n("Preprocess")):
         with gr.Row():
             with gr.Column():
+                model_name = gr.Dropdown(
                     label=i18n("Model Name"),
+                    info=i18n("Name of the new model."),
+                    choices=get_models_list(),
                     value="my-project",
                     interactive=True,
+                    allow_custom_value=True,
                 )
                 dataset_path = gr.Dropdown(
                     label=i18n("Dataset Path"),
+                    info=i18n("Path to the dataset folder."),
                     # placeholder=i18n("Enter dataset path"),
                     choices=get_datasets_list(),
                     allow_custom_value=True,
                     interactive=True,
                 )
+                refresh = gr.Button(i18n("Refresh"))
                 dataset_creator = gr.Checkbox(
                     label=i18n("Dataset Creator"),
                     value=False,
                 )
                 with gr.Column(visible=False) as dataset_creator_settings:
+                    with gr.Accordion(i18n("Dataset Creator")):
                         dataset_name = gr.Textbox(
                             label=i18n("Dataset Name"),
+                            info=i18n("Name of the new dataset."),
                             placeholder=i18n("Enter dataset name"),
                             interactive=True,
                         )
             with gr.Column():
                 sampling_rate = gr.Radio(
                     label=i18n("Sampling Rate"),
+                    info=i18n("The sampling rate of the audio files."),
                     choices=["32000", "40000", "48000"],
                     value="40000",
                     interactive=True,
                 rvc_version = gr.Radio(
                     label=i18n("RVC Version"),
+                    info=i18n("The RVC version of the model."),
                     choices=["v1", "v2"],
                     value="v2",
                     interactive=True,
         preprocess_output_info = gr.Textbox(
             label=i18n("Output Information"),
+            info=i18n("The output information will be displayed here."),
             value="",
             max_lines=8,
             interactive=False,
     with gr.Accordion(i18n("Extract")):
         with gr.Row():
             hop_length = gr.Slider(
+                1,
+                512,
+                128,
+                step=1,
+                label=i18n("Hop Length"),
+                info=i18n(
+                    "Denotes the duration it takes for the system to transition to a significant pitch change. Smaller hop lengths require more time for inference but tend to yield higher pitch accuracy."
+                ),
+                interactive=True,
+                visible=False,
             )
         with gr.Row():
             with gr.Column():
                 f0method = gr.Radio(
                     label=i18n("Pitch extraction algorithm"),
+                    info=i18n(
+                        "Pitch extraction algorithm to use for the audio conversion. The default algorithm is rmvpe, which is recommended for most cases."
+                    ),
                     choices=["pm", "dio", "crepe", "crepe-tiny", "harvest", "rmvpe"],
                     value="rmvpe",
                     interactive=True,
         extract_output_info = gr.Textbox(
             label=i18n("Output Information"),
+            info=i18n("The output information will be displayed here."),
             value="",
             max_lines=8,
             interactive=False,
                 max_vram_gpu(0),
                 step=1,
                 label=i18n("Batch Size"),
+                info=i18n(
+                    "It's advisable to align it with the available VRAM of your GPU. A setting of 4 offers improved accuracy but slower processing, while 8 provides faster and standard results."
+                ),
                 interactive=True,
             )
             save_every_epoch = gr.Slider(
+                1,
+                100,
+                10,
+                step=1,
+                label=i18n("Save Every Epoch"),
+                info=i18n("Determine at how many epochs the model will saved at."),
+                interactive=True,
             )
             total_epoch = gr.Slider(
+                1,
+                10000,
+                500,
+                step=1,
+                label=i18n("Total Epoch"),
+                info=i18n(
+                    "Specifies the overall quantity of epochs for the model training process."
+                ),
+                interactive=True,
             )
         with gr.Row():
             pitch_guidance = gr.Checkbox(
+                label=i18n("Pitch Guidance"),
+                info=i18n(
+                    "By employing pitch guidance, it becomes feasible to mirror the intonation of the original voice, including its pitch. This feature is particularly valuable for singing and other scenarios where preserving the original melody or pitch pattern is essential."
+                ),
+                value=True,
+                interactive=True,
             )
             pretrained = gr.Checkbox(
+                label=i18n("Pretrained"),
+                info=i18n(
+                    "Utilize pretrained models when training your own. This approach reduces training duration and enhances overall quality."
+                ),
+                value=True,
+                interactive=True,
             )
             save_only_latest = gr.Checkbox(
+                label=i18n("Save Only Latest"),
+                info=i18n(
+                    "Enabling this setting will result in the G and D files saving only their most recent versions, effectively conserving storage space."
+                ),
+                value=False,
+                interactive=True,
             )
             save_every_weights = gr.Checkbox(
                 label=i18n("Save Every Weights"),
+                info=i18n(
+                    "This setting enables you to save the weights of the model at the conclusion of each epoch."
+                ),
                 value=True,
                 interactive=True,
             )
             custom_pretrained = gr.Checkbox(
+                label=i18n("Custom Pretrained"),
+                info=i18n(
+                    "Utilizing custom pretrained models can lead to superior results, as selecting the most suitable pretrained models tailored to the specific use case can significantly enhance performance."
+                ),
+                value=False,
+                interactive=True,
             )
             multiple_gpu = gr.Checkbox(
+                label=i18n("GPU Settings"),
+                info=(
+                    i18n(
+                        "Sets advanced GPU settings, recommended for users with better GPU architecture."
+                    )
+                ),
+                value=False,
+                interactive=True,
+            )
+            overtraining_detector = gr.Checkbox(
+                label=i18n("Overtraining Detector"),
+                info=i18n(
+                    "Detect overtraining to prevent the model from learning the training data too well and losing the ability to generalize to new data."
+                ),
+                value=False,
+                interactive=True,
             )
         with gr.Row():
             with gr.Column(visible=False) as pretrained_custom_settings:
+                with gr.Accordion(i18n("Pretrained Custom Settings")):
                     upload_pretrained = gr.File(
                         label=i18n("Upload Pretrained Model"),
                         type="filepath",
                     )
                     g_pretrained_path = gr.Dropdown(
                         label=i18n("Custom Pretrained G"),
+                        info=i18n(
+                            "Select the custom pretrained model for the generator."
+                        ),
                         choices=sorted(pretraineds_list_g),
                         interactive=True,
                         allow_custom_value=True,
                     )
                     d_pretrained_path = gr.Dropdown(
                         label=i18n("Custom Pretrained D"),
+                        info=i18n(
+                            "Select the custom pretrained model for the discriminator."
+                        ),
                         choices=sorted(pretraineds_list_d),
                         interactive=True,
                         allow_custom_value=True,
                     )
             with gr.Column(visible=False) as gpu_custom_settings:
+                with gr.Accordion(i18n("GPU Settings")):
                     gpu = gr.Textbox(
                         label=i18n("GPU Number"),
+                        info=i18n(
+                            "Specify the number of GPUs you wish to utilize for training by entering them separated by hyphens (-)."
+                        ),
                         placeholder=i18n("0 to ∞ separated by -"),
                         value="0",
                         interactive=True,
                     )
                     gr.Textbox(
                         label=i18n("GPU Information"),
+                        info=i18n("The GPU information will be displayed here."),
                         value=get_gpu_info(),
                         interactive=False,
                     )
+            with gr.Column(visible=False) as overtraining_settings:
+                with gr.Accordion(i18n("Overtraining Detector Settings")):
+                    overtraining_threshold = gr.Slider(
+                        1,
+                        100,
+                        50,
+                        step=1,
+                        label=i18n("Overtraining Threshold"),
+                        info=i18n(
+                            "Set the maximum number of epochs you want your model to stop training if no improvement is detected."
+                        ),
+                        interactive=True,
+                    )
         with gr.Row():
             train_output_info = gr.Textbox(
                 label=i18n("Output Information"),
+                info=i18n("The output information will be displayed here."),
                 value="",
                 max_lines=8,
                 interactive=False,
                     batch_size,
                     gpu,
                     pitch_guidance,
+                    overtraining_detector,
+                    overtraining_threshold,
                     pretrained,
                     custom_pretrained,
                     g_pretrained_path,
                 api_name="start_training",
             )
+            stop_train_button = gr.Button(
+                i18n("Stop Training & Restart Applio"), visible=False
+            )
+            stop_train_button.click(
+                fn=restart_applio,
+                inputs=[],
+                outputs=[],
+            )
             index_button = gr.Button(i18n("Generate Index"))
             index_button.click(
                 run_index_script,
                 api_name="generate_index",
             )
+    with gr.Accordion(i18n("Export Model"), open=False):
+        if not os.name == "nt":
+            gr.Markdown(
+                i18n(
+                    "The button 'Upload' is only for google colab: Uploads the exported files to the ApplioExported folder in your Google Drive."
+                )
+            )
+        with gr.Row():
+            with gr.Column():
+                pth_file_export = gr.File(
+                    label=i18n("Exported Pth file"),
+                    type="filepath",
+                    value=None,
+                    interactive=False,
+                )
+                pth_dropdown_export = gr.Dropdown(
+                    label=i18n("Pth file"),
+                    info=i18n("Select the pth file to be exported"),
+                    choices=get_pth_list(),
+                    value=None,
+                    interactive=True,
+                    allow_custom_value=True,
+                )
+            with gr.Column():
+                index_file_export = gr.File(
+                    label=i18n("Exported Index File"),
+                    type="filepath",
+                    value=None,
+                    interactive=False,
+                )
+                index_dropdown_export = gr.Dropdown(
+                    label=i18n("Index File"),
+                    info=i18n("Select the index file to be exported"),
+                    choices=get_index_list(),
+                    value=None,
+                    interactive=True,
+                    allow_custom_value=True,
+                )
+        with gr.Row():
+            with gr.Column():
+                refresh_export = gr.Button(i18n("Refresh"))
+                if not os.name == "nt":
+                    upload_exported = gr.Button(i18n("Upload"), variant="primary")
+                    upload_exported.click(
+                        fn=upload_to_google_drive,
+                        inputs=[pth_dropdown_export, index_dropdown_export],
+                        outputs=[],
+                    )
             def toggle_visible(checkbox):
                 return {"visible": checkbox, "__type__": "update"}
+            def toggle_visible_hop_length(f0method):
+                if f0method == "crepe" or f0method == "crepe-tiny":
+                    return {"visible": True, "__type__": "update"}
+                return {"visible": False, "__type__": "update"}
+            def toggle_pretrained(pretrained, custom_pretrained):
+                if custom_pretrained == False:
+                    return {"visible": pretrained, "__type__": "update"}, {
+                        "visible": False,
+                        "__type__": "update",
+                    }
+                else:
+                    return {"visible": pretrained, "__type__": "update"}, {
+                        "visible": pretrained,
+                        "__type__": "update",
+                    }
+            def enable_stop_train_button():
+                return {"visible": False, "__type__": "update"}, {
+                    "visible": True,
+                    "__type__": "update",
+                }
+            def disable_stop_train_button():
+                return {"visible": True, "__type__": "update"}, {
+                    "visible": False,
+                    "__type__": "update",
+                }
+            def download_prerequisites(version):
+                for remote_folder, file_list in pretraineds_v1:
+                    local_folder = folder_mapping.get(remote_folder, "")
+                    missing = False
+                    for file in file_list:
+                        destination_path = os.path.join(local_folder, file)
+                        if not os.path.exists(destination_path):
+                            missing = True
+                if version == "v1" and missing == True:
+                    gr.Info(
+                        "Downloading prerequisites... Please wait till it finishes to start preprocessing."
+                    )
+                    run_prerequisites_script("True", "False", "True", "True")
+                    gr.Info(
+                        "Prerequisites downloaded successfully, you may now start preprocessing."
+                    )
+            rvc_version.change(
+                fn=download_prerequisites,
+                inputs=[rvc_version],
+                outputs=[],
+            )
+            refresh.click(
+                fn=refresh_models_and_datasets,
                 inputs=[],
+                outputs=[model_name, dataset_path],
             )
             dataset_creator.change(
                 outputs=[upload_audio_dataset, dataset_path],
             )
+            f0method.change(
+                fn=toggle_visible_hop_length,
+                inputs=[f0method],
+                outputs=[hop_length],
+            )
+            pretrained.change(
+                fn=toggle_pretrained,
+                inputs=[pretrained, custom_pretrained],
+                outputs=[custom_pretrained, pretrained_custom_settings],
+            )
             custom_pretrained.change(
                 fn=toggle_visible,
                 inputs=[custom_pretrained],
                 outputs=[upload_pretrained],
             )
+            overtraining_detector.change(
+                fn=toggle_visible,
+                inputs=[overtraining_detector],
+                outputs=[overtraining_settings],
+            )
             multiple_gpu.change(
                 fn=toggle_visible,
                 inputs=[multiple_gpu],
                 outputs=[gpu_custom_settings],
             )
+            train_button.click(
+                fn=enable_stop_train_button,
+                inputs=[],
+                outputs=[train_button, stop_train_button],
+            )
+            train_output_info.change(
+                fn=disable_stop_train_button,
+                inputs=[],
+                outputs=[train_button, stop_train_button],
+            )
+            pth_dropdown_export.change(
+                fn=export_pth,
+                inputs=[pth_dropdown_export],
+                outputs=[pth_file_export],
+            )
+            index_dropdown_export.change(
+                fn=export_index,
+                inputs=[index_dropdown_export],
+                outputs=[index_file_export],
+            )
+            refresh_export.click(
+                fn=refresh_pth_and_index_list,
+                inputs=[],
+                outputs=[pth_dropdown_export, index_dropdown_export],
+            )

tabs/tts/tts.py CHANGED Viewed

@@ -2,8 +2,6 @@ import os, sys
 import gradio as gr
 import regex as re
 import json
-import shutil
-import datetime
 import random
 from core import (
@@ -18,26 +16,7 @@ now_dir = os.getcwd()
 sys.path.append(now_dir)
 model_root = os.path.join(now_dir, "logs")
-audio_root = os.path.join(now_dir, "assets", "audios")
 model_root_relative = os.path.relpath(model_root, now_dir)
-audio_root_relative = os.path.relpath(audio_root, now_dir)
-sup_audioext = {
-    "wav",
-    "mp3",
-    "flac",
-    "ogg",
-    "opus",
-    "m4a",
-    "mp4",
-    "aac",
-    "alac",
-    "wma",
-    "aiff",
-    "webm",
-    "ac3",
-}
 names = [
     os.path.join(root, file)
@@ -56,15 +35,6 @@ indexes_list = [
     if name.endswith(".index") and "trained" not in name
 ]
-audio_paths = [
-    os.path.join(root, name)
-    for root, _, files in os.walk(audio_root_relative, topdown=False)
-    for name in files
-    if name.endswith(tuple(sup_audioext))
-    and root == audio_root_relative
-    and "_output" not in name
-]
 def change_choices():
     names = [
@@ -83,19 +53,9 @@ def change_choices():
         for name in files
         if name.endswith(".index") and "trained" not in name
     ]
-    audio_paths = [
-        os.path.join(root, name)
-        for root, _, files in os.walk(audio_root_relative, topdown=False)
-        for name in files
-        if name.endswith(tuple(sup_audioext))
-        and root == audio_root_relative
-        and "_output" not in name
-    ]
     return (
         {"choices": sorted(names), "__type__": "update"},
         {"choices": sorted(indexes_list), "__type__": "update"},
-        {"choices": sorted(audio_paths), "__type__": "update"},
     )
@@ -110,93 +70,30 @@ def get_indexes():
     return indexes_list if indexes_list else ""
-def match_index(model_file: str) -> tuple:
-    model_files_trip = re.sub(r"\.pth|\.onnx$", "", model_file)
-    model_file_name = os.path.split(model_files_trip)[
-        -1
-    ]  # Extract only the name, not the directory
-    # Check if the sid0strip has the specific ending format _eXXX_sXXX
-    if re.match(r".+_e\d+_s\d+$", model_file_name):
-        base_model_name = model_file_name.rsplit("_", 2)[0]
-    else:
-        base_model_name = model_file_name
-    sid_directory = os.path.join(model_root_relative, base_model_name)
-    directories_to_search = [sid_directory] if os.path.exists(sid_directory) else []
-    directories_to_search.append(model_root_relative)
-    matching_index_files = []
-    for directory in directories_to_search:
-        for filename in os.listdir(directory):
-            if filename.endswith(".index") and "trained" not in filename:
-                # Condition to match the name
-                name_match = any(
-                    name.lower() in filename.lower()
-                    for name in [model_file_name, base_model_name]
-                )
-                # If in the specific directory, it's automatically a match
-                folder_match = directory == sid_directory
-                if name_match or folder_match:
-                    index_path = os.path.join(directory, filename)
-                    if index_path in indexes_list:
-                        matching_index_files.append(
-                            (
-                                index_path,
-                                os.path.getsize(index_path),
-                                " " not in filename,
-                            )
-                        )
-    if matching_index_files:
-        # Sort by favoring files without spaces and by size (largest size first)
-        matching_index_files.sort(key=lambda x: (-x[2], -x[1]))
-        best_match_index_path = matching_index_files[0][0]
-        return best_match_index_path
     return ""
-def save_to_wav(record_button):
-    if record_button is None:
-        pass
-    else:
-        path_to_file = record_button
-        new_name = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + ".wav"
-        target_path = os.path.join(audio_root_relative, os.path.basename(new_name))
-        shutil.move(path_to_file, target_path)
-        return target_path
-def save_to_wav2(upload_audio):
-    file_path = upload_audio
-    target_path = os.path.join(audio_root_relative, os.path.basename(file_path))
-    if os.path.exists(target_path):
-        os.remove(target_path)
-    shutil.copy(file_path, target_path)
-    return target_path
-def delete_outputs():
-    for root, _, files in os.walk(audio_root_relative, topdown=False):
-        for name in files:
-            if name.endswith(tuple(sup_audioext)) and name.__contains__("_output"):
-                os.remove(os.path.join(root, name))
-    gr.Info(f"Outputs cleared!")
 def tts_tab():
     default_weight = random.choice(names) if names else ""
     with gr.Row():
         with gr.Row():
             model_file = gr.Dropdown(
                 label=i18n("Voice Model"),
                 choices=sorted(names, key=lambda path: os.path.getsize(path)),
                 interactive=True,
                 value=default_weight,
@@ -205,6 +102,7 @@ def tts_tab():
             best_default_index_path = match_index(model_file.value)
             index_file = gr.Dropdown(
                 label=i18n("Index File"),
                 choices=get_indexes(),
                 value=best_default_index_path,
                 interactive=True,
@@ -215,13 +113,16 @@ def tts_tab():
             unload_button = gr.Button(i18n("Unload Voice"))
             unload_button.click(
-                fn=lambda: ({"value": "", "__type__": "update"}),
                 inputs=[],
-                outputs=[model_file],
             )
             model_file.select(
-                fn=match_index,
                 inputs=[model_file],
                 outputs=[index_file],
             )
@@ -234,6 +135,7 @@ def tts_tab():
     tts_voice = gr.Dropdown(
         label=i18n("TTS Voices"),
         choices=short_names,
         interactive=True,
         value=None,
@@ -241,10 +143,16 @@ def tts_tab():
     tts_text = gr.Textbox(
         label=i18n("Text to Synthesize"),
         placeholder=i18n("Enter text to synthesize"),
         lines=3,
     )
     with gr.Accordion(i18n("Advanced Settings"), open=False):
         with gr.Column():
             output_tts_path = gr.Textbox(
@@ -253,27 +161,74 @@ def tts_tab():
                 value=os.path.join(now_dir, "assets", "audios", "tts_output.wav"),
                 interactive=True,
             )
             output_rvc_path = gr.Textbox(
                 label=i18n("Output Path for RVC Audio"),
                 placeholder=i18n("Enter output path"),
                 value=os.path.join(now_dir, "assets", "audios", "tts_rvc_output.wav"),
                 interactive=True,
             )
             pitch = gr.Slider(
                 minimum=-24,
                 maximum=24,
                 step=1,
                 label=i18n("Pitch"),
                 value=0,
                 interactive=True,
             )
             filter_radius = gr.Slider(
                 minimum=0,
                 maximum=7,
-                label=i18n(
-                    "If >=3: apply median filtering to the harvested pitch results. The value represents the filter radius and can reduce breathiness"
                 ),
                 value=3,
                 step=1,
@@ -283,43 +238,90 @@ def tts_tab():
                 minimum=0,
                 maximum=1,
                 label=i18n("Search Feature Ratio"),
                 value=0.75,
                 interactive=True,
             )
             hop_length = gr.Slider(
                 minimum=1,
                 maximum=512,
                 step=1,
                 label=i18n("Hop Length"),
                 value=128,
                 interactive=True,
             )
-        with gr.Column():
-            f0method = gr.Radio(
-                label=i18n("Pitch extraction algorithm"),
-                choices=[
-                    "pm",
-                    "harvest",
-                    "dio",
-                    "crepe",
-                    "crepe-tiny",
-                    "rmvpe",
-                ],
-                value="rmvpe",
-                interactive=True,
-            )
     convert_button1 = gr.Button(i18n("Convert"))
     with gr.Row():  # Defines output info + output audio download after conversion
-        vc_output1 = gr.Textbox(label=i18n("Output Information"))
         vc_output2 = gr.Audio(label=i18n("Export Audio"))
     refresh_button.click(
         fn=change_choices,
         inputs=[],
         outputs=[model_file, index_file],
     )
     convert_button1.click(
         fn=run_tts_script,
         inputs=[
@@ -328,12 +330,19 @@ def tts_tab():
             pitch,
             filter_radius,
             index_rate,
             hop_length,
             f0method,
             output_tts_path,
             output_rvc_path,
             model_file,
             index_file,
         ],
         outputs=[vc_output1, vc_output2],
     )

 import gradio as gr
 import regex as re
 import json
 import random
 from core import (
 sys.path.append(now_dir)
 model_root = os.path.join(now_dir, "logs")
 model_root_relative = os.path.relpath(model_root, now_dir)
 names = [
     os.path.join(root, file)
     if name.endswith(".index") and "trained" not in name
 ]
 def change_choices():
     names = [
         for name in files
         if name.endswith(".index") and "trained" not in name
     ]
     return (
         {"choices": sorted(names), "__type__": "update"},
         {"choices": sorted(indexes_list), "__type__": "update"},
     )
     return indexes_list if indexes_list else ""
+def process_input(file_path):
+    with open(file_path, "r") as file:
+        file_contents = file.read()
+    gr.Info(f"The text from the txt file has been loaded!")
+    return file_contents, None
+def match_index(model_file_value):
+    if model_file_value:
+        model_folder = os.path.dirname(model_file_value)
+        index_files = get_indexes()
+        for index_file in index_files:
+            if os.path.dirname(index_file) == model_folder:
+                return index_file
     return ""
 def tts_tab():
     default_weight = random.choice(names) if names else ""
     with gr.Row():
         with gr.Row():
             model_file = gr.Dropdown(
                 label=i18n("Voice Model"),
+                info=i18n("Select the voice model to use for the conversion."),
                 choices=sorted(names, key=lambda path: os.path.getsize(path)),
                 interactive=True,
                 value=default_weight,
             best_default_index_path = match_index(model_file.value)
             index_file = gr.Dropdown(
                 label=i18n("Index File"),
+                info=i18n("Select the index file to use for the conversion."),
                 choices=get_indexes(),
                 value=best_default_index_path,
                 interactive=True,
             unload_button = gr.Button(i18n("Unload Voice"))
             unload_button.click(
+                fn=lambda: (
+                    {"value": "", "__type__": "update"},
+                    {"value": "", "__type__": "update"},
+                ),
                 inputs=[],
+                outputs=[model_file, index_file],
             )
             model_file.select(
+                fn=lambda model_file_value: match_index(model_file_value),
                 inputs=[model_file],
                 outputs=[index_file],
             )
     tts_voice = gr.Dropdown(
         label=i18n("TTS Voices"),
+        info=i18n("Select the TTS voice to use for the conversion."),
         choices=short_names,
         interactive=True,
         value=None,
     tts_text = gr.Textbox(
         label=i18n("Text to Synthesize"),
+        info=i18n("Enter the text to synthesize."),
         placeholder=i18n("Enter text to synthesize"),
         lines=3,
     )
+    txt_file = gr.File(
+        label=i18n("Or you can upload a .txt file"),
+        type="filepath",
+    )
     with gr.Accordion(i18n("Advanced Settings"), open=False):
         with gr.Column():
             output_tts_path = gr.Textbox(
                 value=os.path.join(now_dir, "assets", "audios", "tts_output.wav"),
                 interactive=True,
             )
             output_rvc_path = gr.Textbox(
                 label=i18n("Output Path for RVC Audio"),
                 placeholder=i18n("Enter output path"),
                 value=os.path.join(now_dir, "assets", "audios", "tts_rvc_output.wav"),
                 interactive=True,
             )
+            export_format = gr.Radio(
+                label=i18n("Export Format"),
+                info=i18n("Select the format to export the audio."),
+                choices=["WAV", "MP3", "FLAC", "OGG", "M4A"],
+                value="WAV",
+                interactive=True,
+            )
+            split_audio = gr.Checkbox(
+                label=i18n("Split Audio"),
+                info=i18n(
+                    "Split the audio into chunks for inference to obtain better results in some cases."
+                ),
+                visible=True,
+                value=False,
+                interactive=True,
+            )
+            autotune = gr.Checkbox(
+                label=i18n("Autotune"),
+                info=i18n(
+                    "Apply a soft autotune to your inferences, recommended for singing conversions."
+                ),
+                visible=True,
+                value=False,
+                interactive=True,
+            )
+            clean_audio = gr.Checkbox(
+                label=i18n("Clean Audio"),
+                info=i18n(
+                    "Clean your audio output using noise detection algorithms, recommended for speaking audios."
+                ),
+                visible=True,
+                value=True,
+                interactive=True,
+            )
+            clean_strength = gr.Slider(
+                minimum=0,
+                maximum=1,
+                label=i18n("Clean Strength"),
+                info=i18n(
+                    "Set the clean-up level to the audio you want, the more you increase it the more it will clean up, but it is possible that the audio will be more compressed."
+                ),
+                visible=True,
+                value=0.5,
+                interactive=True,
+            )
             pitch = gr.Slider(
                 minimum=-24,
                 maximum=24,
                 step=1,
                 label=i18n("Pitch"),
+                info=i18n(
+                    "Set the pitch of the audio, the higher the value, the higher the pitch."
+                ),
                 value=0,
                 interactive=True,
             )
             filter_radius = gr.Slider(
                 minimum=0,
                 maximum=7,
+                label=i18n("Filter Radius"),
+                info=i18n(
+                    "If the number is greater than or equal to three, employing median filtering on the collected tone results has the potential to decrease respiration."
                 ),
                 value=3,
                 step=1,
                 minimum=0,
                 maximum=1,
                 label=i18n("Search Feature Ratio"),
+                info=i18n(
+                    "Influence exerted by the index file; a higher value corresponds to greater influence. However, opting for lower values can help mitigate artifacts present in the audio."
+                ),
                 value=0.75,
                 interactive=True,
             )
+            rms_mix_rate = gr.Slider(
+                minimum=0,
+                maximum=1,
+                label=i18n("Volume Envelope"),
+                info=i18n(
+                    "Substitute or blend with the volume envelope of the output. The closer the ratio is to 1, the more the output envelope is employed."
+                ),
+                value=1,
+                interactive=True,
+            )
+            protect = gr.Slider(
+                minimum=0,
+                maximum=0.5,
+                label=i18n("Protect Voiceless Consonants"),
+                info=i18n(
+                    "Safeguard distinct consonants and breathing sounds to prevent electro-acoustic tearing and other artifacts. Pulling the parameter to its maximum value of 0.5 offers comprehensive protection. However, reducing this value might decrease the extent of protection while potentially mitigating the indexing effect."
+                ),
+                value=0.5,
+                interactive=True,
+            )
             hop_length = gr.Slider(
                 minimum=1,
                 maximum=512,
                 step=1,
                 label=i18n("Hop Length"),
+                info=i18n(
+                    "Denotes the duration it takes for the system to transition to a significant pitch change. Smaller hop lengths require more time for inference but tend to yield higher pitch accuracy."
+                ),
                 value=128,
                 interactive=True,
             )
+            with gr.Column():
+                f0method = gr.Radio(
+                    label=i18n("Pitch extraction algorithm"),
+                    info=i18n(
+                        "Pitch extraction algorithm to use for the audio conversion. The default algorithm is rmvpe, which is recommended for most cases."
+                    ),
+                    choices=[
+                        "pm",
+                        "harvest",
+                        "dio",
+                        "crepe",
+                        "crepe-tiny",
+                        "rmvpe",
+                        "fcpe",
+                        "hybrid[rmvpe+fcpe]",
+                    ],
+                    value="rmvpe",
+                    interactive=True,
+                )
     convert_button1 = gr.Button(i18n("Convert"))
     with gr.Row():  # Defines output info + output audio download after conversion
+        vc_output1 = gr.Textbox(
+            label=i18n("Output Information"),
+            info=i18n("The output information will be displayed here."),
+        )
         vc_output2 = gr.Audio(label=i18n("Export Audio"))
+    def toggle_visible(checkbox):
+        return {"visible": checkbox, "__type__": "update"}
+    clean_audio.change(
+        fn=toggle_visible,
+        inputs=[clean_audio],
+        outputs=[clean_strength],
+    )
     refresh_button.click(
         fn=change_choices,
         inputs=[],
         outputs=[model_file, index_file],
     )
+    txt_file.upload(
+        fn=process_input,
+        inputs=[txt_file],
+        outputs=[tts_text, txt_file],
+    )
     convert_button1.click(
         fn=run_tts_script,
         inputs=[
             pitch,
             filter_radius,
             index_rate,
+            rms_mix_rate,
+            protect,
             hop_length,
             f0method,
             output_tts_path,
             output_rvc_path,
             model_file,
             index_file,
+            split_audio,
+            autotune,
+            clean_audio,
+            clean_strength,
+            export_format,
         ],
         outputs=[vc_output1, vc_output2],
     )

tabs/voice_blender/voice_blender.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import os, sys
+import gradio as gr
+import shutil
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+from assets.i18n.i18n import I18nAuto
+from core import run_model_blender_script
+i18n = I18nAuto()
+def update_model_fusion(dropbox):
+    return dropbox, None
+def voice_blender_tab():
+    gr.Markdown(i18n("## Voice Blender"))
+    gr.Markdown(
+        i18n(
+            "Select two voice models, set your desired blend percentage, and blend them into an entirely new voice."
+        )
+    )
+    with gr.Column():
+        model_fusion_name = gr.Textbox(
+            label=i18n("Model Name"),
+            info=i18n("Name of the new model."),
+            value="",
+            max_lines=1,
+            interactive=True,
+            placeholder=i18n("Enter model name"),
+        )
+        with gr.Row():
+            with gr.Column():
+                model_fusion_a_dropbox = gr.File(
+                    label=i18n("Drag and drop your model here"), type="filepath"
+                )
+                model_fusion_a = gr.Textbox(
+                    label=i18n("Path to Model"),
+                    value="",
+                    interactive=True,
+                    placeholder=i18n("Enter path to model"),
+                    info=i18n("You can also use a custom path."),
+                )
+            with gr.Column():
+                model_fusion_b_dropbox = gr.File(
+                    label=i18n("Drag and drop your model here"), type="filepath"
+                )
+                model_fusion_b = gr.Textbox(
+                    label=i18n("Path to Model"),
+                    value="",
+                    interactive=True,
+                    placeholder=i18n("Enter path to model"),
+                    info=i18n("You can also use a custom path."),
+                )
+        alpha_a = gr.Slider(
+            minimum=0,
+            maximum=1,
+            label=i18n("Blend Ratio"),
+            value=0.5,
+            interactive=True,
+            info=i18n(
+                "Adjusting the position more towards one side or the other will make the model more similar to the first or second."
+            ),
+        )
+        model_fusion_button = gr.Button(i18n("Fusion"), variant="primary")
+        with gr.Row():
+            model_fusion_output_info = gr.Textbox(
+                label=i18n("Output Information"),
+                info=i18n("The output information will be displayed here."),
+                value="",
+            )
+            model_fusion_pth_output = gr.File(
+                label=i18n("Download Model"), type="filepath", interactive=False
+            )
+    model_fusion_button.click(
+        fn=run_model_blender_script,
+        inputs=[
+            model_fusion_name,
+            model_fusion_a,
+            model_fusion_b,
+            alpha_a,
+        ],
+        outputs=[model_fusion_output_info, model_fusion_pth_output],
+    )
+    model_fusion_a_dropbox.upload(
+        fn=update_model_fusion,
+        inputs=model_fusion_a_dropbox,
+        outputs=[model_fusion_a, model_fusion_a_dropbox],
+    )
+    model_fusion_b_dropbox.upload(
+        fn=update_model_fusion,
+        inputs=model_fusion_b_dropbox,
+        outputs=[model_fusion_b, model_fusion_b_dropbox],
+    )