diff --git a/.gitignore b/.gitignore
index dde3895fc112ad34a839b2fed9210ac2288a959b..9ab76a12bc96eff64a46cc52cd9a22f7aa9ae58f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
 .DS_Store
 *.pyc
+__pycache__
diff --git a/README.md b/README.md
index b3dfbd275aa433bc627da1fb3db77ab694736732..bd0ea5ec405147f732262cd1d65ef33a204a8dd9 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 ---
-title: RVC Trainer
+title: ZeroRVC
 emoji: 🦀
 colorFrom: gray
 colorTo: gray
@@ -9,4 +9,6 @@ app_file: app.py
 pinned: false
 ---
 
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+# ZeroRVC
+
+Run Retrieval-based Voice Conversion training and inference on HuggingFace ZeroGPU.
diff --git a/app.py b/app.py
index 084948dca95c5841b84ed94c694089eda5f19a78..92c14588e9a58dad43c47f210415cf359bcd3aa5 100644
--- a/app.py
+++ b/app.py
@@ -1,11 +1,12 @@
+from typing import Tuple
+from prelude import prelude
+
+prelude()
+
 import os
 import traceback
-
 import numpy as np
 from sklearn.cluster import MiniBatchKMeans
-
-os.environ["PYTORCH_JIT"] = "0v"
-
 from random import shuffle
 import gradio as gr
 import zipfile
@@ -18,23 +19,12 @@ from infer.modules.train.extract.extract_f0_rmvpe import FeatureInput
 from infer.modules.train.extract_feature_print import HubertFeatureExtractor
 from infer.modules.train.train import train
 from infer.lib.train.process_ckpt import extract_small_model
+from infer.modules.vc.modules import VC
+from configs.config import Config
+import demucs.separate
+import soundfile as sf
 from zero import zero
-
-# patch for jit script
-# if we find `def expand_2d_or_3d_tensor(x,` in /usr/local/lib/python3.10/site-packages/fairseq/models/model_utils.py
-# patch it with `def expand_2d_or_3d_tensor(x: Tensor,`
-FAIRSEQ_CODE = "/usr/local/lib/python3.10/site-packages/fairseq/models/model_utils.py"
-if os.path.exists(FAIRSEQ_CODE):
-    with open(FAIRSEQ_CODE, "r") as f:
-        lines = f.readlines()
-    with open(FAIRSEQ_CODE, "w") as f:
-        for line in lines:
-            if "def expand_2d_or_3d_tensor(x, trg_dim: int, padding_idx: int):" in line:
-                f.write(
-                    "def expand_2d_or_3d_tensor(x: Tensor, trg_dim: int, padding_idx: int) -> Tensor:\n"
-                )
-            else:
-                f.write(line)
+from model import device
 
 
 def extract_audio_files(zip_file: str, target_dir: str) -> list[str]:
@@ -189,13 +179,15 @@ def download_weight(exp_dir: str) -> str:
         raise gr.Error("No model found")
 
     latest_model = max(models, key=os.path.getctime)
+    print(f"Latest model: {latest_model}")
 
     name = os.path.basename(exp_dir)
+    out = os.path.join(exp_dir, f"{name}.pth")
     extract_small_model(
-        latest_model, name, "40k", True, "Model trained by ZeroGPU.", "v2"
+        latest_model, out, "40k", True, "Model trained by ZeroGPU.", "v2"
     )
 
-    return "assets/weights/%s.pth" % name
+    return out
 
 
 def train_index(exp_dir: str) -> str:
@@ -269,9 +261,70 @@ def restore_expdir(zip: str) -> str:
     return exp_dir
 
 
+@zero(duration=120)
+def infer(exp_dir: str, original_audio: str, f0add: int) -> Tuple[int, np.ndarray]:
+    name = os.path.basename(exp_dir)
+    model = os.path.join(exp_dir, f"{name}.pth")
+    if not os.path.exists(model):
+        raise gr.Error("Model not found")
+
+    index = glob(f"{exp_dir}/added_*.index")
+    if not index:
+        raise gr.Error("Index not found")
+
+    base = os.path.basename(original_audio)
+    base = os.path.splitext(base)[0]
+    demucs.separate.main(
+        ["--two-stems", "vocals", "-d", str(device), "-n", "htdemucs", original_audio]
+    )
+    out = os.path.join("separated", "htdemucs", base, "vocals.wav")
+
+    cfg = Config()
+    vc = VC(cfg)
+    vc.get_vc(model)
+    _, wav_opt = vc.vc_single(
+        0,
+        out,
+        f0add,
+        None,
+        "rmvpe",
+        index,
+        None,
+        0.5,
+        3,
+        0,
+        1,
+        0.33,
+    )
+
+    sr = wav_opt[0]
+    data = wav_opt[1]
+
+    return sr, data
+
+
+def merge(exp_dir: str, original_audio: str, vocal: Tuple[int, np.ndarray]) -> str:
+    base = os.path.basename(original_audio)
+    base = os.path.splitext(base)[0]
+    music = os.path.join("separated", "htdemucs", base, "no-vocals.wav")
+
+    tmp = os.path.join(exp_dir, "tmp.wav")
+    sf.write(tmp, vocal[1], vocal[0])
+
+    os.system(
+        f"ffmpeg -i {music} -i {tmp} -filter_complex '[1]volume=2[a];[0][a]amix=inputs=2:duration=first:dropout_transition=2' {tmp}.merged.mp3"
+    )
+
+    return f"{tmp}.merged.mp3"
+
+
 with gr.Blocks() as app:
     # allow user to manually select the experiment directory
-    exp_dir = gr.Textbox(label="Experiment directory (don't touch it unless you know what you are doing)", visible=True, interactive=True)
+    exp_dir = gr.Textbox(
+        label="Experiment directory (don't touch it unless you know what you are doing)",
+        visible=True,
+        interactive=True,
+    )
 
     with gr.Tabs():
         with gr.Tab(label="New / Restore"):
@@ -284,10 +337,10 @@ with gr.Blocks() as app:
                     preprocess_output = gr.Textbox(
                         label="Preprocessing output", lines=5
                     )
-                with gr.Column():
-                    preprocess_btn = gr.Button(
-                        value="Start New Experiment", variant="primary"
-                    )
+
+                preprocess_btn = gr.Button(
+                    value="Start New Experiment", variant="primary"
+                )
 
             with gr.Row():
                 restore_zip_file = gr.File(
@@ -327,6 +380,26 @@ with gr.Blocks() as app:
                 )
                 download_expdir_output = gr.File(label="Download experiment directory")
 
+        with gr.Tab(label="Inference"):
+            with gr.Row():
+                original_audio = gr.Audio(
+                    label="Upload original audio",
+                    type="filepath",
+                    show_download_button=True,
+                )
+                f0add = gr.Slider(
+                    label="F0 add",
+                    minimum=-16,
+                    maximum=16,
+                    step=1,
+                    value=0,
+                )
+                infer_btn = gr.Button(value="Infer", variant="primary")
+            with gr.Row():
+                infer_output = gr.Audio(label="Inferred audio")
+            with gr.Row():
+                merge_output = gr.Audio(label="Merged audio")
+
     preprocess_btn.click(
         fn=preprocess,
         inputs=[zip_file],
@@ -343,6 +416,10 @@ with gr.Blocks() as app:
         fn=train_model,
         inputs=[exp_dir],
         outputs=[latest_model],
+    ).success(
+        fn=train_model,
+        inputs=[exp_dir],
+        outputs=[latest_model],
     )
 
     train_index_btn.click(
@@ -369,4 +446,14 @@ with gr.Blocks() as app:
         outputs=[exp_dir],
     )
 
+    infer_btn.click(
+        fn=infer,
+        inputs=[exp_dir, original_audio, f0add],
+        outputs=[infer_output],
+    ).success(
+        fn=merge,
+        inputs=[exp_dir, original_audio, infer_output],
+        outputs=[merge_output],
+    )
+
     app.launch()
diff --git a/assets/pretrained_v2/D40k.pth b/assets/pretrained_v2/D40k.pth
deleted file mode 100644
index 6d13aea9208310573b59309a9c80310ef71c5547..0000000000000000000000000000000000000000
--- a/assets/pretrained_v2/D40k.pth
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:471378e894e7191f89a94eda8288c5947b16bbe0b10c3f1f17efdb7a1d998242
-size 142875703
diff --git a/assets/pretrained_v2/G40k.pth b/assets/pretrained_v2/G40k.pth
deleted file mode 100644
index ee39bf64a1fc1d0d8154e242a3b60ef3e2abf0ca..0000000000000000000000000000000000000000
--- a/assets/pretrained_v2/G40k.pth
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a3843da7fde33db1dab176146c70d6c2df06eafe9457f4e3aa10024e9c6a4b69
-size 72959671
diff --git a/config.json b/config.json
index 5ce52f4aa8b16e161e1edb4e57f4d8aeec835e74..7c98e5bad3607000dc35dccf0288b38f3a29b0cf 100644
--- a/config.json
+++ b/config.json
@@ -67,7 +67,7 @@
         "c_mel": 45,
         "epochs": 20000,
         "eps": 1e-09,
-        "fp16_run": false,
+        "fp16_run": true,
         "init_lr_ratio": 1,
         "learning_rate": 0.0001,
         "log_interval": 200,
diff --git a/configs/config.py b/configs/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3c16599d94254d9ff04ba38e0a3757af5ed06f3
--- /dev/null
+++ b/configs/config.py
@@ -0,0 +1,245 @@
+import argparse
+import os
+import sys
+import json
+import shutil
+from multiprocessing import cpu_count
+
+import torch
+import logging
+from model import device, fp16
+
+logger = logging.getLogger(__name__)
+
+
+version_config_list = [
+    "v1/32k.json",
+    "v1/40k.json",
+    "v1/48k.json",
+    "v2/48k.json",
+    "v2/32k.json",
+]
+
+
+def singleton_variable(func):
+    def wrapper(*args, **kwargs):
+        if not wrapper.instance:
+            wrapper.instance = func(*args, **kwargs)
+        return wrapper.instance
+
+    wrapper.instance = None
+    return wrapper
+
+
+@singleton_variable
+class Config:
+    def __init__(self):
+        self.device = str(device)
+        self.is_half = fp16
+        self.use_jit = False
+        self.n_cpu = 0
+        self.gpu_name = None
+        self.json_config = self.load_config_json()
+        self.gpu_mem = None
+        (
+            self.python_cmd,
+            self.listen_port,
+            self.iscolab,
+            self.noparallel,
+            self.noautoopen,
+            self.dml,
+        ) = self.arg_parse()
+        self.instead = ""
+        self.preprocess_per = 3.7
+        self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()
+
+    @staticmethod
+    def load_config_json() -> dict:
+        d = {}
+        # for config_file in version_config_list:
+        #     p = f"configs/inuse/{config_file}"
+        #     if not os.path.exists(p):
+        #         shutil.copy(f"configs/{config_file}", p)
+        #     with open(f"configs/inuse/{config_file}", "r") as f:
+        #         d[config_file] = json.load(f)
+        return d
+
+    @staticmethod
+    def arg_parse() -> tuple:
+        exe = sys.executable or "python"
+        parser = argparse.ArgumentParser()
+        parser.add_argument("--port", type=int, default=7865, help="Listen port")
+        parser.add_argument("--pycmd", type=str, default=exe, help="Python command")
+        parser.add_argument("--colab", action="store_true", help="Launch in colab")
+        parser.add_argument(
+            "--noparallel", action="store_true", help="Disable parallel processing"
+        )
+        parser.add_argument(
+            "--noautoopen",
+            action="store_true",
+            help="Do not open in browser automatically",
+        )
+        parser.add_argument(
+            "--dml",
+            action="store_true",
+            help="torch_dml",
+        )
+        cmd_opts = parser.parse_args()
+
+        cmd_opts.port = cmd_opts.port if 0 <= cmd_opts.port <= 65535 else 7865
+
+        return (
+            cmd_opts.pycmd,
+            cmd_opts.port,
+            cmd_opts.colab,
+            cmd_opts.noparallel,
+            cmd_opts.noautoopen,
+            cmd_opts.dml,
+        )
+
+    # has_mps is only available in nightly pytorch (for now) and MasOS 12.3+.
+    # check `getattr` and try it for compatibility
+    @staticmethod
+    def has_mps() -> bool:
+        if not torch.backends.mps.is_available():
+            return False
+        try:
+            torch.zeros(1).to(torch.device("mps"))
+            return True
+        except Exception:
+            return False
+
+    @staticmethod
+    def has_xpu() -> bool:
+        if hasattr(torch, "xpu") and torch.xpu.is_available():
+            return True
+        else:
+            return False
+
+    def use_fp32_config(self):
+        for config_file in version_config_list:
+            self.json_config[config_file]["train"]["fp16_run"] = False
+            with open(f"configs/inuse/{config_file}", "r") as f:
+                strr = f.read().replace("true", "false")
+            with open(f"configs/inuse/{config_file}", "w") as f:
+                f.write(strr)
+            logger.info("overwrite " + config_file)
+        self.preprocess_per = 3.0
+        logger.info("overwrite preprocess_per to %d" % (self.preprocess_per))
+
+    def device_config(self) -> tuple:
+        if torch.cuda.is_available():
+            if self.has_xpu():
+                self.device = self.instead = "xpu:0"
+                self.is_half = True
+            i_device = int(self.device.split(":")[-1])
+            self.gpu_name = torch.cuda.get_device_name(i_device)
+            if (
+                ("16" in self.gpu_name and "V100" not in self.gpu_name.upper())
+                or "P40" in self.gpu_name.upper()
+                or "P10" in self.gpu_name.upper()
+                or "1060" in self.gpu_name
+                or "1070" in self.gpu_name
+                or "1080" in self.gpu_name
+            ):
+                logger.info("Found GPU %s, force to fp32", self.gpu_name)
+                self.is_half = False
+                self.use_fp32_config()
+            else:
+                logger.info("Found GPU %s", self.gpu_name)
+            self.gpu_mem = int(
+                torch.cuda.get_device_properties(i_device).total_memory
+                / 1024
+                / 1024
+                / 1024
+                + 0.4
+            )
+            if self.gpu_mem <= 4:
+                self.preprocess_per = 3.0
+        elif self.has_mps():
+            logger.info("No supported Nvidia GPU found")
+            self.device = self.instead = "mps"
+            self.is_half = False
+            self.use_fp32_config()
+        else:
+            logger.info("No supported Nvidia GPU found")
+            self.device = self.instead = "cpu"
+            self.is_half = False
+            self.use_fp32_config()
+
+        if self.n_cpu == 0:
+            self.n_cpu = cpu_count()
+
+        if self.is_half:
+            # 6G显存配置
+            x_pad = 3
+            x_query = 10
+            x_center = 60
+            x_max = 65
+        else:
+            # 5G显存配置
+            x_pad = 1
+            x_query = 6
+            x_center = 38
+            x_max = 41
+
+        if self.gpu_mem is not None and self.gpu_mem <= 4:
+            x_pad = 1
+            x_query = 5
+            x_center = 30
+            x_max = 32
+        if self.dml:
+            logger.info("Use DirectML instead")
+            if (
+                os.path.exists(
+                    "runtime\Lib\site-packages\onnxruntime\capi\DirectML.dll"
+                )
+                == False
+            ):
+                try:
+                    os.rename(
+                        "runtime\Lib\site-packages\onnxruntime",
+                        "runtime\Lib\site-packages\onnxruntime-cuda",
+                    )
+                except:
+                    pass
+                try:
+                    os.rename(
+                        "runtime\Lib\site-packages\onnxruntime-dml",
+                        "runtime\Lib\site-packages\onnxruntime",
+                    )
+                except:
+                    pass
+            # if self.device != "cpu":
+            import torch_directml
+
+            self.device = torch_directml.device(torch_directml.default_device())
+            self.is_half = False
+        else:
+            if self.instead:
+                logger.info(f"Use {self.instead} instead")
+            if (
+                os.path.exists(
+                    "runtime\Lib\site-packages\onnxruntime\capi\onnxruntime_providers_cuda.dll"
+                )
+                == False
+            ):
+                try:
+                    os.rename(
+                        "runtime\Lib\site-packages\onnxruntime",
+                        "runtime\Lib\site-packages\onnxruntime-dml",
+                    )
+                except:
+                    pass
+                try:
+                    os.rename(
+                        "runtime\Lib\site-packages\onnxruntime-cuda",
+                        "runtime\Lib\site-packages\onnxruntime",
+                    )
+                except:
+                    pass
+        logger.info(
+            "Half-precision floating-point: %s, device: %s"
+            % (self.is_half, self.device)
+        )
+        return x_pad, x_query, x_center, x_max
diff --git a/infer/lib/audio.py b/infer/lib/audio.py
index 90c825e82a6ba8b7d511e5d07d171d058de452aa..d43e5d033275cc9f8159a8470efa2180105a576a 100644
--- a/infer/lib/audio.py
+++ b/infer/lib/audio.py
@@ -1,8 +1,8 @@
 import platform, os
+import traceback
 import ffmpeg
 import numpy as np
 import av
-from io import BytesIO
 
 
 def wav2(i, o, format):
diff --git a/infer/lib/rmvpe.py b/infer/lib/rmvpe.py
index 86c6899e3a4c55fc5cef8f195e994e026aa1345a..6a46ac80a38ca6766f883deeea1c1d4a5a097b6a 100644
--- a/infer/lib/rmvpe.py
+++ b/infer/lib/rmvpe.py
@@ -1,24 +1,14 @@
 from io import BytesIO
 import os
-from typing import List, Optional, Tuple
+from typing import List
 import numpy as np
 import torch
 
 from infer.lib import jit
 
-try:
-    # Fix "Torch not compiled with CUDA enabled"
-    import intel_extension_for_pytorch as ipex  # pylint: disable=import-error, unused-import
-
-    if torch.xpu.is_available():
-        from infer.modules.ipex import ipex_init
-
-        ipex_init()
-except Exception:  # pylint: disable=broad-exception-caught
-    pass
 import torch.nn as nn
 import torch.nn.functional as F
-from librosa.util import normalize, pad_center, tiny
+from librosa.util import pad_center
 from scipy.signal import get_window
 
 import logging
diff --git a/infer/lib/train/process_ckpt.py b/infer/lib/train/process_ckpt.py
index 2529ccf6fb05935258af44bf9f3aa204532696ba..3f131e1e5a95adc2cf0eac2b503c8492b5bbf351 100644
--- a/infer/lib/train/process_ckpt.py
+++ b/infer/lib/train/process_ckpt.py
@@ -61,7 +61,7 @@ def show_info(path):
         return traceback.format_exc()
 
 
-def extract_small_model(path, name, sr, if_f0, info, version):
+def extract_small_model(path, out, sr, if_f0, info, version):
     try:
         ckpt = torch.load(path, map_location="cpu")
         if "model" in ckpt:
@@ -185,7 +185,7 @@ def extract_small_model(path, name, sr, if_f0, info, version):
         opt["version"] = version
         opt["sr"] = sr
         opt["f0"] = int(if_f0)
-        torch.save(opt, "assets/weights/%s.pth" % name)
+        torch.save(opt, out)
         return "Success."
     except:
         return traceback.format_exc()
diff --git a/infer/lib/uvr5_pack/lib_v5/dataset.py b/infer/lib/uvr5_pack/lib_v5/dataset.py
deleted file mode 100644
index cfd01a174978d97180a897e40cb59ecadec1d12e..0000000000000000000000000000000000000000
--- a/infer/lib/uvr5_pack/lib_v5/dataset.py
+++ /dev/null
@@ -1,183 +0,0 @@
-import os
-import random
-
-import numpy as np
-import torch
-import torch.utils.data
-from tqdm import tqdm
-
-from . import spec_utils
-
-
-class VocalRemoverValidationSet(torch.utils.data.Dataset):
-    def __init__(self, patch_list):
-        self.patch_list = patch_list
-
-    def __len__(self):
-        return len(self.patch_list)
-
-    def __getitem__(self, idx):
-        path = self.patch_list[idx]
-        data = np.load(path)
-
-        X, y = data["X"], data["y"]
-
-        X_mag = np.abs(X)
-        y_mag = np.abs(y)
-
-        return X_mag, y_mag
-
-
-def make_pair(mix_dir, inst_dir):
-    input_exts = [".wav", ".m4a", ".mp3", ".mp4", ".flac"]
-
-    X_list = sorted(
-        [
-            os.path.join(mix_dir, fname)
-            for fname in os.listdir(mix_dir)
-            if os.path.splitext(fname)[1] in input_exts
-        ]
-    )
-    y_list = sorted(
-        [
-            os.path.join(inst_dir, fname)
-            for fname in os.listdir(inst_dir)
-            if os.path.splitext(fname)[1] in input_exts
-        ]
-    )
-
-    filelist = list(zip(X_list, y_list))
-
-    return filelist
-
-
-def train_val_split(dataset_dir, split_mode, val_rate, val_filelist):
-    if split_mode == "random":
-        filelist = make_pair(
-            os.path.join(dataset_dir, "mixtures"),
-            os.path.join(dataset_dir, "instruments"),
-        )
-
-        random.shuffle(filelist)
-
-        if len(val_filelist) == 0:
-            val_size = int(len(filelist) * val_rate)
-            train_filelist = filelist[:-val_size]
-            val_filelist = filelist[-val_size:]
-        else:
-            train_filelist = [
-                pair for pair in filelist if list(pair) not in val_filelist
-            ]
-    elif split_mode == "subdirs":
-        if len(val_filelist) != 0:
-            raise ValueError(
-                "The `val_filelist` option is not available in `subdirs` mode"
-            )
-
-        train_filelist = make_pair(
-            os.path.join(dataset_dir, "training/mixtures"),
-            os.path.join(dataset_dir, "training/instruments"),
-        )
-
-        val_filelist = make_pair(
-            os.path.join(dataset_dir, "validation/mixtures"),
-            os.path.join(dataset_dir, "validation/instruments"),
-        )
-
-    return train_filelist, val_filelist
-
-
-def augment(X, y, reduction_rate, reduction_mask, mixup_rate, mixup_alpha):
-    perm = np.random.permutation(len(X))
-    for i, idx in enumerate(tqdm(perm)):
-        if np.random.uniform() < reduction_rate:
-            y[idx] = spec_utils.reduce_vocal_aggressively(
-                X[idx], y[idx], reduction_mask
-            )
-
-        if np.random.uniform() < 0.5:
-            # swap channel
-            X[idx] = X[idx, ::-1]
-            y[idx] = y[idx, ::-1]
-        if np.random.uniform() < 0.02:
-            # mono
-            X[idx] = X[idx].mean(axis=0, keepdims=True)
-            y[idx] = y[idx].mean(axis=0, keepdims=True)
-        if np.random.uniform() < 0.02:
-            # inst
-            X[idx] = y[idx]
-
-        if np.random.uniform() < mixup_rate and i < len(perm) - 1:
-            lam = np.random.beta(mixup_alpha, mixup_alpha)
-            X[idx] = lam * X[idx] + (1 - lam) * X[perm[i + 1]]
-            y[idx] = lam * y[idx] + (1 - lam) * y[perm[i + 1]]
-
-    return X, y
-
-
-def make_padding(width, cropsize, offset):
-    left = offset
-    roi_size = cropsize - left * 2
-    if roi_size == 0:
-        roi_size = cropsize
-    right = roi_size - (width % roi_size) + left
-
-    return left, right, roi_size
-
-
-def make_training_set(filelist, cropsize, patches, sr, hop_length, n_fft, offset):
-    len_dataset = patches * len(filelist)
-
-    X_dataset = np.zeros((len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64)
-    y_dataset = np.zeros((len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64)
-
-    for i, (X_path, y_path) in enumerate(tqdm(filelist)):
-        X, y = spec_utils.cache_or_load(X_path, y_path, sr, hop_length, n_fft)
-        coef = np.max([np.abs(X).max(), np.abs(y).max()])
-        X, y = X / coef, y / coef
-
-        l, r, roi_size = make_padding(X.shape[2], cropsize, offset)
-        X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode="constant")
-        y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode="constant")
-
-        starts = np.random.randint(0, X_pad.shape[2] - cropsize, patches)
-        ends = starts + cropsize
-        for j in range(patches):
-            idx = i * patches + j
-            X_dataset[idx] = X_pad[:, :, starts[j] : ends[j]]
-            y_dataset[idx] = y_pad[:, :, starts[j] : ends[j]]
-
-    return X_dataset, y_dataset
-
-
-def make_validation_set(filelist, cropsize, sr, hop_length, n_fft, offset):
-    patch_list = []
-    patch_dir = "cs{}_sr{}_hl{}_nf{}_of{}".format(
-        cropsize, sr, hop_length, n_fft, offset
-    )
-    os.makedirs(patch_dir, exist_ok=True)
-
-    for i, (X_path, y_path) in enumerate(tqdm(filelist)):
-        basename = os.path.splitext(os.path.basename(X_path))[0]
-
-        X, y = spec_utils.cache_or_load(X_path, y_path, sr, hop_length, n_fft)
-        coef = np.max([np.abs(X).max(), np.abs(y).max()])
-        X, y = X / coef, y / coef
-
-        l, r, roi_size = make_padding(X.shape[2], cropsize, offset)
-        X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode="constant")
-        y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode="constant")
-
-        len_dataset = int(np.ceil(X.shape[2] / roi_size))
-        for j in range(len_dataset):
-            outpath = os.path.join(patch_dir, "{}_p{}.npz".format(basename, j))
-            start = j * roi_size
-            if not os.path.exists(outpath):
-                np.savez(
-                    outpath,
-                    X=X_pad[:, :, start : start + cropsize],
-                    y=y_pad[:, :, start : start + cropsize],
-                )
-            patch_list.append(outpath)
-
-    return VocalRemoverValidationSet(patch_list)
diff --git a/infer/lib/uvr5_pack/lib_v5/layers.py b/infer/lib/uvr5_pack/lib_v5/layers.py
deleted file mode 100644
index 4fc1b5cb85a3327f60cbb9f5deffbeeaaac516ad..0000000000000000000000000000000000000000
--- a/infer/lib/uvr5_pack/lib_v5/layers.py
+++ /dev/null
@@ -1,118 +0,0 @@
-import torch
-import torch.nn.functional as F
-from torch import nn
-
-from . import spec_utils
-
-
-class Conv2DBNActiv(nn.Module):
-    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
-        super(Conv2DBNActiv, self).__init__()
-        self.conv = nn.Sequential(
-            nn.Conv2d(
-                nin,
-                nout,
-                kernel_size=ksize,
-                stride=stride,
-                padding=pad,
-                dilation=dilation,
-                bias=False,
-            ),
-            nn.BatchNorm2d(nout),
-            activ(),
-        )
-
-    def __call__(self, x):
-        return self.conv(x)
-
-
-class SeperableConv2DBNActiv(nn.Module):
-    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
-        super(SeperableConv2DBNActiv, self).__init__()
-        self.conv = nn.Sequential(
-            nn.Conv2d(
-                nin,
-                nin,
-                kernel_size=ksize,
-                stride=stride,
-                padding=pad,
-                dilation=dilation,
-                groups=nin,
-                bias=False,
-            ),
-            nn.Conv2d(nin, nout, kernel_size=1, bias=False),
-            nn.BatchNorm2d(nout),
-            activ(),
-        )
-
-    def __call__(self, x):
-        return self.conv(x)
-
-
-class Encoder(nn.Module):
-    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
-        super(Encoder, self).__init__()
-        self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
-        self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
-
-    def __call__(self, x):
-        skip = self.conv1(x)
-        h = self.conv2(skip)
-
-        return h, skip
-
-
-class Decoder(nn.Module):
-    def __init__(
-        self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
-    ):
-        super(Decoder, self).__init__()
-        self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
-        self.dropout = nn.Dropout2d(0.1) if dropout else None
-
-    def __call__(self, x, skip=None):
-        x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
-        if skip is not None:
-            skip = spec_utils.crop_center(skip, x)
-            x = torch.cat([x, skip], dim=1)
-        h = self.conv(x)
-
-        if self.dropout is not None:
-            h = self.dropout(h)
-
-        return h
-
-
-class ASPPModule(nn.Module):
-    def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
-        super(ASPPModule, self).__init__()
-        self.conv1 = nn.Sequential(
-            nn.AdaptiveAvgPool2d((1, None)),
-            Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
-        )
-        self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
-        self.conv3 = SeperableConv2DBNActiv(
-            nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
-        )
-        self.conv4 = SeperableConv2DBNActiv(
-            nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
-        )
-        self.conv5 = SeperableConv2DBNActiv(
-            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
-        )
-        self.bottleneck = nn.Sequential(
-            Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
-        )
-
-    def forward(self, x):
-        _, _, h, w = x.size()
-        feat1 = F.interpolate(
-            self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
-        )
-        feat2 = self.conv2(x)
-        feat3 = self.conv3(x)
-        feat4 = self.conv4(x)
-        feat5 = self.conv5(x)
-        out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
-        bottle = self.bottleneck(out)
-        return bottle
diff --git a/infer/lib/uvr5_pack/lib_v5/layers_123812KB .py b/infer/lib/uvr5_pack/lib_v5/layers_123812KB .py
deleted file mode 100644
index 4fc1b5cb85a3327f60cbb9f5deffbeeaaac516ad..0000000000000000000000000000000000000000
--- a/infer/lib/uvr5_pack/lib_v5/layers_123812KB .py	
+++ /dev/null
@@ -1,118 +0,0 @@
-import torch
-import torch.nn.functional as F
-from torch import nn
-
-from . import spec_utils
-
-
-class Conv2DBNActiv(nn.Module):
-    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
-        super(Conv2DBNActiv, self).__init__()
-        self.conv = nn.Sequential(
-            nn.Conv2d(
-                nin,
-                nout,
-                kernel_size=ksize,
-                stride=stride,
-                padding=pad,
-                dilation=dilation,
-                bias=False,
-            ),
-            nn.BatchNorm2d(nout),
-            activ(),
-        )
-
-    def __call__(self, x):
-        return self.conv(x)
-
-
-class SeperableConv2DBNActiv(nn.Module):
-    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
-        super(SeperableConv2DBNActiv, self).__init__()
-        self.conv = nn.Sequential(
-            nn.Conv2d(
-                nin,
-                nin,
-                kernel_size=ksize,
-                stride=stride,
-                padding=pad,
-                dilation=dilation,
-                groups=nin,
-                bias=False,
-            ),
-            nn.Conv2d(nin, nout, kernel_size=1, bias=False),
-            nn.BatchNorm2d(nout),
-            activ(),
-        )
-
-    def __call__(self, x):
-        return self.conv(x)
-
-
-class Encoder(nn.Module):
-    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
-        super(Encoder, self).__init__()
-        self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
-        self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
-
-    def __call__(self, x):
-        skip = self.conv1(x)
-        h = self.conv2(skip)
-
-        return h, skip
-
-
-class Decoder(nn.Module):
-    def __init__(
-        self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
-    ):
-        super(Decoder, self).__init__()
-        self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
-        self.dropout = nn.Dropout2d(0.1) if dropout else None
-
-    def __call__(self, x, skip=None):
-        x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
-        if skip is not None:
-            skip = spec_utils.crop_center(skip, x)
-            x = torch.cat([x, skip], dim=1)
-        h = self.conv(x)
-
-        if self.dropout is not None:
-            h = self.dropout(h)
-
-        return h
-
-
-class ASPPModule(nn.Module):
-    def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
-        super(ASPPModule, self).__init__()
-        self.conv1 = nn.Sequential(
-            nn.AdaptiveAvgPool2d((1, None)),
-            Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
-        )
-        self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
-        self.conv3 = SeperableConv2DBNActiv(
-            nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
-        )
-        self.conv4 = SeperableConv2DBNActiv(
-            nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
-        )
-        self.conv5 = SeperableConv2DBNActiv(
-            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
-        )
-        self.bottleneck = nn.Sequential(
-            Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
-        )
-
-    def forward(self, x):
-        _, _, h, w = x.size()
-        feat1 = F.interpolate(
-            self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
-        )
-        feat2 = self.conv2(x)
-        feat3 = self.conv3(x)
-        feat4 = self.conv4(x)
-        feat5 = self.conv5(x)
-        out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
-        bottle = self.bottleneck(out)
-        return bottle
diff --git a/infer/lib/uvr5_pack/lib_v5/layers_123821KB.py b/infer/lib/uvr5_pack/lib_v5/layers_123821KB.py
deleted file mode 100644
index 4fc1b5cb85a3327f60cbb9f5deffbeeaaac516ad..0000000000000000000000000000000000000000
--- a/infer/lib/uvr5_pack/lib_v5/layers_123821KB.py
+++ /dev/null
@@ -1,118 +0,0 @@
-import torch
-import torch.nn.functional as F
-from torch import nn
-
-from . import spec_utils
-
-
-class Conv2DBNActiv(nn.Module):
-    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
-        super(Conv2DBNActiv, self).__init__()
-        self.conv = nn.Sequential(
-            nn.Conv2d(
-                nin,
-                nout,
-                kernel_size=ksize,
-                stride=stride,
-                padding=pad,
-                dilation=dilation,
-                bias=False,
-            ),
-            nn.BatchNorm2d(nout),
-            activ(),
-        )
-
-    def __call__(self, x):
-        return self.conv(x)
-
-
-class SeperableConv2DBNActiv(nn.Module):
-    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
-        super(SeperableConv2DBNActiv, self).__init__()
-        self.conv = nn.Sequential(
-            nn.Conv2d(
-                nin,
-                nin,
-                kernel_size=ksize,
-                stride=stride,
-                padding=pad,
-                dilation=dilation,
-                groups=nin,
-                bias=False,
-            ),
-            nn.Conv2d(nin, nout, kernel_size=1, bias=False),
-            nn.BatchNorm2d(nout),
-            activ(),
-        )
-
-    def __call__(self, x):
-        return self.conv(x)
-
-
-class Encoder(nn.Module):
-    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
-        super(Encoder, self).__init__()
-        self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
-        self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
-
-    def __call__(self, x):
-        skip = self.conv1(x)
-        h = self.conv2(skip)
-
-        return h, skip
-
-
-class Decoder(nn.Module):
-    def __init__(
-        self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
-    ):
-        super(Decoder, self).__init__()
-        self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
-        self.dropout = nn.Dropout2d(0.1) if dropout else None
-
-    def __call__(self, x, skip=None):
-        x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
-        if skip is not None:
-            skip = spec_utils.crop_center(skip, x)
-            x = torch.cat([x, skip], dim=1)
-        h = self.conv(x)
-
-        if self.dropout is not None:
-            h = self.dropout(h)
-
-        return h
-
-
-class ASPPModule(nn.Module):
-    def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
-        super(ASPPModule, self).__init__()
-        self.conv1 = nn.Sequential(
-            nn.AdaptiveAvgPool2d((1, None)),
-            Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
-        )
-        self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
-        self.conv3 = SeperableConv2DBNActiv(
-            nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
-        )
-        self.conv4 = SeperableConv2DBNActiv(
-            nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
-        )
-        self.conv5 = SeperableConv2DBNActiv(
-            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
-        )
-        self.bottleneck = nn.Sequential(
-            Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
-        )
-
-    def forward(self, x):
-        _, _, h, w = x.size()
-        feat1 = F.interpolate(
-            self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
-        )
-        feat2 = self.conv2(x)
-        feat3 = self.conv3(x)
-        feat4 = self.conv4(x)
-        feat5 = self.conv5(x)
-        out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
-        bottle = self.bottleneck(out)
-        return bottle
diff --git a/infer/lib/uvr5_pack/lib_v5/layers_33966KB.py b/infer/lib/uvr5_pack/lib_v5/layers_33966KB.py
deleted file mode 100644
index 9b127bc6427f5c60c8cf85603a3d8a093c3501c4..0000000000000000000000000000000000000000
--- a/infer/lib/uvr5_pack/lib_v5/layers_33966KB.py
+++ /dev/null
@@ -1,126 +0,0 @@
-import torch
-import torch.nn.functional as F
-from torch import nn
-
-from . import spec_utils
-
-
-class Conv2DBNActiv(nn.Module):
-    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
-        super(Conv2DBNActiv, self).__init__()
-        self.conv = nn.Sequential(
-            nn.Conv2d(
-                nin,
-                nout,
-                kernel_size=ksize,
-                stride=stride,
-                padding=pad,
-                dilation=dilation,
-                bias=False,
-            ),
-            nn.BatchNorm2d(nout),
-            activ(),
-        )
-
-    def __call__(self, x):
-        return self.conv(x)
-
-
-class SeperableConv2DBNActiv(nn.Module):
-    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
-        super(SeperableConv2DBNActiv, self).__init__()
-        self.conv = nn.Sequential(
-            nn.Conv2d(
-                nin,
-                nin,
-                kernel_size=ksize,
-                stride=stride,
-                padding=pad,
-                dilation=dilation,
-                groups=nin,
-                bias=False,
-            ),
-            nn.Conv2d(nin, nout, kernel_size=1, bias=False),
-            nn.BatchNorm2d(nout),
-            activ(),
-        )
-
-    def __call__(self, x):
-        return self.conv(x)
-
-
-class Encoder(nn.Module):
-    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
-        super(Encoder, self).__init__()
-        self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
-        self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
-
-    def __call__(self, x):
-        skip = self.conv1(x)
-        h = self.conv2(skip)
-
-        return h, skip
-
-
-class Decoder(nn.Module):
-    def __init__(
-        self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
-    ):
-        super(Decoder, self).__init__()
-        self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
-        self.dropout = nn.Dropout2d(0.1) if dropout else None
-
-    def __call__(self, x, skip=None):
-        x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
-        if skip is not None:
-            skip = spec_utils.crop_center(skip, x)
-            x = torch.cat([x, skip], dim=1)
-        h = self.conv(x)
-
-        if self.dropout is not None:
-            h = self.dropout(h)
-
-        return h
-
-
-class ASPPModule(nn.Module):
-    def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
-        super(ASPPModule, self).__init__()
-        self.conv1 = nn.Sequential(
-            nn.AdaptiveAvgPool2d((1, None)),
-            Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
-        )
-        self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
-        self.conv3 = SeperableConv2DBNActiv(
-            nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
-        )
-        self.conv4 = SeperableConv2DBNActiv(
-            nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
-        )
-        self.conv5 = SeperableConv2DBNActiv(
-            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
-        )
-        self.conv6 = SeperableConv2DBNActiv(
-            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
-        )
-        self.conv7 = SeperableConv2DBNActiv(
-            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
-        )
-        self.bottleneck = nn.Sequential(
-            Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
-        )
-
-    def forward(self, x):
-        _, _, h, w = x.size()
-        feat1 = F.interpolate(
-            self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
-        )
-        feat2 = self.conv2(x)
-        feat3 = self.conv3(x)
-        feat4 = self.conv4(x)
-        feat5 = self.conv5(x)
-        feat6 = self.conv6(x)
-        feat7 = self.conv7(x)
-        out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1)
-        bottle = self.bottleneck(out)
-        return bottle
diff --git a/infer/lib/uvr5_pack/lib_v5/layers_537227KB.py b/infer/lib/uvr5_pack/lib_v5/layers_537227KB.py
deleted file mode 100644
index 9b127bc6427f5c60c8cf85603a3d8a093c3501c4..0000000000000000000000000000000000000000
--- a/infer/lib/uvr5_pack/lib_v5/layers_537227KB.py
+++ /dev/null
@@ -1,126 +0,0 @@
-import torch
-import torch.nn.functional as F
-from torch import nn
-
-from . import spec_utils
-
-
-class Conv2DBNActiv(nn.Module):
-    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
-        super(Conv2DBNActiv, self).__init__()
-        self.conv = nn.Sequential(
-            nn.Conv2d(
-                nin,
-                nout,
-                kernel_size=ksize,
-                stride=stride,
-                padding=pad,
-                dilation=dilation,
-                bias=False,
-            ),
-            nn.BatchNorm2d(nout),
-            activ(),
-        )
-
-    def __call__(self, x):
-        return self.conv(x)
-
-
-class SeperableConv2DBNActiv(nn.Module):
-    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
-        super(SeperableConv2DBNActiv, self).__init__()
-        self.conv = nn.Sequential(
-            nn.Conv2d(
-                nin,
-                nin,
-                kernel_size=ksize,
-                stride=stride,
-                padding=pad,
-                dilation=dilation,
-                groups=nin,
-                bias=False,
-            ),
-            nn.Conv2d(nin, nout, kernel_size=1, bias=False),
-            nn.BatchNorm2d(nout),
-            activ(),
-        )
-
-    def __call__(self, x):
-        return self.conv(x)
-
-
-class Encoder(nn.Module):
-    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
-        super(Encoder, self).__init__()
-        self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
-        self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
-
-    def __call__(self, x):
-        skip = self.conv1(x)
-        h = self.conv2(skip)
-
-        return h, skip
-
-
-class Decoder(nn.Module):
-    def __init__(
-        self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
-    ):
-        super(Decoder, self).__init__()
-        self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
-        self.dropout = nn.Dropout2d(0.1) if dropout else None
-
-    def __call__(self, x, skip=None):
-        x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
-        if skip is not None:
-            skip = spec_utils.crop_center(skip, x)
-            x = torch.cat([x, skip], dim=1)
-        h = self.conv(x)
-
-        if self.dropout is not None:
-            h = self.dropout(h)
-
-        return h
-
-
-class ASPPModule(nn.Module):
-    def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
-        super(ASPPModule, self).__init__()
-        self.conv1 = nn.Sequential(
-            nn.AdaptiveAvgPool2d((1, None)),
-            Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
-        )
-        self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
-        self.conv3 = SeperableConv2DBNActiv(
-            nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
-        )
-        self.conv4 = SeperableConv2DBNActiv(
-            nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
-        )
-        self.conv5 = SeperableConv2DBNActiv(
-            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
-        )
-        self.conv6 = SeperableConv2DBNActiv(
-            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
-        )
-        self.conv7 = SeperableConv2DBNActiv(
-            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
-        )
-        self.bottleneck = nn.Sequential(
-            Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
-        )
-
-    def forward(self, x):
-        _, _, h, w = x.size()
-        feat1 = F.interpolate(
-            self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
-        )
-        feat2 = self.conv2(x)
-        feat3 = self.conv3(x)
-        feat4 = self.conv4(x)
-        feat5 = self.conv5(x)
-        feat6 = self.conv6(x)
-        feat7 = self.conv7(x)
-        out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1)
-        bottle = self.bottleneck(out)
-        return bottle
diff --git a/infer/lib/uvr5_pack/lib_v5/layers_537238KB.py b/infer/lib/uvr5_pack/lib_v5/layers_537238KB.py
deleted file mode 100644
index 9b127bc6427f5c60c8cf85603a3d8a093c3501c4..0000000000000000000000000000000000000000
--- a/infer/lib/uvr5_pack/lib_v5/layers_537238KB.py
+++ /dev/null
@@ -1,126 +0,0 @@
-import torch
-import torch.nn.functional as F
-from torch import nn
-
-from . import spec_utils
-
-
-class Conv2DBNActiv(nn.Module):
-    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
-        super(Conv2DBNActiv, self).__init__()
-        self.conv = nn.Sequential(
-            nn.Conv2d(
-                nin,
-                nout,
-                kernel_size=ksize,
-                stride=stride,
-                padding=pad,
-                dilation=dilation,
-                bias=False,
-            ),
-            nn.BatchNorm2d(nout),
-            activ(),
-        )
-
-    def __call__(self, x):
-        return self.conv(x)
-
-
-class SeperableConv2DBNActiv(nn.Module):
-    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
-        super(SeperableConv2DBNActiv, self).__init__()
-        self.conv = nn.Sequential(
-            nn.Conv2d(
-                nin,
-                nin,
-                kernel_size=ksize,
-                stride=stride,
-                padding=pad,
-                dilation=dilation,
-                groups=nin,
-                bias=False,
-            ),
-            nn.Conv2d(nin, nout, kernel_size=1, bias=False),
-            nn.BatchNorm2d(nout),
-            activ(),
-        )
-
-    def __call__(self, x):
-        return self.conv(x)
-
-
-class Encoder(nn.Module):
-    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
-        super(Encoder, self).__init__()
-        self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
-        self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
-
-    def __call__(self, x):
-        skip = self.conv1(x)
-        h = self.conv2(skip)
-
-        return h, skip
-
-
-class Decoder(nn.Module):
-    def __init__(
-        self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
-    ):
-        super(Decoder, self).__init__()
-        self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
-        self.dropout = nn.Dropout2d(0.1) if dropout else None
-
-    def __call__(self, x, skip=None):
-        x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
-        if skip is not None:
-            skip = spec_utils.crop_center(skip, x)
-            x = torch.cat([x, skip], dim=1)
-        h = self.conv(x)
-
-        if self.dropout is not None:
-            h = self.dropout(h)
-
-        return h
-
-
-class ASPPModule(nn.Module):
-    def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
-        super(ASPPModule, self).__init__()
-        self.conv1 = nn.Sequential(
-            nn.AdaptiveAvgPool2d((1, None)),
-            Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
-        )
-        self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
-        self.conv3 = SeperableConv2DBNActiv(
-            nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
-        )
-        self.conv4 = SeperableConv2DBNActiv(
-            nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
-        )
-        self.conv5 = SeperableConv2DBNActiv(
-            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
-        )
-        self.conv6 = SeperableConv2DBNActiv(
-            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
-        )
-        self.conv7 = SeperableConv2DBNActiv(
-            nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
-        )
-        self.bottleneck = nn.Sequential(
-            Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
-        )
-
-    def forward(self, x):
-        _, _, h, w = x.size()
-        feat1 = F.interpolate(
-            self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
-        )
-        feat2 = self.conv2(x)
-        feat3 = self.conv3(x)
-        feat4 = self.conv4(x)
-        feat5 = self.conv5(x)
-        feat6 = self.conv6(x)
-        feat7 = self.conv7(x)
-        out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1)
-        bottle = self.bottleneck(out)
-        return bottle
diff --git a/infer/lib/uvr5_pack/lib_v5/layers_new.py b/infer/lib/uvr5_pack/lib_v5/layers_new.py
deleted file mode 100644
index 44153b6a23399c6938affc61c71919eaa172bcee..0000000000000000000000000000000000000000
--- a/infer/lib/uvr5_pack/lib_v5/layers_new.py
+++ /dev/null
@@ -1,125 +0,0 @@
-import torch
-import torch.nn.functional as F
-from torch import nn
-
-from . import spec_utils
-
-
-class Conv2DBNActiv(nn.Module):
-    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
-        super(Conv2DBNActiv, self).__init__()
-        self.conv = nn.Sequential(
-            nn.Conv2d(
-                nin,
-                nout,
-                kernel_size=ksize,
-                stride=stride,
-                padding=pad,
-                dilation=dilation,
-                bias=False,
-            ),
-            nn.BatchNorm2d(nout),
-            activ(),
-        )
-
-    def __call__(self, x):
-        return self.conv(x)
-
-
-class Encoder(nn.Module):
-    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
-        super(Encoder, self).__init__()
-        self.conv1 = Conv2DBNActiv(nin, nout, ksize, stride, pad, activ=activ)
-        self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)
-
-    def __call__(self, x):
-        h = self.conv1(x)
-        h = self.conv2(h)
-
-        return h
-
-
-class Decoder(nn.Module):
-    def __init__(
-        self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
-    ):
-        super(Decoder, self).__init__()
-        self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
-        # self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)
-        self.dropout = nn.Dropout2d(0.1) if dropout else None
-
-    def __call__(self, x, skip=None):
-        x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
-
-        if skip is not None:
-            skip = spec_utils.crop_center(skip, x)
-            x = torch.cat([x, skip], dim=1)
-
-        h = self.conv1(x)
-        # h = self.conv2(h)
-
-        if self.dropout is not None:
-            h = self.dropout(h)
-
-        return h
-
-
-class ASPPModule(nn.Module):
-    def __init__(self, nin, nout, dilations=(4, 8, 12), activ=nn.ReLU, dropout=False):
-        super(ASPPModule, self).__init__()
-        self.conv1 = nn.Sequential(
-            nn.AdaptiveAvgPool2d((1, None)),
-            Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ),
-        )
-        self.conv2 = Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ)
-        self.conv3 = Conv2DBNActiv(
-            nin, nout, 3, 1, dilations[0], dilations[0], activ=activ
-        )
-        self.conv4 = Conv2DBNActiv(
-            nin, nout, 3, 1, dilations[1], dilations[1], activ=activ
-        )
-        self.conv5 = Conv2DBNActiv(
-            nin, nout, 3, 1, dilations[2], dilations[2], activ=activ
-        )
-        self.bottleneck = Conv2DBNActiv(nout * 5, nout, 1, 1, 0, activ=activ)
-        self.dropout = nn.Dropout2d(0.1) if dropout else None
-
-    def forward(self, x):
-        _, _, h, w = x.size()
-        feat1 = F.interpolate(
-            self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
-        )
-        feat2 = self.conv2(x)
-        feat3 = self.conv3(x)
-        feat4 = self.conv4(x)
-        feat5 = self.conv5(x)
-        out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
-        out = self.bottleneck(out)
-
-        if self.dropout is not None:
-            out = self.dropout(out)
-
-        return out
-
-
-class LSTMModule(nn.Module):
-    def __init__(self, nin_conv, nin_lstm, nout_lstm):
-        super(LSTMModule, self).__init__()
-        self.conv = Conv2DBNActiv(nin_conv, 1, 1, 1, 0)
-        self.lstm = nn.LSTM(
-            input_size=nin_lstm, hidden_size=nout_lstm // 2, bidirectional=True
-        )
-        self.dense = nn.Sequential(
-            nn.Linear(nout_lstm, nin_lstm), nn.BatchNorm1d(nin_lstm), nn.ReLU()
-        )
-
-    def forward(self, x):
-        N, _, nbins, nframes = x.size()
-        h = self.conv(x)[:, 0]  # N, nbins, nframes
-        h = h.permute(2, 0, 1)  # nframes, N, nbins
-        h, _ = self.lstm(h)
-        h = self.dense(h.reshape(-1, h.size()[-1]))  # nframes * N, nbins
-        h = h.reshape(nframes, N, 1, nbins)
-        h = h.permute(1, 2, 3, 0)
-
-        return h
diff --git a/infer/lib/uvr5_pack/lib_v5/model_param_init.py b/infer/lib/uvr5_pack/lib_v5/model_param_init.py
deleted file mode 100644
index b995c0bfb1194746187692e2ab1c2a6dbaaaec6c..0000000000000000000000000000000000000000
--- a/infer/lib/uvr5_pack/lib_v5/model_param_init.py
+++ /dev/null
@@ -1,69 +0,0 @@
-import json
-import os
-import pathlib
-
-default_param = {}
-default_param["bins"] = 768
-default_param["unstable_bins"] = 9  # training only
-default_param["reduction_bins"] = 762  # training only
-default_param["sr"] = 44100
-default_param["pre_filter_start"] = 757
-default_param["pre_filter_stop"] = 768
-default_param["band"] = {}
-
-
-default_param["band"][1] = {
-    "sr": 11025,
-    "hl": 128,
-    "n_fft": 960,
-    "crop_start": 0,
-    "crop_stop": 245,
-    "lpf_start": 61,  # inference only
-    "res_type": "polyphase",
-}
-
-default_param["band"][2] = {
-    "sr": 44100,
-    "hl": 512,
-    "n_fft": 1536,
-    "crop_start": 24,
-    "crop_stop": 547,
-    "hpf_start": 81,  # inference only
-    "res_type": "sinc_best",
-}
-
-
-def int_keys(d):
-    r = {}
-    for k, v in d:
-        if k.isdigit():
-            k = int(k)
-        r[k] = v
-    return r
-
-
-class ModelParameters(object):
-    def __init__(self, config_path=""):
-        if ".pth" == pathlib.Path(config_path).suffix:
-            import zipfile
-
-            with zipfile.ZipFile(config_path, "r") as zip:
-                self.param = json.loads(
-                    zip.read("param.json"), object_pairs_hook=int_keys
-                )
-        elif ".json" == pathlib.Path(config_path).suffix:
-            with open(config_path, "r") as f:
-                self.param = json.loads(f.read(), object_pairs_hook=int_keys)
-        else:
-            self.param = default_param
-
-        for k in [
-            "mid_side",
-            "mid_side_b",
-            "mid_side_b2",
-            "stereo_w",
-            "stereo_n",
-            "reverse",
-        ]:
-            if not k in self.param:
-                self.param[k] = False
diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json
deleted file mode 100644
index 72cb4499867ad2827185e85687f06fb73d33eced..0000000000000000000000000000000000000000
--- a/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json
+++ /dev/null
@@ -1,19 +0,0 @@
-{
-	"bins": 1024,
-	"unstable_bins": 0,
-	"reduction_bins": 0,
-	"band": {
-		"1": {
-			"sr": 16000,
-			"hl": 512,
-			"n_fft": 2048,
-			"crop_start": 0,
-			"crop_stop": 1024,
-			"hpf_start": -1,
-			"res_type": "sinc_best"
-		}
-	},
-	"sr": 16000,
-	"pre_filter_start": 1023,
-	"pre_filter_stop": 1024
-}
\ No newline at end of file
diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json
deleted file mode 100644
index 3c00ecf0a105e55a6a86a3c32db301a2635b5b41..0000000000000000000000000000000000000000
--- a/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json
+++ /dev/null
@@ -1,19 +0,0 @@
-{
-	"bins": 1024,
-	"unstable_bins": 0,
-	"reduction_bins": 0,
-	"band": {
-		"1": {
-			"sr": 32000,
-			"hl": 512,
-			"n_fft": 2048,
-			"crop_start": 0,
-			"crop_stop": 1024,
-			"hpf_start": -1,
-			"res_type": "kaiser_fast"
-		}
-	},
-	"sr": 32000,
-	"pre_filter_start": 1000,
-	"pre_filter_stop": 1021
-}
\ No newline at end of file
diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json
deleted file mode 100644
index 55666ac9a8d0547751fb4b4d3bffb1ee2c956913..0000000000000000000000000000000000000000
--- a/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json
+++ /dev/null
@@ -1,19 +0,0 @@
-{
-	"bins": 1024,
-	"unstable_bins": 0,
-	"reduction_bins": 0,
-	"band": {
-		"1": {
-			"sr": 33075,
-			"hl": 384,
-			"n_fft": 2048,
-			"crop_start": 0,
-			"crop_stop": 1024,
-			"hpf_start": -1,
-			"res_type": "sinc_best"
-		}
-	},
-	"sr": 33075,
-	"pre_filter_start": 1000,
-	"pre_filter_stop": 1021
-}
\ No newline at end of file
diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json
deleted file mode 100644
index 665abe20eb3cc39fe0f8493dad8f25f6ef634a14..0000000000000000000000000000000000000000
--- a/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json
+++ /dev/null
@@ -1,19 +0,0 @@
-{
-	"bins": 1024,
-	"unstable_bins": 0,
-	"reduction_bins": 0,
-	"band": {
-		"1": {
-			"sr": 44100,
-			"hl": 1024,
-			"n_fft": 2048,
-			"crop_start": 0,
-			"crop_stop": 1024,
-			"hpf_start": -1,
-			"res_type": "sinc_best"
-		}
-	},
-	"sr": 44100,
-	"pre_filter_start": 1023,
-	"pre_filter_stop": 1024
-}
\ No newline at end of file
diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json
deleted file mode 100644
index 0e8b16f89b0231d06eabe8d2f7c2670c7caa2272..0000000000000000000000000000000000000000
--- a/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json
+++ /dev/null
@@ -1,19 +0,0 @@
-{
-	"bins": 256,
-	"unstable_bins": 0,
-	"reduction_bins": 0,
-	"band": {
-		"1": {
-			"sr": 44100,
-			"hl": 256,
-			"n_fft": 512,
-			"crop_start": 0,
-			"crop_stop": 256,
-			"hpf_start": -1,
-			"res_type": "sinc_best"
-		}
-	},
-	"sr": 44100,
-	"pre_filter_start": 256,
-	"pre_filter_stop": 256
-}
\ No newline at end of file
diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json
deleted file mode 100644
index 3b38fcaf60ba204e03a47f5bd3f5bcfe75e1983a..0000000000000000000000000000000000000000
--- a/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json
+++ /dev/null
@@ -1,19 +0,0 @@
-{
-	"bins": 1024,
-	"unstable_bins": 0,
-	"reduction_bins": 0,
-	"band": {
-		"1": {
-			"sr": 44100,
-			"hl": 512,
-			"n_fft": 2048,
-			"crop_start": 0,
-			"crop_stop": 1024,
-			"hpf_start": -1,
-			"res_type": "sinc_best"
-		}
-	},
-	"sr": 44100,
-	"pre_filter_start": 1023,
-	"pre_filter_stop": 1024
-}
\ No newline at end of file
diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512_cut.json b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512_cut.json
deleted file mode 100644
index 630df3524e340f43a1ddb7b33ff02cc91fc1cb47..0000000000000000000000000000000000000000
--- a/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512_cut.json
+++ /dev/null
@@ -1,19 +0,0 @@
-{
-	"bins": 1024,
-	"unstable_bins": 0,
-	"reduction_bins": 0,
-	"band": {
-		"1": {
-			"sr": 44100,
-			"hl": 512,
-			"n_fft": 2048,
-			"crop_start": 0,
-			"crop_stop": 700,
-			"hpf_start": -1,
-			"res_type": "sinc_best"
-		}
-	},
-	"sr": 44100,
-	"pre_filter_start": 1023,
-	"pre_filter_stop": 700
-}
\ No newline at end of file
diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/2band_32000.json b/infer/lib/uvr5_pack/lib_v5/modelparams/2band_32000.json
deleted file mode 100644
index ab9cf1150a818eb6252105408311be0a40d423b3..0000000000000000000000000000000000000000
--- a/infer/lib/uvr5_pack/lib_v5/modelparams/2band_32000.json
+++ /dev/null
@@ -1,30 +0,0 @@
-{
-	"bins": 768,
-	"unstable_bins": 7,
-	"reduction_bins": 705,
-	"band": {
-		"1": {
-			"sr": 6000,
-			"hl": 66,
-			"n_fft": 512,
-			"crop_start": 0,
-			"crop_stop": 240,
-			"lpf_start": 60,
-			"lpf_stop": 118,
-			"res_type": "sinc_fastest"
-		},
-		"2": {
-			"sr": 32000,
-			"hl": 352,
-			"n_fft": 1024,
-			"crop_start": 22,
-			"crop_stop": 505,
-			"hpf_start": 44,
-			"hpf_stop": 23,
-			"res_type": "sinc_medium"
-		}
-	},
-	"sr": 32000,
-	"pre_filter_start": 710,
-	"pre_filter_stop": 731
-}
diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json b/infer/lib/uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json
deleted file mode 100644
index 7faa216d7b49aeece24123dbdd868847a1dbc03c..0000000000000000000000000000000000000000
--- a/infer/lib/uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json
+++ /dev/null
@@ -1,30 +0,0 @@
-{
-	"bins": 512,
-	"unstable_bins": 7,
-	"reduction_bins": 510,
-	"band": {
-		"1": {
-			"sr": 11025,
-			"hl": 160,
-			"n_fft": 768,
-			"crop_start": 0,
-			"crop_stop": 192,
-			"lpf_start": 41,
-			"lpf_stop": 139,
-			"res_type": "sinc_fastest"
-		},
-		"2": {
-			"sr": 44100,
-			"hl": 640,
-			"n_fft": 1024,
-			"crop_start": 10,
-			"crop_stop": 320,
-			"hpf_start": 47,
-			"hpf_stop": 15,
-			"res_type": "sinc_medium"
-		}
-	},
-	"sr": 44100,
-	"pre_filter_start": 510,
-	"pre_filter_stop": 512
-}
diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/2band_48000.json b/infer/lib/uvr5_pack/lib_v5/modelparams/2band_48000.json
deleted file mode 100644
index 7e78175052b09cb1a32345e54006475992712f9a..0000000000000000000000000000000000000000
--- a/infer/lib/uvr5_pack/lib_v5/modelparams/2band_48000.json
+++ /dev/null
@@ -1,30 +0,0 @@
-{
-	"bins": 768,
-	"unstable_bins": 7,
-	"reduction_bins": 705,
-	"band": {
-		"1": {
-			"sr": 6000,
-			"hl": 66,
-			"n_fft": 512,
-			"crop_start": 0,
-			"crop_stop": 240,
-			"lpf_start": 60,
-			"lpf_stop": 240,
-			"res_type": "sinc_fastest"
-		},
-		"2": {
-			"sr": 48000,
-			"hl": 528,
-			"n_fft": 1536,
-			"crop_start": 22,
-			"crop_stop": 505,
-			"hpf_start": 82,
-			"hpf_stop": 22,
-			"res_type": "sinc_medium"
-		}
-	},
-	"sr": 48000,
-	"pre_filter_start": 710,
-	"pre_filter_stop": 731
-}
\ No newline at end of file
diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100.json b/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100.json
deleted file mode 100644
index d881d767ff83fbac0e18dfe2587ef16925b29b3c..0000000000000000000000000000000000000000
--- a/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100.json
+++ /dev/null
@@ -1,42 +0,0 @@
-{
-	"bins": 768,
-	"unstable_bins": 5,
-	"reduction_bins": 733,
-	"band": {
-		"1": {
-			"sr": 11025,
-			"hl": 128,
-			"n_fft": 768,
-			"crop_start": 0,
-			"crop_stop": 278,
-			"lpf_start": 28,
-			"lpf_stop": 140,
-			"res_type": "polyphase"
-		},
-		"2": {
-			"sr": 22050,
-			"hl": 256,
-			"n_fft": 768,
-			"crop_start": 14,
-			"crop_stop": 322,
-			"hpf_start": 70,
-			"hpf_stop": 14,
-			"lpf_start": 283,
-			"lpf_stop": 314,
-			"res_type": "polyphase"
-		},	
-		"3": {
-			"sr": 44100,
-			"hl": 512,
-			"n_fft": 768,
-			"crop_start": 131,
-			"crop_stop": 313,
-			"hpf_start": 154,
-			"hpf_stop": 141,
-			"res_type": "sinc_medium"
-		}
-	},
-	"sr": 44100,
-	"pre_filter_start": 757,
-	"pre_filter_stop": 768
-}
diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json b/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json
deleted file mode 100644
index 77ec198573b19f36519a028a509767d30764c0e2..0000000000000000000000000000000000000000
--- a/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json
+++ /dev/null
@@ -1,43 +0,0 @@
-{
-	"mid_side": true,
-	"bins": 768,
-	"unstable_bins": 5,
-	"reduction_bins": 733,
-	"band": {
-		"1": {
-			"sr": 11025,
-			"hl": 128,
-			"n_fft": 768,
-			"crop_start": 0,
-			"crop_stop": 278,
-			"lpf_start": 28,
-			"lpf_stop": 140,
-			"res_type": "polyphase"
-		},
-		"2": {
-			"sr": 22050,
-			"hl": 256,
-			"n_fft": 768,
-			"crop_start": 14,
-			"crop_stop": 322,
-			"hpf_start": 70,
-			"hpf_stop": 14,
-			"lpf_start": 283,
-			"lpf_stop": 314,
-			"res_type": "polyphase"
-		},	
-		"3": {
-			"sr": 44100,
-			"hl": 512,
-			"n_fft": 768,
-			"crop_start": 131,
-			"crop_stop": 313,
-			"hpf_start": 154,
-			"hpf_stop": 141,
-			"res_type": "sinc_medium"
-		}
-	},
-	"sr": 44100,
-	"pre_filter_start": 757,
-	"pre_filter_stop": 768
-}
diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json b/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json
deleted file mode 100644
index 85ee8a7d44541c9176e85ea3dce8728d34990938..0000000000000000000000000000000000000000
--- a/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json
+++ /dev/null
@@ -1,43 +0,0 @@
-{
-	"mid_side_b2": true,
-	"bins": 640,
-	"unstable_bins": 7,
-	"reduction_bins": 565,
-	"band": {
-		"1": {
-			"sr": 11025,
-			"hl": 108,
-			"n_fft": 1024,
-			"crop_start": 0,
-			"crop_stop": 187,
-			"lpf_start": 92,
-			"lpf_stop": 186,
-			"res_type": "polyphase"
-		},
-		"2": {
-			"sr": 22050,
-			"hl": 216,
-			"n_fft": 768,
-			"crop_start": 0,
-			"crop_stop": 212,
-			"hpf_start": 68,
-			"hpf_stop": 34,
-			"lpf_start": 174,
-			"lpf_stop": 209,
-			"res_type": "polyphase"
-		},	
-		"3": {
-			"sr": 44100,
-			"hl": 432,
-			"n_fft": 640,
-			"crop_start": 66,
-			"crop_stop": 307,
-			"hpf_start": 86,
-			"hpf_stop": 72,
-			"res_type": "kaiser_fast"
-		}
-	},
-	"sr": 44100,
-	"pre_filter_start": 639,
-	"pre_filter_stop": 640
-}
diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json
deleted file mode 100644
index df123754204372aa50d464fbe9102a401f48cc73..0000000000000000000000000000000000000000
--- a/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json
+++ /dev/null
@@ -1,54 +0,0 @@
-{
-	"bins": 768,
-	"unstable_bins": 7,
-	"reduction_bins": 668,
-	"band": {
-		"1": {
-			"sr": 11025,
-			"hl": 128,
-			"n_fft": 1024,
-			"crop_start": 0,
-			"crop_stop": 186,
-			"lpf_start": 37,
-			"lpf_stop": 73,
-			"res_type": "polyphase"
-		},
-		"2": {
-			"sr": 11025,
-			"hl": 128,
-			"n_fft": 512,
-			"crop_start": 4,
-			"crop_stop": 185,			
-			"hpf_start": 36,
-			"hpf_stop": 18,
-			"lpf_start": 93,
-			"lpf_stop": 185,
-			"res_type": "polyphase"
-		},
-		"3": {
-			"sr": 22050,
-			"hl": 256,
-			"n_fft": 512,
-			"crop_start": 46,
-			"crop_stop": 186,
-			"hpf_start": 93,
-			"hpf_stop": 46,
-			"lpf_start": 164,
-			"lpf_stop": 186,
-			"res_type": "polyphase"
-		},	
-		"4": {
-			"sr": 44100,
-			"hl": 512,
-			"n_fft": 768,
-			"crop_start": 121,
-			"crop_stop": 382,
-			"hpf_start": 138,
-			"hpf_stop": 123,
-			"res_type": "sinc_medium"
-		}
-	},
-	"sr": 44100,
-	"pre_filter_start": 740,
-	"pre_filter_stop": 768
-}
diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_mid.json b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_mid.json
deleted file mode 100644
index e91b699eb63d3382c3b9e9edf46d40ed91d6122b..0000000000000000000000000000000000000000
--- a/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_mid.json
+++ /dev/null
@@ -1,55 +0,0 @@
-{
-	"bins": 768,
-	"unstable_bins": 7,
-	"mid_side": true,
-	"reduction_bins": 668,
-	"band": {
-		"1": {
-			"sr": 11025,
-			"hl": 128,
-			"n_fft": 1024,
-			"crop_start": 0,
-			"crop_stop": 186,
-			"lpf_start": 37,
-			"lpf_stop": 73,
-			"res_type": "polyphase"
-		},
-		"2": {
-			"sr": 11025,
-			"hl": 128,
-			"n_fft": 512,
-			"crop_start": 4,
-			"crop_stop": 185,			
-			"hpf_start": 36,
-			"hpf_stop": 18,
-			"lpf_start": 93,
-			"lpf_stop": 185,
-			"res_type": "polyphase"
-		},
-		"3": {
-			"sr": 22050,
-			"hl": 256,
-			"n_fft": 512,
-			"crop_start": 46,
-			"crop_stop": 186,
-			"hpf_start": 93,
-			"hpf_stop": 46,
-			"lpf_start": 164,
-			"lpf_stop": 186,
-			"res_type": "polyphase"
-		},	
-		"4": {
-			"sr": 44100,
-			"hl": 512,
-			"n_fft": 768,
-			"crop_start": 121,
-			"crop_stop": 382,
-			"hpf_start": 138,
-			"hpf_stop": 123,
-			"res_type": "sinc_medium"
-		}
-	},
-	"sr": 44100,
-	"pre_filter_start": 740,
-	"pre_filter_stop": 768
-}
diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb.json b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb.json
deleted file mode 100644
index f852f280ec9d98fc1b65cec688290eaafec61b84..0000000000000000000000000000000000000000
--- a/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb.json
+++ /dev/null
@@ -1,55 +0,0 @@
-{
-	"mid_side_b": true,
-	"bins": 768,
-	"unstable_bins": 7,
-	"reduction_bins": 668,
-	"band": {
-		"1": {
-			"sr": 11025,
-			"hl": 128,
-			"n_fft": 1024,
-			"crop_start": 0,
-			"crop_stop": 186,
-			"lpf_start": 37,
-			"lpf_stop": 73,
-			"res_type": "polyphase"
-		},
-		"2": {
-			"sr": 11025,
-			"hl": 128,
-			"n_fft": 512,
-			"crop_start": 4,
-			"crop_stop": 185,			
-			"hpf_start": 36,
-			"hpf_stop": 18,
-			"lpf_start": 93,
-			"lpf_stop": 185,
-			"res_type": "polyphase"
-		},
-		"3": {
-			"sr": 22050,
-			"hl": 256,
-			"n_fft": 512,
-			"crop_start": 46,
-			"crop_stop": 186,
-			"hpf_start": 93,
-			"hpf_stop": 46,
-			"lpf_start": 164,
-			"lpf_stop": 186,
-			"res_type": "polyphase"
-		},	
-		"4": {
-			"sr": 44100,
-			"hl": 512,
-			"n_fft": 768,
-			"crop_start": 121,
-			"crop_stop": 382,
-			"hpf_start": 138,
-			"hpf_stop": 123,
-			"res_type": "sinc_medium"
-		}
-	},
-	"sr": 44100,
-	"pre_filter_start": 740,
-	"pre_filter_stop": 768
-}
\ No newline at end of file
diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json
deleted file mode 100644
index f852f280ec9d98fc1b65cec688290eaafec61b84..0000000000000000000000000000000000000000
--- a/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json
+++ /dev/null
@@ -1,55 +0,0 @@
-{
-	"mid_side_b": true,
-	"bins": 768,
-	"unstable_bins": 7,
-	"reduction_bins": 668,
-	"band": {
-		"1": {
-			"sr": 11025,
-			"hl": 128,
-			"n_fft": 1024,
-			"crop_start": 0,
-			"crop_stop": 186,
-			"lpf_start": 37,
-			"lpf_stop": 73,
-			"res_type": "polyphase"
-		},
-		"2": {
-			"sr": 11025,
-			"hl": 128,
-			"n_fft": 512,
-			"crop_start": 4,
-			"crop_stop": 185,			
-			"hpf_start": 36,
-			"hpf_stop": 18,
-			"lpf_start": 93,
-			"lpf_stop": 185,
-			"res_type": "polyphase"
-		},
-		"3": {
-			"sr": 22050,
-			"hl": 256,
-			"n_fft": 512,
-			"crop_start": 46,
-			"crop_stop": 186,
-			"hpf_start": 93,
-			"hpf_stop": 46,
-			"lpf_start": 164,
-			"lpf_stop": 186,
-			"res_type": "polyphase"
-		},	
-		"4": {
-			"sr": 44100,
-			"hl": 512,
-			"n_fft": 768,
-			"crop_start": 121,
-			"crop_stop": 382,
-			"hpf_start": 138,
-			"hpf_stop": 123,
-			"res_type": "sinc_medium"
-		}
-	},
-	"sr": 44100,
-	"pre_filter_start": 740,
-	"pre_filter_stop": 768
-}
\ No newline at end of file
diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json
deleted file mode 100644
index 7a07d5541bd83dc1caa20b531c3b43a2ffccac88..0000000000000000000000000000000000000000
--- a/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json
+++ /dev/null
@@ -1,55 +0,0 @@
-{
-	"reverse": true,
-	"bins": 768,
-	"unstable_bins": 7,
-	"reduction_bins": 668,
-	"band": {
-		"1": {
-			"sr": 11025,
-			"hl": 128,
-			"n_fft": 1024,
-			"crop_start": 0,
-			"crop_stop": 186,
-			"lpf_start": 37,
-			"lpf_stop": 73,
-			"res_type": "polyphase"
-		},
-		"2": {
-			"sr": 11025,
-			"hl": 128,
-			"n_fft": 512,
-			"crop_start": 4,
-			"crop_stop": 185,			
-			"hpf_start": 36,
-			"hpf_stop": 18,
-			"lpf_start": 93,
-			"lpf_stop": 185,
-			"res_type": "polyphase"
-		},
-		"3": {
-			"sr": 22050,
-			"hl": 256,
-			"n_fft": 512,
-			"crop_start": 46,
-			"crop_stop": 186,
-			"hpf_start": 93,
-			"hpf_stop": 46,
-			"lpf_start": 164,
-			"lpf_stop": 186,
-			"res_type": "polyphase"
-		},	
-		"4": {
-			"sr": 44100,
-			"hl": 512,
-			"n_fft": 768,
-			"crop_start": 121,
-			"crop_stop": 382,
-			"hpf_start": 138,
-			"hpf_stop": 123,
-			"res_type": "sinc_medium"
-		}
-	},
-	"sr": 44100,
-	"pre_filter_start": 740,
-	"pre_filter_stop": 768
-}
\ No newline at end of file
diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_sw.json b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_sw.json
deleted file mode 100644
index ba0cf342106de793e6ec3e876854c7fd451fbf76..0000000000000000000000000000000000000000
--- a/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_sw.json
+++ /dev/null
@@ -1,55 +0,0 @@
-{
-	"stereo_w": true,
-	"bins": 768,
-	"unstable_bins": 7,
-	"reduction_bins": 668,
-	"band": {
-		"1": {
-			"sr": 11025,
-			"hl": 128,
-			"n_fft": 1024,
-			"crop_start": 0,
-			"crop_stop": 186,
-			"lpf_start": 37,
-			"lpf_stop": 73,
-			"res_type": "polyphase"
-		},
-		"2": {
-			"sr": 11025,
-			"hl": 128,
-			"n_fft": 512,
-			"crop_start": 4,
-			"crop_stop": 185,			
-			"hpf_start": 36,
-			"hpf_stop": 18,
-			"lpf_start": 93,
-			"lpf_stop": 185,
-			"res_type": "polyphase"
-		},
-		"3": {
-			"sr": 22050,
-			"hl": 256,
-			"n_fft": 512,
-			"crop_start": 46,
-			"crop_stop": 186,
-			"hpf_start": 93,
-			"hpf_stop": 46,
-			"lpf_start": 164,
-			"lpf_stop": 186,
-			"res_type": "polyphase"
-		},	
-		"4": {
-			"sr": 44100,
-			"hl": 512,
-			"n_fft": 768,
-			"crop_start": 121,
-			"crop_stop": 382,
-			"hpf_start": 138,
-			"hpf_stop": 123,
-			"res_type": "sinc_medium"
-		}
-	},
-	"sr": 44100,
-	"pre_filter_start": 740,
-	"pre_filter_stop": 768
-}
\ No newline at end of file
diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json
deleted file mode 100644
index 33281a0cf9916fc33558ddfda7a0287a2547faf4..0000000000000000000000000000000000000000
--- a/infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json
+++ /dev/null
@@ -1,54 +0,0 @@
-{
-	"bins": 672,
-	"unstable_bins": 8,
-	"reduction_bins": 637,
-	"band": {
-		"1": {
-			"sr": 7350,
-			"hl": 80,
-			"n_fft": 640,
-			"crop_start": 0,
-			"crop_stop": 85,
-			"lpf_start": 25,
-			"lpf_stop": 53,
-			"res_type": "polyphase"
-		},
-		"2": {
-			"sr": 7350,
-			"hl": 80,
-			"n_fft": 320,
-			"crop_start": 4,
-			"crop_stop": 87,
-			"hpf_start": 25,
-			"hpf_stop": 12,
-			"lpf_start": 31,
-			"lpf_stop": 62,
-			"res_type": "polyphase"
-		},		
-		"3": {
-			"sr": 14700,
-			"hl": 160,
-			"n_fft": 512,
-			"crop_start": 17,
-			"crop_stop": 216,
-			"hpf_start": 48,
-			"hpf_stop": 24,
-			"lpf_start": 139,
-			"lpf_stop": 210,
-			"res_type": "polyphase"
-		},	
-		"4": {
-			"sr": 44100,
-			"hl": 480,
-			"n_fft": 960,
-			"crop_start": 78,
-			"crop_stop": 383,
-			"hpf_start": 130,
-			"hpf_stop": 86,
-			"res_type": "kaiser_fast"
-		}
-	},
-	"sr": 44100,
-	"pre_filter_start": 668,
-	"pre_filter_stop": 672
-}
\ No newline at end of file
diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json
deleted file mode 100644
index 2e5c770fe188779bf6b0873190b7a324d6a867b2..0000000000000000000000000000000000000000
--- a/infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json
+++ /dev/null
@@ -1,55 +0,0 @@
-{
-	"bins": 672,
-	"unstable_bins": 8,
-	"reduction_bins": 637,
-	"band": {
-		"1": {
-			"sr": 7350,
-			"hl": 80,
-			"n_fft": 640,
-			"crop_start": 0,
-			"crop_stop": 85,
-			"lpf_start": 25,
-			"lpf_stop": 53,
-			"res_type": "polyphase"
-		},
-		"2": {
-			"sr": 7350,
-			"hl": 80,
-			"n_fft": 320,
-			"crop_start": 4,
-			"crop_stop": 87,
-			"hpf_start": 25,
-			"hpf_stop": 12,
-			"lpf_start": 31,
-			"lpf_stop": 62,
-			"res_type": "polyphase"
-		},		
-		"3": {
-			"sr": 14700,
-			"hl": 160,
-			"n_fft": 512,
-			"crop_start": 17,
-			"crop_stop": 216,
-			"hpf_start": 48,
-			"hpf_stop": 24,
-			"lpf_start": 139,
-			"lpf_stop": 210,
-			"res_type": "polyphase"
-		},	
-		"4": {
-			"sr": 44100,
-			"hl": 480,
-			"n_fft": 960,
-			"crop_start": 78,
-			"crop_stop": 383,
-			"hpf_start": 130,
-			"hpf_stop": 86,
-			"convert_channels": "stereo_n",
-			"res_type": "kaiser_fast"
-		}
-	},
-	"sr": 44100,
-	"pre_filter_start": 668,
-	"pre_filter_stop": 672
-}
\ No newline at end of file
diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/4band_v3.json b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_v3.json
deleted file mode 100644
index 2a73bc97ac545145a75bdca7addc5d59f5b8574b..0000000000000000000000000000000000000000
--- a/infer/lib/uvr5_pack/lib_v5/modelparams/4band_v3.json
+++ /dev/null
@@ -1,54 +0,0 @@
-{
-	"bins": 672,
-	"unstable_bins": 8,
-	"reduction_bins": 530,
-	"band": {
-		"1": {
-			"sr": 7350,
-			"hl": 80,
-			"n_fft": 640,
-			"crop_start": 0,
-			"crop_stop": 85,
-			"lpf_start": 25,
-			"lpf_stop": 53,
-			"res_type": "polyphase"
-		},
-		"2": {
-			"sr": 7350,
-			"hl": 80,
-			"n_fft": 320,
-			"crop_start": 4,
-			"crop_stop": 87,
-			"hpf_start": 25,
-			"hpf_stop": 12,
-			"lpf_start": 31,
-			"lpf_stop": 62,
-			"res_type": "polyphase"
-		},
-		"3": {
-			"sr": 14700,
-			"hl": 160,
-			"n_fft": 512,
-			"crop_start": 17,
-			"crop_stop": 216,
-			"hpf_start": 48,
-			"hpf_stop": 24,
-			"lpf_start": 139,
-			"lpf_stop": 210,
-			"res_type": "polyphase"
-		},
-		"4": {
-			"sr": 44100,
-			"hl": 480,
-			"n_fft": 960,
-			"crop_start": 78,
-			"crop_stop": 383,
-			"hpf_start": 130,
-			"hpf_stop": 86,
-			"res_type": "kaiser_fast"
-		}
-	},
-	"sr": 44100,
-	"pre_filter_start": 668,
-	"pre_filter_stop": 672
-}
\ No newline at end of file
diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/ensemble.json b/infer/lib/uvr5_pack/lib_v5/modelparams/ensemble.json
deleted file mode 100644
index ee69beb46fc82f34619c5e48761e329fcabbbd00..0000000000000000000000000000000000000000
--- a/infer/lib/uvr5_pack/lib_v5/modelparams/ensemble.json
+++ /dev/null
@@ -1,43 +0,0 @@
-{
-	"mid_side_b2": true,
-	"bins": 1280,
-	"unstable_bins": 7,
-	"reduction_bins": 565,
-	"band": {
-		"1": {
-			"sr": 11025,
-			"hl": 108,
-			"n_fft": 2048,
-			"crop_start": 0,
-			"crop_stop": 374,
-			"lpf_start": 92,
-			"lpf_stop": 186,
-			"res_type": "polyphase"
-		},
-		"2": {
-			"sr": 22050,
-			"hl": 216,
-			"n_fft": 1536,
-			"crop_start": 0,
-			"crop_stop": 424,
-			"hpf_start": 68,
-			"hpf_stop": 34,
-			"lpf_start": 348,
-			"lpf_stop": 418,
-			"res_type": "polyphase"
-		},	
-		"3": {
-			"sr": 44100,
-			"hl": 432,
-			"n_fft": 1280,
-			"crop_start": 132,
-			"crop_stop": 614,
-			"hpf_start": 172,
-			"hpf_stop": 144,
-			"res_type": "polyphase"
-		}
-	},
-	"sr": 44100,
-	"pre_filter_start": 1280,
-	"pre_filter_stop": 1280
-}
\ No newline at end of file
diff --git a/infer/lib/uvr5_pack/lib_v5/nets.py b/infer/lib/uvr5_pack/lib_v5/nets.py
deleted file mode 100644
index 5da3948c2f2e9edcc3cdac49bdf9f738e403de40..0000000000000000000000000000000000000000
--- a/infer/lib/uvr5_pack/lib_v5/nets.py
+++ /dev/null
@@ -1,123 +0,0 @@
-import layers
-import torch
-import torch.nn.functional as F
-from torch import nn
-
-from . import spec_utils
-
-
-class BaseASPPNet(nn.Module):
-    def __init__(self, nin, ch, dilations=(4, 8, 16)):
-        super(BaseASPPNet, self).__init__()
-        self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
-        self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
-        self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
-        self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
-
-        self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
-
-        self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
-        self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
-        self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
-        self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
-
-    def __call__(self, x):
-        h, e1 = self.enc1(x)
-        h, e2 = self.enc2(h)
-        h, e3 = self.enc3(h)
-        h, e4 = self.enc4(h)
-
-        h = self.aspp(h)
-
-        h = self.dec4(h, e4)
-        h = self.dec3(h, e3)
-        h = self.dec2(h, e2)
-        h = self.dec1(h, e1)
-
-        return h
-
-
-class CascadedASPPNet(nn.Module):
-    def __init__(self, n_fft):
-        super(CascadedASPPNet, self).__init__()
-        self.stg1_low_band_net = BaseASPPNet(2, 16)
-        self.stg1_high_band_net = BaseASPPNet(2, 16)
-
-        self.stg2_bridge = layers.Conv2DBNActiv(18, 8, 1, 1, 0)
-        self.stg2_full_band_net = BaseASPPNet(8, 16)
-
-        self.stg3_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
-        self.stg3_full_band_net = BaseASPPNet(16, 32)
-
-        self.out = nn.Conv2d(32, 2, 1, bias=False)
-        self.aux1_out = nn.Conv2d(16, 2, 1, bias=False)
-        self.aux2_out = nn.Conv2d(16, 2, 1, bias=False)
-
-        self.max_bin = n_fft // 2
-        self.output_bin = n_fft // 2 + 1
-
-        self.offset = 128
-
-    def forward(self, x, aggressiveness=None):
-        mix = x.detach()
-        x = x.clone()
-
-        x = x[:, :, : self.max_bin]
-
-        bandw = x.size()[2] // 2
-        aux1 = torch.cat(
-            [
-                self.stg1_low_band_net(x[:, :, :bandw]),
-                self.stg1_high_band_net(x[:, :, bandw:]),
-            ],
-            dim=2,
-        )
-
-        h = torch.cat([x, aux1], dim=1)
-        aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
-
-        h = torch.cat([x, aux1, aux2], dim=1)
-        h = self.stg3_full_band_net(self.stg3_bridge(h))
-
-        mask = torch.sigmoid(self.out(h))
-        mask = F.pad(
-            input=mask,
-            pad=(0, 0, 0, self.output_bin - mask.size()[2]),
-            mode="replicate",
-        )
-
-        if self.training:
-            aux1 = torch.sigmoid(self.aux1_out(aux1))
-            aux1 = F.pad(
-                input=aux1,
-                pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
-                mode="replicate",
-            )
-            aux2 = torch.sigmoid(self.aux2_out(aux2))
-            aux2 = F.pad(
-                input=aux2,
-                pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
-                mode="replicate",
-            )
-            return mask * mix, aux1 * mix, aux2 * mix
-        else:
-            if aggressiveness:
-                mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
-                    mask[:, :, : aggressiveness["split_bin"]],
-                    1 + aggressiveness["value"] / 3,
-                )
-                mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
-                    mask[:, :, aggressiveness["split_bin"] :],
-                    1 + aggressiveness["value"],
-                )
-
-            return mask * mix
-
-    def predict(self, x_mag, aggressiveness=None):
-        h = self.forward(x_mag, aggressiveness)
-
-        if self.offset > 0:
-            h = h[:, :, :, self.offset : -self.offset]
-            assert h.size()[3] > 0
-
-        return h
diff --git a/infer/lib/uvr5_pack/lib_v5/nets_123812KB.py b/infer/lib/uvr5_pack/lib_v5/nets_123812KB.py
deleted file mode 100644
index 167d4cb2198863cf43e93440f7e63c5342fc7605..0000000000000000000000000000000000000000
--- a/infer/lib/uvr5_pack/lib_v5/nets_123812KB.py
+++ /dev/null
@@ -1,122 +0,0 @@
-import torch
-import torch.nn.functional as F
-from torch import nn
-
-from . import layers_123821KB as layers
-
-
-class BaseASPPNet(nn.Module):
-    def __init__(self, nin, ch, dilations=(4, 8, 16)):
-        super(BaseASPPNet, self).__init__()
-        self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
-        self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
-        self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
-        self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
-
-        self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
-
-        self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
-        self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
-        self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
-        self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
-
-    def __call__(self, x):
-        h, e1 = self.enc1(x)
-        h, e2 = self.enc2(h)
-        h, e3 = self.enc3(h)
-        h, e4 = self.enc4(h)
-
-        h = self.aspp(h)
-
-        h = self.dec4(h, e4)
-        h = self.dec3(h, e3)
-        h = self.dec2(h, e2)
-        h = self.dec1(h, e1)
-
-        return h
-
-
-class CascadedASPPNet(nn.Module):
-    def __init__(self, n_fft):
-        super(CascadedASPPNet, self).__init__()
-        self.stg1_low_band_net = BaseASPPNet(2, 32)
-        self.stg1_high_band_net = BaseASPPNet(2, 32)
-
-        self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
-        self.stg2_full_band_net = BaseASPPNet(16, 32)
-
-        self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
-        self.stg3_full_band_net = BaseASPPNet(32, 64)
-
-        self.out = nn.Conv2d(64, 2, 1, bias=False)
-        self.aux1_out = nn.Conv2d(32, 2, 1, bias=False)
-        self.aux2_out = nn.Conv2d(32, 2, 1, bias=False)
-
-        self.max_bin = n_fft // 2
-        self.output_bin = n_fft // 2 + 1
-
-        self.offset = 128
-
-    def forward(self, x, aggressiveness=None):
-        mix = x.detach()
-        x = x.clone()
-
-        x = x[:, :, : self.max_bin]
-
-        bandw = x.size()[2] // 2
-        aux1 = torch.cat(
-            [
-                self.stg1_low_band_net(x[:, :, :bandw]),
-                self.stg1_high_band_net(x[:, :, bandw:]),
-            ],
-            dim=2,
-        )
-
-        h = torch.cat([x, aux1], dim=1)
-        aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
-
-        h = torch.cat([x, aux1, aux2], dim=1)
-        h = self.stg3_full_band_net(self.stg3_bridge(h))
-
-        mask = torch.sigmoid(self.out(h))
-        mask = F.pad(
-            input=mask,
-            pad=(0, 0, 0, self.output_bin - mask.size()[2]),
-            mode="replicate",
-        )
-
-        if self.training:
-            aux1 = torch.sigmoid(self.aux1_out(aux1))
-            aux1 = F.pad(
-                input=aux1,
-                pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
-                mode="replicate",
-            )
-            aux2 = torch.sigmoid(self.aux2_out(aux2))
-            aux2 = F.pad(
-                input=aux2,
-                pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
-                mode="replicate",
-            )
-            return mask * mix, aux1 * mix, aux2 * mix
-        else:
-            if aggressiveness:
-                mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
-                    mask[:, :, : aggressiveness["split_bin"]],
-                    1 + aggressiveness["value"] / 3,
-                )
-                mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
-                    mask[:, :, aggressiveness["split_bin"] :],
-                    1 + aggressiveness["value"],
-                )
-
-            return mask * mix
-
-    def predict(self, x_mag, aggressiveness=None):
-        h = self.forward(x_mag, aggressiveness)
-
-        if self.offset > 0:
-            h = h[:, :, :, self.offset : -self.offset]
-            assert h.size()[3] > 0
-
-        return h
diff --git a/infer/lib/uvr5_pack/lib_v5/nets_123821KB.py b/infer/lib/uvr5_pack/lib_v5/nets_123821KB.py
deleted file mode 100644
index 167d4cb2198863cf43e93440f7e63c5342fc7605..0000000000000000000000000000000000000000
--- a/infer/lib/uvr5_pack/lib_v5/nets_123821KB.py
+++ /dev/null
@@ -1,122 +0,0 @@
-import torch
-import torch.nn.functional as F
-from torch import nn
-
-from . import layers_123821KB as layers
-
-
-class BaseASPPNet(nn.Module):
-    def __init__(self, nin, ch, dilations=(4, 8, 16)):
-        super(BaseASPPNet, self).__init__()
-        self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
-        self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
-        self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
-        self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
-
-        self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
-
-        self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
-        self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
-        self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
-        self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
-
-    def __call__(self, x):
-        h, e1 = self.enc1(x)
-        h, e2 = self.enc2(h)
-        h, e3 = self.enc3(h)
-        h, e4 = self.enc4(h)
-
-        h = self.aspp(h)
-
-        h = self.dec4(h, e4)
-        h = self.dec3(h, e3)
-        h = self.dec2(h, e2)
-        h = self.dec1(h, e1)
-
-        return h
-
-
-class CascadedASPPNet(nn.Module):
-    def __init__(self, n_fft):
-        super(CascadedASPPNet, self).__init__()
-        self.stg1_low_band_net = BaseASPPNet(2, 32)
-        self.stg1_high_band_net = BaseASPPNet(2, 32)
-
-        self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
-        self.stg2_full_band_net = BaseASPPNet(16, 32)
-
-        self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
-        self.stg3_full_band_net = BaseASPPNet(32, 64)
-
-        self.out = nn.Conv2d(64, 2, 1, bias=False)
-        self.aux1_out = nn.Conv2d(32, 2, 1, bias=False)
-        self.aux2_out = nn.Conv2d(32, 2, 1, bias=False)
-
-        self.max_bin = n_fft // 2
-        self.output_bin = n_fft // 2 + 1
-
-        self.offset = 128
-
-    def forward(self, x, aggressiveness=None):
-        mix = x.detach()
-        x = x.clone()
-
-        x = x[:, :, : self.max_bin]
-
-        bandw = x.size()[2] // 2
-        aux1 = torch.cat(
-            [
-                self.stg1_low_band_net(x[:, :, :bandw]),
-                self.stg1_high_band_net(x[:, :, bandw:]),
-            ],
-            dim=2,
-        )
-
-        h = torch.cat([x, aux1], dim=1)
-        aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
-
-        h = torch.cat([x, aux1, aux2], dim=1)
-        h = self.stg3_full_band_net(self.stg3_bridge(h))
-
-        mask = torch.sigmoid(self.out(h))
-        mask = F.pad(
-            input=mask,
-            pad=(0, 0, 0, self.output_bin - mask.size()[2]),
-            mode="replicate",
-        )
-
-        if self.training:
-            aux1 = torch.sigmoid(self.aux1_out(aux1))
-            aux1 = F.pad(
-                input=aux1,
-                pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
-                mode="replicate",
-            )
-            aux2 = torch.sigmoid(self.aux2_out(aux2))
-            aux2 = F.pad(
-                input=aux2,
-                pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
-                mode="replicate",
-            )
-            return mask * mix, aux1 * mix, aux2 * mix
-        else:
-            if aggressiveness:
-                mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
-                    mask[:, :, : aggressiveness["split_bin"]],
-                    1 + aggressiveness["value"] / 3,
-                )
-                mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
-                    mask[:, :, aggressiveness["split_bin"] :],
-                    1 + aggressiveness["value"],
-                )
-
-            return mask * mix
-
-    def predict(self, x_mag, aggressiveness=None):
-        h = self.forward(x_mag, aggressiveness)
-
-        if self.offset > 0:
-            h = h[:, :, :, self.offset : -self.offset]
-            assert h.size()[3] > 0
-
-        return h
diff --git a/infer/lib/uvr5_pack/lib_v5/nets_33966KB.py b/infer/lib/uvr5_pack/lib_v5/nets_33966KB.py
deleted file mode 100644
index 73a5b836177b706c306e27875f8391c1aed4b948..0000000000000000000000000000000000000000
--- a/infer/lib/uvr5_pack/lib_v5/nets_33966KB.py
+++ /dev/null
@@ -1,122 +0,0 @@
-import torch
-import torch.nn.functional as F
-from torch import nn
-
-from . import layers_33966KB as layers
-
-
-class BaseASPPNet(nn.Module):
-    def __init__(self, nin, ch, dilations=(4, 8, 16, 32)):
-        super(BaseASPPNet, self).__init__()
-        self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
-        self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
-        self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
-        self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
-
-        self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
-
-        self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
-        self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
-        self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
-        self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
-
-    def __call__(self, x):
-        h, e1 = self.enc1(x)
-        h, e2 = self.enc2(h)
-        h, e3 = self.enc3(h)
-        h, e4 = self.enc4(h)
-
-        h = self.aspp(h)
-
-        h = self.dec4(h, e4)
-        h = self.dec3(h, e3)
-        h = self.dec2(h, e2)
-        h = self.dec1(h, e1)
-
-        return h
-
-
-class CascadedASPPNet(nn.Module):
-    def __init__(self, n_fft):
-        super(CascadedASPPNet, self).__init__()
-        self.stg1_low_band_net = BaseASPPNet(2, 16)
-        self.stg1_high_band_net = BaseASPPNet(2, 16)
-
-        self.stg2_bridge = layers.Conv2DBNActiv(18, 8, 1, 1, 0)
-        self.stg2_full_band_net = BaseASPPNet(8, 16)
-
-        self.stg3_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
-        self.stg3_full_band_net = BaseASPPNet(16, 32)
-
-        self.out = nn.Conv2d(32, 2, 1, bias=False)
-        self.aux1_out = nn.Conv2d(16, 2, 1, bias=False)
-        self.aux2_out = nn.Conv2d(16, 2, 1, bias=False)
-
-        self.max_bin = n_fft // 2
-        self.output_bin = n_fft // 2 + 1
-
-        self.offset = 128
-
-    def forward(self, x, aggressiveness=None):
-        mix = x.detach()
-        x = x.clone()
-
-        x = x[:, :, : self.max_bin]
-
-        bandw = x.size()[2] // 2
-        aux1 = torch.cat(
-            [
-                self.stg1_low_band_net(x[:, :, :bandw]),
-                self.stg1_high_band_net(x[:, :, bandw:]),
-            ],
-            dim=2,
-        )
-
-        h = torch.cat([x, aux1], dim=1)
-        aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
-
-        h = torch.cat([x, aux1, aux2], dim=1)
-        h = self.stg3_full_band_net(self.stg3_bridge(h))
-
-        mask = torch.sigmoid(self.out(h))
-        mask = F.pad(
-            input=mask,
-            pad=(0, 0, 0, self.output_bin - mask.size()[2]),
-            mode="replicate",
-        )
-
-        if self.training:
-            aux1 = torch.sigmoid(self.aux1_out(aux1))
-            aux1 = F.pad(
-                input=aux1,
-                pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
-                mode="replicate",
-            )
-            aux2 = torch.sigmoid(self.aux2_out(aux2))
-            aux2 = F.pad(
-                input=aux2,
-                pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
-                mode="replicate",
-            )
-            return mask * mix, aux1 * mix, aux2 * mix
-        else:
-            if aggressiveness:
-                mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
-                    mask[:, :, : aggressiveness["split_bin"]],
-                    1 + aggressiveness["value"] / 3,
-                )
-                mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
-                    mask[:, :, aggressiveness["split_bin"] :],
-                    1 + aggressiveness["value"],
-                )
-
-            return mask * mix
-
-    def predict(self, x_mag, aggressiveness=None):
-        h = self.forward(x_mag, aggressiveness)
-
-        if self.offset > 0:
-            h = h[:, :, :, self.offset : -self.offset]
-            assert h.size()[3] > 0
-
-        return h
diff --git a/infer/lib/uvr5_pack/lib_v5/nets_537227KB.py b/infer/lib/uvr5_pack/lib_v5/nets_537227KB.py
deleted file mode 100644
index 823b44fb64898e8dcbb12180ba45d1718f9b03f7..0000000000000000000000000000000000000000
--- a/infer/lib/uvr5_pack/lib_v5/nets_537227KB.py
+++ /dev/null
@@ -1,123 +0,0 @@
-import numpy as np
-import torch
-import torch.nn.functional as F
-from torch import nn
-
-from . import layers_537238KB as layers
-
-
-class BaseASPPNet(nn.Module):
-    def __init__(self, nin, ch, dilations=(4, 8, 16)):
-        super(BaseASPPNet, self).__init__()
-        self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
-        self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
-        self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
-        self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
-
-        self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
-
-        self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
-        self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
-        self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
-        self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
-
-    def __call__(self, x):
-        h, e1 = self.enc1(x)
-        h, e2 = self.enc2(h)
-        h, e3 = self.enc3(h)
-        h, e4 = self.enc4(h)
-
-        h = self.aspp(h)
-
-        h = self.dec4(h, e4)
-        h = self.dec3(h, e3)
-        h = self.dec2(h, e2)
-        h = self.dec1(h, e1)
-
-        return h
-
-
-class CascadedASPPNet(nn.Module):
-    def __init__(self, n_fft):
-        super(CascadedASPPNet, self).__init__()
-        self.stg1_low_band_net = BaseASPPNet(2, 64)
-        self.stg1_high_band_net = BaseASPPNet(2, 64)
-
-        self.stg2_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
-        self.stg2_full_band_net = BaseASPPNet(32, 64)
-
-        self.stg3_bridge = layers.Conv2DBNActiv(130, 64, 1, 1, 0)
-        self.stg3_full_band_net = BaseASPPNet(64, 128)
-
-        self.out = nn.Conv2d(128, 2, 1, bias=False)
-        self.aux1_out = nn.Conv2d(64, 2, 1, bias=False)
-        self.aux2_out = nn.Conv2d(64, 2, 1, bias=False)
-
-        self.max_bin = n_fft // 2
-        self.output_bin = n_fft // 2 + 1
-
-        self.offset = 128
-
-    def forward(self, x, aggressiveness=None):
-        mix = x.detach()
-        x = x.clone()
-
-        x = x[:, :, : self.max_bin]
-
-        bandw = x.size()[2] // 2
-        aux1 = torch.cat(
-            [
-                self.stg1_low_band_net(x[:, :, :bandw]),
-                self.stg1_high_band_net(x[:, :, bandw:]),
-            ],
-            dim=2,
-        )
-
-        h = torch.cat([x, aux1], dim=1)
-        aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
-
-        h = torch.cat([x, aux1, aux2], dim=1)
-        h = self.stg3_full_band_net(self.stg3_bridge(h))
-
-        mask = torch.sigmoid(self.out(h))
-        mask = F.pad(
-            input=mask,
-            pad=(0, 0, 0, self.output_bin - mask.size()[2]),
-            mode="replicate",
-        )
-
-        if self.training:
-            aux1 = torch.sigmoid(self.aux1_out(aux1))
-            aux1 = F.pad(
-                input=aux1,
-                pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
-                mode="replicate",
-            )
-            aux2 = torch.sigmoid(self.aux2_out(aux2))
-            aux2 = F.pad(
-                input=aux2,
-                pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
-                mode="replicate",
-            )
-            return mask * mix, aux1 * mix, aux2 * mix
-        else:
-            if aggressiveness:
-                mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
-                    mask[:, :, : aggressiveness["split_bin"]],
-                    1 + aggressiveness["value"] / 3,
-                )
-                mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
-                    mask[:, :, aggressiveness["split_bin"] :],
-                    1 + aggressiveness["value"],
-                )
-
-            return mask * mix
-
-    def predict(self, x_mag, aggressiveness=None):
-        h = self.forward(x_mag, aggressiveness)
-
-        if self.offset > 0:
-            h = h[:, :, :, self.offset : -self.offset]
-            assert h.size()[3] > 0
-
-        return h
diff --git a/infer/lib/uvr5_pack/lib_v5/nets_537238KB.py b/infer/lib/uvr5_pack/lib_v5/nets_537238KB.py
deleted file mode 100644
index 823b44fb64898e8dcbb12180ba45d1718f9b03f7..0000000000000000000000000000000000000000
--- a/infer/lib/uvr5_pack/lib_v5/nets_537238KB.py
+++ /dev/null
@@ -1,123 +0,0 @@
-import numpy as np
-import torch
-import torch.nn.functional as F
-from torch import nn
-
-from . import layers_537238KB as layers
-
-
-class BaseASPPNet(nn.Module):
-    def __init__(self, nin, ch, dilations=(4, 8, 16)):
-        super(BaseASPPNet, self).__init__()
-        self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
-        self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
-        self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
-        self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
-
-        self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
-
-        self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
-        self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
-        self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
-        self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
-
-    def __call__(self, x):
-        h, e1 = self.enc1(x)
-        h, e2 = self.enc2(h)
-        h, e3 = self.enc3(h)
-        h, e4 = self.enc4(h)
-
-        h = self.aspp(h)
-
-        h = self.dec4(h, e4)
-        h = self.dec3(h, e3)
-        h = self.dec2(h, e2)
-        h = self.dec1(h, e1)
-
-        return h
-
-
-class CascadedASPPNet(nn.Module):
-    def __init__(self, n_fft):
-        super(CascadedASPPNet, self).__init__()
-        self.stg1_low_band_net = BaseASPPNet(2, 64)
-        self.stg1_high_band_net = BaseASPPNet(2, 64)
-
-        self.stg2_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
-        self.stg2_full_band_net = BaseASPPNet(32, 64)
-
-        self.stg3_bridge = layers.Conv2DBNActiv(130, 64, 1, 1, 0)
-        self.stg3_full_band_net = BaseASPPNet(64, 128)
-
-        self.out = nn.Conv2d(128, 2, 1, bias=False)
-        self.aux1_out = nn.Conv2d(64, 2, 1, bias=False)
-        self.aux2_out = nn.Conv2d(64, 2, 1, bias=False)
-
-        self.max_bin = n_fft // 2
-        self.output_bin = n_fft // 2 + 1
-
-        self.offset = 128
-
-    def forward(self, x, aggressiveness=None):
-        mix = x.detach()
-        x = x.clone()
-
-        x = x[:, :, : self.max_bin]
-
-        bandw = x.size()[2] // 2
-        aux1 = torch.cat(
-            [
-                self.stg1_low_band_net(x[:, :, :bandw]),
-                self.stg1_high_band_net(x[:, :, bandw:]),
-            ],
-            dim=2,
-        )
-
-        h = torch.cat([x, aux1], dim=1)
-        aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
-
-        h = torch.cat([x, aux1, aux2], dim=1)
-        h = self.stg3_full_band_net(self.stg3_bridge(h))
-
-        mask = torch.sigmoid(self.out(h))
-        mask = F.pad(
-            input=mask,
-            pad=(0, 0, 0, self.output_bin - mask.size()[2]),
-            mode="replicate",
-        )
-
-        if self.training:
-            aux1 = torch.sigmoid(self.aux1_out(aux1))
-            aux1 = F.pad(
-                input=aux1,
-                pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
-                mode="replicate",
-            )
-            aux2 = torch.sigmoid(self.aux2_out(aux2))
-            aux2 = F.pad(
-                input=aux2,
-                pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
-                mode="replicate",
-            )
-            return mask * mix, aux1 * mix, aux2 * mix
-        else:
-            if aggressiveness:
-                mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
-                    mask[:, :, : aggressiveness["split_bin"]],
-                    1 + aggressiveness["value"] / 3,
-                )
-                mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
-                    mask[:, :, aggressiveness["split_bin"] :],
-                    1 + aggressiveness["value"],
-                )
-
-            return mask * mix
-
-    def predict(self, x_mag, aggressiveness=None):
-        h = self.forward(x_mag, aggressiveness)
-
-        if self.offset > 0:
-            h = h[:, :, :, self.offset : -self.offset]
-            assert h.size()[3] > 0
-
-        return h
diff --git a/infer/lib/uvr5_pack/lib_v5/nets_61968KB.py b/infer/lib/uvr5_pack/lib_v5/nets_61968KB.py
deleted file mode 100644
index 167d4cb2198863cf43e93440f7e63c5342fc7605..0000000000000000000000000000000000000000
--- a/infer/lib/uvr5_pack/lib_v5/nets_61968KB.py
+++ /dev/null
@@ -1,122 +0,0 @@
-import torch
-import torch.nn.functional as F
-from torch import nn
-
-from . import layers_123821KB as layers
-
-
-class BaseASPPNet(nn.Module):
-    def __init__(self, nin, ch, dilations=(4, 8, 16)):
-        super(BaseASPPNet, self).__init__()
-        self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
-        self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
-        self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
-        self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
-
-        self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
-
-        self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
-        self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
-        self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
-        self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
-
-    def __call__(self, x):
-        h, e1 = self.enc1(x)
-        h, e2 = self.enc2(h)
-        h, e3 = self.enc3(h)
-        h, e4 = self.enc4(h)
-
-        h = self.aspp(h)
-
-        h = self.dec4(h, e4)
-        h = self.dec3(h, e3)
-        h = self.dec2(h, e2)
-        h = self.dec1(h, e1)
-
-        return h
-
-
-class CascadedASPPNet(nn.Module):
-    def __init__(self, n_fft):
-        super(CascadedASPPNet, self).__init__()
-        self.stg1_low_band_net = BaseASPPNet(2, 32)
-        self.stg1_high_band_net = BaseASPPNet(2, 32)
-
-        self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
-        self.stg2_full_band_net = BaseASPPNet(16, 32)
-
-        self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
-        self.stg3_full_band_net = BaseASPPNet(32, 64)
-
-        self.out = nn.Conv2d(64, 2, 1, bias=False)
-        self.aux1_out = nn.Conv2d(32, 2, 1, bias=False)
-        self.aux2_out = nn.Conv2d(32, 2, 1, bias=False)
-
-        self.max_bin = n_fft // 2
-        self.output_bin = n_fft // 2 + 1
-
-        self.offset = 128
-
-    def forward(self, x, aggressiveness=None):
-        mix = x.detach()
-        x = x.clone()
-
-        x = x[:, :, : self.max_bin]
-
-        bandw = x.size()[2] // 2
-        aux1 = torch.cat(
-            [
-                self.stg1_low_band_net(x[:, :, :bandw]),
-                self.stg1_high_band_net(x[:, :, bandw:]),
-            ],
-            dim=2,
-        )
-
-        h = torch.cat([x, aux1], dim=1)
-        aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
-
-        h = torch.cat([x, aux1, aux2], dim=1)
-        h = self.stg3_full_band_net(self.stg3_bridge(h))
-
-        mask = torch.sigmoid(self.out(h))
-        mask = F.pad(
-            input=mask,
-            pad=(0, 0, 0, self.output_bin - mask.size()[2]),
-            mode="replicate",
-        )
-
-        if self.training:
-            aux1 = torch.sigmoid(self.aux1_out(aux1))
-            aux1 = F.pad(
-                input=aux1,
-                pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
-                mode="replicate",
-            )
-            aux2 = torch.sigmoid(self.aux2_out(aux2))
-            aux2 = F.pad(
-                input=aux2,
-                pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
-                mode="replicate",
-            )
-            return mask * mix, aux1 * mix, aux2 * mix
-        else:
-            if aggressiveness:
-                mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
-                    mask[:, :, : aggressiveness["split_bin"]],
-                    1 + aggressiveness["value"] / 3,
-                )
-                mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
-                    mask[:, :, aggressiveness["split_bin"] :],
-                    1 + aggressiveness["value"],
-                )
-
-            return mask * mix
-
-    def predict(self, x_mag, aggressiveness=None):
-        h = self.forward(x_mag, aggressiveness)
-
-        if self.offset > 0:
-            h = h[:, :, :, self.offset : -self.offset]
-            assert h.size()[3] > 0
-
-        return h
diff --git a/infer/lib/uvr5_pack/lib_v5/nets_new.py b/infer/lib/uvr5_pack/lib_v5/nets_new.py
deleted file mode 100644
index 1c0f4fa96d921e979fe31bd4151701b7783fbcea..0000000000000000000000000000000000000000
--- a/infer/lib/uvr5_pack/lib_v5/nets_new.py
+++ /dev/null
@@ -1,133 +0,0 @@
-import torch
-import torch.nn.functional as F
-from torch import nn
-
-from . import layers_new
-
-
-class BaseNet(nn.Module):
-    def __init__(
-        self, nin, nout, nin_lstm, nout_lstm, dilations=((4, 2), (8, 4), (12, 6))
-    ):
-        super(BaseNet, self).__init__()
-        self.enc1 = layers_new.Conv2DBNActiv(nin, nout, 3, 1, 1)
-        self.enc2 = layers_new.Encoder(nout, nout * 2, 3, 2, 1)
-        self.enc3 = layers_new.Encoder(nout * 2, nout * 4, 3, 2, 1)
-        self.enc4 = layers_new.Encoder(nout * 4, nout * 6, 3, 2, 1)
-        self.enc5 = layers_new.Encoder(nout * 6, nout * 8, 3, 2, 1)
-
-        self.aspp = layers_new.ASPPModule(nout * 8, nout * 8, dilations, dropout=True)
-
-        self.dec4 = layers_new.Decoder(nout * (6 + 8), nout * 6, 3, 1, 1)
-        self.dec3 = layers_new.Decoder(nout * (4 + 6), nout * 4, 3, 1, 1)
-        self.dec2 = layers_new.Decoder(nout * (2 + 4), nout * 2, 3, 1, 1)
-        self.lstm_dec2 = layers_new.LSTMModule(nout * 2, nin_lstm, nout_lstm)
-        self.dec1 = layers_new.Decoder(nout * (1 + 2) + 1, nout * 1, 3, 1, 1)
-
-    def __call__(self, x):
-        e1 = self.enc1(x)
-        e2 = self.enc2(e1)
-        e3 = self.enc3(e2)
-        e4 = self.enc4(e3)
-        e5 = self.enc5(e4)
-
-        h = self.aspp(e5)
-
-        h = self.dec4(h, e4)
-        h = self.dec3(h, e3)
-        h = self.dec2(h, e2)
-        h = torch.cat([h, self.lstm_dec2(h)], dim=1)
-        h = self.dec1(h, e1)
-
-        return h
-
-
-class CascadedNet(nn.Module):
-    def __init__(self, n_fft, nout=32, nout_lstm=128):
-        super(CascadedNet, self).__init__()
-
-        self.max_bin = n_fft // 2
-        self.output_bin = n_fft // 2 + 1
-        self.nin_lstm = self.max_bin // 2
-        self.offset = 64
-
-        self.stg1_low_band_net = nn.Sequential(
-            BaseNet(2, nout // 2, self.nin_lstm // 2, nout_lstm),
-            layers_new.Conv2DBNActiv(nout // 2, nout // 4, 1, 1, 0),
-        )
-
-        self.stg1_high_band_net = BaseNet(
-            2, nout // 4, self.nin_lstm // 2, nout_lstm // 2
-        )
-
-        self.stg2_low_band_net = nn.Sequential(
-            BaseNet(nout // 4 + 2, nout, self.nin_lstm // 2, nout_lstm),
-            layers_new.Conv2DBNActiv(nout, nout // 2, 1, 1, 0),
-        )
-        self.stg2_high_band_net = BaseNet(
-            nout // 4 + 2, nout // 2, self.nin_lstm // 2, nout_lstm // 2
-        )
-
-        self.stg3_full_band_net = BaseNet(
-            3 * nout // 4 + 2, nout, self.nin_lstm, nout_lstm
-        )
-
-        self.out = nn.Conv2d(nout, 2, 1, bias=False)
-        self.aux_out = nn.Conv2d(3 * nout // 4, 2, 1, bias=False)
-
-    def forward(self, x):
-        x = x[:, :, : self.max_bin]
-
-        bandw = x.size()[2] // 2
-        l1_in = x[:, :, :bandw]
-        h1_in = x[:, :, bandw:]
-        l1 = self.stg1_low_band_net(l1_in)
-        h1 = self.stg1_high_band_net(h1_in)
-        aux1 = torch.cat([l1, h1], dim=2)
-
-        l2_in = torch.cat([l1_in, l1], dim=1)
-        h2_in = torch.cat([h1_in, h1], dim=1)
-        l2 = self.stg2_low_band_net(l2_in)
-        h2 = self.stg2_high_band_net(h2_in)
-        aux2 = torch.cat([l2, h2], dim=2)
-
-        f3_in = torch.cat([x, aux1, aux2], dim=1)
-        f3 = self.stg3_full_band_net(f3_in)
-
-        mask = torch.sigmoid(self.out(f3))
-        mask = F.pad(
-            input=mask,
-            pad=(0, 0, 0, self.output_bin - mask.size()[2]),
-            mode="replicate",
-        )
-
-        if self.training:
-            aux = torch.cat([aux1, aux2], dim=1)
-            aux = torch.sigmoid(self.aux_out(aux))
-            aux = F.pad(
-                input=aux,
-                pad=(0, 0, 0, self.output_bin - aux.size()[2]),
-                mode="replicate",
-            )
-            return mask, aux
-        else:
-            return mask
-
-    def predict_mask(self, x):
-        mask = self.forward(x)
-
-        if self.offset > 0:
-            mask = mask[:, :, :, self.offset : -self.offset]
-            assert mask.size()[3] > 0
-
-        return mask
-
-    def predict(self, x, aggressiveness=None):
-        mask = self.forward(x)
-        pred_mag = x * mask
-
-        if self.offset > 0:
-            pred_mag = pred_mag[:, :, :, self.offset : -self.offset]
-            assert pred_mag.size()[3] > 0
-
-        return pred_mag
diff --git a/infer/lib/uvr5_pack/lib_v5/spec_utils.py b/infer/lib/uvr5_pack/lib_v5/spec_utils.py
deleted file mode 100644
index a9634fd51ff47bf90211839231774719154c37cf..0000000000000000000000000000000000000000
--- a/infer/lib/uvr5_pack/lib_v5/spec_utils.py
+++ /dev/null
@@ -1,672 +0,0 @@
-import hashlib
-import json
-import math
-import os
-
-import librosa
-import numpy as np
-import soundfile as sf
-from tqdm import tqdm
-
-
-def crop_center(h1, h2):
-    h1_shape = h1.size()
-    h2_shape = h2.size()
-
-    if h1_shape[3] == h2_shape[3]:
-        return h1
-    elif h1_shape[3] < h2_shape[3]:
-        raise ValueError("h1_shape[3] must be greater than h2_shape[3]")
-
-    # s_freq = (h2_shape[2] - h1_shape[2]) // 2
-    # e_freq = s_freq + h1_shape[2]
-    s_time = (h1_shape[3] - h2_shape[3]) // 2
-    e_time = s_time + h2_shape[3]
-    h1 = h1[:, :, :, s_time:e_time]
-
-    return h1
-
-
-def wave_to_spectrogram(
-    wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False
-):
-    if reverse:
-        wave_left = np.flip(np.asfortranarray(wave[0]))
-        wave_right = np.flip(np.asfortranarray(wave[1]))
-    elif mid_side:
-        wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2)
-        wave_right = np.asfortranarray(np.subtract(wave[0], wave[1]))
-    elif mid_side_b2:
-        wave_left = np.asfortranarray(np.add(wave[1], wave[0] * 0.5))
-        wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * 0.5))
-    else:
-        wave_left = np.asfortranarray(wave[0])
-        wave_right = np.asfortranarray(wave[1])
-
-    spec_left = librosa.stft(wave_left, n_fft, hop_length=hop_length)
-    spec_right = librosa.stft(wave_right, n_fft, hop_length=hop_length)
-
-    spec = np.asfortranarray([spec_left, spec_right])
-
-    return spec
-
-
-def wave_to_spectrogram_mt(
-    wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False
-):
-    import threading
-
-    if reverse:
-        wave_left = np.flip(np.asfortranarray(wave[0]))
-        wave_right = np.flip(np.asfortranarray(wave[1]))
-    elif mid_side:
-        wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2)
-        wave_right = np.asfortranarray(np.subtract(wave[0], wave[1]))
-    elif mid_side_b2:
-        wave_left = np.asfortranarray(np.add(wave[1], wave[0] * 0.5))
-        wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * 0.5))
-    else:
-        wave_left = np.asfortranarray(wave[0])
-        wave_right = np.asfortranarray(wave[1])
-
-    def run_thread(**kwargs):
-        global spec_left
-        spec_left = librosa.stft(**kwargs)
-
-    thread = threading.Thread(
-        target=run_thread,
-        kwargs={"y": wave_left, "n_fft": n_fft, "hop_length": hop_length},
-    )
-    thread.start()
-    spec_right = librosa.stft(wave_right, n_fft, hop_length=hop_length)
-    thread.join()
-
-    spec = np.asfortranarray([spec_left, spec_right])
-
-    return spec
-
-
-def combine_spectrograms(specs, mp):
-    l = min([specs[i].shape[2] for i in specs])
-    spec_c = np.zeros(shape=(2, mp.param["bins"] + 1, l), dtype=np.complex64)
-    offset = 0
-    bands_n = len(mp.param["band"])
-
-    for d in range(1, bands_n + 1):
-        h = mp.param["band"][d]["crop_stop"] - mp.param["band"][d]["crop_start"]
-        spec_c[:, offset : offset + h, :l] = specs[d][
-            :, mp.param["band"][d]["crop_start"] : mp.param["band"][d]["crop_stop"], :l
-        ]
-        offset += h
-
-    if offset > mp.param["bins"]:
-        raise ValueError("Too much bins")
-
-    # lowpass fiter
-    if (
-        mp.param["pre_filter_start"] > 0
-    ):  # and mp.param['band'][bands_n]['res_type'] in ['scipy', 'polyphase']:
-        if bands_n == 1:
-            spec_c = fft_lp_filter(
-                spec_c, mp.param["pre_filter_start"], mp.param["pre_filter_stop"]
-            )
-        else:
-            gp = 1
-            for b in range(
-                mp.param["pre_filter_start"] + 1, mp.param["pre_filter_stop"]
-            ):
-                g = math.pow(
-                    10, -(b - mp.param["pre_filter_start"]) * (3.5 - gp) / 20.0
-                )
-                gp = g
-                spec_c[:, b, :] *= g
-
-    return np.asfortranarray(spec_c)
-
-
-def spectrogram_to_image(spec, mode="magnitude"):
-    if mode == "magnitude":
-        if np.iscomplexobj(spec):
-            y = np.abs(spec)
-        else:
-            y = spec
-        y = np.log10(y**2 + 1e-8)
-    elif mode == "phase":
-        if np.iscomplexobj(spec):
-            y = np.angle(spec)
-        else:
-            y = spec
-
-    y -= y.min()
-    y *= 255 / y.max()
-    img = np.uint8(y)
-
-    if y.ndim == 3:
-        img = img.transpose(1, 2, 0)
-        img = np.concatenate([np.max(img, axis=2, keepdims=True), img], axis=2)
-
-    return img
-
-
-def reduce_vocal_aggressively(X, y, softmask):
-    v = X - y
-    y_mag_tmp = np.abs(y)
-    v_mag_tmp = np.abs(v)
-
-    v_mask = v_mag_tmp > y_mag_tmp
-    y_mag = np.clip(y_mag_tmp - v_mag_tmp * v_mask * softmask, 0, np.inf)
-
-    return y_mag * np.exp(1.0j * np.angle(y))
-
-
-def mask_silence(mag, ref, thres=0.2, min_range=64, fade_size=32):
-    if min_range < fade_size * 2:
-        raise ValueError("min_range must be >= fade_area * 2")
-
-    mag = mag.copy()
-
-    idx = np.where(ref.mean(axis=(0, 1)) < thres)[0]
-    starts = np.insert(idx[np.where(np.diff(idx) != 1)[0] + 1], 0, idx[0])
-    ends = np.append(idx[np.where(np.diff(idx) != 1)[0]], idx[-1])
-    uninformative = np.where(ends - starts > min_range)[0]
-    if len(uninformative) > 0:
-        starts = starts[uninformative]
-        ends = ends[uninformative]
-        old_e = None
-        for s, e in zip(starts, ends):
-            if old_e is not None and s - old_e < fade_size:
-                s = old_e - fade_size * 2
-
-            if s != 0:
-                weight = np.linspace(0, 1, fade_size)
-                mag[:, :, s : s + fade_size] += weight * ref[:, :, s : s + fade_size]
-            else:
-                s -= fade_size
-
-            if e != mag.shape[2]:
-                weight = np.linspace(1, 0, fade_size)
-                mag[:, :, e - fade_size : e] += weight * ref[:, :, e - fade_size : e]
-            else:
-                e += fade_size
-
-            mag[:, :, s + fade_size : e - fade_size] += ref[
-                :, :, s + fade_size : e - fade_size
-            ]
-            old_e = e
-
-    return mag
-
-
-def align_wave_head_and_tail(a, b):
-    l = min([a[0].size, b[0].size])
-
-    return a[:l, :l], b[:l, :l]
-
-
-def cache_or_load(mix_path, inst_path, mp):
-    mix_basename = os.path.splitext(os.path.basename(mix_path))[0]
-    inst_basename = os.path.splitext(os.path.basename(inst_path))[0]
-
-    cache_dir = "mph{}".format(
-        hashlib.sha1(json.dumps(mp.param, sort_keys=True).encode("utf-8")).hexdigest()
-    )
-    mix_cache_dir = os.path.join("cache", cache_dir)
-    inst_cache_dir = os.path.join("cache", cache_dir)
-
-    os.makedirs(mix_cache_dir, exist_ok=True)
-    os.makedirs(inst_cache_dir, exist_ok=True)
-
-    mix_cache_path = os.path.join(mix_cache_dir, mix_basename + ".npy")
-    inst_cache_path = os.path.join(inst_cache_dir, inst_basename + ".npy")
-
-    if os.path.exists(mix_cache_path) and os.path.exists(inst_cache_path):
-        X_spec_m = np.load(mix_cache_path)
-        y_spec_m = np.load(inst_cache_path)
-    else:
-        X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {}
-
-        for d in range(len(mp.param["band"]), 0, -1):
-            bp = mp.param["band"][d]
-
-            if d == len(mp.param["band"]):  # high-end band
-                X_wave[d], _ = librosa.load(
-                    mix_path, bp["sr"], False, dtype=np.float32, res_type=bp["res_type"]
-                )
-                y_wave[d], _ = librosa.load(
-                    inst_path,
-                    bp["sr"],
-                    False,
-                    dtype=np.float32,
-                    res_type=bp["res_type"],
-                )
-            else:  # lower bands
-                X_wave[d] = librosa.resample(
-                    X_wave[d + 1],
-                    mp.param["band"][d + 1]["sr"],
-                    bp["sr"],
-                    res_type=bp["res_type"],
-                )
-                y_wave[d] = librosa.resample(
-                    y_wave[d + 1],
-                    mp.param["band"][d + 1]["sr"],
-                    bp["sr"],
-                    res_type=bp["res_type"],
-                )
-
-            X_wave[d], y_wave[d] = align_wave_head_and_tail(X_wave[d], y_wave[d])
-
-            X_spec_s[d] = wave_to_spectrogram(
-                X_wave[d],
-                bp["hl"],
-                bp["n_fft"],
-                mp.param["mid_side"],
-                mp.param["mid_side_b2"],
-                mp.param["reverse"],
-            )
-            y_spec_s[d] = wave_to_spectrogram(
-                y_wave[d],
-                bp["hl"],
-                bp["n_fft"],
-                mp.param["mid_side"],
-                mp.param["mid_side_b2"],
-                mp.param["reverse"],
-            )
-
-        del X_wave, y_wave
-
-        X_spec_m = combine_spectrograms(X_spec_s, mp)
-        y_spec_m = combine_spectrograms(y_spec_s, mp)
-
-        if X_spec_m.shape != y_spec_m.shape:
-            raise ValueError("The combined spectrograms are different: " + mix_path)
-
-        _, ext = os.path.splitext(mix_path)
-
-        np.save(mix_cache_path, X_spec_m)
-        np.save(inst_cache_path, y_spec_m)
-
-    return X_spec_m, y_spec_m
-
-
-def spectrogram_to_wave(spec, hop_length, mid_side, mid_side_b2, reverse):
-    spec_left = np.asfortranarray(spec[0])
-    spec_right = np.asfortranarray(spec[1])
-
-    wave_left = librosa.istft(spec_left, hop_length=hop_length)
-    wave_right = librosa.istft(spec_right, hop_length=hop_length)
-
-    if reverse:
-        return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)])
-    elif mid_side:
-        return np.asfortranarray(
-            [np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)]
-        )
-    elif mid_side_b2:
-        return np.asfortranarray(
-            [
-                np.add(wave_right / 1.25, 0.4 * wave_left),
-                np.subtract(wave_left / 1.25, 0.4 * wave_right),
-            ]
-        )
-    else:
-        return np.asfortranarray([wave_left, wave_right])
-
-
-def spectrogram_to_wave_mt(spec, hop_length, mid_side, reverse, mid_side_b2):
-    import threading
-
-    spec_left = np.asfortranarray(spec[0])
-    spec_right = np.asfortranarray(spec[1])
-
-    def run_thread(**kwargs):
-        global wave_left
-        wave_left = librosa.istft(**kwargs)
-
-    thread = threading.Thread(
-        target=run_thread, kwargs={"stft_matrix": spec_left, "hop_length": hop_length}
-    )
-    thread.start()
-    wave_right = librosa.istft(spec_right, hop_length=hop_length)
-    thread.join()
-
-    if reverse:
-        return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)])
-    elif mid_side:
-        return np.asfortranarray(
-            [np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)]
-        )
-    elif mid_side_b2:
-        return np.asfortranarray(
-            [
-                np.add(wave_right / 1.25, 0.4 * wave_left),
-                np.subtract(wave_left / 1.25, 0.4 * wave_right),
-            ]
-        )
-    else:
-        return np.asfortranarray([wave_left, wave_right])
-
-
-def cmb_spectrogram_to_wave(spec_m, mp, extra_bins_h=None, extra_bins=None):
-    wave_band = {}
-    bands_n = len(mp.param["band"])
-    offset = 0
-
-    for d in range(1, bands_n + 1):
-        bp = mp.param["band"][d]
-        spec_s = np.ndarray(
-            shape=(2, bp["n_fft"] // 2 + 1, spec_m.shape[2]), dtype=complex
-        )
-        h = bp["crop_stop"] - bp["crop_start"]
-        spec_s[:, bp["crop_start"] : bp["crop_stop"], :] = spec_m[
-            :, offset : offset + h, :
-        ]
-
-        offset += h
-        if d == bands_n:  # higher
-            if extra_bins_h:  # if --high_end_process bypass
-                max_bin = bp["n_fft"] // 2
-                spec_s[:, max_bin - extra_bins_h : max_bin, :] = extra_bins[
-                    :, :extra_bins_h, :
-                ]
-            if bp["hpf_start"] > 0:
-                spec_s = fft_hp_filter(spec_s, bp["hpf_start"], bp["hpf_stop"] - 1)
-            if bands_n == 1:
-                wave = spectrogram_to_wave(
-                    spec_s,
-                    bp["hl"],
-                    mp.param["mid_side"],
-                    mp.param["mid_side_b2"],
-                    mp.param["reverse"],
-                )
-            else:
-                wave = np.add(
-                    wave,
-                    spectrogram_to_wave(
-                        spec_s,
-                        bp["hl"],
-                        mp.param["mid_side"],
-                        mp.param["mid_side_b2"],
-                        mp.param["reverse"],
-                    ),
-                )
-        else:
-            sr = mp.param["band"][d + 1]["sr"]
-            if d == 1:  # lower
-                spec_s = fft_lp_filter(spec_s, bp["lpf_start"], bp["lpf_stop"])
-                wave = librosa.resample(
-                    spectrogram_to_wave(
-                        spec_s,
-                        bp["hl"],
-                        mp.param["mid_side"],
-                        mp.param["mid_side_b2"],
-                        mp.param["reverse"],
-                    ),
-                    bp["sr"],
-                    sr,
-                    res_type="sinc_fastest",
-                )
-            else:  # mid
-                spec_s = fft_hp_filter(spec_s, bp["hpf_start"], bp["hpf_stop"] - 1)
-                spec_s = fft_lp_filter(spec_s, bp["lpf_start"], bp["lpf_stop"])
-                wave2 = np.add(
-                    wave,
-                    spectrogram_to_wave(
-                        spec_s,
-                        bp["hl"],
-                        mp.param["mid_side"],
-                        mp.param["mid_side_b2"],
-                        mp.param["reverse"],
-                    ),
-                )
-                # wave = librosa.core.resample(wave2, bp['sr'], sr, res_type="sinc_fastest")
-                wave = librosa.core.resample(wave2, bp["sr"], sr, res_type="scipy")
-
-    return wave.T
-
-
-def fft_lp_filter(spec, bin_start, bin_stop):
-    g = 1.0
-    for b in range(bin_start, bin_stop):
-        g -= 1 / (bin_stop - bin_start)
-        spec[:, b, :] = g * spec[:, b, :]
-
-    spec[:, bin_stop:, :] *= 0
-
-    return spec
-
-
-def fft_hp_filter(spec, bin_start, bin_stop):
-    g = 1.0
-    for b in range(bin_start, bin_stop, -1):
-        g -= 1 / (bin_start - bin_stop)
-        spec[:, b, :] = g * spec[:, b, :]
-
-    spec[:, 0 : bin_stop + 1, :] *= 0
-
-    return spec
-
-
-def mirroring(a, spec_m, input_high_end, mp):
-    if "mirroring" == a:
-        mirror = np.flip(
-            np.abs(
-                spec_m[
-                    :,
-                    mp.param["pre_filter_start"]
-                    - 10
-                    - input_high_end.shape[1] : mp.param["pre_filter_start"]
-                    - 10,
-                    :,
-                ]
-            ),
-            1,
-        )
-        mirror = mirror * np.exp(1.0j * np.angle(input_high_end))
-
-        return np.where(
-            np.abs(input_high_end) <= np.abs(mirror), input_high_end, mirror
-        )
-
-    if "mirroring2" == a:
-        mirror = np.flip(
-            np.abs(
-                spec_m[
-                    :,
-                    mp.param["pre_filter_start"]
-                    - 10
-                    - input_high_end.shape[1] : mp.param["pre_filter_start"]
-                    - 10,
-                    :,
-                ]
-            ),
-            1,
-        )
-        mi = np.multiply(mirror, input_high_end * 1.7)
-
-        return np.where(np.abs(input_high_end) <= np.abs(mi), input_high_end, mi)
-
-
-def ensembling(a, specs):
-    for i in range(1, len(specs)):
-        if i == 1:
-            spec = specs[0]
-
-        ln = min([spec.shape[2], specs[i].shape[2]])
-        spec = spec[:, :, :ln]
-        specs[i] = specs[i][:, :, :ln]
-
-        if "min_mag" == a:
-            spec = np.where(np.abs(specs[i]) <= np.abs(spec), specs[i], spec)
-        if "max_mag" == a:
-            spec = np.where(np.abs(specs[i]) >= np.abs(spec), specs[i], spec)
-
-    return spec
-
-
-def stft(wave, nfft, hl):
-    wave_left = np.asfortranarray(wave[0])
-    wave_right = np.asfortranarray(wave[1])
-    spec_left = librosa.stft(wave_left, nfft, hop_length=hl)
-    spec_right = librosa.stft(wave_right, nfft, hop_length=hl)
-    spec = np.asfortranarray([spec_left, spec_right])
-
-    return spec
-
-
-def istft(spec, hl):
-    spec_left = np.asfortranarray(spec[0])
-    spec_right = np.asfortranarray(spec[1])
-
-    wave_left = librosa.istft(spec_left, hop_length=hl)
-    wave_right = librosa.istft(spec_right, hop_length=hl)
-    wave = np.asfortranarray([wave_left, wave_right])
-
-
-if __name__ == "__main__":
-    import argparse
-    import sys
-    import time
-
-    import cv2
-    from model_param_init import ModelParameters
-
-    p = argparse.ArgumentParser()
-    p.add_argument(
-        "--algorithm",
-        "-a",
-        type=str,
-        choices=["invert", "invert_p", "min_mag", "max_mag", "deep", "align"],
-        default="min_mag",
-    )
-    p.add_argument(
-        "--model_params",
-        "-m",
-        type=str,
-        default=os.path.join("modelparams", "1band_sr44100_hl512.json"),
-    )
-    p.add_argument("--output_name", "-o", type=str, default="output")
-    p.add_argument("--vocals_only", "-v", action="store_true")
-    p.add_argument("input", nargs="+")
-    args = p.parse_args()
-
-    start_time = time.time()
-
-    if args.algorithm.startswith("invert") and len(args.input) != 2:
-        raise ValueError("There should be two input files.")
-
-    if not args.algorithm.startswith("invert") and len(args.input) < 2:
-        raise ValueError("There must be at least two input files.")
-
-    wave, specs = {}, {}
-    mp = ModelParameters(args.model_params)
-
-    for i in range(len(args.input)):
-        spec = {}
-
-        for d in range(len(mp.param["band"]), 0, -1):
-            bp = mp.param["band"][d]
-
-            if d == len(mp.param["band"]):  # high-end band
-                wave[d], _ = librosa.load(
-                    args.input[i],
-                    bp["sr"],
-                    False,
-                    dtype=np.float32,
-                    res_type=bp["res_type"],
-                )
-
-                if len(wave[d].shape) == 1:  # mono to stereo
-                    wave[d] = np.array([wave[d], wave[d]])
-            else:  # lower bands
-                wave[d] = librosa.resample(
-                    wave[d + 1],
-                    mp.param["band"][d + 1]["sr"],
-                    bp["sr"],
-                    res_type=bp["res_type"],
-                )
-
-            spec[d] = wave_to_spectrogram(
-                wave[d],
-                bp["hl"],
-                bp["n_fft"],
-                mp.param["mid_side"],
-                mp.param["mid_side_b2"],
-                mp.param["reverse"],
-            )
-
-        specs[i] = combine_spectrograms(spec, mp)
-
-    del wave
-
-    if args.algorithm == "deep":
-        d_spec = np.where(np.abs(specs[0]) <= np.abs(spec[1]), specs[0], spec[1])
-        v_spec = d_spec - specs[1]
-        sf.write(
-            os.path.join("{}.wav".format(args.output_name)),
-            cmb_spectrogram_to_wave(v_spec, mp),
-            mp.param["sr"],
-        )
-
-    if args.algorithm.startswith("invert"):
-        ln = min([specs[0].shape[2], specs[1].shape[2]])
-        specs[0] = specs[0][:, :, :ln]
-        specs[1] = specs[1][:, :, :ln]
-
-        if "invert_p" == args.algorithm:
-            X_mag = np.abs(specs[0])
-            y_mag = np.abs(specs[1])
-            max_mag = np.where(X_mag >= y_mag, X_mag, y_mag)
-            v_spec = specs[1] - max_mag * np.exp(1.0j * np.angle(specs[0]))
-        else:
-            specs[1] = reduce_vocal_aggressively(specs[0], specs[1], 0.2)
-            v_spec = specs[0] - specs[1]
-
-            if not args.vocals_only:
-                X_mag = np.abs(specs[0])
-                y_mag = np.abs(specs[1])
-                v_mag = np.abs(v_spec)
-
-                X_image = spectrogram_to_image(X_mag)
-                y_image = spectrogram_to_image(y_mag)
-                v_image = spectrogram_to_image(v_mag)
-
-                cv2.imwrite("{}_X.png".format(args.output_name), X_image)
-                cv2.imwrite("{}_y.png".format(args.output_name), y_image)
-                cv2.imwrite("{}_v.png".format(args.output_name), v_image)
-
-                sf.write(
-                    "{}_X.wav".format(args.output_name),
-                    cmb_spectrogram_to_wave(specs[0], mp),
-                    mp.param["sr"],
-                )
-                sf.write(
-                    "{}_y.wav".format(args.output_name),
-                    cmb_spectrogram_to_wave(specs[1], mp),
-                    mp.param["sr"],
-                )
-
-        sf.write(
-            "{}_v.wav".format(args.output_name),
-            cmb_spectrogram_to_wave(v_spec, mp),
-            mp.param["sr"],
-        )
-    else:
-        if not args.algorithm == "deep":
-            sf.write(
-                os.path.join("ensembled", "{}.wav".format(args.output_name)),
-                cmb_spectrogram_to_wave(ensembling(args.algorithm, specs), mp),
-                mp.param["sr"],
-            )
-
-    if args.algorithm == "align":
-        trackalignment = [
-            {
-                "file1": '"{}"'.format(args.input[0]),
-                "file2": '"{}"'.format(args.input[1]),
-            }
-        ]
-
-        for i, e in tqdm(enumerate(trackalignment), desc="Performing Alignment..."):
-            os.system(f"python lib/align_tracks.py {e['file1']} {e['file2']}")
-
-    # print('Total time: {0:.{1}f}s'.format(time.time() - start_time, 1))
diff --git a/infer/lib/uvr5_pack/name_params.json b/infer/lib/uvr5_pack/name_params.json
deleted file mode 100644
index 8ed51a68370607a7a8693b99cfb35fc5d92b04af..0000000000000000000000000000000000000000
--- a/infer/lib/uvr5_pack/name_params.json
+++ /dev/null
@@ -1,263 +0,0 @@
-{
-    "equivalent" : [
-        {
-            "model_hash_name" : [
-                {
-                    "hash_name": "47939caf0cfe52a0e81442b85b971dfd",
-                    "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json",
-                    "param_name": "4band_44100"
-                },
-                {
-                    "hash_name": "4e4ecb9764c50a8c414fee6e10395bbe",
-                    "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json",
-                    "param_name": "4band_v2"
-                },
-                {
-                    "hash_name": "ca106edd563e034bde0bdec4bb7a4b36",
-                    "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json",
-                    "param_name": "4band_v2"
-                },
-                {
-                    "hash_name": "e60a1e84803ce4efc0a6551206cc4b71",
-                    "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json",
-                    "param_name": "4band_44100"
-                },
-                {
-                    "hash_name": "a82f14e75892e55e994376edbf0c8435",
-                    "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json",
-                    "param_name": "4band_44100"
-                },
-                {
-                    "hash_name": "6dd9eaa6f0420af9f1d403aaafa4cc06",
-                    "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json",
-                    "param_name": "4band_v2_sn"
-                },
-                {
-                    "hash_name": "08611fb99bd59eaa79ad27c58d137727",
-                    "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json",
-                    "param_name": "4band_v2_sn"
-                },
-                {
-                    "hash_name": "5c7bbca45a187e81abbbd351606164e5",
-                    "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json",
-                    "param_name": "3band_44100_msb2"
-                },
-                {
-                    "hash_name": "d6b2cb685a058a091e5e7098192d3233",
-                    "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json",
-                    "param_name": "3band_44100_msb2"
-                },
-                {
-                    "hash_name": "c1b9f38170a7c90e96f027992eb7c62b",
-                    "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json",
-                    "param_name": "4band_44100"
-                },
-                {
-                    "hash_name": "c3448ec923fa0edf3d03a19e633faa53",
-                    "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json",
-                    "param_name": "4band_44100"
-                },
-                {
-                    "hash_name": "68aa2c8093d0080704b200d140f59e54",
-                    "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100.json",
-                    "param_name": "3band_44100"
-                },
-                {
-                    "hash_name": "fdc83be5b798e4bd29fe00fe6600e147",
-                    "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json",
-                    "param_name": "3band_44100_mid.json"
-                },
-                {
-                    "hash_name": "2ce34bc92fd57f55db16b7a4def3d745",
-                    "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json",
-                    "param_name": "3band_44100_mid.json"
-                },
-                {
-                    "hash_name": "52fdca89576f06cf4340b74a4730ee5f",
-                    "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json",
-                    "param_name": "4band_44100.json"
-                },
-                {
-                    "hash_name": "41191165b05d38fc77f072fa9e8e8a30",
-                    "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json",
-                    "param_name": "4band_44100.json"
-                },
-                {
-                    "hash_name": "89e83b511ad474592689e562d5b1f80e",
-                    "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/2band_32000.json",
-                    "param_name": "2band_32000.json"
-                },
-                {
-                    "hash_name": "0b954da81d453b716b114d6d7c95177f",
-                    "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/2band_32000.json",
-                    "param_name": "2band_32000.json"
-                }
-
-            ],
-            "v4 Models": [
-                {
-                    "hash_name": "6a00461c51c2920fd68937d4609ed6c8",
-                    "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json",
-                    "param_name": "1band_sr16000_hl512"
-                },
-                {
-                    "hash_name": "0ab504864d20f1bd378fe9c81ef37140",
-                    "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json",
-                    "param_name": "1band_sr32000_hl512"
-                },
-                {
-                    "hash_name": "7dd21065bf91c10f7fccb57d7d83b07f",
-                    "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json",
-                    "param_name": "1band_sr32000_hl512"
-                },
-                {
-                    "hash_name": "80ab74d65e515caa3622728d2de07d23",
-                    "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json",
-                    "param_name": "1band_sr32000_hl512"
-                },
-                {
-                    "hash_name": "edc115e7fc523245062200c00caa847f",
-                    "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json",
-                    "param_name": "1band_sr33075_hl384"
-                },
-                {
-                    "hash_name": "28063e9f6ab5b341c5f6d3c67f2045b7",
-                    "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json",
-                    "param_name": "1band_sr33075_hl384"
-                },
-                {
-                    "hash_name": "b58090534c52cbc3e9b5104bad666ef2",
-                    "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json",
-                    "param_name": "1band_sr44100_hl512"
-                },
-                {
-                    "hash_name": "0cdab9947f1b0928705f518f3c78ea8f",
-                    "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json",
-                    "param_name": "1band_sr44100_hl512"
-                },
-                {
-                    "hash_name": "ae702fed0238afb5346db8356fe25f13",
-                    "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json",
-                    "param_name": "1band_sr44100_hl1024"
-                }
-            ]
-        }
-    ],
-    "User Models" : [
-        {
-            "1 Band": [
-                {
-                    "hash_name": "1band_sr16000_hl512",
-                    "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json",
-                    "param_name": "1band_sr16000_hl512"
-                },
-                {
-                    "hash_name": "1band_sr32000_hl512",
-                    "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json",
-                    "param_name": "1band_sr16000_hl512"
-                },
-                {
-                    "hash_name": "1band_sr33075_hl384",
-                    "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json",
-                    "param_name": "1band_sr33075_hl384"
-                },
-                {
-                    "hash_name": "1band_sr44100_hl256",
-                    "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json",
-                    "param_name": "1band_sr44100_hl256"
-                },
-                {
-                    "hash_name": "1band_sr44100_hl512",
-                    "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json",
-                    "param_name": "1band_sr44100_hl512"
-                },
-                {
-                    "hash_name": "1band_sr44100_hl1024",
-                    "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json",
-                    "param_name": "1band_sr44100_hl1024"
-                }
-            ],
-            "2 Band": [
-                {
-                    "hash_name": "2band_44100_lofi",
-                    "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json",
-                    "param_name": "2band_44100_lofi"
-                },
-                {
-                    "hash_name": "2band_32000",
-                    "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/2band_32000.json",
-                    "param_name": "2band_32000"
-                },
-                {
-                    "hash_name": "2band_48000",
-                    "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/2band_48000.json",
-                    "param_name": "2band_48000"
-                }
-            ],
-            "3 Band": [
-                {
-                    "hash_name": "3band_44100",
-                    "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100.json",
-                    "param_name": "3band_44100"
-                },
-                {
-                    "hash_name": "3band_44100_mid",
-                    "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json",
-                    "param_name": "3band_44100_mid"
-                },
-                {
-                    "hash_name": "3band_44100_msb2",
-                    "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json",
-                    "param_name": "3band_44100_msb2"
-                }
-            ],
-            "4 Band": [
-                {
-                    "hash_name": "4band_44100",
-                    "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json",
-                    "param_name": "4band_44100"
-                },
-                {
-                    "hash_name": "4band_44100_mid",
-                    "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_mid.json",
-                    "param_name": "4band_44100_mid"
-                },
-                {
-                    "hash_name": "4band_44100_msb",
-                    "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb.json",
-                    "param_name": "4band_44100_msb"
-                },
-                {
-                    "hash_name": "4band_44100_msb2",
-                    "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json",
-                    "param_name": "4band_44100_msb2"
-                },
-                {
-                    "hash_name": "4band_44100_reverse",
-                    "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json",
-                    "param_name": "4band_44100_reverse"
-                },
-                {
-                    "hash_name": "4band_44100_sw",
-                    "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_sw.json",
-                    "param_name": "4band_44100_sw"
-                },
-                {
-                    "hash_name": "4band_v2",
-                    "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json",
-                    "param_name": "4band_v2"
-                },
-                {
-                    "hash_name": "4band_v2_sn",
-                    "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json",
-                    "param_name": "4band_v2_sn"
-                },
-                {
-                    "hash_name": "tmodelparam",
-                    "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/tmodelparam.json",
-                    "param_name": "User Model Param Set"
-                }
-            ]
-        }
-    ]
-}
\ No newline at end of file
diff --git a/infer/lib/uvr5_pack/utils.py b/infer/lib/uvr5_pack/utils.py
deleted file mode 100644
index f4805cdb25e7c50611412a19340ad525d1251d7b..0000000000000000000000000000000000000000
--- a/infer/lib/uvr5_pack/utils.py
+++ /dev/null
@@ -1,121 +0,0 @@
-import json
-
-import numpy as np
-import torch
-from tqdm import tqdm
-
-
-def load_data(file_name: str = "./infer/lib/uvr5_pack/name_params.json") -> dict:
-    with open(file_name, "r") as f:
-        data = json.load(f)
-
-    return data
-
-
-def make_padding(width, cropsize, offset):
-    left = offset
-    roi_size = cropsize - left * 2
-    if roi_size == 0:
-        roi_size = cropsize
-    right = roi_size - (width % roi_size) + left
-
-    return left, right, roi_size
-
-
-def inference(X_spec, device, model, aggressiveness, data):
-    """
-    data ： dic configs
-    """
-
-    def _execute(
-        X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half=True
-    ):
-        model.eval()
-        with torch.no_grad():
-            preds = []
-
-            iterations = [n_window]
-
-            total_iterations = sum(iterations)
-            for i in tqdm(range(n_window)):
-                start = i * roi_size
-                X_mag_window = X_mag_pad[
-                    None, :, :, start : start + data["window_size"]
-                ]
-                X_mag_window = torch.from_numpy(X_mag_window)
-                if is_half:
-                    X_mag_window = X_mag_window.half()
-                X_mag_window = X_mag_window.to(device)
-
-                pred = model.predict(X_mag_window, aggressiveness)
-
-                pred = pred.detach().cpu().numpy()
-                preds.append(pred[0])
-
-            pred = np.concatenate(preds, axis=2)
-        return pred
-
-    def preprocess(X_spec):
-        X_mag = np.abs(X_spec)
-        X_phase = np.angle(X_spec)
-
-        return X_mag, X_phase
-
-    X_mag, X_phase = preprocess(X_spec)
-
-    coef = X_mag.max()
-    X_mag_pre = X_mag / coef
-
-    n_frame = X_mag_pre.shape[2]
-    pad_l, pad_r, roi_size = make_padding(n_frame, data["window_size"], model.offset)
-    n_window = int(np.ceil(n_frame / roi_size))
-
-    X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant")
-
-    if list(model.state_dict().values())[0].dtype == torch.float16:
-        is_half = True
-    else:
-        is_half = False
-    pred = _execute(
-        X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half
-    )
-    pred = pred[:, :, :n_frame]
-
-    if data["tta"]:
-        pad_l += roi_size // 2
-        pad_r += roi_size // 2
-        n_window += 1
-
-        X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant")
-
-        pred_tta = _execute(
-            X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half
-        )
-        pred_tta = pred_tta[:, :, roi_size // 2 :]
-        pred_tta = pred_tta[:, :, :n_frame]
-
-        return (pred + pred_tta) * 0.5 * coef, X_mag, np.exp(1.0j * X_phase)
-    else:
-        return pred * coef, X_mag, np.exp(1.0j * X_phase)
-
-
-def _get_name_params(model_path, model_hash):
-    data = load_data()
-    flag = False
-    ModelName = model_path
-    for type in list(data):
-        for model in list(data[type][0]):
-            for i in range(len(data[type][0][model])):
-                if str(data[type][0][model][i]["hash_name"]) == model_hash:
-                    flag = True
-                elif str(data[type][0][model][i]["hash_name"]) in ModelName:
-                    flag = True
-
-                if flag:
-                    model_params_auto = data[type][0][model][i]["model_params"]
-                    param_name_auto = data[type][0][model][i]["param_name"]
-                    if type == "equivalent":
-                        return param_name_auto, model_params_auto
-                    else:
-                        flag = False
-    return param_name_auto, model_params_auto
diff --git a/infer/modules/ipex/__init__.py b/infer/modules/ipex/__init__.py
deleted file mode 100644
index cd27bc172f28a20a0378f8e91e4fa463d4118a72..0000000000000000000000000000000000000000
--- a/infer/modules/ipex/__init__.py
+++ /dev/null
@@ -1,190 +0,0 @@
-import os
-import sys
-import contextlib
-import torch
-import intel_extension_for_pytorch as ipex  # pylint: disable=import-error, unused-import
-from .hijacks import ipex_hijacks
-from .attention import attention_init
-
-# pylint: disable=protected-access, missing-function-docstring, line-too-long
-
-
-def ipex_init():  # pylint: disable=too-many-statements
-    try:
-        # Replace cuda with xpu:
-        torch.cuda.current_device = torch.xpu.current_device
-        torch.cuda.current_stream = torch.xpu.current_stream
-        torch.cuda.device = torch.xpu.device
-        torch.cuda.device_count = torch.xpu.device_count
-        torch.cuda.device_of = torch.xpu.device_of
-        torch.cuda.get_device_name = torch.xpu.get_device_name
-        torch.cuda.get_device_properties = torch.xpu.get_device_properties
-        torch.cuda.init = torch.xpu.init
-        torch.cuda.is_available = torch.xpu.is_available
-        torch.cuda.is_initialized = torch.xpu.is_initialized
-        torch.cuda.is_current_stream_capturing = lambda: False
-        torch.cuda.set_device = torch.xpu.set_device
-        torch.cuda.stream = torch.xpu.stream
-        torch.cuda.synchronize = torch.xpu.synchronize
-        torch.cuda.Event = torch.xpu.Event
-        torch.cuda.Stream = torch.xpu.Stream
-        torch.cuda.FloatTensor = torch.xpu.FloatTensor
-        torch.Tensor.cuda = torch.Tensor.xpu
-        torch.Tensor.is_cuda = torch.Tensor.is_xpu
-        torch.cuda._initialization_lock = torch.xpu.lazy_init._initialization_lock
-        torch.cuda._initialized = torch.xpu.lazy_init._initialized
-        torch.cuda._lazy_seed_tracker = torch.xpu.lazy_init._lazy_seed_tracker
-        torch.cuda._queued_calls = torch.xpu.lazy_init._queued_calls
-        torch.cuda._tls = torch.xpu.lazy_init._tls
-        torch.cuda.threading = torch.xpu.lazy_init.threading
-        torch.cuda.traceback = torch.xpu.lazy_init.traceback
-        torch.cuda.Optional = torch.xpu.Optional
-        torch.cuda.__cached__ = torch.xpu.__cached__
-        torch.cuda.__loader__ = torch.xpu.__loader__
-        torch.cuda.ComplexFloatStorage = torch.xpu.ComplexFloatStorage
-        torch.cuda.Tuple = torch.xpu.Tuple
-        torch.cuda.streams = torch.xpu.streams
-        torch.cuda._lazy_new = torch.xpu._lazy_new
-        torch.cuda.FloatStorage = torch.xpu.FloatStorage
-        torch.cuda.Any = torch.xpu.Any
-        torch.cuda.__doc__ = torch.xpu.__doc__
-        torch.cuda.default_generators = torch.xpu.default_generators
-        torch.cuda.HalfTensor = torch.xpu.HalfTensor
-        torch.cuda._get_device_index = torch.xpu._get_device_index
-        torch.cuda.__path__ = torch.xpu.__path__
-        torch.cuda.Device = torch.xpu.Device
-        torch.cuda.IntTensor = torch.xpu.IntTensor
-        torch.cuda.ByteStorage = torch.xpu.ByteStorage
-        torch.cuda.set_stream = torch.xpu.set_stream
-        torch.cuda.BoolStorage = torch.xpu.BoolStorage
-        torch.cuda.os = torch.xpu.os
-        torch.cuda.torch = torch.xpu.torch
-        torch.cuda.BFloat16Storage = torch.xpu.BFloat16Storage
-        torch.cuda.Union = torch.xpu.Union
-        torch.cuda.DoubleTensor = torch.xpu.DoubleTensor
-        torch.cuda.ShortTensor = torch.xpu.ShortTensor
-        torch.cuda.LongTensor = torch.xpu.LongTensor
-        torch.cuda.IntStorage = torch.xpu.IntStorage
-        torch.cuda.LongStorage = torch.xpu.LongStorage
-        torch.cuda.__annotations__ = torch.xpu.__annotations__
-        torch.cuda.__package__ = torch.xpu.__package__
-        torch.cuda.__builtins__ = torch.xpu.__builtins__
-        torch.cuda.CharTensor = torch.xpu.CharTensor
-        torch.cuda.List = torch.xpu.List
-        torch.cuda._lazy_init = torch.xpu._lazy_init
-        torch.cuda.BFloat16Tensor = torch.xpu.BFloat16Tensor
-        torch.cuda.DoubleStorage = torch.xpu.DoubleStorage
-        torch.cuda.ByteTensor = torch.xpu.ByteTensor
-        torch.cuda.StreamContext = torch.xpu.StreamContext
-        torch.cuda.ComplexDoubleStorage = torch.xpu.ComplexDoubleStorage
-        torch.cuda.ShortStorage = torch.xpu.ShortStorage
-        torch.cuda._lazy_call = torch.xpu._lazy_call
-        torch.cuda.HalfStorage = torch.xpu.HalfStorage
-        torch.cuda.random = torch.xpu.random
-        torch.cuda._device = torch.xpu._device
-        torch.cuda.classproperty = torch.xpu.classproperty
-        torch.cuda.__name__ = torch.xpu.__name__
-        torch.cuda._device_t = torch.xpu._device_t
-        torch.cuda.warnings = torch.xpu.warnings
-        torch.cuda.__spec__ = torch.xpu.__spec__
-        torch.cuda.BoolTensor = torch.xpu.BoolTensor
-        torch.cuda.CharStorage = torch.xpu.CharStorage
-        torch.cuda.__file__ = torch.xpu.__file__
-        torch.cuda._is_in_bad_fork = torch.xpu.lazy_init._is_in_bad_fork
-        # torch.cuda.is_current_stream_capturing = torch.xpu.is_current_stream_capturing
-
-        # Memory:
-        torch.cuda.memory = torch.xpu.memory
-        if "linux" in sys.platform and "WSL2" in os.popen("uname -a").read():
-            torch.xpu.empty_cache = lambda: None
-        torch.cuda.empty_cache = torch.xpu.empty_cache
-        torch.cuda.memory_stats = torch.xpu.memory_stats
-        torch.cuda.memory_summary = torch.xpu.memory_summary
-        torch.cuda.memory_snapshot = torch.xpu.memory_snapshot
-        torch.cuda.memory_allocated = torch.xpu.memory_allocated
-        torch.cuda.max_memory_allocated = torch.xpu.max_memory_allocated
-        torch.cuda.memory_reserved = torch.xpu.memory_reserved
-        torch.cuda.memory_cached = torch.xpu.memory_reserved
-        torch.cuda.max_memory_reserved = torch.xpu.max_memory_reserved
-        torch.cuda.max_memory_cached = torch.xpu.max_memory_reserved
-        torch.cuda.reset_peak_memory_stats = torch.xpu.reset_peak_memory_stats
-        torch.cuda.reset_max_memory_cached = torch.xpu.reset_peak_memory_stats
-        torch.cuda.reset_max_memory_allocated = torch.xpu.reset_peak_memory_stats
-        torch.cuda.memory_stats_as_nested_dict = torch.xpu.memory_stats_as_nested_dict
-        torch.cuda.reset_accumulated_memory_stats = (
-            torch.xpu.reset_accumulated_memory_stats
-        )
-
-        # RNG:
-        torch.cuda.get_rng_state = torch.xpu.get_rng_state
-        torch.cuda.get_rng_state_all = torch.xpu.get_rng_state_all
-        torch.cuda.set_rng_state = torch.xpu.set_rng_state
-        torch.cuda.set_rng_state_all = torch.xpu.set_rng_state_all
-        torch.cuda.manual_seed = torch.xpu.manual_seed
-        torch.cuda.manual_seed_all = torch.xpu.manual_seed_all
-        torch.cuda.seed = torch.xpu.seed
-        torch.cuda.seed_all = torch.xpu.seed_all
-        torch.cuda.initial_seed = torch.xpu.initial_seed
-
-        # AMP:
-        torch.cuda.amp = torch.xpu.amp
-        if not hasattr(torch.cuda.amp, "common"):
-            torch.cuda.amp.common = contextlib.nullcontext()
-        torch.cuda.amp.common.amp_definitely_not_available = lambda: False
-        try:
-            torch.cuda.amp.GradScaler = torch.xpu.amp.GradScaler
-        except Exception:  # pylint: disable=broad-exception-caught
-            try:
-                from .gradscaler import (
-                    gradscaler_init,
-                )  # pylint: disable=import-outside-toplevel, import-error
-
-                gradscaler_init()
-                torch.cuda.amp.GradScaler = torch.xpu.amp.GradScaler
-            except Exception:  # pylint: disable=broad-exception-caught
-                torch.cuda.amp.GradScaler = ipex.cpu.autocast._grad_scaler.GradScaler
-
-        # C
-        torch._C._cuda_getCurrentRawStream = ipex._C._getCurrentStream
-        ipex._C._DeviceProperties.major = 2023
-        ipex._C._DeviceProperties.minor = 2
-
-        # Fix functions with ipex:
-        torch.cuda.mem_get_info = lambda device=None: [
-            (
-                torch.xpu.get_device_properties(device).total_memory
-                - torch.xpu.memory_allocated(device)
-            ),
-            torch.xpu.get_device_properties(device).total_memory,
-        ]
-        torch._utils._get_available_device_type = lambda: "xpu"
-        torch.has_cuda = True
-        torch.cuda.has_half = True
-        torch.cuda.is_bf16_supported = lambda *args, **kwargs: True
-        torch.cuda.is_fp16_supported = lambda *args, **kwargs: True
-        torch.version.cuda = "11.7"
-        torch.cuda.get_device_capability = lambda *args, **kwargs: [11, 7]
-        torch.cuda.get_device_properties.major = 11
-        torch.cuda.get_device_properties.minor = 7
-        torch.cuda.ipc_collect = lambda *args, **kwargs: None
-        torch.cuda.utilization = lambda *args, **kwargs: 0
-        if hasattr(torch.xpu, "getDeviceIdListForCard"):
-            torch.cuda.getDeviceIdListForCard = torch.xpu.getDeviceIdListForCard
-            torch.cuda.get_device_id_list_per_card = torch.xpu.getDeviceIdListForCard
-        else:
-            torch.cuda.getDeviceIdListForCard = torch.xpu.get_device_id_list_per_card
-            torch.cuda.get_device_id_list_per_card = (
-                torch.xpu.get_device_id_list_per_card
-            )
-
-        ipex_hijacks()
-        attention_init()
-        try:
-            from .diffusers import ipex_diffusers
-
-            ipex_diffusers()
-        except Exception:  # pylint: disable=broad-exception-caught
-            pass
-    except Exception as e:
-        return False, e
-    return True, None
diff --git a/infer/modules/ipex/attention.py b/infer/modules/ipex/attention.py
deleted file mode 100644
index 78a4775ccf95ded03a953e07e5ffccc7bb4f29b5..0000000000000000000000000000000000000000
--- a/infer/modules/ipex/attention.py
+++ /dev/null
@@ -1,218 +0,0 @@
-import torch
-import intel_extension_for_pytorch as ipex  # pylint: disable=import-error, unused-import
-
-# pylint: disable=protected-access, missing-function-docstring, line-too-long
-
-original_torch_bmm = torch.bmm
-
-
-def torch_bmm(input, mat2, *, out=None):
-    if input.dtype != mat2.dtype:
-        mat2 = mat2.to(input.dtype)
-
-    # ARC GPUs can't allocate more than 4GB to a single block, Slice it:
-    batch_size_attention, input_tokens, mat2_shape = (
-        input.shape[0],
-        input.shape[1],
-        mat2.shape[2],
-    )
-    block_multiply = input.element_size()
-    slice_block_size = input_tokens * mat2_shape / 1024 / 1024 * block_multiply
-    block_size = batch_size_attention * slice_block_size
-
-    split_slice_size = batch_size_attention
-    if block_size > 4:
-        do_split = True
-        # Find something divisible with the input_tokens
-        while (split_slice_size * slice_block_size) > 4:
-            split_slice_size = split_slice_size // 2
-            if split_slice_size <= 1:
-                split_slice_size = 1
-                break
-    else:
-        do_split = False
-
-    split_2_slice_size = input_tokens
-    if split_slice_size * slice_block_size > 4:
-        slice_block_size2 = split_slice_size * mat2_shape / 1024 / 1024 * block_multiply
-        do_split_2 = True
-        # Find something divisible with the input_tokens
-        while (split_2_slice_size * slice_block_size2) > 4:
-            split_2_slice_size = split_2_slice_size // 2
-            if split_2_slice_size <= 1:
-                split_2_slice_size = 1
-                break
-    else:
-        do_split_2 = False
-
-    if do_split:
-        hidden_states = torch.zeros(
-            input.shape[0],
-            input.shape[1],
-            mat2.shape[2],
-            device=input.device,
-            dtype=input.dtype,
-        )
-        for i in range(batch_size_attention // split_slice_size):
-            start_idx = i * split_slice_size
-            end_idx = (i + 1) * split_slice_size
-            if do_split_2:
-                for i2 in range(
-                    input_tokens // split_2_slice_size
-                ):  # pylint: disable=invalid-name
-                    start_idx_2 = i2 * split_2_slice_size
-                    end_idx_2 = (i2 + 1) * split_2_slice_size
-                    hidden_states[start_idx:end_idx, start_idx_2:end_idx_2] = (
-                        original_torch_bmm(
-                            input[start_idx:end_idx, start_idx_2:end_idx_2],
-                            mat2[start_idx:end_idx, start_idx_2:end_idx_2],
-                            out=out,
-                        )
-                    )
-            else:
-                hidden_states[start_idx:end_idx] = original_torch_bmm(
-                    input[start_idx:end_idx], mat2[start_idx:end_idx], out=out
-                )
-    else:
-        return original_torch_bmm(input, mat2, out=out)
-    return hidden_states
-
-
-original_scaled_dot_product_attention = torch.nn.functional.scaled_dot_product_attention
-
-
-def scaled_dot_product_attention(
-    query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False
-):
-    # ARC GPUs can't allocate more than 4GB to a single block, Slice it:
-    if len(query.shape) == 3:
-        batch_size_attention, query_tokens, shape_four = query.shape
-        shape_one = 1
-        no_shape_one = True
-    else:
-        shape_one, batch_size_attention, query_tokens, shape_four = query.shape
-        no_shape_one = False
-
-    block_multiply = query.element_size()
-    slice_block_size = (
-        shape_one * query_tokens * shape_four / 1024 / 1024 * block_multiply
-    )
-    block_size = batch_size_attention * slice_block_size
-
-    split_slice_size = batch_size_attention
-    if block_size > 4:
-        do_split = True
-        # Find something divisible with the shape_one
-        while (split_slice_size * slice_block_size) > 4:
-            split_slice_size = split_slice_size // 2
-            if split_slice_size <= 1:
-                split_slice_size = 1
-                break
-    else:
-        do_split = False
-
-    split_2_slice_size = query_tokens
-    if split_slice_size * slice_block_size > 4:
-        slice_block_size2 = (
-            shape_one * split_slice_size * shape_four / 1024 / 1024 * block_multiply
-        )
-        do_split_2 = True
-        # Find something divisible with the batch_size_attention
-        while (split_2_slice_size * slice_block_size2) > 4:
-            split_2_slice_size = split_2_slice_size // 2
-            if split_2_slice_size <= 1:
-                split_2_slice_size = 1
-                break
-    else:
-        do_split_2 = False
-
-    if do_split:
-        hidden_states = torch.zeros(query.shape, device=query.device, dtype=query.dtype)
-        for i in range(batch_size_attention // split_slice_size):
-            start_idx = i * split_slice_size
-            end_idx = (i + 1) * split_slice_size
-            if do_split_2:
-                for i2 in range(
-                    query_tokens // split_2_slice_size
-                ):  # pylint: disable=invalid-name
-                    start_idx_2 = i2 * split_2_slice_size
-                    end_idx_2 = (i2 + 1) * split_2_slice_size
-                    if no_shape_one:
-                        hidden_states[start_idx:end_idx, start_idx_2:end_idx_2] = (
-                            original_scaled_dot_product_attention(
-                                query[start_idx:end_idx, start_idx_2:end_idx_2],
-                                key[start_idx:end_idx, start_idx_2:end_idx_2],
-                                value[start_idx:end_idx, start_idx_2:end_idx_2],
-                                attn_mask=(
-                                    attn_mask[start_idx:end_idx, start_idx_2:end_idx_2]
-                                    if attn_mask is not None
-                                    else attn_mask
-                                ),
-                                dropout_p=dropout_p,
-                                is_causal=is_causal,
-                            )
-                        )
-                    else:
-                        hidden_states[:, start_idx:end_idx, start_idx_2:end_idx_2] = (
-                            original_scaled_dot_product_attention(
-                                query[:, start_idx:end_idx, start_idx_2:end_idx_2],
-                                key[:, start_idx:end_idx, start_idx_2:end_idx_2],
-                                value[:, start_idx:end_idx, start_idx_2:end_idx_2],
-                                attn_mask=(
-                                    attn_mask[
-                                        :, start_idx:end_idx, start_idx_2:end_idx_2
-                                    ]
-                                    if attn_mask is not None
-                                    else attn_mask
-                                ),
-                                dropout_p=dropout_p,
-                                is_causal=is_causal,
-                            )
-                        )
-            else:
-                if no_shape_one:
-                    hidden_states[start_idx:end_idx] = (
-                        original_scaled_dot_product_attention(
-                            query[start_idx:end_idx],
-                            key[start_idx:end_idx],
-                            value[start_idx:end_idx],
-                            attn_mask=(
-                                attn_mask[start_idx:end_idx]
-                                if attn_mask is not None
-                                else attn_mask
-                            ),
-                            dropout_p=dropout_p,
-                            is_causal=is_causal,
-                        )
-                    )
-                else:
-                    hidden_states[:, start_idx:end_idx] = (
-                        original_scaled_dot_product_attention(
-                            query[:, start_idx:end_idx],
-                            key[:, start_idx:end_idx],
-                            value[:, start_idx:end_idx],
-                            attn_mask=(
-                                attn_mask[:, start_idx:end_idx]
-                                if attn_mask is not None
-                                else attn_mask
-                            ),
-                            dropout_p=dropout_p,
-                            is_causal=is_causal,
-                        )
-                    )
-    else:
-        return original_scaled_dot_product_attention(
-            query,
-            key,
-            value,
-            attn_mask=attn_mask,
-            dropout_p=dropout_p,
-            is_causal=is_causal,
-        )
-    return hidden_states
-
-
-def attention_init():
-    # ARC GPUs can't allocate more than 4GB to a single block:
-    torch.bmm = torch_bmm
-    torch.nn.functional.scaled_dot_product_attention = scaled_dot_product_attention
diff --git a/infer/modules/ipex/gradscaler.py b/infer/modules/ipex/gradscaler.py
deleted file mode 100644
index 7875151d17c390aca2f8116293c63b0879b7d4c4..0000000000000000000000000000000000000000
--- a/infer/modules/ipex/gradscaler.py
+++ /dev/null
@@ -1,187 +0,0 @@
-from collections import defaultdict
-import torch
-import intel_extension_for_pytorch as ipex  # pylint: disable=import-error, unused-import
-import intel_extension_for_pytorch._C as core  # pylint: disable=import-error, unused-import
-
-# pylint: disable=protected-access, missing-function-docstring, line-too-long
-
-OptState = ipex.cpu.autocast._grad_scaler.OptState
-_MultiDeviceReplicator = ipex.cpu.autocast._grad_scaler._MultiDeviceReplicator
-_refresh_per_optimizer_state = (
-    ipex.cpu.autocast._grad_scaler._refresh_per_optimizer_state
-)
-
-
-def _unscale_grads_(
-    self, optimizer, inv_scale, found_inf, allow_fp16
-):  # pylint: disable=unused-argument
-    per_device_inv_scale = _MultiDeviceReplicator(inv_scale)
-    per_device_found_inf = _MultiDeviceReplicator(found_inf)
-
-    # To set up _amp_foreach_non_finite_check_and_unscale_, split grads by device and dtype.
-    # There could be hundreds of grads, so we'd like to iterate through them just once.
-    # However, we don't know their devices or dtypes in advance.
-
-    # https://stackoverflow.com/questions/5029934/defaultdict-of-defaultdict
-    # Google says mypy struggles with defaultdicts type annotations.
-    per_device_and_dtype_grads = defaultdict(lambda: defaultdict(list))  # type: ignore[var-annotated]
-    # sync grad to master weight
-    if hasattr(optimizer, "sync_grad"):
-        optimizer.sync_grad()
-    with torch.no_grad():
-        for group in optimizer.param_groups:
-            for param in group["params"]:
-                if param.grad is None:
-                    continue
-                if (not allow_fp16) and param.grad.dtype == torch.float16:
-                    raise ValueError("Attempting to unscale FP16 gradients.")
-                if param.grad.is_sparse:
-                    # is_coalesced() == False means the sparse grad has values with duplicate indices.
-                    # coalesce() deduplicates indices and adds all values that have the same index.
-                    # For scaled fp16 values, there's a good chance coalescing will cause overflow,
-                    # so we should check the coalesced _values().
-                    if param.grad.dtype is torch.float16:
-                        param.grad = param.grad.coalesce()
-                    to_unscale = param.grad._values()
-                else:
-                    to_unscale = param.grad
-
-                # -: is there a way to split by device and dtype without appending in the inner loop?
-                to_unscale = to_unscale.to("cpu")
-                per_device_and_dtype_grads[to_unscale.device][to_unscale.dtype].append(
-                    to_unscale
-                )
-
-        for _, per_dtype_grads in per_device_and_dtype_grads.items():
-            for grads in per_dtype_grads.values():
-                core._amp_foreach_non_finite_check_and_unscale_(
-                    grads,
-                    per_device_found_inf.get("cpu"),
-                    per_device_inv_scale.get("cpu"),
-                )
-
-    return per_device_found_inf._per_device_tensors
-
-
-def unscale_(self, optimizer):
-    """
-    Divides ("unscales") the optimizer's gradient tensors by the scale factor.
-    :meth:`unscale_` is optional, serving cases where you need to
-    :ref:`modify or inspect gradients<working-with-unscaled-gradients>`
-    between the backward pass(es) and :meth:`step`.
-    If :meth:`unscale_` is not called explicitly,  gradients will be unscaled  automatically during :meth:`step`.
-    Simple example, using :meth:`unscale_` to enable clipping of unscaled gradients::
-        ...
-        scaler.scale(loss).backward()
-        scaler.unscale_(optimizer)
-        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
-        scaler.step(optimizer)
-        scaler.update()
-    Args:
-        optimizer (torch.optim.Optimizer):  Optimizer that owns the gradients to be unscaled.
-    .. warning::
-        :meth:`unscale_` should only be called once per optimizer per :meth:`step` call,
-        and only after all gradients for that optimizer's assigned parameters have been accumulated.
-        Calling :meth:`unscale_` twice for a given optimizer between each :meth:`step` triggers a RuntimeError.
-    .. warning::
-        :meth:`unscale_` may unscale sparse gradients out of place, replacing the ``.grad`` attribute.
-    """
-    if not self._enabled:
-        return
-
-    self._check_scale_growth_tracker("unscale_")
-
-    optimizer_state = self._per_optimizer_states[id(optimizer)]
-
-    if optimizer_state["stage"] is OptState.UNSCALED:  # pylint: disable=no-else-raise
-        raise RuntimeError(
-            "unscale_() has already been called on this optimizer since the last update()."
-        )
-    elif optimizer_state["stage"] is OptState.STEPPED:
-        raise RuntimeError("unscale_() is being called after step().")
-
-    # FP32 division can be imprecise for certain compile options, so we carry out the reciprocal in FP64.
-    assert self._scale is not None
-    inv_scale = (
-        self._scale.to("cpu").double().reciprocal().float().to(self._scale.device)
-    )
-    found_inf = torch.full((1,), 0.0, dtype=torch.float32, device=self._scale.device)
-
-    optimizer_state["found_inf_per_device"] = self._unscale_grads_(
-        optimizer, inv_scale, found_inf, False
-    )
-    optimizer_state["stage"] = OptState.UNSCALED
-
-
-def update(self, new_scale=None):
-    """
-    Updates the scale factor.
-    If any optimizer steps were skipped the scale is multiplied by ``backoff_factor``
-    to reduce it. If ``growth_interval`` unskipped iterations occurred consecutively,
-    the scale is multiplied by ``growth_factor`` to increase it.
-    Passing ``new_scale`` sets the new scale value manually. (``new_scale`` is not
-    used directly, it's used to fill GradScaler's internal scale tensor. So if
-    ``new_scale`` was a tensor, later in-place changes to that tensor will not further
-    affect the scale GradScaler uses internally.)
-    Args:
-        new_scale (float or :class:`torch.FloatTensor`, optional, default=None):  New scale factor.
-    .. warning::
-        :meth:`update` should only be called at the end of the iteration, after ``scaler.step(optimizer)`` has
-        been invoked for all optimizers used this iteration.
-    """
-    if not self._enabled:
-        return
-
-    _scale, _growth_tracker = self._check_scale_growth_tracker("update")
-
-    if new_scale is not None:
-        # Accept a new user-defined scale.
-        if isinstance(new_scale, float):
-            self._scale.fill_(new_scale)  # type: ignore[union-attr]
-        else:
-            reason = "new_scale should be a float or a 1-element torch.FloatTensor with requires_grad=False."
-            assert isinstance(new_scale, torch.FloatTensor), reason  # type: ignore[attr-defined]
-            assert new_scale.numel() == 1, reason
-            assert new_scale.requires_grad is False, reason
-            self._scale.copy_(new_scale)  # type: ignore[union-attr]
-    else:
-        # Consume shared inf/nan data collected from optimizers to update the scale.
-        # If all found_inf tensors are on the same device as self._scale, this operation is asynchronous.
-        found_infs = [
-            found_inf.to(device="cpu", non_blocking=True)
-            for state in self._per_optimizer_states.values()
-            for found_inf in state["found_inf_per_device"].values()
-        ]
-
-        assert len(found_infs) > 0, "No inf checks were recorded prior to update."
-
-        found_inf_combined = found_infs[0]
-        if len(found_infs) > 1:
-            for i in range(1, len(found_infs)):
-                found_inf_combined += found_infs[i]
-
-        to_device = _scale.device
-        _scale = _scale.to("cpu")
-        _growth_tracker = _growth_tracker.to("cpu")
-
-        core._amp_update_scale_(
-            _scale,
-            _growth_tracker,
-            found_inf_combined,
-            self._growth_factor,
-            self._backoff_factor,
-            self._growth_interval,
-        )
-
-        _scale = _scale.to(to_device)
-        _growth_tracker = _growth_tracker.to(to_device)
-    # To prepare for next iteration, clear the data collected from optimizers this iteration.
-    self._per_optimizer_states = defaultdict(_refresh_per_optimizer_state)
-
-
-def gradscaler_init():
-    torch.xpu.amp.GradScaler = ipex.cpu.autocast._grad_scaler.GradScaler
-    torch.xpu.amp.GradScaler._unscale_grads_ = _unscale_grads_
-    torch.xpu.amp.GradScaler.unscale_ = unscale_
-    torch.xpu.amp.GradScaler.update = update
-    return torch.xpu.amp.GradScaler
diff --git a/infer/modules/ipex/hijacks.py b/infer/modules/ipex/hijacks.py
deleted file mode 100644
index fc75f0c7cbfa41f145db95a05296f0668400e981..0000000000000000000000000000000000000000
--- a/infer/modules/ipex/hijacks.py
+++ /dev/null
@@ -1,365 +0,0 @@
-import contextlib
-import importlib
-import torch
-import intel_extension_for_pytorch as ipex  # pylint: disable=import-error, unused-import
-
-# pylint: disable=protected-access, missing-function-docstring, line-too-long, unnecessary-lambda, no-else-return
-
-
-class CondFunc:  # pylint: disable=missing-class-docstring
-    def __new__(cls, orig_func, sub_func, cond_func):
-        self = super(CondFunc, cls).__new__(cls)
-        if isinstance(orig_func, str):
-            func_path = orig_func.split(".")
-            for i in range(len(func_path) - 1, -1, -1):
-                try:
-                    resolved_obj = importlib.import_module(".".join(func_path[:i]))
-                    break
-                except ImportError:
-                    pass
-            for attr_name in func_path[i:-1]:
-                resolved_obj = getattr(resolved_obj, attr_name)
-            orig_func = getattr(resolved_obj, func_path[-1])
-            setattr(
-                resolved_obj,
-                func_path[-1],
-                lambda *args, **kwargs: self(*args, **kwargs),
-            )
-        self.__init__(orig_func, sub_func, cond_func)
-        return lambda *args, **kwargs: self(*args, **kwargs)
-
-    def __init__(self, orig_func, sub_func, cond_func):
-        self.__orig_func = orig_func
-        self.__sub_func = sub_func
-        self.__cond_func = cond_func
-
-    def __call__(self, *args, **kwargs):
-        if not self.__cond_func or self.__cond_func(self.__orig_func, *args, **kwargs):
-            return self.__sub_func(self.__orig_func, *args, **kwargs)
-        else:
-            return self.__orig_func(*args, **kwargs)
-
-
-_utils = torch.utils.data._utils
-
-
-def _shutdown_workers(self):
-    if (
-        torch.utils.data._utils is None
-        or torch.utils.data._utils.python_exit_status is True
-        or torch.utils.data._utils.python_exit_status is None
-    ):
-        return
-    if hasattr(self, "_shutdown") and not self._shutdown:
-        self._shutdown = True
-        try:
-            if hasattr(self, "_pin_memory_thread"):
-                self._pin_memory_thread_done_event.set()
-                self._worker_result_queue.put((None, None))
-                self._pin_memory_thread.join()
-                self._worker_result_queue.cancel_join_thread()
-                self._worker_result_queue.close()
-            self._workers_done_event.set()
-            for worker_id in range(len(self._workers)):
-                if self._persistent_workers or self._workers_status[worker_id]:
-                    self._mark_worker_as_unavailable(worker_id, shutdown=True)
-            for w in self._workers:  # pylint: disable=invalid-name
-                w.join(timeout=torch.utils.data._utils.MP_STATUS_CHECK_INTERVAL)
-            for q in self._index_queues:  # pylint: disable=invalid-name
-                q.cancel_join_thread()
-                q.close()
-        finally:
-            if self._worker_pids_set:
-                torch.utils.data._utils.signal_handling._remove_worker_pids(id(self))
-                self._worker_pids_set = False
-            for w in self._workers:  # pylint: disable=invalid-name
-                if w.is_alive():
-                    w.terminate()
-
-
-class DummyDataParallel(
-    torch.nn.Module
-):  # pylint: disable=missing-class-docstring, unused-argument, too-few-public-methods
-    def __new__(
-        cls, module, device_ids=None, output_device=None, dim=0
-    ):  # pylint: disable=unused-argument
-        if isinstance(device_ids, list) and len(device_ids) > 1:
-            print("IPEX backend doesn't support DataParallel on multiple XPU devices")
-        return module.to("xpu")
-
-
-def return_null_context(*args, **kwargs):  # pylint: disable=unused-argument
-    return contextlib.nullcontext()
-
-
-def check_device(device):
-    return bool(
-        (isinstance(device, torch.device) and device.type == "cuda")
-        or (isinstance(device, str) and "cuda" in device)
-        or isinstance(device, int)
-    )
-
-
-def return_xpu(device):
-    return (
-        f"xpu:{device[-1]}"
-        if isinstance(device, str) and ":" in device
-        else (
-            f"xpu:{device}"
-            if isinstance(device, int)
-            else torch.device("xpu") if isinstance(device, torch.device) else "xpu"
-        )
-    )
-
-
-def ipex_no_cuda(orig_func, *args, **kwargs):
-    torch.cuda.is_available = lambda: False
-    orig_func(*args, **kwargs)
-    torch.cuda.is_available = torch.xpu.is_available
-
-
-original_autocast = torch.autocast
-
-
-def ipex_autocast(*args, **kwargs):
-    if len(args) > 0 and args[0] == "cuda":
-        return original_autocast("xpu", *args[1:], **kwargs)
-    else:
-        return original_autocast(*args, **kwargs)
-
-
-original_torch_cat = torch.cat
-
-
-def torch_cat(tensor, *args, **kwargs):
-    if len(tensor) == 3 and (
-        tensor[0].dtype != tensor[1].dtype or tensor[2].dtype != tensor[1].dtype
-    ):
-        return original_torch_cat(
-            [tensor[0].to(tensor[1].dtype), tensor[1], tensor[2].to(tensor[1].dtype)],
-            *args,
-            **kwargs,
-        )
-    else:
-        return original_torch_cat(tensor, *args, **kwargs)
-
-
-original_interpolate = torch.nn.functional.interpolate
-
-
-def interpolate(
-    tensor,
-    size=None,
-    scale_factor=None,
-    mode="nearest",
-    align_corners=None,
-    recompute_scale_factor=None,
-    antialias=False,
-):  # pylint: disable=too-many-arguments
-    if antialias or align_corners is not None:
-        return_device = tensor.device
-        return_dtype = tensor.dtype
-        return original_interpolate(
-            tensor.to("cpu", dtype=torch.float32),
-            size=size,
-            scale_factor=scale_factor,
-            mode=mode,
-            align_corners=align_corners,
-            recompute_scale_factor=recompute_scale_factor,
-            antialias=antialias,
-        ).to(return_device, dtype=return_dtype)
-    else:
-        return original_interpolate(
-            tensor,
-            size=size,
-            scale_factor=scale_factor,
-            mode=mode,
-            align_corners=align_corners,
-            recompute_scale_factor=recompute_scale_factor,
-            antialias=antialias,
-        )
-
-
-original_linalg_solve = torch.linalg.solve
-
-
-def linalg_solve(A, B, *args, **kwargs):  # pylint: disable=invalid-name
-    if A.device != torch.device("cpu") or B.device != torch.device("cpu"):
-        return_device = A.device
-        return original_linalg_solve(A.to("cpu"), B.to("cpu"), *args, **kwargs).to(
-            return_device
-        )
-    else:
-        return original_linalg_solve(A, B, *args, **kwargs)
-
-
-def ipex_hijacks():
-    CondFunc(
-        "torch.Tensor.to",
-        lambda orig_func, self, device=None, *args, **kwargs: orig_func(
-            self, return_xpu(device), *args, **kwargs
-        ),
-        lambda orig_func, self, device=None, *args, **kwargs: check_device(device),
-    )
-    CondFunc(
-        "torch.Tensor.cuda",
-        lambda orig_func, self, device=None, *args, **kwargs: orig_func(
-            self, return_xpu(device), *args, **kwargs
-        ),
-        lambda orig_func, self, device=None, *args, **kwargs: check_device(device),
-    )
-    CondFunc(
-        "torch.empty",
-        lambda orig_func, *args, device=None, **kwargs: orig_func(
-            *args, device=return_xpu(device), **kwargs
-        ),
-        lambda orig_func, *args, device=None, **kwargs: check_device(device),
-    )
-    CondFunc(
-        "torch.load",
-        lambda orig_func, *args, map_location=None, **kwargs: orig_func(
-            *args, return_xpu(map_location), **kwargs
-        ),
-        lambda orig_func, *args, map_location=None, **kwargs: map_location is None
-        or check_device(map_location),
-    )
-    CondFunc(
-        "torch.randn",
-        lambda orig_func, *args, device=None, **kwargs: orig_func(
-            *args, device=return_xpu(device), **kwargs
-        ),
-        lambda orig_func, *args, device=None, **kwargs: check_device(device),
-    )
-    CondFunc(
-        "torch.ones",
-        lambda orig_func, *args, device=None, **kwargs: orig_func(
-            *args, device=return_xpu(device), **kwargs
-        ),
-        lambda orig_func, *args, device=None, **kwargs: check_device(device),
-    )
-    CondFunc(
-        "torch.zeros",
-        lambda orig_func, *args, device=None, **kwargs: orig_func(
-            *args, device=return_xpu(device), **kwargs
-        ),
-        lambda orig_func, *args, device=None, **kwargs: check_device(device),
-    )
-    CondFunc(
-        "torch.tensor",
-        lambda orig_func, *args, device=None, **kwargs: orig_func(
-            *args, device=return_xpu(device), **kwargs
-        ),
-        lambda orig_func, *args, device=None, **kwargs: check_device(device),
-    )
-    CondFunc(
-        "torch.linspace",
-        lambda orig_func, *args, device=None, **kwargs: orig_func(
-            *args, device=return_xpu(device), **kwargs
-        ),
-        lambda orig_func, *args, device=None, **kwargs: check_device(device),
-    )
-
-    CondFunc(
-        "torch.Generator",
-        lambda orig_func, device=None: torch.xpu.Generator(device),
-        lambda orig_func, device=None: device is not None
-        and device != torch.device("cpu")
-        and device != "cpu",
-    )
-
-    CondFunc(
-        "torch.batch_norm",
-        lambda orig_func, input, weight, bias, *args, **kwargs: orig_func(
-            input,
-            (
-                weight
-                if weight is not None
-                else torch.ones(input.size()[1], device=input.device)
-            ),
-            (
-                bias
-                if bias is not None
-                else torch.zeros(input.size()[1], device=input.device)
-            ),
-            *args,
-            **kwargs,
-        ),
-        lambda orig_func, input, *args, **kwargs: input.device != torch.device("cpu"),
-    )
-    CondFunc(
-        "torch.instance_norm",
-        lambda orig_func, input, weight, bias, *args, **kwargs: orig_func(
-            input,
-            (
-                weight
-                if weight is not None
-                else torch.ones(input.size()[1], device=input.device)
-            ),
-            (
-                bias
-                if bias is not None
-                else torch.zeros(input.size()[1], device=input.device)
-            ),
-            *args,
-            **kwargs,
-        ),
-        lambda orig_func, input, *args, **kwargs: input.device != torch.device("cpu"),
-    )
-
-    # Functions with dtype errors:
-    CondFunc(
-        "torch.nn.modules.GroupNorm.forward",
-        lambda orig_func, self, input: orig_func(
-            self, input.to(self.weight.data.dtype)
-        ),
-        lambda orig_func, self, input: input.dtype != self.weight.data.dtype,
-    )
-    CondFunc(
-        "torch.nn.modules.linear.Linear.forward",
-        lambda orig_func, self, input: orig_func(
-            self, input.to(self.weight.data.dtype)
-        ),
-        lambda orig_func, self, input: input.dtype != self.weight.data.dtype,
-    )
-    CondFunc(
-        "torch.nn.modules.conv.Conv2d.forward",
-        lambda orig_func, self, input: orig_func(
-            self, input.to(self.weight.data.dtype)
-        ),
-        lambda orig_func, self, input: input.dtype != self.weight.data.dtype,
-    )
-    CondFunc(
-        "torch.nn.functional.layer_norm",
-        lambda orig_func, input, normalized_shape=None, weight=None, *args, **kwargs: orig_func(
-            input.to(weight.data.dtype), normalized_shape, weight, *args, **kwargs
-        ),
-        lambda orig_func, input, normalized_shape=None, weight=None, *args, **kwargs: weight
-        is not None
-        and input.dtype != weight.data.dtype,
-    )
-
-    # Diffusers Float64 (ARC GPUs doesn't support double or Float64):
-    if not torch.xpu.has_fp64_dtype():
-        CondFunc(
-            "torch.from_numpy",
-            lambda orig_func, ndarray: orig_func(ndarray.astype("float32")),
-            lambda orig_func, ndarray: ndarray.dtype == float,
-        )
-
-    # Broken functions when torch.cuda.is_available is True:
-    CondFunc(
-        "torch.utils.data.dataloader._BaseDataLoaderIter.__init__",
-        lambda orig_func, *args, **kwargs: ipex_no_cuda(orig_func, *args, **kwargs),
-        lambda orig_func, *args, **kwargs: True,
-    )
-
-    # Functions that make compile mad with CondFunc:
-    torch.utils.data.dataloader._MultiProcessingDataLoaderIter._shutdown_workers = (
-        _shutdown_workers
-    )
-    torch.nn.DataParallel = DummyDataParallel
-    torch.autocast = ipex_autocast
-    torch.cat = torch_cat
-    torch.linalg.solve = linalg_solve
-    torch.nn.functional.interpolate = interpolate
-    torch.backends.cuda.sdp_kernel = return_null_context
diff --git a/infer/modules/onnx/export.py b/infer/modules/onnx/export.py
deleted file mode 100644
index ed4a4162ff04b7e12642fcbe96847f8ea9db06aa..0000000000000000000000000000000000000000
--- a/infer/modules/onnx/export.py
+++ /dev/null
@@ -1,52 +0,0 @@
-import torch
-
-from infer.lib.infer_pack.models_onnx import SynthesizerTrnMsNSFsidM
-
-
-def export_onnx(ModelPath, ExportedPath):
-    cpt = torch.load(ModelPath, map_location="cpu")
-    cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]
-    vec_channels = 256 if cpt.get("version", "v1") == "v1" else 768
-
-    test_phone = torch.rand(1, 200, vec_channels)  # hidden unit
-    test_phone_lengths = torch.tensor([200]).long()  # hidden unit 长度（貌似没啥用）
-    test_pitch = torch.randint(size=(1, 200), low=5, high=255)  # 基频（单位赫兹）
-    test_pitchf = torch.rand(1, 200)  # nsf基频
-    test_ds = torch.LongTensor([0])  # 说话人ID
-    test_rnd = torch.rand(1, 192, 200)  # 噪声（加入随机因子）
-
-    device = "cpu"  # 导出时设备（不影响使用模型）
-
-    net_g = SynthesizerTrnMsNSFsidM(
-        *cpt["config"], is_half=False, version=cpt.get("version", "v1")
-    )  # fp32导出（C++要支持fp16必须手动将内存重新排列所以暂时不用fp16）
-    net_g.load_state_dict(cpt["weight"], strict=False)
-    input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds", "rnd"]
-    output_names = [
-        "audio",
-    ]
-    # net_g.construct_spkmixmap(n_speaker) 多角色混合轨道导出
-    torch.onnx.export(
-        net_g,
-        (
-            test_phone.to(device),
-            test_phone_lengths.to(device),
-            test_pitch.to(device),
-            test_pitchf.to(device),
-            test_ds.to(device),
-            test_rnd.to(device),
-        ),
-        ExportedPath,
-        dynamic_axes={
-            "phone": [1],
-            "pitch": [1],
-            "pitchf": [1],
-            "rnd": [2],
-        },
-        do_constant_folding=False,
-        opset_version=13,
-        verbose=False,
-        input_names=input_names,
-        output_names=output_names,
-    )
-    return "Finished"
diff --git a/infer/modules/train/extract/extract_f0_print.py b/infer/modules/train/extract/extract_f0_print.py
index 9d231e4e86db204704ce894a2b12ebad38665064..2aa7a789ac1818f21fd94299034fdffb447f2fb1 100644
--- a/infer/modules/train/extract/extract_f0_print.py
+++ b/infer/modules/train/extract/extract_f0_print.py
@@ -15,6 +15,7 @@ from infer.lib.audio import load_audio
 
 logging.getLogger("numba").setLevel(logging.WARNING)
 from multiprocessing import Process
+from model import rmvpe
 
 exp_dir = sys.argv[1]
 f = open("%s/extract_f0_feature.log" % exp_dir, "a+")
@@ -83,12 +84,7 @@ class FeatureInput(object):
             f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.fs)
         elif f0_method == "rmvpe":
             if hasattr(self, "model_rmvpe") == False:
-                from infer.lib.rmvpe import RMVPE
-
-                print("Loading rmvpe model")
-                self.model_rmvpe = RMVPE(
-                    "assets/rmvpe/rmvpe.pt", is_half=False, device="cpu"
-                )
+                self.model_rmvpe = rmvpe
             f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
         return f0
 
diff --git a/infer/modules/train/extract/extract_f0_rmvpe.py b/infer/modules/train/extract/extract_f0_rmvpe.py
index 358bc8cad72c58202ba186d65f6cb925ddd76dc3..114bfd5f57ef7412991dc2236cf302e73a415923 100644
--- a/infer/modules/train/extract/extract_f0_rmvpe.py
+++ b/infer/modules/train/extract/extract_f0_rmvpe.py
@@ -6,17 +6,18 @@ import logging
 import numpy as np
 
 from infer.lib.audio import load_audio
+from model import rmvpe, fp16
 
 logging.getLogger("numba").setLevel(logging.WARNING)
 
 
 class FeatureInput(object):
-    def __init__(self, exp_dir, samplerate=16000, hop_size=160, is_half=False):
+    def __init__(self, exp_dir, samplerate=16000, hop_size=160):
         self.exp_dir = exp_dir
         self.logfile = open("%s/extract_f0_feature.log" % exp_dir, "a+")
         self.fs = samplerate
         self.hop = hop_size
-        self.is_half = is_half
+        self.is_half = fp16
 
         self.f0_bin = 256
         self.f0_max = 1100.0
@@ -34,12 +35,7 @@ class FeatureInput(object):
         # p_len = x.shape[0] // self.hop
         if f0_method == "rmvpe":
             if hasattr(self, "model_rmvpe") == False:
-                from infer.lib.rmvpe import RMVPE
-
-                print("Loading rmvpe model")
-                self.model_rmvpe = RMVPE(
-                    "assets/rmvpe/rmvpe.pt", is_half=self.is_half, device="cuda"
-                )
+                self.model_rmvpe = rmvpe
             f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
         return f0
 
diff --git a/infer/modules/train/extract/extract_f0_rmvpe_dml.py b/infer/modules/train/extract/extract_f0_rmvpe_dml.py
deleted file mode 100644
index 243e825005bd46dfd464f6d49ecf78f0abf03dc2..0000000000000000000000000000000000000000
--- a/infer/modules/train/extract/extract_f0_rmvpe_dml.py
+++ /dev/null
@@ -1,139 +0,0 @@
-import os
-import sys
-import traceback
-
-import parselmouth
-
-now_dir = os.getcwd()
-sys.path.append(now_dir)
-import logging
-
-import numpy as np
-import pyworld
-
-from infer.lib.audio import load_audio
-
-logging.getLogger("numba").setLevel(logging.WARNING)
-
-exp_dir = sys.argv[1]
-import torch_directml
-
-device = torch_directml.device(torch_directml.default_device())
-f = open("%s/extract_f0_feature.log" % exp_dir, "a+")
-
-
-def printt(strr):
-    print(strr)
-    f.write("%s\n" % strr)
-    f.flush()
-
-
-class FeatureInput(object):
-    def __init__(self, samplerate=16000, hop_size=160):
-        self.fs = samplerate
-        self.hop = hop_size
-
-        self.f0_bin = 256
-        self.f0_max = 1100.0
-        self.f0_min = 50.0
-        self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
-        self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
-
-    def compute_f0(self, path, f0_method):
-        x = load_audio(path, self.fs)
-        # p_len = x.shape[0] // self.hop
-        if f0_method == "rmvpe":
-            if hasattr(self, "model_rmvpe") == False:
-                from infer.lib.rmvpe import RMVPE
-
-                print("Loading rmvpe model")
-                self.model_rmvpe = RMVPE(
-                    "assets/rmvpe/rmvpe.pt", is_half=False, device=device
-                )
-            f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
-        return f0
-
-    def coarse_f0(self, f0):
-        f0_mel = 1127 * np.log(1 + f0 / 700)
-        f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * (
-            self.f0_bin - 2
-        ) / (self.f0_mel_max - self.f0_mel_min) + 1
-
-        # use 0 or 1
-        f0_mel[f0_mel <= 1] = 1
-        f0_mel[f0_mel > self.f0_bin - 1] = self.f0_bin - 1
-        f0_coarse = np.rint(f0_mel).astype(int)
-        assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (
-            f0_coarse.max(),
-            f0_coarse.min(),
-        )
-        return f0_coarse
-
-    def go(self, paths, f0_method):
-        if len(paths) == 0:
-            printt("no-f0-todo")
-        else:
-            printt("todo-f0-%s" % len(paths))
-            n = max(len(paths) // 5, 1)  # 每个进程最多打印5条
-            for idx, (inp_path, opt_path1, opt_path2) in enumerate(paths):
-                try:
-                    if idx % n == 0:
-                        printt("f0ing,now-%s,all-%s,-%s" % (idx, len(paths), inp_path))
-                    if (
-                        os.path.exists(opt_path1 + ".npy") == True
-                        and os.path.exists(opt_path2 + ".npy") == True
-                    ):
-                        continue
-                    featur_pit = self.compute_f0(inp_path, f0_method)
-                    np.save(
-                        opt_path2,
-                        featur_pit,
-                        allow_pickle=False,
-                    )  # nsf
-                    coarse_pit = self.coarse_f0(featur_pit)
-                    np.save(
-                        opt_path1,
-                        coarse_pit,
-                        allow_pickle=False,
-                    )  # ori
-                except:
-                    printt("f0fail-%s-%s-%s" % (idx, inp_path, traceback.format_exc()))
-
-
-if __name__ == "__main__":
-    # exp_dir=r"E:\codes\py39\dataset\mi-test"
-    # n_p=16
-    # f = open("%s/log_extract_f0.log"%exp_dir, "w")
-    printt(" ".join(sys.argv))
-    featureInput = FeatureInput()
-    paths = []
-    inp_root = "%s/1_16k_wavs" % (exp_dir)
-    opt_root1 = "%s/2a_f0" % (exp_dir)
-    opt_root2 = "%s/2b-f0nsf" % (exp_dir)
-
-    os.makedirs(opt_root1, exist_ok=True)
-    os.makedirs(opt_root2, exist_ok=True)
-    for name in sorted(list(os.listdir(inp_root))):
-        inp_path = "%s/%s" % (inp_root, name)
-        if "spec" in inp_path:
-            continue
-        opt_path1 = "%s/%s" % (opt_root1, name)
-        opt_path2 = "%s/%s" % (opt_root2, name)
-        paths.append([inp_path, opt_path1, opt_path2])
-    try:
-        featureInput.go(paths, "rmvpe")
-    except:
-        printt("f0_all_fail-%s" % (traceback.format_exc()))
-    # ps = []
-    # for i in range(n_p):
-    #     p = Process(
-    #         target=featureInput.go,
-    #         args=(
-    #             paths[i::n_p],
-    #             f0method,
-    #         ),
-    #     )
-    #     ps.append(p)
-    #     p.start()
-    # for i in range(n_p):
-    #     ps[i].join()
diff --git a/infer/modules/train/extract_feature_print.py b/infer/modules/train/extract_feature_print.py
index 8a21deb6a16212b1b04056a1ca4c3b15974bbdab..c5a5f0374f5f33d0944142674f945e0fa9c0657f 100644
--- a/infer/modules/train/extract_feature_print.py
+++ b/infer/modules/train/extract_feature_print.py
@@ -5,26 +5,7 @@ import numpy as np
 import soundfile as sf
 import torch
 import torch.nn.functional as F
-
-
-device = "cpu"
-if torch.cuda.is_available():
-    device = "cuda"
-elif torch.backends.mps.is_available():
-    device = "mps"
-
-model_path = "assets/hubert/hubert_base.pt"
-models, saved_cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task(
-    [model_path],
-    suffix="",
-)
-model = models[0]
-model = model.to(device)
-is_half = False
-if is_half:
-    if device not in ["mps", "cpu"]:
-        model = model.half()
-model.eval()
+from model import hubert, hubert_cfg, device, fp16 as is_half
 
 
 # wave must be 16k, hop_size=320
@@ -71,19 +52,17 @@ class HubertFeatureExtractor:
                         if os.path.exists(out_path):
                             continue
 
-                        feats = readwave(wav_path, normalize=saved_cfg.task.normalize)
+                        feats = readwave(wav_path, normalize=hubert_cfg.task.normalize)
                         padding_mask = torch.BoolTensor(feats.shape).fill_(False)
                         inputs = {
                             "source": (
-                                feats.half().to(device)
-                                if is_half and device not in ["mps", "cpu"]
-                                else feats.to(device)
+                                feats.half().to(device) if is_half else feats.to(device)
                             ),
                             "padding_mask": padding_mask.to(device),
                             "output_layer": 12,
                         }
                         with torch.no_grad():
-                            logits = model.extract_features(**inputs)
+                            logits = hubert.extract_features(**inputs)
                             feats = logits[0]
 
                         feats = feats.squeeze(0).float().cpu().numpy()
diff --git a/infer/modules/train/train.py b/infer/modules/train/train.py
index 2b900ec34ecdb246838660822bdee4bc9566dde9..c953150118035b01b5d8866a6fa7b359c65a4779 100644
--- a/infer/modules/train/train.py
+++ b/infer/modules/train/train.py
@@ -35,13 +35,9 @@ except Exception:
 
 torch.backends.cudnn.deterministic = False
 torch.backends.cudnn.benchmark = False
-from time import sleep
 from time import time as ttime
 
-import torch.distributed as dist
-import torch.multiprocessing as mp
 from torch.nn import functional as F
-from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.utils.data import DataLoader
 from torch.utils.tensorboard import SummaryWriter
 
diff --git a/infer/modules/uvr5/mdxnet.py b/infer/modules/uvr5/mdxnet.py
deleted file mode 100644
index 2f246db7a7c3186afd60f0b99b8089814331f4ba..0000000000000000000000000000000000000000
--- a/infer/modules/uvr5/mdxnet.py
+++ /dev/null
@@ -1,256 +0,0 @@
-import os
-import logging
-
-logger = logging.getLogger(__name__)
-
-import librosa
-import numpy as np
-import soundfile as sf
-import torch
-from tqdm import tqdm
-
-cpu = torch.device("cpu")
-
-
-class ConvTDFNetTrim:
-    def __init__(
-        self, device, model_name, target_name, L, dim_f, dim_t, n_fft, hop=1024
-    ):
-        super(ConvTDFNetTrim, self).__init__()
-
-        self.dim_f = dim_f
-        self.dim_t = 2**dim_t
-        self.n_fft = n_fft
-        self.hop = hop
-        self.n_bins = self.n_fft // 2 + 1
-        self.chunk_size = hop * (self.dim_t - 1)
-        self.window = torch.hann_window(window_length=self.n_fft, periodic=True).to(
-            device
-        )
-        self.target_name = target_name
-        self.blender = "blender" in model_name
-
-        self.dim_c = 4
-        out_c = self.dim_c * 4 if target_name == "*" else self.dim_c
-        self.freq_pad = torch.zeros(
-            [1, out_c, self.n_bins - self.dim_f, self.dim_t]
-        ).to(device)
-
-        self.n = L // 2
-
-    def stft(self, x):
-        x = x.reshape([-1, self.chunk_size])
-        x = torch.stft(
-            x,
-            n_fft=self.n_fft,
-            hop_length=self.hop,
-            window=self.window,
-            center=True,
-            return_complex=True,
-        )
-        x = torch.view_as_real(x)
-        x = x.permute([0, 3, 1, 2])
-        x = x.reshape([-1, 2, 2, self.n_bins, self.dim_t]).reshape(
-            [-1, self.dim_c, self.n_bins, self.dim_t]
-        )
-        return x[:, :, : self.dim_f]
-
-    def istft(self, x, freq_pad=None):
-        freq_pad = (
-            self.freq_pad.repeat([x.shape[0], 1, 1, 1])
-            if freq_pad is None
-            else freq_pad
-        )
-        x = torch.cat([x, freq_pad], -2)
-        c = 4 * 2 if self.target_name == "*" else 2
-        x = x.reshape([-1, c, 2, self.n_bins, self.dim_t]).reshape(
-            [-1, 2, self.n_bins, self.dim_t]
-        )
-        x = x.permute([0, 2, 3, 1])
-        x = x.contiguous()
-        x = torch.view_as_complex(x)
-        x = torch.istft(
-            x, n_fft=self.n_fft, hop_length=self.hop, window=self.window, center=True
-        )
-        return x.reshape([-1, c, self.chunk_size])
-
-
-def get_models(device, dim_f, dim_t, n_fft):
-    return ConvTDFNetTrim(
-        device=device,
-        model_name="Conv-TDF",
-        target_name="vocals",
-        L=11,
-        dim_f=dim_f,
-        dim_t=dim_t,
-        n_fft=n_fft,
-    )
-
-
-class Predictor:
-    def __init__(self, args):
-        import onnxruntime as ort
-
-        logger.info(ort.get_available_providers())
-        self.args = args
-        self.model_ = get_models(
-            device=cpu, dim_f=args.dim_f, dim_t=args.dim_t, n_fft=args.n_fft
-        )
-        self.model = ort.InferenceSession(
-            os.path.join(args.onnx, self.model_.target_name + ".onnx"),
-            providers=[
-                "CUDAExecutionProvider",
-                "DmlExecutionProvider",
-                "CPUExecutionProvider",
-            ],
-        )
-        logger.info("ONNX load done")
-
-    def demix(self, mix):
-        samples = mix.shape[-1]
-        margin = self.args.margin
-        chunk_size = self.args.chunks * 44100
-        assert not margin == 0, "margin cannot be zero!"
-        if margin > chunk_size:
-            margin = chunk_size
-
-        segmented_mix = {}
-
-        if self.args.chunks == 0 or samples < chunk_size:
-            chunk_size = samples
-
-        counter = -1
-        for skip in range(0, samples, chunk_size):
-            counter += 1
-
-            s_margin = 0 if counter == 0 else margin
-            end = min(skip + chunk_size + margin, samples)
-
-            start = skip - s_margin
-
-            segmented_mix[skip] = mix[:, start:end].copy()
-            if end == samples:
-                break
-
-        sources = self.demix_base(segmented_mix, margin_size=margin)
-        """
-        mix:(2,big_sample)
-        segmented_mix:offset->(2,small_sample)
-        sources:(1,2,big_sample)
-        """
-        return sources
-
-    def demix_base(self, mixes, margin_size):
-        chunked_sources = []
-        progress_bar = tqdm(total=len(mixes))
-        progress_bar.set_description("Processing")
-        for mix in mixes:
-            cmix = mixes[mix]
-            sources = []
-            n_sample = cmix.shape[1]
-            model = self.model_
-            trim = model.n_fft // 2
-            gen_size = model.chunk_size - 2 * trim
-            pad = gen_size - n_sample % gen_size
-            mix_p = np.concatenate(
-                (np.zeros((2, trim)), cmix, np.zeros((2, pad)), np.zeros((2, trim))), 1
-            )
-            mix_waves = []
-            i = 0
-            while i < n_sample + pad:
-                waves = np.array(mix_p[:, i : i + model.chunk_size])
-                mix_waves.append(waves)
-                i += gen_size
-            mix_waves = torch.tensor(mix_waves, dtype=torch.float32).to(cpu)
-            with torch.no_grad():
-                _ort = self.model
-                spek = model.stft(mix_waves)
-                if self.args.denoise:
-                    spec_pred = (
-                        -_ort.run(None, {"input": -spek.cpu().numpy()})[0] * 0.5
-                        + _ort.run(None, {"input": spek.cpu().numpy()})[0] * 0.5
-                    )
-                    tar_waves = model.istft(torch.tensor(spec_pred))
-                else:
-                    tar_waves = model.istft(
-                        torch.tensor(_ort.run(None, {"input": spek.cpu().numpy()})[0])
-                    )
-                tar_signal = (
-                    tar_waves[:, :, trim:-trim]
-                    .transpose(0, 1)
-                    .reshape(2, -1)
-                    .numpy()[:, :-pad]
-                )
-
-                start = 0 if mix == 0 else margin_size
-                end = None if mix == list(mixes.keys())[::-1][0] else -margin_size
-                if margin_size == 0:
-                    end = None
-                sources.append(tar_signal[:, start:end])
-
-                progress_bar.update(1)
-
-            chunked_sources.append(sources)
-        _sources = np.concatenate(chunked_sources, axis=-1)
-        # del self.model
-        progress_bar.close()
-        return _sources
-
-    def prediction(self, m, vocal_root, others_root, format):
-        os.makedirs(vocal_root, exist_ok=True)
-        os.makedirs(others_root, exist_ok=True)
-        basename = os.path.basename(m)
-        mix, rate = librosa.load(m, mono=False, sr=44100)
-        if mix.ndim == 1:
-            mix = np.asfortranarray([mix, mix])
-        mix = mix.T
-        sources = self.demix(mix.T)
-        opt = sources[0].T
-        if format in ["wav", "flac"]:
-            sf.write(
-                "%s/%s_main_vocal.%s" % (vocal_root, basename, format), mix - opt, rate
-            )
-            sf.write("%s/%s_others.%s" % (others_root, basename, format), opt, rate)
-        else:
-            path_vocal = "%s/%s_main_vocal.wav" % (vocal_root, basename)
-            path_other = "%s/%s_others.wav" % (others_root, basename)
-            sf.write(path_vocal, mix - opt, rate)
-            sf.write(path_other, opt, rate)
-            opt_path_vocal = path_vocal[:-4] + ".%s" % format
-            opt_path_other = path_other[:-4] + ".%s" % format
-            if os.path.exists(path_vocal):
-                os.system(
-                    "ffmpeg -i %s -vn %s -q:a 2 -y" % (path_vocal, opt_path_vocal)
-                )
-                if os.path.exists(opt_path_vocal):
-                    try:
-                        os.remove(path_vocal)
-                    except:
-                        pass
-            if os.path.exists(path_other):
-                os.system(
-                    "ffmpeg -i %s -vn %s -q:a 2 -y" % (path_other, opt_path_other)
-                )
-                if os.path.exists(opt_path_other):
-                    try:
-                        os.remove(path_other)
-                    except:
-                        pass
-
-
-class MDXNetDereverb:
-    def __init__(self, chunks, device):
-        self.onnx = "assets/uvr5_weights/onnx_dereverb_By_FoxJoy"
-        self.shifts = 10  # 'Predict with randomised equivariant stabilisation'
-        self.mixing = "min_mag"  # ['default','min_mag','max_mag']
-        self.chunks = chunks
-        self.margin = 44100
-        self.dim_t = 9
-        self.dim_f = 3072
-        self.n_fft = 6144
-        self.denoise = True
-        self.pred = Predictor(self)
-        self.device = device
-
-    def _path_audio_(self, input, vocal_root, others_root, format, is_hp3=False):
-        self.pred.prediction(input, vocal_root, others_root, format)
diff --git a/infer/modules/uvr5/modules.py b/infer/modules/uvr5/modules.py
deleted file mode 100644
index bce3cef4eb83797e9ea196a7c6252abebd106a20..0000000000000000000000000000000000000000
--- a/infer/modules/uvr5/modules.py
+++ /dev/null
@@ -1,108 +0,0 @@
-import os
-import traceback
-import logging
-
-logger = logging.getLogger(__name__)
-
-import ffmpeg
-import torch
-
-from configs.config import Config
-from infer.modules.uvr5.mdxnet import MDXNetDereverb
-from infer.modules.uvr5.vr import AudioPre, AudioPreDeEcho
-
-config = Config()
-
-
-def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format0):
-    infos = []
-    try:
-        inp_root = inp_root.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
-        save_root_vocal = (
-            save_root_vocal.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
-        )
-        save_root_ins = (
-            save_root_ins.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
-        )
-        if model_name == "onnx_dereverb_By_FoxJoy":
-            pre_fun = MDXNetDereverb(15, config.device)
-        else:
-            func = AudioPre if "DeEcho" not in model_name else AudioPreDeEcho
-            pre_fun = func(
-                agg=int(agg),
-                model_path=os.path.join(
-                    os.getenv("weight_uvr5_root"), model_name + ".pth"
-                ),
-                device=config.device,
-                is_half=config.is_half,
-            )
-        is_hp3 = "HP3" in model_name
-        if inp_root != "":
-            paths = [os.path.join(inp_root, name) for name in os.listdir(inp_root)]
-        else:
-            paths = [path.name for path in paths]
-        for path in paths:
-            inp_path = os.path.join(inp_root, path)
-            need_reformat = 1
-            done = 0
-            try:
-                info = ffmpeg.probe(inp_path, cmd="ffprobe")
-                if (
-                    info["streams"][0]["channels"] == 2
-                    and info["streams"][0]["sample_rate"] == "44100"
-                ):
-                    need_reformat = 0
-                    pre_fun._path_audio_(
-                        inp_path, save_root_ins, save_root_vocal, format0, is_hp3=is_hp3
-                    )
-                    done = 1
-            except:
-                need_reformat = 1
-                traceback.print_exc()
-            if need_reformat == 1:
-                tmp_path = "%s/%s.reformatted.wav" % (
-                    os.path.join(os.environ["TEMP"]),
-                    os.path.basename(inp_path),
-                )
-                os.system(
-                    "ffmpeg -i %s -vn -acodec pcm_s16le -ac 2 -ar 44100 %s -y"
-                    % (inp_path, tmp_path)
-                )
-                inp_path = tmp_path
-            try:
-                if done == 0:
-                    pre_fun._path_audio_(
-                        inp_path, save_root_ins, save_root_vocal, format0
-                    )
-                infos.append("%s->Success" % (os.path.basename(inp_path)))
-                yield "\n".join(infos)
-            except:
-                try:
-                    if done == 0:
-                        pre_fun._path_audio_(
-                            inp_path, save_root_ins, save_root_vocal, format0
-                        )
-                    infos.append("%s->Success" % (os.path.basename(inp_path)))
-                    yield "\n".join(infos)
-                except:
-                    infos.append(
-                        "%s->%s" % (os.path.basename(inp_path), traceback.format_exc())
-                    )
-                    yield "\n".join(infos)
-    except:
-        infos.append(traceback.format_exc())
-        yield "\n".join(infos)
-    finally:
-        try:
-            if model_name == "onnx_dereverb_By_FoxJoy":
-                del pre_fun.pred.model
-                del pre_fun.pred.model_
-            else:
-                del pre_fun.model
-                del pre_fun
-        except:
-            traceback.print_exc()
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-            logger.info("Executed torch.cuda.empty_cache()")
-    yield "\n".join(infos)
diff --git a/infer/modules/uvr5/vr.py b/infer/modules/uvr5/vr.py
deleted file mode 100644
index ed5778438a799c98b138dfa35d0a7f81911c3855..0000000000000000000000000000000000000000
--- a/infer/modules/uvr5/vr.py
+++ /dev/null
@@ -1,368 +0,0 @@
-import os
-import logging
-
-logger = logging.getLogger(__name__)
-
-import librosa
-import numpy as np
-import soundfile as sf
-import torch
-
-from infer.lib.uvr5_pack.lib_v5 import nets_61968KB as Nets
-from infer.lib.uvr5_pack.lib_v5 import spec_utils
-from infer.lib.uvr5_pack.lib_v5.model_param_init import ModelParameters
-from infer.lib.uvr5_pack.lib_v5.nets_new import CascadedNet
-from infer.lib.uvr5_pack.utils import inference
-
-
-class AudioPre:
-    def __init__(self, agg, model_path, device, is_half, tta=False):
-        self.model_path = model_path
-        self.device = device
-        self.data = {
-            # Processing Options
-            "postprocess": False,
-            "tta": tta,
-            # Constants
-            "window_size": 512,
-            "agg": agg,
-            "high_end_process": "mirroring",
-        }
-        mp = ModelParameters("infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json")
-        model = Nets.CascadedASPPNet(mp.param["bins"] * 2)
-        cpk = torch.load(model_path, map_location="cpu")
-        model.load_state_dict(cpk)
-        model.eval()
-        if is_half:
-            model = model.half().to(device)
-        else:
-            model = model.to(device)
-
-        self.mp = mp
-        self.model = model
-
-    def _path_audio_(
-        self, music_file, ins_root=None, vocal_root=None, format="flac", is_hp3=False
-    ):
-        if ins_root is None and vocal_root is None:
-            return "No save root."
-        name = os.path.basename(music_file)
-        if ins_root is not None:
-            os.makedirs(ins_root, exist_ok=True)
-        if vocal_root is not None:
-            os.makedirs(vocal_root, exist_ok=True)
-        X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {}
-        bands_n = len(self.mp.param["band"])
-        # print(bands_n)
-        for d in range(bands_n, 0, -1):
-            bp = self.mp.param["band"][d]
-            if d == bands_n:  # high-end band
-                (
-                    X_wave[d],
-                    _,
-                ) = librosa.core.load(  # 理论上librosa读取可能对某些音频有bug，应该上ffmpeg读取，但是太麻烦了弃坑
-                    music_file,
-                    bp["sr"],
-                    False,
-                    dtype=np.float32,
-                    res_type=bp["res_type"],
-                )
-                if X_wave[d].ndim == 1:
-                    X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]])
-            else:  # lower bands
-                X_wave[d] = librosa.core.resample(
-                    X_wave[d + 1],
-                    self.mp.param["band"][d + 1]["sr"],
-                    bp["sr"],
-                    res_type=bp["res_type"],
-                )
-            # Stft of wave source
-            X_spec_s[d] = spec_utils.wave_to_spectrogram_mt(
-                X_wave[d],
-                bp["hl"],
-                bp["n_fft"],
-                self.mp.param["mid_side"],
-                self.mp.param["mid_side_b2"],
-                self.mp.param["reverse"],
-            )
-            # pdb.set_trace()
-            if d == bands_n and self.data["high_end_process"] != "none":
-                input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + (
-                    self.mp.param["pre_filter_stop"] - self.mp.param["pre_filter_start"]
-                )
-                input_high_end = X_spec_s[d][
-                    :, bp["n_fft"] // 2 - input_high_end_h : bp["n_fft"] // 2, :
-                ]
-
-        X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp)
-        aggresive_set = float(self.data["agg"] / 100)
-        aggressiveness = {
-            "value": aggresive_set,
-            "split_bin": self.mp.param["band"][1]["crop_stop"],
-        }
-        with torch.no_grad():
-            pred, X_mag, X_phase = inference(
-                X_spec_m, self.device, self.model, aggressiveness, self.data
-            )
-        # Postprocess
-        if self.data["postprocess"]:
-            pred_inv = np.clip(X_mag - pred, 0, np.inf)
-            pred = spec_utils.mask_silence(pred, pred_inv)
-        y_spec_m = pred * X_phase
-        v_spec_m = X_spec_m - y_spec_m
-
-        if ins_root is not None:
-            if self.data["high_end_process"].startswith("mirroring"):
-                input_high_end_ = spec_utils.mirroring(
-                    self.data["high_end_process"], y_spec_m, input_high_end, self.mp
-                )
-                wav_instrument = spec_utils.cmb_spectrogram_to_wave(
-                    y_spec_m, self.mp, input_high_end_h, input_high_end_
-                )
-            else:
-                wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp)
-            logger.info("%s instruments done" % name)
-            if is_hp3 == True:
-                head = "vocal_"
-            else:
-                head = "instrument_"
-            if format in ["wav", "flac"]:
-                sf.write(
-                    os.path.join(
-                        ins_root,
-                        head + "{}_{}.{}".format(name, self.data["agg"], format),
-                    ),
-                    (np.array(wav_instrument) * 32768).astype("int16"),
-                    self.mp.param["sr"],
-                )  #
-            else:
-                path = os.path.join(
-                    ins_root, head + "{}_{}.wav".format(name, self.data["agg"])
-                )
-                sf.write(
-                    path,
-                    (np.array(wav_instrument) * 32768).astype("int16"),
-                    self.mp.param["sr"],
-                )
-                if os.path.exists(path):
-                    opt_format_path = path[:-4] + ".%s" % format
-                    os.system("ffmpeg -i %s -vn %s -q:a 2 -y" % (path, opt_format_path))
-                    if os.path.exists(opt_format_path):
-                        try:
-                            os.remove(path)
-                        except:
-                            pass
-        if vocal_root is not None:
-            if is_hp3 == True:
-                head = "instrument_"
-            else:
-                head = "vocal_"
-            if self.data["high_end_process"].startswith("mirroring"):
-                input_high_end_ = spec_utils.mirroring(
-                    self.data["high_end_process"], v_spec_m, input_high_end, self.mp
-                )
-                wav_vocals = spec_utils.cmb_spectrogram_to_wave(
-                    v_spec_m, self.mp, input_high_end_h, input_high_end_
-                )
-            else:
-                wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp)
-            logger.info("%s vocals done" % name)
-            if format in ["wav", "flac"]:
-                sf.write(
-                    os.path.join(
-                        vocal_root,
-                        head + "{}_{}.{}".format(name, self.data["agg"], format),
-                    ),
-                    (np.array(wav_vocals) * 32768).astype("int16"),
-                    self.mp.param["sr"],
-                )
-            else:
-                path = os.path.join(
-                    vocal_root, head + "{}_{}.wav".format(name, self.data["agg"])
-                )
-                sf.write(
-                    path,
-                    (np.array(wav_vocals) * 32768).astype("int16"),
-                    self.mp.param["sr"],
-                )
-                if os.path.exists(path):
-                    opt_format_path = path[:-4] + ".%s" % format
-                    os.system("ffmpeg -i %s -vn %s -q:a 2 -y" % (path, opt_format_path))
-                    if os.path.exists(opt_format_path):
-                        try:
-                            os.remove(path)
-                        except:
-                            pass
-
-
-class AudioPreDeEcho:
-    def __init__(self, agg, model_path, device, is_half, tta=False):
-        self.model_path = model_path
-        self.device = device
-        self.data = {
-            # Processing Options
-            "postprocess": False,
-            "tta": tta,
-            # Constants
-            "window_size": 512,
-            "agg": agg,
-            "high_end_process": "mirroring",
-        }
-        mp = ModelParameters("infer/lib/uvr5_pack/lib_v5/modelparams/4band_v3.json")
-        nout = 64 if "DeReverb" in model_path else 48
-        model = CascadedNet(mp.param["bins"] * 2, nout)
-        cpk = torch.load(model_path, map_location="cpu")
-        model.load_state_dict(cpk)
-        model.eval()
-        if is_half:
-            model = model.half().to(device)
-        else:
-            model = model.to(device)
-
-        self.mp = mp
-        self.model = model
-
-    def _path_audio_(
-        self, music_file, vocal_root=None, ins_root=None, format="flac", is_hp3=False
-    ):  # 3个VR模型vocal和ins是反的
-        if ins_root is None and vocal_root is None:
-            return "No save root."
-        name = os.path.basename(music_file)
-        if ins_root is not None:
-            os.makedirs(ins_root, exist_ok=True)
-        if vocal_root is not None:
-            os.makedirs(vocal_root, exist_ok=True)
-        X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {}
-        bands_n = len(self.mp.param["band"])
-        # print(bands_n)
-        for d in range(bands_n, 0, -1):
-            bp = self.mp.param["band"][d]
-            if d == bands_n:  # high-end band
-                (
-                    X_wave[d],
-                    _,
-                ) = librosa.core.load(  # 理论上librosa读取可能对某些音频有bug，应该上ffmpeg读取，但是太麻烦了弃坑
-                    music_file,
-                    bp["sr"],
-                    False,
-                    dtype=np.float32,
-                    res_type=bp["res_type"],
-                )
-                if X_wave[d].ndim == 1:
-                    X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]])
-            else:  # lower bands
-                X_wave[d] = librosa.core.resample(
-                    X_wave[d + 1],
-                    self.mp.param["band"][d + 1]["sr"],
-                    bp["sr"],
-                    res_type=bp["res_type"],
-                )
-            # Stft of wave source
-            X_spec_s[d] = spec_utils.wave_to_spectrogram_mt(
-                X_wave[d],
-                bp["hl"],
-                bp["n_fft"],
-                self.mp.param["mid_side"],
-                self.mp.param["mid_side_b2"],
-                self.mp.param["reverse"],
-            )
-            # pdb.set_trace()
-            if d == bands_n and self.data["high_end_process"] != "none":
-                input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + (
-                    self.mp.param["pre_filter_stop"] - self.mp.param["pre_filter_start"]
-                )
-                input_high_end = X_spec_s[d][
-                    :, bp["n_fft"] // 2 - input_high_end_h : bp["n_fft"] // 2, :
-                ]
-
-        X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp)
-        aggresive_set = float(self.data["agg"] / 100)
-        aggressiveness = {
-            "value": aggresive_set,
-            "split_bin": self.mp.param["band"][1]["crop_stop"],
-        }
-        with torch.no_grad():
-            pred, X_mag, X_phase = inference(
-                X_spec_m, self.device, self.model, aggressiveness, self.data
-            )
-        # Postprocess
-        if self.data["postprocess"]:
-            pred_inv = np.clip(X_mag - pred, 0, np.inf)
-            pred = spec_utils.mask_silence(pred, pred_inv)
-        y_spec_m = pred * X_phase
-        v_spec_m = X_spec_m - y_spec_m
-
-        if ins_root is not None:
-            if self.data["high_end_process"].startswith("mirroring"):
-                input_high_end_ = spec_utils.mirroring(
-                    self.data["high_end_process"], y_spec_m, input_high_end, self.mp
-                )
-                wav_instrument = spec_utils.cmb_spectrogram_to_wave(
-                    y_spec_m, self.mp, input_high_end_h, input_high_end_
-                )
-            else:
-                wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp)
-            logger.info("%s instruments done" % name)
-            if format in ["wav", "flac"]:
-                sf.write(
-                    os.path.join(
-                        ins_root,
-                        "vocal_{}_{}.{}".format(name, self.data["agg"], format),
-                    ),
-                    (np.array(wav_instrument) * 32768).astype("int16"),
-                    self.mp.param["sr"],
-                )  #
-            else:
-                path = os.path.join(
-                    ins_root, "vocal_{}_{}.wav".format(name, self.data["agg"])
-                )
-                sf.write(
-                    path,
-                    (np.array(wav_instrument) * 32768).astype("int16"),
-                    self.mp.param["sr"],
-                )
-                if os.path.exists(path):
-                    opt_format_path = path[:-4] + ".%s" % format
-                    os.system("ffmpeg -i %s -vn %s -q:a 2 -y" % (path, opt_format_path))
-                    if os.path.exists(opt_format_path):
-                        try:
-                            os.remove(path)
-                        except:
-                            pass
-        if vocal_root is not None:
-            if self.data["high_end_process"].startswith("mirroring"):
-                input_high_end_ = spec_utils.mirroring(
-                    self.data["high_end_process"], v_spec_m, input_high_end, self.mp
-                )
-                wav_vocals = spec_utils.cmb_spectrogram_to_wave(
-                    v_spec_m, self.mp, input_high_end_h, input_high_end_
-                )
-            else:
-                wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp)
-            logger.info("%s vocals done" % name)
-            if format in ["wav", "flac"]:
-                sf.write(
-                    os.path.join(
-                        vocal_root,
-                        "instrument_{}_{}.{}".format(name, self.data["agg"], format),
-                    ),
-                    (np.array(wav_vocals) * 32768).astype("int16"),
-                    self.mp.param["sr"],
-                )
-            else:
-                path = os.path.join(
-                    vocal_root, "instrument_{}_{}.wav".format(name, self.data["agg"])
-                )
-                sf.write(
-                    path,
-                    (np.array(wav_vocals) * 32768).astype("int16"),
-                    self.mp.param["sr"],
-                )
-                if os.path.exists(path):
-                    opt_format_path = path[:-4] + ".%s" % format
-                    os.system("ffmpeg -i %s -vn %s -q:a 2 -y" % (path, opt_format_path))
-                    if os.path.exists(opt_format_path):
-                        try:
-                            os.remove(path)
-                        except:
-                            pass
diff --git a/infer/modules/vc/__init__.py b/infer/modules/vc/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/infer/modules/vc/modules.py b/infer/modules/vc/modules.py
new file mode 100644
index 0000000000000000000000000000000000000000..7706537dca0d0c94167e3a8da7a96b9bf18d95b9
--- /dev/null
+++ b/infer/modules/vc/modules.py
@@ -0,0 +1,305 @@
+import traceback
+import logging
+
+logger = logging.getLogger(__name__)
+
+import numpy as np
+import soundfile as sf
+import torch
+from io import BytesIO
+
+from infer.lib.audio import load_audio, wav2
+from infer.lib.infer_pack.models import (
+    SynthesizerTrnMs256NSFsid,
+    SynthesizerTrnMs256NSFsid_nono,
+    SynthesizerTrnMs768NSFsid,
+    SynthesizerTrnMs768NSFsid_nono,
+)
+from infer.modules.vc.pipeline import Pipeline
+from infer.modules.vc.utils import *
+from model import hubert
+
+
+class VC:
+    def __init__(self, config):
+        self.n_spk = None
+        self.tgt_sr = None
+        self.net_g = None
+        self.pipeline = None
+        self.cpt = None
+        self.version = None
+        self.if_f0 = None
+        self.version = None
+        self.hubert_model = None
+
+        self.config = config
+
+    def get_vc(self, sid, *to_return_protect):
+        logger.info("Get sid: " + sid)
+
+        to_return_protect0 = {
+            "visible": self.if_f0 != 0,
+            "value": (
+                to_return_protect[0] if self.if_f0 != 0 and to_return_protect else 0.5
+            ),
+            "__type__": "update",
+        }
+        to_return_protect1 = {
+            "visible": self.if_f0 != 0,
+            "value": (
+                to_return_protect[1] if self.if_f0 != 0 and to_return_protect else 0.33
+            ),
+            "__type__": "update",
+        }
+
+        if sid == "" or sid == []:
+            if (
+                self.hubert_model is not None
+            ):  # 考虑到轮询, 需要加个判断看是否 sid 是由有模型切换到无模型的
+                logger.info("Clean model cache")
+                del (self.net_g, self.n_spk, self.hubert_model, self.tgt_sr)  # ,cpt
+                self.hubert_model = self.net_g = self.n_spk = self.hubert_model = (
+                    self.tgt_sr
+                ) = None
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+                ###楼下不这么折腾清理不干净
+                self.if_f0 = self.cpt.get("f0", 1)
+                self.version = self.cpt.get("version", "v1")
+                if self.version == "v1":
+                    if self.if_f0 == 1:
+                        self.net_g = SynthesizerTrnMs256NSFsid(
+                            *self.cpt["config"], is_half=self.config.is_half
+                        )
+                    else:
+                        self.net_g = SynthesizerTrnMs256NSFsid_nono(*self.cpt["config"])
+                elif self.version == "v2":
+                    if self.if_f0 == 1:
+                        self.net_g = SynthesizerTrnMs768NSFsid(
+                            *self.cpt["config"], is_half=self.config.is_half
+                        )
+                    else:
+                        self.net_g = SynthesizerTrnMs768NSFsid_nono(*self.cpt["config"])
+                del self.net_g, self.cpt
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+            return (
+                {"visible": False, "__type__": "update"},
+                {
+                    "visible": True,
+                    "value": to_return_protect0,
+                    "__type__": "update",
+                },
+                {
+                    "visible": True,
+                    "value": to_return_protect1,
+                    "__type__": "update",
+                },
+                "",
+                "",
+            )
+        person = sid
+        logger.info(f"Loading: {person}")
+
+        self.cpt = torch.load(person, map_location="cpu")
+        self.tgt_sr = self.cpt["config"][-1]
+        self.cpt["config"][-3] = self.cpt["weight"]["emb_g.weight"].shape[0]  # n_spk
+        self.if_f0 = self.cpt.get("f0", 1)
+        self.version = self.cpt.get("version", "v1")
+
+        synthesizer_class = {
+            ("v1", 1): SynthesizerTrnMs256NSFsid,
+            ("v1", 0): SynthesizerTrnMs256NSFsid_nono,
+            ("v2", 1): SynthesizerTrnMs768NSFsid,
+            ("v2", 0): SynthesizerTrnMs768NSFsid_nono,
+        }
+
+        self.net_g = synthesizer_class.get(
+            (self.version, self.if_f0), SynthesizerTrnMs256NSFsid
+        )(*self.cpt["config"], is_half=self.config.is_half)
+
+        del self.net_g.enc_q
+
+        self.net_g.load_state_dict(self.cpt["weight"], strict=False)
+        self.net_g.eval().to(self.config.device)
+        if self.config.is_half:
+            self.net_g = self.net_g.half()
+        else:
+            self.net_g = self.net_g.float()
+
+        self.pipeline = Pipeline(self.tgt_sr, self.config)
+        n_spk = self.cpt["config"][-3]
+        index = {"value": get_index_path_from_model(sid), "__type__": "update"}
+        logger.info("Select index: " + index["value"])
+
+        return (
+            (
+                {"visible": True, "maximum": n_spk, "__type__": "update"},
+                to_return_protect0,
+                to_return_protect1,
+                index,
+                index,
+            )
+            if to_return_protect
+            else {"visible": True, "maximum": n_spk, "__type__": "update"}
+        )
+
+    def vc_single(
+        self,
+        sid,
+        input_audio_path,
+        f0_up_key,
+        f0_file,
+        f0_method,
+        file_index,
+        file_index2,
+        index_rate,
+        filter_radius,
+        resample_sr,
+        rms_mix_rate,
+        protect,
+    ):
+        if input_audio_path is None:
+            return "You need to upload an audio", None
+        f0_up_key = int(f0_up_key)
+        try:
+            audio = load_audio(input_audio_path, 16000)
+            audio_max = np.abs(audio).max() / 0.95
+            if audio_max > 1:
+                audio /= audio_max
+            times = [0, 0, 0]
+
+            if self.hubert_model is None:
+                self.hubert_model = hubert
+
+            if file_index:
+                file_index = (
+                    file_index.strip(" ")
+                    .strip('"')
+                    .strip("\n")
+                    .strip('"')
+                    .strip(" ")
+                    .replace("trained", "added")
+                )
+            elif file_index2:
+                file_index = file_index2
+            else:
+                file_index = ""  # 防止小白写错，自动帮他替换掉
+
+            audio_opt = self.pipeline.pipeline(
+                self.hubert_model,
+                self.net_g,
+                sid,
+                audio,
+                input_audio_path,
+                times,
+                f0_up_key,
+                f0_method,
+                file_index,
+                index_rate,
+                self.if_f0,
+                filter_radius,
+                self.tgt_sr,
+                resample_sr,
+                rms_mix_rate,
+                self.version,
+                protect,
+                f0_file,
+            )
+            if self.tgt_sr != resample_sr >= 16000:
+                tgt_sr = resample_sr
+            else:
+                tgt_sr = self.tgt_sr
+            index_info = (
+                "Index:\n%s." % file_index
+                if os.path.exists(file_index)
+                else "Index not used."
+            )
+            return (
+                "Success.\n%s\nTime:\nnpy: %.2fs, f0: %.2fs, infer: %.2fs."
+                % (index_info, *times),
+                (tgt_sr, audio_opt),
+            )
+        except:
+            info = traceback.format_exc()
+            logger.warning(info)
+            return info, (None, None)
+
+    def vc_multi(
+        self,
+        sid,
+        dir_path,
+        opt_root,
+        paths,
+        f0_up_key,
+        f0_method,
+        file_index,
+        file_index2,
+        index_rate,
+        filter_radius,
+        resample_sr,
+        rms_mix_rate,
+        protect,
+        format1,
+    ):
+        try:
+            dir_path = (
+                dir_path.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
+            )  # 防止小白拷路径头尾带了空格和"和回车
+            opt_root = opt_root.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
+            os.makedirs(opt_root, exist_ok=True)
+            try:
+                if dir_path != "":
+                    paths = [
+                        os.path.join(dir_path, name) for name in os.listdir(dir_path)
+                    ]
+                else:
+                    paths = [path.name for path in paths]
+            except:
+                traceback.print_exc()
+                paths = [path.name for path in paths]
+            infos = []
+            for path in paths:
+                info, opt = self.vc_single(
+                    sid,
+                    path,
+                    f0_up_key,
+                    None,
+                    f0_method,
+                    file_index,
+                    file_index2,
+                    # file_big_npy,
+                    index_rate,
+                    filter_radius,
+                    resample_sr,
+                    rms_mix_rate,
+                    protect,
+                )
+                if "Success" in info:
+                    try:
+                        tgt_sr, audio_opt = opt
+                        if format1 in ["wav", "flac"]:
+                            sf.write(
+                                "%s/%s.%s"
+                                % (opt_root, os.path.basename(path), format1),
+                                audio_opt,
+                                tgt_sr,
+                            )
+                        else:
+                            path = "%s/%s.%s" % (
+                                opt_root,
+                                os.path.basename(path),
+                                format1,
+                            )
+                            with BytesIO() as wavf:
+                                sf.write(wavf, audio_opt, tgt_sr, format="wav")
+                                wavf.seek(0, 0)
+                                with open(path, "wb") as outf:
+                                    wav2(wavf, outf, format1)
+                    except:
+                        info += traceback.format_exc()
+                infos.append("%s->%s" % (os.path.basename(path), info))
+                yield "\n".join(infos)
+            yield "\n".join(infos)
+        except:
+            yield traceback.format_exc()
diff --git a/infer/modules/vc/pipeline.py b/infer/modules/vc/pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..15037e6103c1d8b2cbfa1668b5ffa81f98f12302
--- /dev/null
+++ b/infer/modules/vc/pipeline.py
@@ -0,0 +1,449 @@
+import os
+import sys
+import traceback
+import logging
+
+logger = logging.getLogger(__name__)
+
+from functools import lru_cache
+from time import time as ttime
+
+import faiss
+import librosa
+import numpy as np
+import parselmouth
+import pyworld
+import torch
+import torch.nn.functional as F
+import torchcrepe
+from scipy import signal
+from model import rmvpe, device, fp16
+
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+
+bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
+
+input_audio_path2wav = {}
+
+
+@lru_cache
+def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period):
+    audio = input_audio_path2wav[input_audio_path]
+    f0, t = pyworld.harvest(
+        audio,
+        fs=fs,
+        f0_ceil=f0max,
+        f0_floor=f0min,
+        frame_period=frame_period,
+    )
+    f0 = pyworld.stonemask(audio, f0, t, fs)
+    return f0
+
+
+def change_rms(data1, sr1, data2, sr2, rate):  # 1是输入音频，2是输出音频,rate是2的占比
+    # print(data1.max(),data2.max())
+    rms1 = librosa.feature.rms(
+        y=data1, frame_length=sr1 // 2 * 2, hop_length=sr1 // 2
+    )  # 每半秒一个点
+    rms2 = librosa.feature.rms(y=data2, frame_length=sr2 // 2 * 2, hop_length=sr2 // 2)
+    rms1 = torch.from_numpy(rms1)
+    rms1 = F.interpolate(
+        rms1.unsqueeze(0), size=data2.shape[0], mode="linear"
+    ).squeeze()
+    rms2 = torch.from_numpy(rms2)
+    rms2 = F.interpolate(
+        rms2.unsqueeze(0), size=data2.shape[0], mode="linear"
+    ).squeeze()
+    rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-6)
+    data2 *= (
+        torch.pow(rms1, torch.tensor(1 - rate))
+        * torch.pow(rms2, torch.tensor(rate - 1))
+    ).numpy()
+    return data2
+
+
+class Pipeline(object):
+    def __init__(self, tgt_sr, config):
+        self.x_pad, self.x_query, self.x_center, self.x_max = (
+            config.x_pad,
+            config.x_query,
+            config.x_center,
+            config.x_max,
+        )
+        self.is_half = fp16
+        self.sr = 16000  # hubert输入采样率
+        self.window = 160  # 每帧点数
+        self.t_pad = self.sr * self.x_pad  # 每条前后pad时间
+        self.t_pad_tgt = tgt_sr * self.x_pad
+        self.t_pad2 = self.t_pad * 2
+        self.t_query = self.sr * self.x_query  # 查询切点前后查询时间
+        self.t_center = self.sr * self.x_center  # 查询切点位置
+        self.t_max = self.sr * self.x_max  # 免查询时长阈值
+        self.device = device
+
+    def get_f0(
+        self,
+        input_audio_path,
+        x,
+        p_len,
+        f0_up_key,
+        f0_method,
+        filter_radius,
+        inp_f0=None,
+    ):
+        global input_audio_path2wav
+        time_step = self.window / self.sr * 1000
+        f0_min = 50
+        f0_max = 1100
+        f0_mel_min = 1127 * np.log(1 + f0_min / 700)
+        f0_mel_max = 1127 * np.log(1 + f0_max / 700)
+        if f0_method == "pm":
+            f0 = (
+                parselmouth.Sound(x, self.sr)
+                .to_pitch_ac(
+                    time_step=time_step / 1000,
+                    voicing_threshold=0.6,
+                    pitch_floor=f0_min,
+                    pitch_ceiling=f0_max,
+                )
+                .selected_array["frequency"]
+            )
+            pad_size = (p_len - len(f0) + 1) // 2
+            if pad_size > 0 or p_len - len(f0) - pad_size > 0:
+                f0 = np.pad(
+                    f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
+                )
+        elif f0_method == "harvest":
+            input_audio_path2wav[input_audio_path] = x.astype(np.double)
+            f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10)
+            if filter_radius > 2:
+                f0 = signal.medfilt(f0, 3)
+        elif f0_method == "crepe":
+            model = "full"
+            # Pick a batch size that doesn't cause memory errors on your gpu
+            batch_size = 512
+            # Compute pitch using first gpu
+            audio = torch.tensor(np.copy(x))[None].float()
+            f0, pd = torchcrepe.predict(
+                audio,
+                self.sr,
+                self.window,
+                f0_min,
+                f0_max,
+                model,
+                batch_size=batch_size,
+                device=self.device,
+                return_periodicity=True,
+            )
+            pd = torchcrepe.filter.median(pd, 3)
+            f0 = torchcrepe.filter.mean(f0, 3)
+            f0[pd < 0.1] = 0
+            f0 = f0[0].cpu().numpy()
+        elif f0_method == "rmvpe":
+            if not hasattr(self, "model_rmvpe"):
+                self.model_rmvpe = rmvpe
+            f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
+
+            if "privateuseone" in str(self.device):  # clean ortruntime memory
+                del self.model_rmvpe.model
+                del self.model_rmvpe
+                logger.info("Cleaning ortruntime memory")
+
+        f0 *= pow(2, f0_up_key / 12)
+        # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
+        tf0 = self.sr // self.window  # 每秒f0点数
+        if inp_f0 is not None:
+            delta_t = np.round(
+                (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1
+            ).astype("int16")
+            replace_f0 = np.interp(
+                list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1]
+            )
+            shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0]
+            f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[
+                :shape
+            ]
+        # with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
+        f0bak = f0.copy()
+        f0_mel = 1127 * np.log(1 + f0 / 700)
+        f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
+            f0_mel_max - f0_mel_min
+        ) + 1
+        f0_mel[f0_mel <= 1] = 1
+        f0_mel[f0_mel > 255] = 255
+        f0_coarse = np.rint(f0_mel).astype(np.int32)
+        return f0_coarse, f0bak  # 1-0
+
+    def vc(
+        self,
+        model,
+        net_g,
+        sid,
+        audio0,
+        pitch,
+        pitchf,
+        times,
+        index,
+        big_npy,
+        index_rate,
+        version,
+        protect,
+    ):  # ,file_index,file_big_npy
+        feats = torch.from_numpy(audio0)
+        if self.is_half:
+            feats = feats.half()
+        else:
+            feats = feats.float()
+        if feats.dim() == 2:  # double channels
+            feats = feats.mean(-1)
+        assert feats.dim() == 1, feats.dim()
+        feats = feats.view(1, -1)
+        padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
+
+        inputs = {
+            "source": feats.to(self.device),
+            "padding_mask": padding_mask,
+            "output_layer": 9 if version == "v1" else 12,
+        }
+        t0 = ttime()
+        with torch.no_grad():
+            logits = model.extract_features(**inputs)
+            feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
+        if protect < 0.5 and pitch is not None and pitchf is not None:
+            feats0 = feats.clone()
+        if (
+            not isinstance(index, type(None))
+            and not isinstance(big_npy, type(None))
+            and index_rate != 0
+        ):
+            npy = feats[0].cpu().numpy()
+            if self.is_half:
+                npy = npy.astype("float32")
+
+            # _, I = index.search(npy, 1)
+            # npy = big_npy[I.squeeze()]
+
+            score, ix = index.search(npy, k=8)
+            weight = np.square(1 / score)
+            weight /= weight.sum(axis=1, keepdims=True)
+            npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
+
+            if self.is_half:
+                npy = npy.astype("float16")
+            feats = (
+                torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
+                + (1 - index_rate) * feats
+            )
+
+        feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
+        if protect < 0.5 and pitch is not None and pitchf is not None:
+            feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
+                0, 2, 1
+            )
+        t1 = ttime()
+        p_len = audio0.shape[0] // self.window
+        if feats.shape[1] < p_len:
+            p_len = feats.shape[1]
+            if pitch is not None and pitchf is not None:
+                pitch = pitch[:, :p_len]
+                pitchf = pitchf[:, :p_len]
+
+        if protect < 0.5 and pitch is not None and pitchf is not None:
+            pitchff = pitchf.clone()
+            pitchff[pitchf > 0] = 1
+            pitchff[pitchf < 1] = protect
+            pitchff = pitchff.unsqueeze(-1)
+            feats = feats * pitchff + feats0 * (1 - pitchff)
+            feats = feats.to(feats0.dtype)
+        p_len = torch.tensor([p_len], device=self.device).long()
+        with torch.no_grad():
+            hasp = pitch is not None and pitchf is not None
+            arg = (feats, p_len, pitch, pitchf, sid) if hasp else (feats, p_len, sid)
+            audio1 = (net_g.infer(*arg)[0][0, 0]).data.cpu().float().numpy()
+            del hasp, arg
+        del feats, p_len, padding_mask
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        t2 = ttime()
+        times[0] += t1 - t0
+        times[2] += t2 - t1
+        return audio1
+
+    def pipeline(
+        self,
+        model,
+        net_g,
+        sid,
+        audio,
+        input_audio_path,
+        times,
+        f0_up_key,
+        f0_method,
+        file_index,
+        index_rate,
+        if_f0,
+        filter_radius,
+        tgt_sr,
+        resample_sr,
+        rms_mix_rate,
+        version,
+        protect,
+        f0_file=None,
+    ):
+        if (
+            file_index != ""
+            # and file_big_npy != ""
+            # and os.path.exists(file_big_npy) == True
+            and os.path.exists(file_index)
+            and index_rate != 0
+        ):
+            try:
+                index = faiss.read_index(file_index)
+                # big_npy = np.load(file_big_npy)
+                big_npy = index.reconstruct_n(0, index.ntotal)
+            except:
+                traceback.print_exc()
+                index = big_npy = None
+        else:
+            index = big_npy = None
+        audio = signal.filtfilt(bh, ah, audio)
+        audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
+        opt_ts = []
+        if audio_pad.shape[0] > self.t_max:
+            audio_sum = np.zeros_like(audio)
+            for i in range(self.window):
+                audio_sum += np.abs(audio_pad[i : i - self.window])
+            for t in range(self.t_center, audio.shape[0], self.t_center):
+                opt_ts.append(
+                    t
+                    - self.t_query
+                    + np.where(
+                        audio_sum[t - self.t_query : t + self.t_query]
+                        == audio_sum[t - self.t_query : t + self.t_query].min()
+                    )[0][0]
+                )
+        s = 0
+        audio_opt = []
+        t = None
+        t1 = ttime()
+        audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
+        p_len = audio_pad.shape[0] // self.window
+        inp_f0 = None
+        if hasattr(f0_file, "name"):
+            try:
+                with open(f0_file.name, "r") as f:
+                    lines = f.read().strip("\n").split("\n")
+                inp_f0 = []
+                for line in lines:
+                    inp_f0.append([float(i) for i in line.split(",")])
+                inp_f0 = np.array(inp_f0, dtype="float32")
+            except:
+                traceback.print_exc()
+        sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
+        pitch, pitchf = None, None
+        if if_f0 == 1:
+            pitch, pitchf = self.get_f0(
+                input_audio_path,
+                audio_pad,
+                p_len,
+                f0_up_key,
+                f0_method,
+                filter_radius,
+                inp_f0,
+            )
+            pitch = pitch[:p_len]
+            pitchf = pitchf[:p_len]
+            if "mps" not in str(self.device) or "xpu" not in str(self.device):
+                pitchf = pitchf.astype(np.float32)
+            pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
+            pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
+        t2 = ttime()
+        times[1] += t2 - t1
+        for t in opt_ts:
+            t = t // self.window * self.window
+            if if_f0 == 1:
+                audio_opt.append(
+                    self.vc(
+                        model,
+                        net_g,
+                        sid,
+                        audio_pad[s : t + self.t_pad2 + self.window],
+                        pitch[:, s // self.window : (t + self.t_pad2) // self.window],
+                        pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
+                        times,
+                        index,
+                        big_npy,
+                        index_rate,
+                        version,
+                        protect,
+                    )[self.t_pad_tgt : -self.t_pad_tgt]
+                )
+            else:
+                audio_opt.append(
+                    self.vc(
+                        model,
+                        net_g,
+                        sid,
+                        audio_pad[s : t + self.t_pad2 + self.window],
+                        None,
+                        None,
+                        times,
+                        index,
+                        big_npy,
+                        index_rate,
+                        version,
+                        protect,
+                    )[self.t_pad_tgt : -self.t_pad_tgt]
+                )
+            s = t
+        if if_f0 == 1:
+            audio_opt.append(
+                self.vc(
+                    model,
+                    net_g,
+                    sid,
+                    audio_pad[t:],
+                    pitch[:, t // self.window :] if t is not None else pitch,
+                    pitchf[:, t // self.window :] if t is not None else pitchf,
+                    times,
+                    index,
+                    big_npy,
+                    index_rate,
+                    version,
+                    protect,
+                )[self.t_pad_tgt : -self.t_pad_tgt]
+            )
+        else:
+            audio_opt.append(
+                self.vc(
+                    model,
+                    net_g,
+                    sid,
+                    audio_pad[t:],
+                    None,
+                    None,
+                    times,
+                    index,
+                    big_npy,
+                    index_rate,
+                    version,
+                    protect,
+                )[self.t_pad_tgt : -self.t_pad_tgt]
+            )
+        audio_opt = np.concatenate(audio_opt)
+        if rms_mix_rate != 1:
+            audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate)
+        if tgt_sr != resample_sr >= 16000:
+            audio_opt = librosa.resample(
+                audio_opt, orig_sr=tgt_sr, target_sr=resample_sr
+            )
+        audio_max = np.abs(audio_opt).max() / 0.99
+        max_int16 = 32768
+        if audio_max > 1:
+            max_int16 /= audio_max
+        audio_opt = (audio_opt * max_int16).astype(np.int16)
+        del pitch, pitchf, sid
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        return audio_opt
diff --git a/infer/modules/vc/utils.py b/infer/modules/vc/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..7df184fccf1482d9d4f812a5f1b32ef9756fffa0
--- /dev/null
+++ b/infer/modules/vc/utils.py
@@ -0,0 +1,17 @@
+import os
+
+
+def get_index_path_from_model(sid):
+    return next(
+        (
+            f
+            for f in [
+                os.path.join(root, name)
+                for root, _, files in os.walk(os.getenv("index_root"), topdown=False)
+                for name in files
+                if name.endswith(".index") and "trained" not in name
+            ]
+            if sid.split(".")[0] in f
+        ),
+        "",
+    )
diff --git a/model.py b/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..3846765ec92389e155d97acfc077d2aa4225b649
--- /dev/null
+++ b/model.py
@@ -0,0 +1,23 @@
+from accelerate import Accelerator
+from infer.lib.rmvpe import RMVPE
+from fairseq.checkpoint_utils import load_model_ensemble_and_task
+
+accelerator = Accelerator()
+device = accelerator.device
+print(f"Using device: {device}")
+
+fp16 = accelerator.mixed_precision == "fp16"
+print(f"Using fp16: {fp16}")
+
+rmvpe_model_path = "assets/rmvpe/rmvpe.pt"
+rmvpe = RMVPE(rmvpe_model_path, is_half=fp16, device=device)
+print("RMVPE model loaded.")
+
+hubert_model_path = "assets/hubert/hubert_base.pt"
+models, hubert_cfg, _ = load_model_ensemble_and_task([hubert_model_path])
+hubert = models[0]
+hubert = hubert.to(device)
+if fp16:
+    hubert = hubert.half()
+hubert.eval()
+print("Hubert model loaded.")
diff --git a/prelude.py b/prelude.py
new file mode 100644
index 0000000000000000000000000000000000000000..8160b6a60c995fd0a4e5748e8bb578283d7a09c5
--- /dev/null
+++ b/prelude.py
@@ -0,0 +1,26 @@
+import os
+
+
+def prelude():
+    os.environ["PYTORCH_JIT"] = "0v"
+
+    # patch for jit script
+    # if we find `def expand_2d_or_3d_tensor(x,` in /usr/local/lib/python3.10/site-packages/fairseq/models/model_utils.py
+    # patch it with `def expand_2d_or_3d_tensor(x: Tensor,`
+    FAIRSEQ_CODE = (
+        "/usr/local/lib/python3.10/site-packages/fairseq/models/model_utils.py"
+    )
+    if os.path.exists(FAIRSEQ_CODE):
+        with open(FAIRSEQ_CODE, "r") as f:
+            lines = f.readlines()
+        with open(FAIRSEQ_CODE, "w") as f:
+            for line in lines:
+                if (
+                    "def expand_2d_or_3d_tensor(x, trg_dim: int, padding_idx: int):"
+                    in line
+                ):
+                    f.write(
+                        "def expand_2d_or_3d_tensor(x: Tensor, trg_dim: int, padding_idx: int) -> Tensor:\n"
+                    )
+                else:
+                    f.write(line)
diff --git a/requirements.txt b/requirements.txt
index 94a1876f4b2db37bdf396afb7d7cdbf83465308d..dce058017906a038ccba745b2eb2dd9e09b51446 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -25,6 +25,8 @@ tensorboard
 tqdm>=4.63.1
 pyworld==0.3.2
 httpx
-onnxruntime-gpu
 python-dotenv>=1.0.0
 av
+accelerate==0.32.0
+demucs==4.0.1
+torchcrepe