diff --git a/.gitignore b/.gitignore index dde3895fc112ad34a839b2fed9210ac2288a959b..9ab76a12bc96eff64a46cc52cd9a22f7aa9ae58f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ .DS_Store *.pyc +__pycache__ diff --git a/README.md b/README.md index b3dfbd275aa433bc627da1fb3db77ab694736732..bd0ea5ec405147f732262cd1d65ef33a204a8dd9 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ --- -title: RVC Trainer +title: ZeroRVC emoji: 🦀 colorFrom: gray colorTo: gray @@ -9,4 +9,6 @@ app_file: app.py pinned: false --- -Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference +# ZeroRVC + +Run Retrieval-based Voice Conversion training and inference on HuggingFace ZeroGPU. diff --git a/app.py b/app.py index 084948dca95c5841b84ed94c694089eda5f19a78..92c14588e9a58dad43c47f210415cf359bcd3aa5 100644 --- a/app.py +++ b/app.py @@ -1,11 +1,12 @@ +from typing import Tuple +from prelude import prelude + +prelude() + import os import traceback - import numpy as np from sklearn.cluster import MiniBatchKMeans - -os.environ["PYTORCH_JIT"] = "0v" - from random import shuffle import gradio as gr import zipfile @@ -18,23 +19,12 @@ from infer.modules.train.extract.extract_f0_rmvpe import FeatureInput from infer.modules.train.extract_feature_print import HubertFeatureExtractor from infer.modules.train.train import train from infer.lib.train.process_ckpt import extract_small_model +from infer.modules.vc.modules import VC +from configs.config import Config +import demucs.separate +import soundfile as sf from zero import zero - -# patch for jit script -# if we find `def expand_2d_or_3d_tensor(x,` in /usr/local/lib/python3.10/site-packages/fairseq/models/model_utils.py -# patch it with `def expand_2d_or_3d_tensor(x: Tensor,` -FAIRSEQ_CODE = "/usr/local/lib/python3.10/site-packages/fairseq/models/model_utils.py" -if os.path.exists(FAIRSEQ_CODE): - with open(FAIRSEQ_CODE, "r") as f: - lines = f.readlines() - with open(FAIRSEQ_CODE, "w") as f: - for line in lines: - if "def expand_2d_or_3d_tensor(x, trg_dim: int, padding_idx: int):" in line: - f.write( - "def expand_2d_or_3d_tensor(x: Tensor, trg_dim: int, padding_idx: int) -> Tensor:\n" - ) - else: - f.write(line) +from model import device def extract_audio_files(zip_file: str, target_dir: str) -> list[str]: @@ -189,13 +179,15 @@ def download_weight(exp_dir: str) -> str: raise gr.Error("No model found") latest_model = max(models, key=os.path.getctime) + print(f"Latest model: {latest_model}") name = os.path.basename(exp_dir) + out = os.path.join(exp_dir, f"{name}.pth") extract_small_model( - latest_model, name, "40k", True, "Model trained by ZeroGPU.", "v2" + latest_model, out, "40k", True, "Model trained by ZeroGPU.", "v2" ) - return "assets/weights/%s.pth" % name + return out def train_index(exp_dir: str) -> str: @@ -269,9 +261,70 @@ def restore_expdir(zip: str) -> str: return exp_dir +@zero(duration=120) +def infer(exp_dir: str, original_audio: str, f0add: int) -> Tuple[int, np.ndarray]: + name = os.path.basename(exp_dir) + model = os.path.join(exp_dir, f"{name}.pth") + if not os.path.exists(model): + raise gr.Error("Model not found") + + index = glob(f"{exp_dir}/added_*.index") + if not index: + raise gr.Error("Index not found") + + base = os.path.basename(original_audio) + base = os.path.splitext(base)[0] + demucs.separate.main( + ["--two-stems", "vocals", "-d", str(device), "-n", "htdemucs", original_audio] + ) + out = os.path.join("separated", "htdemucs", base, "vocals.wav") + + cfg = Config() + vc = VC(cfg) + vc.get_vc(model) + _, wav_opt = vc.vc_single( + 0, + out, + f0add, + None, + "rmvpe", + index, + None, + 0.5, + 3, + 0, + 1, + 0.33, + ) + + sr = wav_opt[0] + data = wav_opt[1] + + return sr, data + + +def merge(exp_dir: str, original_audio: str, vocal: Tuple[int, np.ndarray]) -> str: + base = os.path.basename(original_audio) + base = os.path.splitext(base)[0] + music = os.path.join("separated", "htdemucs", base, "no-vocals.wav") + + tmp = os.path.join(exp_dir, "tmp.wav") + sf.write(tmp, vocal[1], vocal[0]) + + os.system( + f"ffmpeg -i {music} -i {tmp} -filter_complex '[1]volume=2[a];[0][a]amix=inputs=2:duration=first:dropout_transition=2' {tmp}.merged.mp3" + ) + + return f"{tmp}.merged.mp3" + + with gr.Blocks() as app: # allow user to manually select the experiment directory - exp_dir = gr.Textbox(label="Experiment directory (don't touch it unless you know what you are doing)", visible=True, interactive=True) + exp_dir = gr.Textbox( + label="Experiment directory (don't touch it unless you know what you are doing)", + visible=True, + interactive=True, + ) with gr.Tabs(): with gr.Tab(label="New / Restore"): @@ -284,10 +337,10 @@ with gr.Blocks() as app: preprocess_output = gr.Textbox( label="Preprocessing output", lines=5 ) - with gr.Column(): - preprocess_btn = gr.Button( - value="Start New Experiment", variant="primary" - ) + + preprocess_btn = gr.Button( + value="Start New Experiment", variant="primary" + ) with gr.Row(): restore_zip_file = gr.File( @@ -327,6 +380,26 @@ with gr.Blocks() as app: ) download_expdir_output = gr.File(label="Download experiment directory") + with gr.Tab(label="Inference"): + with gr.Row(): + original_audio = gr.Audio( + label="Upload original audio", + type="filepath", + show_download_button=True, + ) + f0add = gr.Slider( + label="F0 add", + minimum=-16, + maximum=16, + step=1, + value=0, + ) + infer_btn = gr.Button(value="Infer", variant="primary") + with gr.Row(): + infer_output = gr.Audio(label="Inferred audio") + with gr.Row(): + merge_output = gr.Audio(label="Merged audio") + preprocess_btn.click( fn=preprocess, inputs=[zip_file], @@ -343,6 +416,10 @@ with gr.Blocks() as app: fn=train_model, inputs=[exp_dir], outputs=[latest_model], + ).success( + fn=train_model, + inputs=[exp_dir], + outputs=[latest_model], ) train_index_btn.click( @@ -369,4 +446,14 @@ with gr.Blocks() as app: outputs=[exp_dir], ) + infer_btn.click( + fn=infer, + inputs=[exp_dir, original_audio, f0add], + outputs=[infer_output], + ).success( + fn=merge, + inputs=[exp_dir, original_audio, infer_output], + outputs=[merge_output], + ) + app.launch() diff --git a/assets/pretrained_v2/D40k.pth b/assets/pretrained_v2/D40k.pth deleted file mode 100644 index 6d13aea9208310573b59309a9c80310ef71c5547..0000000000000000000000000000000000000000 --- a/assets/pretrained_v2/D40k.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:471378e894e7191f89a94eda8288c5947b16bbe0b10c3f1f17efdb7a1d998242 -size 142875703 diff --git a/assets/pretrained_v2/G40k.pth b/assets/pretrained_v2/G40k.pth deleted file mode 100644 index ee39bf64a1fc1d0d8154e242a3b60ef3e2abf0ca..0000000000000000000000000000000000000000 --- a/assets/pretrained_v2/G40k.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a3843da7fde33db1dab176146c70d6c2df06eafe9457f4e3aa10024e9c6a4b69 -size 72959671 diff --git a/config.json b/config.json index 5ce52f4aa8b16e161e1edb4e57f4d8aeec835e74..7c98e5bad3607000dc35dccf0288b38f3a29b0cf 100644 --- a/config.json +++ b/config.json @@ -67,7 +67,7 @@ "c_mel": 45, "epochs": 20000, "eps": 1e-09, - "fp16_run": false, + "fp16_run": true, "init_lr_ratio": 1, "learning_rate": 0.0001, "log_interval": 200, diff --git a/configs/config.py b/configs/config.py new file mode 100644 index 0000000000000000000000000000000000000000..c3c16599d94254d9ff04ba38e0a3757af5ed06f3 --- /dev/null +++ b/configs/config.py @@ -0,0 +1,245 @@ +import argparse +import os +import sys +import json +import shutil +from multiprocessing import cpu_count + +import torch +import logging +from model import device, fp16 + +logger = logging.getLogger(__name__) + + +version_config_list = [ + "v1/32k.json", + "v1/40k.json", + "v1/48k.json", + "v2/48k.json", + "v2/32k.json", +] + + +def singleton_variable(func): + def wrapper(*args, **kwargs): + if not wrapper.instance: + wrapper.instance = func(*args, **kwargs) + return wrapper.instance + + wrapper.instance = None + return wrapper + + +@singleton_variable +class Config: + def __init__(self): + self.device = str(device) + self.is_half = fp16 + self.use_jit = False + self.n_cpu = 0 + self.gpu_name = None + self.json_config = self.load_config_json() + self.gpu_mem = None + ( + self.python_cmd, + self.listen_port, + self.iscolab, + self.noparallel, + self.noautoopen, + self.dml, + ) = self.arg_parse() + self.instead = "" + self.preprocess_per = 3.7 + self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config() + + @staticmethod + def load_config_json() -> dict: + d = {} + # for config_file in version_config_list: + # p = f"configs/inuse/{config_file}" + # if not os.path.exists(p): + # shutil.copy(f"configs/{config_file}", p) + # with open(f"configs/inuse/{config_file}", "r") as f: + # d[config_file] = json.load(f) + return d + + @staticmethod + def arg_parse() -> tuple: + exe = sys.executable or "python" + parser = argparse.ArgumentParser() + parser.add_argument("--port", type=int, default=7865, help="Listen port") + parser.add_argument("--pycmd", type=str, default=exe, help="Python command") + parser.add_argument("--colab", action="store_true", help="Launch in colab") + parser.add_argument( + "--noparallel", action="store_true", help="Disable parallel processing" + ) + parser.add_argument( + "--noautoopen", + action="store_true", + help="Do not open in browser automatically", + ) + parser.add_argument( + "--dml", + action="store_true", + help="torch_dml", + ) + cmd_opts = parser.parse_args() + + cmd_opts.port = cmd_opts.port if 0 <= cmd_opts.port <= 65535 else 7865 + + return ( + cmd_opts.pycmd, + cmd_opts.port, + cmd_opts.colab, + cmd_opts.noparallel, + cmd_opts.noautoopen, + cmd_opts.dml, + ) + + # has_mps is only available in nightly pytorch (for now) and MasOS 12.3+. + # check `getattr` and try it for compatibility + @staticmethod + def has_mps() -> bool: + if not torch.backends.mps.is_available(): + return False + try: + torch.zeros(1).to(torch.device("mps")) + return True + except Exception: + return False + + @staticmethod + def has_xpu() -> bool: + if hasattr(torch, "xpu") and torch.xpu.is_available(): + return True + else: + return False + + def use_fp32_config(self): + for config_file in version_config_list: + self.json_config[config_file]["train"]["fp16_run"] = False + with open(f"configs/inuse/{config_file}", "r") as f: + strr = f.read().replace("true", "false") + with open(f"configs/inuse/{config_file}", "w") as f: + f.write(strr) + logger.info("overwrite " + config_file) + self.preprocess_per = 3.0 + logger.info("overwrite preprocess_per to %d" % (self.preprocess_per)) + + def device_config(self) -> tuple: + if torch.cuda.is_available(): + if self.has_xpu(): + self.device = self.instead = "xpu:0" + self.is_half = True + i_device = int(self.device.split(":")[-1]) + self.gpu_name = torch.cuda.get_device_name(i_device) + if ( + ("16" in self.gpu_name and "V100" not in self.gpu_name.upper()) + or "P40" in self.gpu_name.upper() + or "P10" in self.gpu_name.upper() + or "1060" in self.gpu_name + or "1070" in self.gpu_name + or "1080" in self.gpu_name + ): + logger.info("Found GPU %s, force to fp32", self.gpu_name) + self.is_half = False + self.use_fp32_config() + else: + logger.info("Found GPU %s", self.gpu_name) + self.gpu_mem = int( + torch.cuda.get_device_properties(i_device).total_memory + / 1024 + / 1024 + / 1024 + + 0.4 + ) + if self.gpu_mem <= 4: + self.preprocess_per = 3.0 + elif self.has_mps(): + logger.info("No supported Nvidia GPU found") + self.device = self.instead = "mps" + self.is_half = False + self.use_fp32_config() + else: + logger.info("No supported Nvidia GPU found") + self.device = self.instead = "cpu" + self.is_half = False + self.use_fp32_config() + + if self.n_cpu == 0: + self.n_cpu = cpu_count() + + if self.is_half: + # 6G显存配置 + x_pad = 3 + x_query = 10 + x_center = 60 + x_max = 65 + else: + # 5G显存配置 + x_pad = 1 + x_query = 6 + x_center = 38 + x_max = 41 + + if self.gpu_mem is not None and self.gpu_mem <= 4: + x_pad = 1 + x_query = 5 + x_center = 30 + x_max = 32 + if self.dml: + logger.info("Use DirectML instead") + if ( + os.path.exists( + "runtime\Lib\site-packages\onnxruntime\capi\DirectML.dll" + ) + == False + ): + try: + os.rename( + "runtime\Lib\site-packages\onnxruntime", + "runtime\Lib\site-packages\onnxruntime-cuda", + ) + except: + pass + try: + os.rename( + "runtime\Lib\site-packages\onnxruntime-dml", + "runtime\Lib\site-packages\onnxruntime", + ) + except: + pass + # if self.device != "cpu": + import torch_directml + + self.device = torch_directml.device(torch_directml.default_device()) + self.is_half = False + else: + if self.instead: + logger.info(f"Use {self.instead} instead") + if ( + os.path.exists( + "runtime\Lib\site-packages\onnxruntime\capi\onnxruntime_providers_cuda.dll" + ) + == False + ): + try: + os.rename( + "runtime\Lib\site-packages\onnxruntime", + "runtime\Lib\site-packages\onnxruntime-dml", + ) + except: + pass + try: + os.rename( + "runtime\Lib\site-packages\onnxruntime-cuda", + "runtime\Lib\site-packages\onnxruntime", + ) + except: + pass + logger.info( + "Half-precision floating-point: %s, device: %s" + % (self.is_half, self.device) + ) + return x_pad, x_query, x_center, x_max diff --git a/infer/lib/audio.py b/infer/lib/audio.py index 90c825e82a6ba8b7d511e5d07d171d058de452aa..d43e5d033275cc9f8159a8470efa2180105a576a 100644 --- a/infer/lib/audio.py +++ b/infer/lib/audio.py @@ -1,8 +1,8 @@ import platform, os +import traceback import ffmpeg import numpy as np import av -from io import BytesIO def wav2(i, o, format): diff --git a/infer/lib/rmvpe.py b/infer/lib/rmvpe.py index 86c6899e3a4c55fc5cef8f195e994e026aa1345a..6a46ac80a38ca6766f883deeea1c1d4a5a097b6a 100644 --- a/infer/lib/rmvpe.py +++ b/infer/lib/rmvpe.py @@ -1,24 +1,14 @@ from io import BytesIO import os -from typing import List, Optional, Tuple +from typing import List import numpy as np import torch from infer.lib import jit -try: - # Fix "Torch not compiled with CUDA enabled" - import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import - - if torch.xpu.is_available(): - from infer.modules.ipex import ipex_init - - ipex_init() -except Exception: # pylint: disable=broad-exception-caught - pass import torch.nn as nn import torch.nn.functional as F -from librosa.util import normalize, pad_center, tiny +from librosa.util import pad_center from scipy.signal import get_window import logging diff --git a/infer/lib/train/process_ckpt.py b/infer/lib/train/process_ckpt.py index 2529ccf6fb05935258af44bf9f3aa204532696ba..3f131e1e5a95adc2cf0eac2b503c8492b5bbf351 100644 --- a/infer/lib/train/process_ckpt.py +++ b/infer/lib/train/process_ckpt.py @@ -61,7 +61,7 @@ def show_info(path): return traceback.format_exc() -def extract_small_model(path, name, sr, if_f0, info, version): +def extract_small_model(path, out, sr, if_f0, info, version): try: ckpt = torch.load(path, map_location="cpu") if "model" in ckpt: @@ -185,7 +185,7 @@ def extract_small_model(path, name, sr, if_f0, info, version): opt["version"] = version opt["sr"] = sr opt["f0"] = int(if_f0) - torch.save(opt, "assets/weights/%s.pth" % name) + torch.save(opt, out) return "Success." except: return traceback.format_exc() diff --git a/infer/lib/uvr5_pack/lib_v5/dataset.py b/infer/lib/uvr5_pack/lib_v5/dataset.py deleted file mode 100644 index cfd01a174978d97180a897e40cb59ecadec1d12e..0000000000000000000000000000000000000000 --- a/infer/lib/uvr5_pack/lib_v5/dataset.py +++ /dev/null @@ -1,183 +0,0 @@ -import os -import random - -import numpy as np -import torch -import torch.utils.data -from tqdm import tqdm - -from . import spec_utils - - -class VocalRemoverValidationSet(torch.utils.data.Dataset): - def __init__(self, patch_list): - self.patch_list = patch_list - - def __len__(self): - return len(self.patch_list) - - def __getitem__(self, idx): - path = self.patch_list[idx] - data = np.load(path) - - X, y = data["X"], data["y"] - - X_mag = np.abs(X) - y_mag = np.abs(y) - - return X_mag, y_mag - - -def make_pair(mix_dir, inst_dir): - input_exts = [".wav", ".m4a", ".mp3", ".mp4", ".flac"] - - X_list = sorted( - [ - os.path.join(mix_dir, fname) - for fname in os.listdir(mix_dir) - if os.path.splitext(fname)[1] in input_exts - ] - ) - y_list = sorted( - [ - os.path.join(inst_dir, fname) - for fname in os.listdir(inst_dir) - if os.path.splitext(fname)[1] in input_exts - ] - ) - - filelist = list(zip(X_list, y_list)) - - return filelist - - -def train_val_split(dataset_dir, split_mode, val_rate, val_filelist): - if split_mode == "random": - filelist = make_pair( - os.path.join(dataset_dir, "mixtures"), - os.path.join(dataset_dir, "instruments"), - ) - - random.shuffle(filelist) - - if len(val_filelist) == 0: - val_size = int(len(filelist) * val_rate) - train_filelist = filelist[:-val_size] - val_filelist = filelist[-val_size:] - else: - train_filelist = [ - pair for pair in filelist if list(pair) not in val_filelist - ] - elif split_mode == "subdirs": - if len(val_filelist) != 0: - raise ValueError( - "The `val_filelist` option is not available in `subdirs` mode" - ) - - train_filelist = make_pair( - os.path.join(dataset_dir, "training/mixtures"), - os.path.join(dataset_dir, "training/instruments"), - ) - - val_filelist = make_pair( - os.path.join(dataset_dir, "validation/mixtures"), - os.path.join(dataset_dir, "validation/instruments"), - ) - - return train_filelist, val_filelist - - -def augment(X, y, reduction_rate, reduction_mask, mixup_rate, mixup_alpha): - perm = np.random.permutation(len(X)) - for i, idx in enumerate(tqdm(perm)): - if np.random.uniform() < reduction_rate: - y[idx] = spec_utils.reduce_vocal_aggressively( - X[idx], y[idx], reduction_mask - ) - - if np.random.uniform() < 0.5: - # swap channel - X[idx] = X[idx, ::-1] - y[idx] = y[idx, ::-1] - if np.random.uniform() < 0.02: - # mono - X[idx] = X[idx].mean(axis=0, keepdims=True) - y[idx] = y[idx].mean(axis=0, keepdims=True) - if np.random.uniform() < 0.02: - # inst - X[idx] = y[idx] - - if np.random.uniform() < mixup_rate and i < len(perm) - 1: - lam = np.random.beta(mixup_alpha, mixup_alpha) - X[idx] = lam * X[idx] + (1 - lam) * X[perm[i + 1]] - y[idx] = lam * y[idx] + (1 - lam) * y[perm[i + 1]] - - return X, y - - -def make_padding(width, cropsize, offset): - left = offset - roi_size = cropsize - left * 2 - if roi_size == 0: - roi_size = cropsize - right = roi_size - (width % roi_size) + left - - return left, right, roi_size - - -def make_training_set(filelist, cropsize, patches, sr, hop_length, n_fft, offset): - len_dataset = patches * len(filelist) - - X_dataset = np.zeros((len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64) - y_dataset = np.zeros((len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64) - - for i, (X_path, y_path) in enumerate(tqdm(filelist)): - X, y = spec_utils.cache_or_load(X_path, y_path, sr, hop_length, n_fft) - coef = np.max([np.abs(X).max(), np.abs(y).max()]) - X, y = X / coef, y / coef - - l, r, roi_size = make_padding(X.shape[2], cropsize, offset) - X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode="constant") - y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode="constant") - - starts = np.random.randint(0, X_pad.shape[2] - cropsize, patches) - ends = starts + cropsize - for j in range(patches): - idx = i * patches + j - X_dataset[idx] = X_pad[:, :, starts[j] : ends[j]] - y_dataset[idx] = y_pad[:, :, starts[j] : ends[j]] - - return X_dataset, y_dataset - - -def make_validation_set(filelist, cropsize, sr, hop_length, n_fft, offset): - patch_list = [] - patch_dir = "cs{}_sr{}_hl{}_nf{}_of{}".format( - cropsize, sr, hop_length, n_fft, offset - ) - os.makedirs(patch_dir, exist_ok=True) - - for i, (X_path, y_path) in enumerate(tqdm(filelist)): - basename = os.path.splitext(os.path.basename(X_path))[0] - - X, y = spec_utils.cache_or_load(X_path, y_path, sr, hop_length, n_fft) - coef = np.max([np.abs(X).max(), np.abs(y).max()]) - X, y = X / coef, y / coef - - l, r, roi_size = make_padding(X.shape[2], cropsize, offset) - X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode="constant") - y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode="constant") - - len_dataset = int(np.ceil(X.shape[2] / roi_size)) - for j in range(len_dataset): - outpath = os.path.join(patch_dir, "{}_p{}.npz".format(basename, j)) - start = j * roi_size - if not os.path.exists(outpath): - np.savez( - outpath, - X=X_pad[:, :, start : start + cropsize], - y=y_pad[:, :, start : start + cropsize], - ) - patch_list.append(outpath) - - return VocalRemoverValidationSet(patch_list) diff --git a/infer/lib/uvr5_pack/lib_v5/layers.py b/infer/lib/uvr5_pack/lib_v5/layers.py deleted file mode 100644 index 4fc1b5cb85a3327f60cbb9f5deffbeeaaac516ad..0000000000000000000000000000000000000000 --- a/infer/lib/uvr5_pack/lib_v5/layers.py +++ /dev/null @@ -1,118 +0,0 @@ -import torch -import torch.nn.functional as F -from torch import nn - -from . import spec_utils - - -class Conv2DBNActiv(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): - super(Conv2DBNActiv, self).__init__() - self.conv = nn.Sequential( - nn.Conv2d( - nin, - nout, - kernel_size=ksize, - stride=stride, - padding=pad, - dilation=dilation, - bias=False, - ), - nn.BatchNorm2d(nout), - activ(), - ) - - def __call__(self, x): - return self.conv(x) - - -class SeperableConv2DBNActiv(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): - super(SeperableConv2DBNActiv, self).__init__() - self.conv = nn.Sequential( - nn.Conv2d( - nin, - nin, - kernel_size=ksize, - stride=stride, - padding=pad, - dilation=dilation, - groups=nin, - bias=False, - ), - nn.Conv2d(nin, nout, kernel_size=1, bias=False), - nn.BatchNorm2d(nout), - activ(), - ) - - def __call__(self, x): - return self.conv(x) - - -class Encoder(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): - super(Encoder, self).__init__() - self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) - self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) - - def __call__(self, x): - skip = self.conv1(x) - h = self.conv2(skip) - - return h, skip - - -class Decoder(nn.Module): - def __init__( - self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False - ): - super(Decoder, self).__init__() - self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) - self.dropout = nn.Dropout2d(0.1) if dropout else None - - def __call__(self, x, skip=None): - x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) - if skip is not None: - skip = spec_utils.crop_center(skip, x) - x = torch.cat([x, skip], dim=1) - h = self.conv(x) - - if self.dropout is not None: - h = self.dropout(h) - - return h - - -class ASPPModule(nn.Module): - def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU): - super(ASPPModule, self).__init__() - self.conv1 = nn.Sequential( - nn.AdaptiveAvgPool2d((1, None)), - Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), - ) - self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) - self.conv3 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[0], dilations[0], activ=activ - ) - self.conv4 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[1], dilations[1], activ=activ - ) - self.conv5 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ - ) - self.bottleneck = nn.Sequential( - Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) - ) - - def forward(self, x): - _, _, h, w = x.size() - feat1 = F.interpolate( - self.conv1(x), size=(h, w), mode="bilinear", align_corners=True - ) - feat2 = self.conv2(x) - feat3 = self.conv3(x) - feat4 = self.conv4(x) - feat5 = self.conv5(x) - out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1) - bottle = self.bottleneck(out) - return bottle diff --git a/infer/lib/uvr5_pack/lib_v5/layers_123812KB .py b/infer/lib/uvr5_pack/lib_v5/layers_123812KB .py deleted file mode 100644 index 4fc1b5cb85a3327f60cbb9f5deffbeeaaac516ad..0000000000000000000000000000000000000000 --- a/infer/lib/uvr5_pack/lib_v5/layers_123812KB .py +++ /dev/null @@ -1,118 +0,0 @@ -import torch -import torch.nn.functional as F -from torch import nn - -from . import spec_utils - - -class Conv2DBNActiv(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): - super(Conv2DBNActiv, self).__init__() - self.conv = nn.Sequential( - nn.Conv2d( - nin, - nout, - kernel_size=ksize, - stride=stride, - padding=pad, - dilation=dilation, - bias=False, - ), - nn.BatchNorm2d(nout), - activ(), - ) - - def __call__(self, x): - return self.conv(x) - - -class SeperableConv2DBNActiv(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): - super(SeperableConv2DBNActiv, self).__init__() - self.conv = nn.Sequential( - nn.Conv2d( - nin, - nin, - kernel_size=ksize, - stride=stride, - padding=pad, - dilation=dilation, - groups=nin, - bias=False, - ), - nn.Conv2d(nin, nout, kernel_size=1, bias=False), - nn.BatchNorm2d(nout), - activ(), - ) - - def __call__(self, x): - return self.conv(x) - - -class Encoder(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): - super(Encoder, self).__init__() - self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) - self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) - - def __call__(self, x): - skip = self.conv1(x) - h = self.conv2(skip) - - return h, skip - - -class Decoder(nn.Module): - def __init__( - self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False - ): - super(Decoder, self).__init__() - self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) - self.dropout = nn.Dropout2d(0.1) if dropout else None - - def __call__(self, x, skip=None): - x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) - if skip is not None: - skip = spec_utils.crop_center(skip, x) - x = torch.cat([x, skip], dim=1) - h = self.conv(x) - - if self.dropout is not None: - h = self.dropout(h) - - return h - - -class ASPPModule(nn.Module): - def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU): - super(ASPPModule, self).__init__() - self.conv1 = nn.Sequential( - nn.AdaptiveAvgPool2d((1, None)), - Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), - ) - self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) - self.conv3 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[0], dilations[0], activ=activ - ) - self.conv4 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[1], dilations[1], activ=activ - ) - self.conv5 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ - ) - self.bottleneck = nn.Sequential( - Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) - ) - - def forward(self, x): - _, _, h, w = x.size() - feat1 = F.interpolate( - self.conv1(x), size=(h, w), mode="bilinear", align_corners=True - ) - feat2 = self.conv2(x) - feat3 = self.conv3(x) - feat4 = self.conv4(x) - feat5 = self.conv5(x) - out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1) - bottle = self.bottleneck(out) - return bottle diff --git a/infer/lib/uvr5_pack/lib_v5/layers_123821KB.py b/infer/lib/uvr5_pack/lib_v5/layers_123821KB.py deleted file mode 100644 index 4fc1b5cb85a3327f60cbb9f5deffbeeaaac516ad..0000000000000000000000000000000000000000 --- a/infer/lib/uvr5_pack/lib_v5/layers_123821KB.py +++ /dev/null @@ -1,118 +0,0 @@ -import torch -import torch.nn.functional as F -from torch import nn - -from . import spec_utils - - -class Conv2DBNActiv(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): - super(Conv2DBNActiv, self).__init__() - self.conv = nn.Sequential( - nn.Conv2d( - nin, - nout, - kernel_size=ksize, - stride=stride, - padding=pad, - dilation=dilation, - bias=False, - ), - nn.BatchNorm2d(nout), - activ(), - ) - - def __call__(self, x): - return self.conv(x) - - -class SeperableConv2DBNActiv(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): - super(SeperableConv2DBNActiv, self).__init__() - self.conv = nn.Sequential( - nn.Conv2d( - nin, - nin, - kernel_size=ksize, - stride=stride, - padding=pad, - dilation=dilation, - groups=nin, - bias=False, - ), - nn.Conv2d(nin, nout, kernel_size=1, bias=False), - nn.BatchNorm2d(nout), - activ(), - ) - - def __call__(self, x): - return self.conv(x) - - -class Encoder(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): - super(Encoder, self).__init__() - self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) - self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) - - def __call__(self, x): - skip = self.conv1(x) - h = self.conv2(skip) - - return h, skip - - -class Decoder(nn.Module): - def __init__( - self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False - ): - super(Decoder, self).__init__() - self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) - self.dropout = nn.Dropout2d(0.1) if dropout else None - - def __call__(self, x, skip=None): - x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) - if skip is not None: - skip = spec_utils.crop_center(skip, x) - x = torch.cat([x, skip], dim=1) - h = self.conv(x) - - if self.dropout is not None: - h = self.dropout(h) - - return h - - -class ASPPModule(nn.Module): - def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU): - super(ASPPModule, self).__init__() - self.conv1 = nn.Sequential( - nn.AdaptiveAvgPool2d((1, None)), - Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), - ) - self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) - self.conv3 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[0], dilations[0], activ=activ - ) - self.conv4 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[1], dilations[1], activ=activ - ) - self.conv5 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ - ) - self.bottleneck = nn.Sequential( - Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) - ) - - def forward(self, x): - _, _, h, w = x.size() - feat1 = F.interpolate( - self.conv1(x), size=(h, w), mode="bilinear", align_corners=True - ) - feat2 = self.conv2(x) - feat3 = self.conv3(x) - feat4 = self.conv4(x) - feat5 = self.conv5(x) - out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1) - bottle = self.bottleneck(out) - return bottle diff --git a/infer/lib/uvr5_pack/lib_v5/layers_33966KB.py b/infer/lib/uvr5_pack/lib_v5/layers_33966KB.py deleted file mode 100644 index 9b127bc6427f5c60c8cf85603a3d8a093c3501c4..0000000000000000000000000000000000000000 --- a/infer/lib/uvr5_pack/lib_v5/layers_33966KB.py +++ /dev/null @@ -1,126 +0,0 @@ -import torch -import torch.nn.functional as F -from torch import nn - -from . import spec_utils - - -class Conv2DBNActiv(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): - super(Conv2DBNActiv, self).__init__() - self.conv = nn.Sequential( - nn.Conv2d( - nin, - nout, - kernel_size=ksize, - stride=stride, - padding=pad, - dilation=dilation, - bias=False, - ), - nn.BatchNorm2d(nout), - activ(), - ) - - def __call__(self, x): - return self.conv(x) - - -class SeperableConv2DBNActiv(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): - super(SeperableConv2DBNActiv, self).__init__() - self.conv = nn.Sequential( - nn.Conv2d( - nin, - nin, - kernel_size=ksize, - stride=stride, - padding=pad, - dilation=dilation, - groups=nin, - bias=False, - ), - nn.Conv2d(nin, nout, kernel_size=1, bias=False), - nn.BatchNorm2d(nout), - activ(), - ) - - def __call__(self, x): - return self.conv(x) - - -class Encoder(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): - super(Encoder, self).__init__() - self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) - self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) - - def __call__(self, x): - skip = self.conv1(x) - h = self.conv2(skip) - - return h, skip - - -class Decoder(nn.Module): - def __init__( - self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False - ): - super(Decoder, self).__init__() - self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) - self.dropout = nn.Dropout2d(0.1) if dropout else None - - def __call__(self, x, skip=None): - x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) - if skip is not None: - skip = spec_utils.crop_center(skip, x) - x = torch.cat([x, skip], dim=1) - h = self.conv(x) - - if self.dropout is not None: - h = self.dropout(h) - - return h - - -class ASPPModule(nn.Module): - def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU): - super(ASPPModule, self).__init__() - self.conv1 = nn.Sequential( - nn.AdaptiveAvgPool2d((1, None)), - Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), - ) - self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) - self.conv3 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[0], dilations[0], activ=activ - ) - self.conv4 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[1], dilations[1], activ=activ - ) - self.conv5 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ - ) - self.conv6 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ - ) - self.conv7 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ - ) - self.bottleneck = nn.Sequential( - Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) - ) - - def forward(self, x): - _, _, h, w = x.size() - feat1 = F.interpolate( - self.conv1(x), size=(h, w), mode="bilinear", align_corners=True - ) - feat2 = self.conv2(x) - feat3 = self.conv3(x) - feat4 = self.conv4(x) - feat5 = self.conv5(x) - feat6 = self.conv6(x) - feat7 = self.conv7(x) - out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1) - bottle = self.bottleneck(out) - return bottle diff --git a/infer/lib/uvr5_pack/lib_v5/layers_537227KB.py b/infer/lib/uvr5_pack/lib_v5/layers_537227KB.py deleted file mode 100644 index 9b127bc6427f5c60c8cf85603a3d8a093c3501c4..0000000000000000000000000000000000000000 --- a/infer/lib/uvr5_pack/lib_v5/layers_537227KB.py +++ /dev/null @@ -1,126 +0,0 @@ -import torch -import torch.nn.functional as F -from torch import nn - -from . import spec_utils - - -class Conv2DBNActiv(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): - super(Conv2DBNActiv, self).__init__() - self.conv = nn.Sequential( - nn.Conv2d( - nin, - nout, - kernel_size=ksize, - stride=stride, - padding=pad, - dilation=dilation, - bias=False, - ), - nn.BatchNorm2d(nout), - activ(), - ) - - def __call__(self, x): - return self.conv(x) - - -class SeperableConv2DBNActiv(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): - super(SeperableConv2DBNActiv, self).__init__() - self.conv = nn.Sequential( - nn.Conv2d( - nin, - nin, - kernel_size=ksize, - stride=stride, - padding=pad, - dilation=dilation, - groups=nin, - bias=False, - ), - nn.Conv2d(nin, nout, kernel_size=1, bias=False), - nn.BatchNorm2d(nout), - activ(), - ) - - def __call__(self, x): - return self.conv(x) - - -class Encoder(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): - super(Encoder, self).__init__() - self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) - self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) - - def __call__(self, x): - skip = self.conv1(x) - h = self.conv2(skip) - - return h, skip - - -class Decoder(nn.Module): - def __init__( - self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False - ): - super(Decoder, self).__init__() - self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) - self.dropout = nn.Dropout2d(0.1) if dropout else None - - def __call__(self, x, skip=None): - x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) - if skip is not None: - skip = spec_utils.crop_center(skip, x) - x = torch.cat([x, skip], dim=1) - h = self.conv(x) - - if self.dropout is not None: - h = self.dropout(h) - - return h - - -class ASPPModule(nn.Module): - def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU): - super(ASPPModule, self).__init__() - self.conv1 = nn.Sequential( - nn.AdaptiveAvgPool2d((1, None)), - Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), - ) - self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) - self.conv3 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[0], dilations[0], activ=activ - ) - self.conv4 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[1], dilations[1], activ=activ - ) - self.conv5 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ - ) - self.conv6 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ - ) - self.conv7 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ - ) - self.bottleneck = nn.Sequential( - Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) - ) - - def forward(self, x): - _, _, h, w = x.size() - feat1 = F.interpolate( - self.conv1(x), size=(h, w), mode="bilinear", align_corners=True - ) - feat2 = self.conv2(x) - feat3 = self.conv3(x) - feat4 = self.conv4(x) - feat5 = self.conv5(x) - feat6 = self.conv6(x) - feat7 = self.conv7(x) - out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1) - bottle = self.bottleneck(out) - return bottle diff --git a/infer/lib/uvr5_pack/lib_v5/layers_537238KB.py b/infer/lib/uvr5_pack/lib_v5/layers_537238KB.py deleted file mode 100644 index 9b127bc6427f5c60c8cf85603a3d8a093c3501c4..0000000000000000000000000000000000000000 --- a/infer/lib/uvr5_pack/lib_v5/layers_537238KB.py +++ /dev/null @@ -1,126 +0,0 @@ -import torch -import torch.nn.functional as F -from torch import nn - -from . import spec_utils - - -class Conv2DBNActiv(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): - super(Conv2DBNActiv, self).__init__() - self.conv = nn.Sequential( - nn.Conv2d( - nin, - nout, - kernel_size=ksize, - stride=stride, - padding=pad, - dilation=dilation, - bias=False, - ), - nn.BatchNorm2d(nout), - activ(), - ) - - def __call__(self, x): - return self.conv(x) - - -class SeperableConv2DBNActiv(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): - super(SeperableConv2DBNActiv, self).__init__() - self.conv = nn.Sequential( - nn.Conv2d( - nin, - nin, - kernel_size=ksize, - stride=stride, - padding=pad, - dilation=dilation, - groups=nin, - bias=False, - ), - nn.Conv2d(nin, nout, kernel_size=1, bias=False), - nn.BatchNorm2d(nout), - activ(), - ) - - def __call__(self, x): - return self.conv(x) - - -class Encoder(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): - super(Encoder, self).__init__() - self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) - self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) - - def __call__(self, x): - skip = self.conv1(x) - h = self.conv2(skip) - - return h, skip - - -class Decoder(nn.Module): - def __init__( - self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False - ): - super(Decoder, self).__init__() - self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) - self.dropout = nn.Dropout2d(0.1) if dropout else None - - def __call__(self, x, skip=None): - x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) - if skip is not None: - skip = spec_utils.crop_center(skip, x) - x = torch.cat([x, skip], dim=1) - h = self.conv(x) - - if self.dropout is not None: - h = self.dropout(h) - - return h - - -class ASPPModule(nn.Module): - def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU): - super(ASPPModule, self).__init__() - self.conv1 = nn.Sequential( - nn.AdaptiveAvgPool2d((1, None)), - Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), - ) - self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) - self.conv3 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[0], dilations[0], activ=activ - ) - self.conv4 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[1], dilations[1], activ=activ - ) - self.conv5 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ - ) - self.conv6 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ - ) - self.conv7 = SeperableConv2DBNActiv( - nin, nin, 3, 1, dilations[2], dilations[2], activ=activ - ) - self.bottleneck = nn.Sequential( - Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) - ) - - def forward(self, x): - _, _, h, w = x.size() - feat1 = F.interpolate( - self.conv1(x), size=(h, w), mode="bilinear", align_corners=True - ) - feat2 = self.conv2(x) - feat3 = self.conv3(x) - feat4 = self.conv4(x) - feat5 = self.conv5(x) - feat6 = self.conv6(x) - feat7 = self.conv7(x) - out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1) - bottle = self.bottleneck(out) - return bottle diff --git a/infer/lib/uvr5_pack/lib_v5/layers_new.py b/infer/lib/uvr5_pack/lib_v5/layers_new.py deleted file mode 100644 index 44153b6a23399c6938affc61c71919eaa172bcee..0000000000000000000000000000000000000000 --- a/infer/lib/uvr5_pack/lib_v5/layers_new.py +++ /dev/null @@ -1,125 +0,0 @@ -import torch -import torch.nn.functional as F -from torch import nn - -from . import spec_utils - - -class Conv2DBNActiv(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): - super(Conv2DBNActiv, self).__init__() - self.conv = nn.Sequential( - nn.Conv2d( - nin, - nout, - kernel_size=ksize, - stride=stride, - padding=pad, - dilation=dilation, - bias=False, - ), - nn.BatchNorm2d(nout), - activ(), - ) - - def __call__(self, x): - return self.conv(x) - - -class Encoder(nn.Module): - def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): - super(Encoder, self).__init__() - self.conv1 = Conv2DBNActiv(nin, nout, ksize, stride, pad, activ=activ) - self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ) - - def __call__(self, x): - h = self.conv1(x) - h = self.conv2(h) - - return h - - -class Decoder(nn.Module): - def __init__( - self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False - ): - super(Decoder, self).__init__() - self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) - # self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ) - self.dropout = nn.Dropout2d(0.1) if dropout else None - - def __call__(self, x, skip=None): - x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) - - if skip is not None: - skip = spec_utils.crop_center(skip, x) - x = torch.cat([x, skip], dim=1) - - h = self.conv1(x) - # h = self.conv2(h) - - if self.dropout is not None: - h = self.dropout(h) - - return h - - -class ASPPModule(nn.Module): - def __init__(self, nin, nout, dilations=(4, 8, 12), activ=nn.ReLU, dropout=False): - super(ASPPModule, self).__init__() - self.conv1 = nn.Sequential( - nn.AdaptiveAvgPool2d((1, None)), - Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ), - ) - self.conv2 = Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ) - self.conv3 = Conv2DBNActiv( - nin, nout, 3, 1, dilations[0], dilations[0], activ=activ - ) - self.conv4 = Conv2DBNActiv( - nin, nout, 3, 1, dilations[1], dilations[1], activ=activ - ) - self.conv5 = Conv2DBNActiv( - nin, nout, 3, 1, dilations[2], dilations[2], activ=activ - ) - self.bottleneck = Conv2DBNActiv(nout * 5, nout, 1, 1, 0, activ=activ) - self.dropout = nn.Dropout2d(0.1) if dropout else None - - def forward(self, x): - _, _, h, w = x.size() - feat1 = F.interpolate( - self.conv1(x), size=(h, w), mode="bilinear", align_corners=True - ) - feat2 = self.conv2(x) - feat3 = self.conv3(x) - feat4 = self.conv4(x) - feat5 = self.conv5(x) - out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1) - out = self.bottleneck(out) - - if self.dropout is not None: - out = self.dropout(out) - - return out - - -class LSTMModule(nn.Module): - def __init__(self, nin_conv, nin_lstm, nout_lstm): - super(LSTMModule, self).__init__() - self.conv = Conv2DBNActiv(nin_conv, 1, 1, 1, 0) - self.lstm = nn.LSTM( - input_size=nin_lstm, hidden_size=nout_lstm // 2, bidirectional=True - ) - self.dense = nn.Sequential( - nn.Linear(nout_lstm, nin_lstm), nn.BatchNorm1d(nin_lstm), nn.ReLU() - ) - - def forward(self, x): - N, _, nbins, nframes = x.size() - h = self.conv(x)[:, 0] # N, nbins, nframes - h = h.permute(2, 0, 1) # nframes, N, nbins - h, _ = self.lstm(h) - h = self.dense(h.reshape(-1, h.size()[-1])) # nframes * N, nbins - h = h.reshape(nframes, N, 1, nbins) - h = h.permute(1, 2, 3, 0) - - return h diff --git a/infer/lib/uvr5_pack/lib_v5/model_param_init.py b/infer/lib/uvr5_pack/lib_v5/model_param_init.py deleted file mode 100644 index b995c0bfb1194746187692e2ab1c2a6dbaaaec6c..0000000000000000000000000000000000000000 --- a/infer/lib/uvr5_pack/lib_v5/model_param_init.py +++ /dev/null @@ -1,69 +0,0 @@ -import json -import os -import pathlib - -default_param = {} -default_param["bins"] = 768 -default_param["unstable_bins"] = 9 # training only -default_param["reduction_bins"] = 762 # training only -default_param["sr"] = 44100 -default_param["pre_filter_start"] = 757 -default_param["pre_filter_stop"] = 768 -default_param["band"] = {} - - -default_param["band"][1] = { - "sr": 11025, - "hl": 128, - "n_fft": 960, - "crop_start": 0, - "crop_stop": 245, - "lpf_start": 61, # inference only - "res_type": "polyphase", -} - -default_param["band"][2] = { - "sr": 44100, - "hl": 512, - "n_fft": 1536, - "crop_start": 24, - "crop_stop": 547, - "hpf_start": 81, # inference only - "res_type": "sinc_best", -} - - -def int_keys(d): - r = {} - for k, v in d: - if k.isdigit(): - k = int(k) - r[k] = v - return r - - -class ModelParameters(object): - def __init__(self, config_path=""): - if ".pth" == pathlib.Path(config_path).suffix: - import zipfile - - with zipfile.ZipFile(config_path, "r") as zip: - self.param = json.loads( - zip.read("param.json"), object_pairs_hook=int_keys - ) - elif ".json" == pathlib.Path(config_path).suffix: - with open(config_path, "r") as f: - self.param = json.loads(f.read(), object_pairs_hook=int_keys) - else: - self.param = default_param - - for k in [ - "mid_side", - "mid_side_b", - "mid_side_b2", - "stereo_w", - "stereo_n", - "reverse", - ]: - if not k in self.param: - self.param[k] = False diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json deleted file mode 100644 index 72cb4499867ad2827185e85687f06fb73d33eced..0000000000000000000000000000000000000000 --- a/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json +++ /dev/null @@ -1,19 +0,0 @@ -{ - "bins": 1024, - "unstable_bins": 0, - "reduction_bins": 0, - "band": { - "1": { - "sr": 16000, - "hl": 512, - "n_fft": 2048, - "crop_start": 0, - "crop_stop": 1024, - "hpf_start": -1, - "res_type": "sinc_best" - } - }, - "sr": 16000, - "pre_filter_start": 1023, - "pre_filter_stop": 1024 -} \ No newline at end of file diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json deleted file mode 100644 index 3c00ecf0a105e55a6a86a3c32db301a2635b5b41..0000000000000000000000000000000000000000 --- a/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json +++ /dev/null @@ -1,19 +0,0 @@ -{ - "bins": 1024, - "unstable_bins": 0, - "reduction_bins": 0, - "band": { - "1": { - "sr": 32000, - "hl": 512, - "n_fft": 2048, - "crop_start": 0, - "crop_stop": 1024, - "hpf_start": -1, - "res_type": "kaiser_fast" - } - }, - "sr": 32000, - "pre_filter_start": 1000, - "pre_filter_stop": 1021 -} \ No newline at end of file diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json deleted file mode 100644 index 55666ac9a8d0547751fb4b4d3bffb1ee2c956913..0000000000000000000000000000000000000000 --- a/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json +++ /dev/null @@ -1,19 +0,0 @@ -{ - "bins": 1024, - "unstable_bins": 0, - "reduction_bins": 0, - "band": { - "1": { - "sr": 33075, - "hl": 384, - "n_fft": 2048, - "crop_start": 0, - "crop_stop": 1024, - "hpf_start": -1, - "res_type": "sinc_best" - } - }, - "sr": 33075, - "pre_filter_start": 1000, - "pre_filter_stop": 1021 -} \ No newline at end of file diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json deleted file mode 100644 index 665abe20eb3cc39fe0f8493dad8f25f6ef634a14..0000000000000000000000000000000000000000 --- a/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json +++ /dev/null @@ -1,19 +0,0 @@ -{ - "bins": 1024, - "unstable_bins": 0, - "reduction_bins": 0, - "band": { - "1": { - "sr": 44100, - "hl": 1024, - "n_fft": 2048, - "crop_start": 0, - "crop_stop": 1024, - "hpf_start": -1, - "res_type": "sinc_best" - } - }, - "sr": 44100, - "pre_filter_start": 1023, - "pre_filter_stop": 1024 -} \ No newline at end of file diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json deleted file mode 100644 index 0e8b16f89b0231d06eabe8d2f7c2670c7caa2272..0000000000000000000000000000000000000000 --- a/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json +++ /dev/null @@ -1,19 +0,0 @@ -{ - "bins": 256, - "unstable_bins": 0, - "reduction_bins": 0, - "band": { - "1": { - "sr": 44100, - "hl": 256, - "n_fft": 512, - "crop_start": 0, - "crop_stop": 256, - "hpf_start": -1, - "res_type": "sinc_best" - } - }, - "sr": 44100, - "pre_filter_start": 256, - "pre_filter_stop": 256 -} \ No newline at end of file diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json deleted file mode 100644 index 3b38fcaf60ba204e03a47f5bd3f5bcfe75e1983a..0000000000000000000000000000000000000000 --- a/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json +++ /dev/null @@ -1,19 +0,0 @@ -{ - "bins": 1024, - "unstable_bins": 0, - "reduction_bins": 0, - "band": { - "1": { - "sr": 44100, - "hl": 512, - "n_fft": 2048, - "crop_start": 0, - "crop_stop": 1024, - "hpf_start": -1, - "res_type": "sinc_best" - } - }, - "sr": 44100, - "pre_filter_start": 1023, - "pre_filter_stop": 1024 -} \ No newline at end of file diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512_cut.json b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512_cut.json deleted file mode 100644 index 630df3524e340f43a1ddb7b33ff02cc91fc1cb47..0000000000000000000000000000000000000000 --- a/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512_cut.json +++ /dev/null @@ -1,19 +0,0 @@ -{ - "bins": 1024, - "unstable_bins": 0, - "reduction_bins": 0, - "band": { - "1": { - "sr": 44100, - "hl": 512, - "n_fft": 2048, - "crop_start": 0, - "crop_stop": 700, - "hpf_start": -1, - "res_type": "sinc_best" - } - }, - "sr": 44100, - "pre_filter_start": 1023, - "pre_filter_stop": 700 -} \ No newline at end of file diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/2band_32000.json b/infer/lib/uvr5_pack/lib_v5/modelparams/2band_32000.json deleted file mode 100644 index ab9cf1150a818eb6252105408311be0a40d423b3..0000000000000000000000000000000000000000 --- a/infer/lib/uvr5_pack/lib_v5/modelparams/2band_32000.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "bins": 768, - "unstable_bins": 7, - "reduction_bins": 705, - "band": { - "1": { - "sr": 6000, - "hl": 66, - "n_fft": 512, - "crop_start": 0, - "crop_stop": 240, - "lpf_start": 60, - "lpf_stop": 118, - "res_type": "sinc_fastest" - }, - "2": { - "sr": 32000, - "hl": 352, - "n_fft": 1024, - "crop_start": 22, - "crop_stop": 505, - "hpf_start": 44, - "hpf_stop": 23, - "res_type": "sinc_medium" - } - }, - "sr": 32000, - "pre_filter_start": 710, - "pre_filter_stop": 731 -} diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json b/infer/lib/uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json deleted file mode 100644 index 7faa216d7b49aeece24123dbdd868847a1dbc03c..0000000000000000000000000000000000000000 --- a/infer/lib/uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "bins": 512, - "unstable_bins": 7, - "reduction_bins": 510, - "band": { - "1": { - "sr": 11025, - "hl": 160, - "n_fft": 768, - "crop_start": 0, - "crop_stop": 192, - "lpf_start": 41, - "lpf_stop": 139, - "res_type": "sinc_fastest" - }, - "2": { - "sr": 44100, - "hl": 640, - "n_fft": 1024, - "crop_start": 10, - "crop_stop": 320, - "hpf_start": 47, - "hpf_stop": 15, - "res_type": "sinc_medium" - } - }, - "sr": 44100, - "pre_filter_start": 510, - "pre_filter_stop": 512 -} diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/2band_48000.json b/infer/lib/uvr5_pack/lib_v5/modelparams/2band_48000.json deleted file mode 100644 index 7e78175052b09cb1a32345e54006475992712f9a..0000000000000000000000000000000000000000 --- a/infer/lib/uvr5_pack/lib_v5/modelparams/2band_48000.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "bins": 768, - "unstable_bins": 7, - "reduction_bins": 705, - "band": { - "1": { - "sr": 6000, - "hl": 66, - "n_fft": 512, - "crop_start": 0, - "crop_stop": 240, - "lpf_start": 60, - "lpf_stop": 240, - "res_type": "sinc_fastest" - }, - "2": { - "sr": 48000, - "hl": 528, - "n_fft": 1536, - "crop_start": 22, - "crop_stop": 505, - "hpf_start": 82, - "hpf_stop": 22, - "res_type": "sinc_medium" - } - }, - "sr": 48000, - "pre_filter_start": 710, - "pre_filter_stop": 731 -} \ No newline at end of file diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100.json b/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100.json deleted file mode 100644 index d881d767ff83fbac0e18dfe2587ef16925b29b3c..0000000000000000000000000000000000000000 --- a/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100.json +++ /dev/null @@ -1,42 +0,0 @@ -{ - "bins": 768, - "unstable_bins": 5, - "reduction_bins": 733, - "band": { - "1": { - "sr": 11025, - "hl": 128, - "n_fft": 768, - "crop_start": 0, - "crop_stop": 278, - "lpf_start": 28, - "lpf_stop": 140, - "res_type": "polyphase" - }, - "2": { - "sr": 22050, - "hl": 256, - "n_fft": 768, - "crop_start": 14, - "crop_stop": 322, - "hpf_start": 70, - "hpf_stop": 14, - "lpf_start": 283, - "lpf_stop": 314, - "res_type": "polyphase" - }, - "3": { - "sr": 44100, - "hl": 512, - "n_fft": 768, - "crop_start": 131, - "crop_stop": 313, - "hpf_start": 154, - "hpf_stop": 141, - "res_type": "sinc_medium" - } - }, - "sr": 44100, - "pre_filter_start": 757, - "pre_filter_stop": 768 -} diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json b/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json deleted file mode 100644 index 77ec198573b19f36519a028a509767d30764c0e2..0000000000000000000000000000000000000000 --- a/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json +++ /dev/null @@ -1,43 +0,0 @@ -{ - "mid_side": true, - "bins": 768, - "unstable_bins": 5, - "reduction_bins": 733, - "band": { - "1": { - "sr": 11025, - "hl": 128, - "n_fft": 768, - "crop_start": 0, - "crop_stop": 278, - "lpf_start": 28, - "lpf_stop": 140, - "res_type": "polyphase" - }, - "2": { - "sr": 22050, - "hl": 256, - "n_fft": 768, - "crop_start": 14, - "crop_stop": 322, - "hpf_start": 70, - "hpf_stop": 14, - "lpf_start": 283, - "lpf_stop": 314, - "res_type": "polyphase" - }, - "3": { - "sr": 44100, - "hl": 512, - "n_fft": 768, - "crop_start": 131, - "crop_stop": 313, - "hpf_start": 154, - "hpf_stop": 141, - "res_type": "sinc_medium" - } - }, - "sr": 44100, - "pre_filter_start": 757, - "pre_filter_stop": 768 -} diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json b/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json deleted file mode 100644 index 85ee8a7d44541c9176e85ea3dce8728d34990938..0000000000000000000000000000000000000000 --- a/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json +++ /dev/null @@ -1,43 +0,0 @@ -{ - "mid_side_b2": true, - "bins": 640, - "unstable_bins": 7, - "reduction_bins": 565, - "band": { - "1": { - "sr": 11025, - "hl": 108, - "n_fft": 1024, - "crop_start": 0, - "crop_stop": 187, - "lpf_start": 92, - "lpf_stop": 186, - "res_type": "polyphase" - }, - "2": { - "sr": 22050, - "hl": 216, - "n_fft": 768, - "crop_start": 0, - "crop_stop": 212, - "hpf_start": 68, - "hpf_stop": 34, - "lpf_start": 174, - "lpf_stop": 209, - "res_type": "polyphase" - }, - "3": { - "sr": 44100, - "hl": 432, - "n_fft": 640, - "crop_start": 66, - "crop_stop": 307, - "hpf_start": 86, - "hpf_stop": 72, - "res_type": "kaiser_fast" - } - }, - "sr": 44100, - "pre_filter_start": 639, - "pre_filter_stop": 640 -} diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json deleted file mode 100644 index df123754204372aa50d464fbe9102a401f48cc73..0000000000000000000000000000000000000000 --- a/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json +++ /dev/null @@ -1,54 +0,0 @@ -{ - "bins": 768, - "unstable_bins": 7, - "reduction_bins": 668, - "band": { - "1": { - "sr": 11025, - "hl": 128, - "n_fft": 1024, - "crop_start": 0, - "crop_stop": 186, - "lpf_start": 37, - "lpf_stop": 73, - "res_type": "polyphase" - }, - "2": { - "sr": 11025, - "hl": 128, - "n_fft": 512, - "crop_start": 4, - "crop_stop": 185, - "hpf_start": 36, - "hpf_stop": 18, - "lpf_start": 93, - "lpf_stop": 185, - "res_type": "polyphase" - }, - "3": { - "sr": 22050, - "hl": 256, - "n_fft": 512, - "crop_start": 46, - "crop_stop": 186, - "hpf_start": 93, - "hpf_stop": 46, - "lpf_start": 164, - "lpf_stop": 186, - "res_type": "polyphase" - }, - "4": { - "sr": 44100, - "hl": 512, - "n_fft": 768, - "crop_start": 121, - "crop_stop": 382, - "hpf_start": 138, - "hpf_stop": 123, - "res_type": "sinc_medium" - } - }, - "sr": 44100, - "pre_filter_start": 740, - "pre_filter_stop": 768 -} diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_mid.json b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_mid.json deleted file mode 100644 index e91b699eb63d3382c3b9e9edf46d40ed91d6122b..0000000000000000000000000000000000000000 --- a/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_mid.json +++ /dev/null @@ -1,55 +0,0 @@ -{ - "bins": 768, - "unstable_bins": 7, - "mid_side": true, - "reduction_bins": 668, - "band": { - "1": { - "sr": 11025, - "hl": 128, - "n_fft": 1024, - "crop_start": 0, - "crop_stop": 186, - "lpf_start": 37, - "lpf_stop": 73, - "res_type": "polyphase" - }, - "2": { - "sr": 11025, - "hl": 128, - "n_fft": 512, - "crop_start": 4, - "crop_stop": 185, - "hpf_start": 36, - "hpf_stop": 18, - "lpf_start": 93, - "lpf_stop": 185, - "res_type": "polyphase" - }, - "3": { - "sr": 22050, - "hl": 256, - "n_fft": 512, - "crop_start": 46, - "crop_stop": 186, - "hpf_start": 93, - "hpf_stop": 46, - "lpf_start": 164, - "lpf_stop": 186, - "res_type": "polyphase" - }, - "4": { - "sr": 44100, - "hl": 512, - "n_fft": 768, - "crop_start": 121, - "crop_stop": 382, - "hpf_start": 138, - "hpf_stop": 123, - "res_type": "sinc_medium" - } - }, - "sr": 44100, - "pre_filter_start": 740, - "pre_filter_stop": 768 -} diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb.json b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb.json deleted file mode 100644 index f852f280ec9d98fc1b65cec688290eaafec61b84..0000000000000000000000000000000000000000 --- a/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb.json +++ /dev/null @@ -1,55 +0,0 @@ -{ - "mid_side_b": true, - "bins": 768, - "unstable_bins": 7, - "reduction_bins": 668, - "band": { - "1": { - "sr": 11025, - "hl": 128, - "n_fft": 1024, - "crop_start": 0, - "crop_stop": 186, - "lpf_start": 37, - "lpf_stop": 73, - "res_type": "polyphase" - }, - "2": { - "sr": 11025, - "hl": 128, - "n_fft": 512, - "crop_start": 4, - "crop_stop": 185, - "hpf_start": 36, - "hpf_stop": 18, - "lpf_start": 93, - "lpf_stop": 185, - "res_type": "polyphase" - }, - "3": { - "sr": 22050, - "hl": 256, - "n_fft": 512, - "crop_start": 46, - "crop_stop": 186, - "hpf_start": 93, - "hpf_stop": 46, - "lpf_start": 164, - "lpf_stop": 186, - "res_type": "polyphase" - }, - "4": { - "sr": 44100, - "hl": 512, - "n_fft": 768, - "crop_start": 121, - "crop_stop": 382, - "hpf_start": 138, - "hpf_stop": 123, - "res_type": "sinc_medium" - } - }, - "sr": 44100, - "pre_filter_start": 740, - "pre_filter_stop": 768 -} \ No newline at end of file diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json deleted file mode 100644 index f852f280ec9d98fc1b65cec688290eaafec61b84..0000000000000000000000000000000000000000 --- a/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json +++ /dev/null @@ -1,55 +0,0 @@ -{ - "mid_side_b": true, - "bins": 768, - "unstable_bins": 7, - "reduction_bins": 668, - "band": { - "1": { - "sr": 11025, - "hl": 128, - "n_fft": 1024, - "crop_start": 0, - "crop_stop": 186, - "lpf_start": 37, - "lpf_stop": 73, - "res_type": "polyphase" - }, - "2": { - "sr": 11025, - "hl": 128, - "n_fft": 512, - "crop_start": 4, - "crop_stop": 185, - "hpf_start": 36, - "hpf_stop": 18, - "lpf_start": 93, - "lpf_stop": 185, - "res_type": "polyphase" - }, - "3": { - "sr": 22050, - "hl": 256, - "n_fft": 512, - "crop_start": 46, - "crop_stop": 186, - "hpf_start": 93, - "hpf_stop": 46, - "lpf_start": 164, - "lpf_stop": 186, - "res_type": "polyphase" - }, - "4": { - "sr": 44100, - "hl": 512, - "n_fft": 768, - "crop_start": 121, - "crop_stop": 382, - "hpf_start": 138, - "hpf_stop": 123, - "res_type": "sinc_medium" - } - }, - "sr": 44100, - "pre_filter_start": 740, - "pre_filter_stop": 768 -} \ No newline at end of file diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json deleted file mode 100644 index 7a07d5541bd83dc1caa20b531c3b43a2ffccac88..0000000000000000000000000000000000000000 --- a/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json +++ /dev/null @@ -1,55 +0,0 @@ -{ - "reverse": true, - "bins": 768, - "unstable_bins": 7, - "reduction_bins": 668, - "band": { - "1": { - "sr": 11025, - "hl": 128, - "n_fft": 1024, - "crop_start": 0, - "crop_stop": 186, - "lpf_start": 37, - "lpf_stop": 73, - "res_type": "polyphase" - }, - "2": { - "sr": 11025, - "hl": 128, - "n_fft": 512, - "crop_start": 4, - "crop_stop": 185, - "hpf_start": 36, - "hpf_stop": 18, - "lpf_start": 93, - "lpf_stop": 185, - "res_type": "polyphase" - }, - "3": { - "sr": 22050, - "hl": 256, - "n_fft": 512, - "crop_start": 46, - "crop_stop": 186, - "hpf_start": 93, - "hpf_stop": 46, - "lpf_start": 164, - "lpf_stop": 186, - "res_type": "polyphase" - }, - "4": { - "sr": 44100, - "hl": 512, - "n_fft": 768, - "crop_start": 121, - "crop_stop": 382, - "hpf_start": 138, - "hpf_stop": 123, - "res_type": "sinc_medium" - } - }, - "sr": 44100, - "pre_filter_start": 740, - "pre_filter_stop": 768 -} \ No newline at end of file diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_sw.json b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_sw.json deleted file mode 100644 index ba0cf342106de793e6ec3e876854c7fd451fbf76..0000000000000000000000000000000000000000 --- a/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_sw.json +++ /dev/null @@ -1,55 +0,0 @@ -{ - "stereo_w": true, - "bins": 768, - "unstable_bins": 7, - "reduction_bins": 668, - "band": { - "1": { - "sr": 11025, - "hl": 128, - "n_fft": 1024, - "crop_start": 0, - "crop_stop": 186, - "lpf_start": 37, - "lpf_stop": 73, - "res_type": "polyphase" - }, - "2": { - "sr": 11025, - "hl": 128, - "n_fft": 512, - "crop_start": 4, - "crop_stop": 185, - "hpf_start": 36, - "hpf_stop": 18, - "lpf_start": 93, - "lpf_stop": 185, - "res_type": "polyphase" - }, - "3": { - "sr": 22050, - "hl": 256, - "n_fft": 512, - "crop_start": 46, - "crop_stop": 186, - "hpf_start": 93, - "hpf_stop": 46, - "lpf_start": 164, - "lpf_stop": 186, - "res_type": "polyphase" - }, - "4": { - "sr": 44100, - "hl": 512, - "n_fft": 768, - "crop_start": 121, - "crop_stop": 382, - "hpf_start": 138, - "hpf_stop": 123, - "res_type": "sinc_medium" - } - }, - "sr": 44100, - "pre_filter_start": 740, - "pre_filter_stop": 768 -} \ No newline at end of file diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json deleted file mode 100644 index 33281a0cf9916fc33558ddfda7a0287a2547faf4..0000000000000000000000000000000000000000 --- a/infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json +++ /dev/null @@ -1,54 +0,0 @@ -{ - "bins": 672, - "unstable_bins": 8, - "reduction_bins": 637, - "band": { - "1": { - "sr": 7350, - "hl": 80, - "n_fft": 640, - "crop_start": 0, - "crop_stop": 85, - "lpf_start": 25, - "lpf_stop": 53, - "res_type": "polyphase" - }, - "2": { - "sr": 7350, - "hl": 80, - "n_fft": 320, - "crop_start": 4, - "crop_stop": 87, - "hpf_start": 25, - "hpf_stop": 12, - "lpf_start": 31, - "lpf_stop": 62, - "res_type": "polyphase" - }, - "3": { - "sr": 14700, - "hl": 160, - "n_fft": 512, - "crop_start": 17, - "crop_stop": 216, - "hpf_start": 48, - "hpf_stop": 24, - "lpf_start": 139, - "lpf_stop": 210, - "res_type": "polyphase" - }, - "4": { - "sr": 44100, - "hl": 480, - "n_fft": 960, - "crop_start": 78, - "crop_stop": 383, - "hpf_start": 130, - "hpf_stop": 86, - "res_type": "kaiser_fast" - } - }, - "sr": 44100, - "pre_filter_start": 668, - "pre_filter_stop": 672 -} \ No newline at end of file diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json deleted file mode 100644 index 2e5c770fe188779bf6b0873190b7a324d6a867b2..0000000000000000000000000000000000000000 --- a/infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json +++ /dev/null @@ -1,55 +0,0 @@ -{ - "bins": 672, - "unstable_bins": 8, - "reduction_bins": 637, - "band": { - "1": { - "sr": 7350, - "hl": 80, - "n_fft": 640, - "crop_start": 0, - "crop_stop": 85, - "lpf_start": 25, - "lpf_stop": 53, - "res_type": "polyphase" - }, - "2": { - "sr": 7350, - "hl": 80, - "n_fft": 320, - "crop_start": 4, - "crop_stop": 87, - "hpf_start": 25, - "hpf_stop": 12, - "lpf_start": 31, - "lpf_stop": 62, - "res_type": "polyphase" - }, - "3": { - "sr": 14700, - "hl": 160, - "n_fft": 512, - "crop_start": 17, - "crop_stop": 216, - "hpf_start": 48, - "hpf_stop": 24, - "lpf_start": 139, - "lpf_stop": 210, - "res_type": "polyphase" - }, - "4": { - "sr": 44100, - "hl": 480, - "n_fft": 960, - "crop_start": 78, - "crop_stop": 383, - "hpf_start": 130, - "hpf_stop": 86, - "convert_channels": "stereo_n", - "res_type": "kaiser_fast" - } - }, - "sr": 44100, - "pre_filter_start": 668, - "pre_filter_stop": 672 -} \ No newline at end of file diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/4band_v3.json b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_v3.json deleted file mode 100644 index 2a73bc97ac545145a75bdca7addc5d59f5b8574b..0000000000000000000000000000000000000000 --- a/infer/lib/uvr5_pack/lib_v5/modelparams/4band_v3.json +++ /dev/null @@ -1,54 +0,0 @@ -{ - "bins": 672, - "unstable_bins": 8, - "reduction_bins": 530, - "band": { - "1": { - "sr": 7350, - "hl": 80, - "n_fft": 640, - "crop_start": 0, - "crop_stop": 85, - "lpf_start": 25, - "lpf_stop": 53, - "res_type": "polyphase" - }, - "2": { - "sr": 7350, - "hl": 80, - "n_fft": 320, - "crop_start": 4, - "crop_stop": 87, - "hpf_start": 25, - "hpf_stop": 12, - "lpf_start": 31, - "lpf_stop": 62, - "res_type": "polyphase" - }, - "3": { - "sr": 14700, - "hl": 160, - "n_fft": 512, - "crop_start": 17, - "crop_stop": 216, - "hpf_start": 48, - "hpf_stop": 24, - "lpf_start": 139, - "lpf_stop": 210, - "res_type": "polyphase" - }, - "4": { - "sr": 44100, - "hl": 480, - "n_fft": 960, - "crop_start": 78, - "crop_stop": 383, - "hpf_start": 130, - "hpf_stop": 86, - "res_type": "kaiser_fast" - } - }, - "sr": 44100, - "pre_filter_start": 668, - "pre_filter_stop": 672 -} \ No newline at end of file diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/ensemble.json b/infer/lib/uvr5_pack/lib_v5/modelparams/ensemble.json deleted file mode 100644 index ee69beb46fc82f34619c5e48761e329fcabbbd00..0000000000000000000000000000000000000000 --- a/infer/lib/uvr5_pack/lib_v5/modelparams/ensemble.json +++ /dev/null @@ -1,43 +0,0 @@ -{ - "mid_side_b2": true, - "bins": 1280, - "unstable_bins": 7, - "reduction_bins": 565, - "band": { - "1": { - "sr": 11025, - "hl": 108, - "n_fft": 2048, - "crop_start": 0, - "crop_stop": 374, - "lpf_start": 92, - "lpf_stop": 186, - "res_type": "polyphase" - }, - "2": { - "sr": 22050, - "hl": 216, - "n_fft": 1536, - "crop_start": 0, - "crop_stop": 424, - "hpf_start": 68, - "hpf_stop": 34, - "lpf_start": 348, - "lpf_stop": 418, - "res_type": "polyphase" - }, - "3": { - "sr": 44100, - "hl": 432, - "n_fft": 1280, - "crop_start": 132, - "crop_stop": 614, - "hpf_start": 172, - "hpf_stop": 144, - "res_type": "polyphase" - } - }, - "sr": 44100, - "pre_filter_start": 1280, - "pre_filter_stop": 1280 -} \ No newline at end of file diff --git a/infer/lib/uvr5_pack/lib_v5/nets.py b/infer/lib/uvr5_pack/lib_v5/nets.py deleted file mode 100644 index 5da3948c2f2e9edcc3cdac49bdf9f738e403de40..0000000000000000000000000000000000000000 --- a/infer/lib/uvr5_pack/lib_v5/nets.py +++ /dev/null @@ -1,123 +0,0 @@ -import layers -import torch -import torch.nn.functional as F -from torch import nn - -from . import spec_utils - - -class BaseASPPNet(nn.Module): - def __init__(self, nin, ch, dilations=(4, 8, 16)): - super(BaseASPPNet, self).__init__() - self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) - self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) - self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) - self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) - - self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) - - self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) - self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) - self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) - self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) - - def __call__(self, x): - h, e1 = self.enc1(x) - h, e2 = self.enc2(h) - h, e3 = self.enc3(h) - h, e4 = self.enc4(h) - - h = self.aspp(h) - - h = self.dec4(h, e4) - h = self.dec3(h, e3) - h = self.dec2(h, e2) - h = self.dec1(h, e1) - - return h - - -class CascadedASPPNet(nn.Module): - def __init__(self, n_fft): - super(CascadedASPPNet, self).__init__() - self.stg1_low_band_net = BaseASPPNet(2, 16) - self.stg1_high_band_net = BaseASPPNet(2, 16) - - self.stg2_bridge = layers.Conv2DBNActiv(18, 8, 1, 1, 0) - self.stg2_full_band_net = BaseASPPNet(8, 16) - - self.stg3_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0) - self.stg3_full_band_net = BaseASPPNet(16, 32) - - self.out = nn.Conv2d(32, 2, 1, bias=False) - self.aux1_out = nn.Conv2d(16, 2, 1, bias=False) - self.aux2_out = nn.Conv2d(16, 2, 1, bias=False) - - self.max_bin = n_fft // 2 - self.output_bin = n_fft // 2 + 1 - - self.offset = 128 - - def forward(self, x, aggressiveness=None): - mix = x.detach() - x = x.clone() - - x = x[:, :, : self.max_bin] - - bandw = x.size()[2] // 2 - aux1 = torch.cat( - [ - self.stg1_low_band_net(x[:, :, :bandw]), - self.stg1_high_band_net(x[:, :, bandw:]), - ], - dim=2, - ) - - h = torch.cat([x, aux1], dim=1) - aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) - - h = torch.cat([x, aux1, aux2], dim=1) - h = self.stg3_full_band_net(self.stg3_bridge(h)) - - mask = torch.sigmoid(self.out(h)) - mask = F.pad( - input=mask, - pad=(0, 0, 0, self.output_bin - mask.size()[2]), - mode="replicate", - ) - - if self.training: - aux1 = torch.sigmoid(self.aux1_out(aux1)) - aux1 = F.pad( - input=aux1, - pad=(0, 0, 0, self.output_bin - aux1.size()[2]), - mode="replicate", - ) - aux2 = torch.sigmoid(self.aux2_out(aux2)) - aux2 = F.pad( - input=aux2, - pad=(0, 0, 0, self.output_bin - aux2.size()[2]), - mode="replicate", - ) - return mask * mix, aux1 * mix, aux2 * mix - else: - if aggressiveness: - mask[:, :, : aggressiveness["split_bin"]] = torch.pow( - mask[:, :, : aggressiveness["split_bin"]], - 1 + aggressiveness["value"] / 3, - ) - mask[:, :, aggressiveness["split_bin"] :] = torch.pow( - mask[:, :, aggressiveness["split_bin"] :], - 1 + aggressiveness["value"], - ) - - return mask * mix - - def predict(self, x_mag, aggressiveness=None): - h = self.forward(x_mag, aggressiveness) - - if self.offset > 0: - h = h[:, :, :, self.offset : -self.offset] - assert h.size()[3] > 0 - - return h diff --git a/infer/lib/uvr5_pack/lib_v5/nets_123812KB.py b/infer/lib/uvr5_pack/lib_v5/nets_123812KB.py deleted file mode 100644 index 167d4cb2198863cf43e93440f7e63c5342fc7605..0000000000000000000000000000000000000000 --- a/infer/lib/uvr5_pack/lib_v5/nets_123812KB.py +++ /dev/null @@ -1,122 +0,0 @@ -import torch -import torch.nn.functional as F -from torch import nn - -from . import layers_123821KB as layers - - -class BaseASPPNet(nn.Module): - def __init__(self, nin, ch, dilations=(4, 8, 16)): - super(BaseASPPNet, self).__init__() - self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) - self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) - self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) - self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) - - self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) - - self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) - self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) - self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) - self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) - - def __call__(self, x): - h, e1 = self.enc1(x) - h, e2 = self.enc2(h) - h, e3 = self.enc3(h) - h, e4 = self.enc4(h) - - h = self.aspp(h) - - h = self.dec4(h, e4) - h = self.dec3(h, e3) - h = self.dec2(h, e2) - h = self.dec1(h, e1) - - return h - - -class CascadedASPPNet(nn.Module): - def __init__(self, n_fft): - super(CascadedASPPNet, self).__init__() - self.stg1_low_band_net = BaseASPPNet(2, 32) - self.stg1_high_band_net = BaseASPPNet(2, 32) - - self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0) - self.stg2_full_band_net = BaseASPPNet(16, 32) - - self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0) - self.stg3_full_band_net = BaseASPPNet(32, 64) - - self.out = nn.Conv2d(64, 2, 1, bias=False) - self.aux1_out = nn.Conv2d(32, 2, 1, bias=False) - self.aux2_out = nn.Conv2d(32, 2, 1, bias=False) - - self.max_bin = n_fft // 2 - self.output_bin = n_fft // 2 + 1 - - self.offset = 128 - - def forward(self, x, aggressiveness=None): - mix = x.detach() - x = x.clone() - - x = x[:, :, : self.max_bin] - - bandw = x.size()[2] // 2 - aux1 = torch.cat( - [ - self.stg1_low_band_net(x[:, :, :bandw]), - self.stg1_high_band_net(x[:, :, bandw:]), - ], - dim=2, - ) - - h = torch.cat([x, aux1], dim=1) - aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) - - h = torch.cat([x, aux1, aux2], dim=1) - h = self.stg3_full_band_net(self.stg3_bridge(h)) - - mask = torch.sigmoid(self.out(h)) - mask = F.pad( - input=mask, - pad=(0, 0, 0, self.output_bin - mask.size()[2]), - mode="replicate", - ) - - if self.training: - aux1 = torch.sigmoid(self.aux1_out(aux1)) - aux1 = F.pad( - input=aux1, - pad=(0, 0, 0, self.output_bin - aux1.size()[2]), - mode="replicate", - ) - aux2 = torch.sigmoid(self.aux2_out(aux2)) - aux2 = F.pad( - input=aux2, - pad=(0, 0, 0, self.output_bin - aux2.size()[2]), - mode="replicate", - ) - return mask * mix, aux1 * mix, aux2 * mix - else: - if aggressiveness: - mask[:, :, : aggressiveness["split_bin"]] = torch.pow( - mask[:, :, : aggressiveness["split_bin"]], - 1 + aggressiveness["value"] / 3, - ) - mask[:, :, aggressiveness["split_bin"] :] = torch.pow( - mask[:, :, aggressiveness["split_bin"] :], - 1 + aggressiveness["value"], - ) - - return mask * mix - - def predict(self, x_mag, aggressiveness=None): - h = self.forward(x_mag, aggressiveness) - - if self.offset > 0: - h = h[:, :, :, self.offset : -self.offset] - assert h.size()[3] > 0 - - return h diff --git a/infer/lib/uvr5_pack/lib_v5/nets_123821KB.py b/infer/lib/uvr5_pack/lib_v5/nets_123821KB.py deleted file mode 100644 index 167d4cb2198863cf43e93440f7e63c5342fc7605..0000000000000000000000000000000000000000 --- a/infer/lib/uvr5_pack/lib_v5/nets_123821KB.py +++ /dev/null @@ -1,122 +0,0 @@ -import torch -import torch.nn.functional as F -from torch import nn - -from . import layers_123821KB as layers - - -class BaseASPPNet(nn.Module): - def __init__(self, nin, ch, dilations=(4, 8, 16)): - super(BaseASPPNet, self).__init__() - self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) - self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) - self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) - self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) - - self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) - - self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) - self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) - self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) - self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) - - def __call__(self, x): - h, e1 = self.enc1(x) - h, e2 = self.enc2(h) - h, e3 = self.enc3(h) - h, e4 = self.enc4(h) - - h = self.aspp(h) - - h = self.dec4(h, e4) - h = self.dec3(h, e3) - h = self.dec2(h, e2) - h = self.dec1(h, e1) - - return h - - -class CascadedASPPNet(nn.Module): - def __init__(self, n_fft): - super(CascadedASPPNet, self).__init__() - self.stg1_low_band_net = BaseASPPNet(2, 32) - self.stg1_high_band_net = BaseASPPNet(2, 32) - - self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0) - self.stg2_full_band_net = BaseASPPNet(16, 32) - - self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0) - self.stg3_full_band_net = BaseASPPNet(32, 64) - - self.out = nn.Conv2d(64, 2, 1, bias=False) - self.aux1_out = nn.Conv2d(32, 2, 1, bias=False) - self.aux2_out = nn.Conv2d(32, 2, 1, bias=False) - - self.max_bin = n_fft // 2 - self.output_bin = n_fft // 2 + 1 - - self.offset = 128 - - def forward(self, x, aggressiveness=None): - mix = x.detach() - x = x.clone() - - x = x[:, :, : self.max_bin] - - bandw = x.size()[2] // 2 - aux1 = torch.cat( - [ - self.stg1_low_band_net(x[:, :, :bandw]), - self.stg1_high_band_net(x[:, :, bandw:]), - ], - dim=2, - ) - - h = torch.cat([x, aux1], dim=1) - aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) - - h = torch.cat([x, aux1, aux2], dim=1) - h = self.stg3_full_band_net(self.stg3_bridge(h)) - - mask = torch.sigmoid(self.out(h)) - mask = F.pad( - input=mask, - pad=(0, 0, 0, self.output_bin - mask.size()[2]), - mode="replicate", - ) - - if self.training: - aux1 = torch.sigmoid(self.aux1_out(aux1)) - aux1 = F.pad( - input=aux1, - pad=(0, 0, 0, self.output_bin - aux1.size()[2]), - mode="replicate", - ) - aux2 = torch.sigmoid(self.aux2_out(aux2)) - aux2 = F.pad( - input=aux2, - pad=(0, 0, 0, self.output_bin - aux2.size()[2]), - mode="replicate", - ) - return mask * mix, aux1 * mix, aux2 * mix - else: - if aggressiveness: - mask[:, :, : aggressiveness["split_bin"]] = torch.pow( - mask[:, :, : aggressiveness["split_bin"]], - 1 + aggressiveness["value"] / 3, - ) - mask[:, :, aggressiveness["split_bin"] :] = torch.pow( - mask[:, :, aggressiveness["split_bin"] :], - 1 + aggressiveness["value"], - ) - - return mask * mix - - def predict(self, x_mag, aggressiveness=None): - h = self.forward(x_mag, aggressiveness) - - if self.offset > 0: - h = h[:, :, :, self.offset : -self.offset] - assert h.size()[3] > 0 - - return h diff --git a/infer/lib/uvr5_pack/lib_v5/nets_33966KB.py b/infer/lib/uvr5_pack/lib_v5/nets_33966KB.py deleted file mode 100644 index 73a5b836177b706c306e27875f8391c1aed4b948..0000000000000000000000000000000000000000 --- a/infer/lib/uvr5_pack/lib_v5/nets_33966KB.py +++ /dev/null @@ -1,122 +0,0 @@ -import torch -import torch.nn.functional as F -from torch import nn - -from . import layers_33966KB as layers - - -class BaseASPPNet(nn.Module): - def __init__(self, nin, ch, dilations=(4, 8, 16, 32)): - super(BaseASPPNet, self).__init__() - self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) - self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) - self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) - self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) - - self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) - - self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) - self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) - self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) - self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) - - def __call__(self, x): - h, e1 = self.enc1(x) - h, e2 = self.enc2(h) - h, e3 = self.enc3(h) - h, e4 = self.enc4(h) - - h = self.aspp(h) - - h = self.dec4(h, e4) - h = self.dec3(h, e3) - h = self.dec2(h, e2) - h = self.dec1(h, e1) - - return h - - -class CascadedASPPNet(nn.Module): - def __init__(self, n_fft): - super(CascadedASPPNet, self).__init__() - self.stg1_low_band_net = BaseASPPNet(2, 16) - self.stg1_high_band_net = BaseASPPNet(2, 16) - - self.stg2_bridge = layers.Conv2DBNActiv(18, 8, 1, 1, 0) - self.stg2_full_band_net = BaseASPPNet(8, 16) - - self.stg3_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0) - self.stg3_full_band_net = BaseASPPNet(16, 32) - - self.out = nn.Conv2d(32, 2, 1, bias=False) - self.aux1_out = nn.Conv2d(16, 2, 1, bias=False) - self.aux2_out = nn.Conv2d(16, 2, 1, bias=False) - - self.max_bin = n_fft // 2 - self.output_bin = n_fft // 2 + 1 - - self.offset = 128 - - def forward(self, x, aggressiveness=None): - mix = x.detach() - x = x.clone() - - x = x[:, :, : self.max_bin] - - bandw = x.size()[2] // 2 - aux1 = torch.cat( - [ - self.stg1_low_band_net(x[:, :, :bandw]), - self.stg1_high_band_net(x[:, :, bandw:]), - ], - dim=2, - ) - - h = torch.cat([x, aux1], dim=1) - aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) - - h = torch.cat([x, aux1, aux2], dim=1) - h = self.stg3_full_band_net(self.stg3_bridge(h)) - - mask = torch.sigmoid(self.out(h)) - mask = F.pad( - input=mask, - pad=(0, 0, 0, self.output_bin - mask.size()[2]), - mode="replicate", - ) - - if self.training: - aux1 = torch.sigmoid(self.aux1_out(aux1)) - aux1 = F.pad( - input=aux1, - pad=(0, 0, 0, self.output_bin - aux1.size()[2]), - mode="replicate", - ) - aux2 = torch.sigmoid(self.aux2_out(aux2)) - aux2 = F.pad( - input=aux2, - pad=(0, 0, 0, self.output_bin - aux2.size()[2]), - mode="replicate", - ) - return mask * mix, aux1 * mix, aux2 * mix - else: - if aggressiveness: - mask[:, :, : aggressiveness["split_bin"]] = torch.pow( - mask[:, :, : aggressiveness["split_bin"]], - 1 + aggressiveness["value"] / 3, - ) - mask[:, :, aggressiveness["split_bin"] :] = torch.pow( - mask[:, :, aggressiveness["split_bin"] :], - 1 + aggressiveness["value"], - ) - - return mask * mix - - def predict(self, x_mag, aggressiveness=None): - h = self.forward(x_mag, aggressiveness) - - if self.offset > 0: - h = h[:, :, :, self.offset : -self.offset] - assert h.size()[3] > 0 - - return h diff --git a/infer/lib/uvr5_pack/lib_v5/nets_537227KB.py b/infer/lib/uvr5_pack/lib_v5/nets_537227KB.py deleted file mode 100644 index 823b44fb64898e8dcbb12180ba45d1718f9b03f7..0000000000000000000000000000000000000000 --- a/infer/lib/uvr5_pack/lib_v5/nets_537227KB.py +++ /dev/null @@ -1,123 +0,0 @@ -import numpy as np -import torch -import torch.nn.functional as F -from torch import nn - -from . import layers_537238KB as layers - - -class BaseASPPNet(nn.Module): - def __init__(self, nin, ch, dilations=(4, 8, 16)): - super(BaseASPPNet, self).__init__() - self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) - self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) - self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) - self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) - - self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) - - self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) - self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) - self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) - self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) - - def __call__(self, x): - h, e1 = self.enc1(x) - h, e2 = self.enc2(h) - h, e3 = self.enc3(h) - h, e4 = self.enc4(h) - - h = self.aspp(h) - - h = self.dec4(h, e4) - h = self.dec3(h, e3) - h = self.dec2(h, e2) - h = self.dec1(h, e1) - - return h - - -class CascadedASPPNet(nn.Module): - def __init__(self, n_fft): - super(CascadedASPPNet, self).__init__() - self.stg1_low_band_net = BaseASPPNet(2, 64) - self.stg1_high_band_net = BaseASPPNet(2, 64) - - self.stg2_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0) - self.stg2_full_band_net = BaseASPPNet(32, 64) - - self.stg3_bridge = layers.Conv2DBNActiv(130, 64, 1, 1, 0) - self.stg3_full_band_net = BaseASPPNet(64, 128) - - self.out = nn.Conv2d(128, 2, 1, bias=False) - self.aux1_out = nn.Conv2d(64, 2, 1, bias=False) - self.aux2_out = nn.Conv2d(64, 2, 1, bias=False) - - self.max_bin = n_fft // 2 - self.output_bin = n_fft // 2 + 1 - - self.offset = 128 - - def forward(self, x, aggressiveness=None): - mix = x.detach() - x = x.clone() - - x = x[:, :, : self.max_bin] - - bandw = x.size()[2] // 2 - aux1 = torch.cat( - [ - self.stg1_low_band_net(x[:, :, :bandw]), - self.stg1_high_band_net(x[:, :, bandw:]), - ], - dim=2, - ) - - h = torch.cat([x, aux1], dim=1) - aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) - - h = torch.cat([x, aux1, aux2], dim=1) - h = self.stg3_full_band_net(self.stg3_bridge(h)) - - mask = torch.sigmoid(self.out(h)) - mask = F.pad( - input=mask, - pad=(0, 0, 0, self.output_bin - mask.size()[2]), - mode="replicate", - ) - - if self.training: - aux1 = torch.sigmoid(self.aux1_out(aux1)) - aux1 = F.pad( - input=aux1, - pad=(0, 0, 0, self.output_bin - aux1.size()[2]), - mode="replicate", - ) - aux2 = torch.sigmoid(self.aux2_out(aux2)) - aux2 = F.pad( - input=aux2, - pad=(0, 0, 0, self.output_bin - aux2.size()[2]), - mode="replicate", - ) - return mask * mix, aux1 * mix, aux2 * mix - else: - if aggressiveness: - mask[:, :, : aggressiveness["split_bin"]] = torch.pow( - mask[:, :, : aggressiveness["split_bin"]], - 1 + aggressiveness["value"] / 3, - ) - mask[:, :, aggressiveness["split_bin"] :] = torch.pow( - mask[:, :, aggressiveness["split_bin"] :], - 1 + aggressiveness["value"], - ) - - return mask * mix - - def predict(self, x_mag, aggressiveness=None): - h = self.forward(x_mag, aggressiveness) - - if self.offset > 0: - h = h[:, :, :, self.offset : -self.offset] - assert h.size()[3] > 0 - - return h diff --git a/infer/lib/uvr5_pack/lib_v5/nets_537238KB.py b/infer/lib/uvr5_pack/lib_v5/nets_537238KB.py deleted file mode 100644 index 823b44fb64898e8dcbb12180ba45d1718f9b03f7..0000000000000000000000000000000000000000 --- a/infer/lib/uvr5_pack/lib_v5/nets_537238KB.py +++ /dev/null @@ -1,123 +0,0 @@ -import numpy as np -import torch -import torch.nn.functional as F -from torch import nn - -from . import layers_537238KB as layers - - -class BaseASPPNet(nn.Module): - def __init__(self, nin, ch, dilations=(4, 8, 16)): - super(BaseASPPNet, self).__init__() - self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) - self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) - self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) - self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) - - self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) - - self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) - self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) - self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) - self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) - - def __call__(self, x): - h, e1 = self.enc1(x) - h, e2 = self.enc2(h) - h, e3 = self.enc3(h) - h, e4 = self.enc4(h) - - h = self.aspp(h) - - h = self.dec4(h, e4) - h = self.dec3(h, e3) - h = self.dec2(h, e2) - h = self.dec1(h, e1) - - return h - - -class CascadedASPPNet(nn.Module): - def __init__(self, n_fft): - super(CascadedASPPNet, self).__init__() - self.stg1_low_band_net = BaseASPPNet(2, 64) - self.stg1_high_band_net = BaseASPPNet(2, 64) - - self.stg2_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0) - self.stg2_full_band_net = BaseASPPNet(32, 64) - - self.stg3_bridge = layers.Conv2DBNActiv(130, 64, 1, 1, 0) - self.stg3_full_band_net = BaseASPPNet(64, 128) - - self.out = nn.Conv2d(128, 2, 1, bias=False) - self.aux1_out = nn.Conv2d(64, 2, 1, bias=False) - self.aux2_out = nn.Conv2d(64, 2, 1, bias=False) - - self.max_bin = n_fft // 2 - self.output_bin = n_fft // 2 + 1 - - self.offset = 128 - - def forward(self, x, aggressiveness=None): - mix = x.detach() - x = x.clone() - - x = x[:, :, : self.max_bin] - - bandw = x.size()[2] // 2 - aux1 = torch.cat( - [ - self.stg1_low_band_net(x[:, :, :bandw]), - self.stg1_high_band_net(x[:, :, bandw:]), - ], - dim=2, - ) - - h = torch.cat([x, aux1], dim=1) - aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) - - h = torch.cat([x, aux1, aux2], dim=1) - h = self.stg3_full_band_net(self.stg3_bridge(h)) - - mask = torch.sigmoid(self.out(h)) - mask = F.pad( - input=mask, - pad=(0, 0, 0, self.output_bin - mask.size()[2]), - mode="replicate", - ) - - if self.training: - aux1 = torch.sigmoid(self.aux1_out(aux1)) - aux1 = F.pad( - input=aux1, - pad=(0, 0, 0, self.output_bin - aux1.size()[2]), - mode="replicate", - ) - aux2 = torch.sigmoid(self.aux2_out(aux2)) - aux2 = F.pad( - input=aux2, - pad=(0, 0, 0, self.output_bin - aux2.size()[2]), - mode="replicate", - ) - return mask * mix, aux1 * mix, aux2 * mix - else: - if aggressiveness: - mask[:, :, : aggressiveness["split_bin"]] = torch.pow( - mask[:, :, : aggressiveness["split_bin"]], - 1 + aggressiveness["value"] / 3, - ) - mask[:, :, aggressiveness["split_bin"] :] = torch.pow( - mask[:, :, aggressiveness["split_bin"] :], - 1 + aggressiveness["value"], - ) - - return mask * mix - - def predict(self, x_mag, aggressiveness=None): - h = self.forward(x_mag, aggressiveness) - - if self.offset > 0: - h = h[:, :, :, self.offset : -self.offset] - assert h.size()[3] > 0 - - return h diff --git a/infer/lib/uvr5_pack/lib_v5/nets_61968KB.py b/infer/lib/uvr5_pack/lib_v5/nets_61968KB.py deleted file mode 100644 index 167d4cb2198863cf43e93440f7e63c5342fc7605..0000000000000000000000000000000000000000 --- a/infer/lib/uvr5_pack/lib_v5/nets_61968KB.py +++ /dev/null @@ -1,122 +0,0 @@ -import torch -import torch.nn.functional as F -from torch import nn - -from . import layers_123821KB as layers - - -class BaseASPPNet(nn.Module): - def __init__(self, nin, ch, dilations=(4, 8, 16)): - super(BaseASPPNet, self).__init__() - self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) - self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) - self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) - self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) - - self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) - - self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) - self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) - self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) - self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) - - def __call__(self, x): - h, e1 = self.enc1(x) - h, e2 = self.enc2(h) - h, e3 = self.enc3(h) - h, e4 = self.enc4(h) - - h = self.aspp(h) - - h = self.dec4(h, e4) - h = self.dec3(h, e3) - h = self.dec2(h, e2) - h = self.dec1(h, e1) - - return h - - -class CascadedASPPNet(nn.Module): - def __init__(self, n_fft): - super(CascadedASPPNet, self).__init__() - self.stg1_low_band_net = BaseASPPNet(2, 32) - self.stg1_high_band_net = BaseASPPNet(2, 32) - - self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0) - self.stg2_full_band_net = BaseASPPNet(16, 32) - - self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0) - self.stg3_full_band_net = BaseASPPNet(32, 64) - - self.out = nn.Conv2d(64, 2, 1, bias=False) - self.aux1_out = nn.Conv2d(32, 2, 1, bias=False) - self.aux2_out = nn.Conv2d(32, 2, 1, bias=False) - - self.max_bin = n_fft // 2 - self.output_bin = n_fft // 2 + 1 - - self.offset = 128 - - def forward(self, x, aggressiveness=None): - mix = x.detach() - x = x.clone() - - x = x[:, :, : self.max_bin] - - bandw = x.size()[2] // 2 - aux1 = torch.cat( - [ - self.stg1_low_band_net(x[:, :, :bandw]), - self.stg1_high_band_net(x[:, :, bandw:]), - ], - dim=2, - ) - - h = torch.cat([x, aux1], dim=1) - aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) - - h = torch.cat([x, aux1, aux2], dim=1) - h = self.stg3_full_band_net(self.stg3_bridge(h)) - - mask = torch.sigmoid(self.out(h)) - mask = F.pad( - input=mask, - pad=(0, 0, 0, self.output_bin - mask.size()[2]), - mode="replicate", - ) - - if self.training: - aux1 = torch.sigmoid(self.aux1_out(aux1)) - aux1 = F.pad( - input=aux1, - pad=(0, 0, 0, self.output_bin - aux1.size()[2]), - mode="replicate", - ) - aux2 = torch.sigmoid(self.aux2_out(aux2)) - aux2 = F.pad( - input=aux2, - pad=(0, 0, 0, self.output_bin - aux2.size()[2]), - mode="replicate", - ) - return mask * mix, aux1 * mix, aux2 * mix - else: - if aggressiveness: - mask[:, :, : aggressiveness["split_bin"]] = torch.pow( - mask[:, :, : aggressiveness["split_bin"]], - 1 + aggressiveness["value"] / 3, - ) - mask[:, :, aggressiveness["split_bin"] :] = torch.pow( - mask[:, :, aggressiveness["split_bin"] :], - 1 + aggressiveness["value"], - ) - - return mask * mix - - def predict(self, x_mag, aggressiveness=None): - h = self.forward(x_mag, aggressiveness) - - if self.offset > 0: - h = h[:, :, :, self.offset : -self.offset] - assert h.size()[3] > 0 - - return h diff --git a/infer/lib/uvr5_pack/lib_v5/nets_new.py b/infer/lib/uvr5_pack/lib_v5/nets_new.py deleted file mode 100644 index 1c0f4fa96d921e979fe31bd4151701b7783fbcea..0000000000000000000000000000000000000000 --- a/infer/lib/uvr5_pack/lib_v5/nets_new.py +++ /dev/null @@ -1,133 +0,0 @@ -import torch -import torch.nn.functional as F -from torch import nn - -from . import layers_new - - -class BaseNet(nn.Module): - def __init__( - self, nin, nout, nin_lstm, nout_lstm, dilations=((4, 2), (8, 4), (12, 6)) - ): - super(BaseNet, self).__init__() - self.enc1 = layers_new.Conv2DBNActiv(nin, nout, 3, 1, 1) - self.enc2 = layers_new.Encoder(nout, nout * 2, 3, 2, 1) - self.enc3 = layers_new.Encoder(nout * 2, nout * 4, 3, 2, 1) - self.enc4 = layers_new.Encoder(nout * 4, nout * 6, 3, 2, 1) - self.enc5 = layers_new.Encoder(nout * 6, nout * 8, 3, 2, 1) - - self.aspp = layers_new.ASPPModule(nout * 8, nout * 8, dilations, dropout=True) - - self.dec4 = layers_new.Decoder(nout * (6 + 8), nout * 6, 3, 1, 1) - self.dec3 = layers_new.Decoder(nout * (4 + 6), nout * 4, 3, 1, 1) - self.dec2 = layers_new.Decoder(nout * (2 + 4), nout * 2, 3, 1, 1) - self.lstm_dec2 = layers_new.LSTMModule(nout * 2, nin_lstm, nout_lstm) - self.dec1 = layers_new.Decoder(nout * (1 + 2) + 1, nout * 1, 3, 1, 1) - - def __call__(self, x): - e1 = self.enc1(x) - e2 = self.enc2(e1) - e3 = self.enc3(e2) - e4 = self.enc4(e3) - e5 = self.enc5(e4) - - h = self.aspp(e5) - - h = self.dec4(h, e4) - h = self.dec3(h, e3) - h = self.dec2(h, e2) - h = torch.cat([h, self.lstm_dec2(h)], dim=1) - h = self.dec1(h, e1) - - return h - - -class CascadedNet(nn.Module): - def __init__(self, n_fft, nout=32, nout_lstm=128): - super(CascadedNet, self).__init__() - - self.max_bin = n_fft // 2 - self.output_bin = n_fft // 2 + 1 - self.nin_lstm = self.max_bin // 2 - self.offset = 64 - - self.stg1_low_band_net = nn.Sequential( - BaseNet(2, nout // 2, self.nin_lstm // 2, nout_lstm), - layers_new.Conv2DBNActiv(nout // 2, nout // 4, 1, 1, 0), - ) - - self.stg1_high_band_net = BaseNet( - 2, nout // 4, self.nin_lstm // 2, nout_lstm // 2 - ) - - self.stg2_low_band_net = nn.Sequential( - BaseNet(nout // 4 + 2, nout, self.nin_lstm // 2, nout_lstm), - layers_new.Conv2DBNActiv(nout, nout // 2, 1, 1, 0), - ) - self.stg2_high_band_net = BaseNet( - nout // 4 + 2, nout // 2, self.nin_lstm // 2, nout_lstm // 2 - ) - - self.stg3_full_band_net = BaseNet( - 3 * nout // 4 + 2, nout, self.nin_lstm, nout_lstm - ) - - self.out = nn.Conv2d(nout, 2, 1, bias=False) - self.aux_out = nn.Conv2d(3 * nout // 4, 2, 1, bias=False) - - def forward(self, x): - x = x[:, :, : self.max_bin] - - bandw = x.size()[2] // 2 - l1_in = x[:, :, :bandw] - h1_in = x[:, :, bandw:] - l1 = self.stg1_low_band_net(l1_in) - h1 = self.stg1_high_band_net(h1_in) - aux1 = torch.cat([l1, h1], dim=2) - - l2_in = torch.cat([l1_in, l1], dim=1) - h2_in = torch.cat([h1_in, h1], dim=1) - l2 = self.stg2_low_band_net(l2_in) - h2 = self.stg2_high_band_net(h2_in) - aux2 = torch.cat([l2, h2], dim=2) - - f3_in = torch.cat([x, aux1, aux2], dim=1) - f3 = self.stg3_full_band_net(f3_in) - - mask = torch.sigmoid(self.out(f3)) - mask = F.pad( - input=mask, - pad=(0, 0, 0, self.output_bin - mask.size()[2]), - mode="replicate", - ) - - if self.training: - aux = torch.cat([aux1, aux2], dim=1) - aux = torch.sigmoid(self.aux_out(aux)) - aux = F.pad( - input=aux, - pad=(0, 0, 0, self.output_bin - aux.size()[2]), - mode="replicate", - ) - return mask, aux - else: - return mask - - def predict_mask(self, x): - mask = self.forward(x) - - if self.offset > 0: - mask = mask[:, :, :, self.offset : -self.offset] - assert mask.size()[3] > 0 - - return mask - - def predict(self, x, aggressiveness=None): - mask = self.forward(x) - pred_mag = x * mask - - if self.offset > 0: - pred_mag = pred_mag[:, :, :, self.offset : -self.offset] - assert pred_mag.size()[3] > 0 - - return pred_mag diff --git a/infer/lib/uvr5_pack/lib_v5/spec_utils.py b/infer/lib/uvr5_pack/lib_v5/spec_utils.py deleted file mode 100644 index a9634fd51ff47bf90211839231774719154c37cf..0000000000000000000000000000000000000000 --- a/infer/lib/uvr5_pack/lib_v5/spec_utils.py +++ /dev/null @@ -1,672 +0,0 @@ -import hashlib -import json -import math -import os - -import librosa -import numpy as np -import soundfile as sf -from tqdm import tqdm - - -def crop_center(h1, h2): - h1_shape = h1.size() - h2_shape = h2.size() - - if h1_shape[3] == h2_shape[3]: - return h1 - elif h1_shape[3] < h2_shape[3]: - raise ValueError("h1_shape[3] must be greater than h2_shape[3]") - - # s_freq = (h2_shape[2] - h1_shape[2]) // 2 - # e_freq = s_freq + h1_shape[2] - s_time = (h1_shape[3] - h2_shape[3]) // 2 - e_time = s_time + h2_shape[3] - h1 = h1[:, :, :, s_time:e_time] - - return h1 - - -def wave_to_spectrogram( - wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False -): - if reverse: - wave_left = np.flip(np.asfortranarray(wave[0])) - wave_right = np.flip(np.asfortranarray(wave[1])) - elif mid_side: - wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2) - wave_right = np.asfortranarray(np.subtract(wave[0], wave[1])) - elif mid_side_b2: - wave_left = np.asfortranarray(np.add(wave[1], wave[0] * 0.5)) - wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * 0.5)) - else: - wave_left = np.asfortranarray(wave[0]) - wave_right = np.asfortranarray(wave[1]) - - spec_left = librosa.stft(wave_left, n_fft, hop_length=hop_length) - spec_right = librosa.stft(wave_right, n_fft, hop_length=hop_length) - - spec = np.asfortranarray([spec_left, spec_right]) - - return spec - - -def wave_to_spectrogram_mt( - wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False -): - import threading - - if reverse: - wave_left = np.flip(np.asfortranarray(wave[0])) - wave_right = np.flip(np.asfortranarray(wave[1])) - elif mid_side: - wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2) - wave_right = np.asfortranarray(np.subtract(wave[0], wave[1])) - elif mid_side_b2: - wave_left = np.asfortranarray(np.add(wave[1], wave[0] * 0.5)) - wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * 0.5)) - else: - wave_left = np.asfortranarray(wave[0]) - wave_right = np.asfortranarray(wave[1]) - - def run_thread(**kwargs): - global spec_left - spec_left = librosa.stft(**kwargs) - - thread = threading.Thread( - target=run_thread, - kwargs={"y": wave_left, "n_fft": n_fft, "hop_length": hop_length}, - ) - thread.start() - spec_right = librosa.stft(wave_right, n_fft, hop_length=hop_length) - thread.join() - - spec = np.asfortranarray([spec_left, spec_right]) - - return spec - - -def combine_spectrograms(specs, mp): - l = min([specs[i].shape[2] for i in specs]) - spec_c = np.zeros(shape=(2, mp.param["bins"] + 1, l), dtype=np.complex64) - offset = 0 - bands_n = len(mp.param["band"]) - - for d in range(1, bands_n + 1): - h = mp.param["band"][d]["crop_stop"] - mp.param["band"][d]["crop_start"] - spec_c[:, offset : offset + h, :l] = specs[d][ - :, mp.param["band"][d]["crop_start"] : mp.param["band"][d]["crop_stop"], :l - ] - offset += h - - if offset > mp.param["bins"]: - raise ValueError("Too much bins") - - # lowpass fiter - if ( - mp.param["pre_filter_start"] > 0 - ): # and mp.param['band'][bands_n]['res_type'] in ['scipy', 'polyphase']: - if bands_n == 1: - spec_c = fft_lp_filter( - spec_c, mp.param["pre_filter_start"], mp.param["pre_filter_stop"] - ) - else: - gp = 1 - for b in range( - mp.param["pre_filter_start"] + 1, mp.param["pre_filter_stop"] - ): - g = math.pow( - 10, -(b - mp.param["pre_filter_start"]) * (3.5 - gp) / 20.0 - ) - gp = g - spec_c[:, b, :] *= g - - return np.asfortranarray(spec_c) - - -def spectrogram_to_image(spec, mode="magnitude"): - if mode == "magnitude": - if np.iscomplexobj(spec): - y = np.abs(spec) - else: - y = spec - y = np.log10(y**2 + 1e-8) - elif mode == "phase": - if np.iscomplexobj(spec): - y = np.angle(spec) - else: - y = spec - - y -= y.min() - y *= 255 / y.max() - img = np.uint8(y) - - if y.ndim == 3: - img = img.transpose(1, 2, 0) - img = np.concatenate([np.max(img, axis=2, keepdims=True), img], axis=2) - - return img - - -def reduce_vocal_aggressively(X, y, softmask): - v = X - y - y_mag_tmp = np.abs(y) - v_mag_tmp = np.abs(v) - - v_mask = v_mag_tmp > y_mag_tmp - y_mag = np.clip(y_mag_tmp - v_mag_tmp * v_mask * softmask, 0, np.inf) - - return y_mag * np.exp(1.0j * np.angle(y)) - - -def mask_silence(mag, ref, thres=0.2, min_range=64, fade_size=32): - if min_range < fade_size * 2: - raise ValueError("min_range must be >= fade_area * 2") - - mag = mag.copy() - - idx = np.where(ref.mean(axis=(0, 1)) < thres)[0] - starts = np.insert(idx[np.where(np.diff(idx) != 1)[0] + 1], 0, idx[0]) - ends = np.append(idx[np.where(np.diff(idx) != 1)[0]], idx[-1]) - uninformative = np.where(ends - starts > min_range)[0] - if len(uninformative) > 0: - starts = starts[uninformative] - ends = ends[uninformative] - old_e = None - for s, e in zip(starts, ends): - if old_e is not None and s - old_e < fade_size: - s = old_e - fade_size * 2 - - if s != 0: - weight = np.linspace(0, 1, fade_size) - mag[:, :, s : s + fade_size] += weight * ref[:, :, s : s + fade_size] - else: - s -= fade_size - - if e != mag.shape[2]: - weight = np.linspace(1, 0, fade_size) - mag[:, :, e - fade_size : e] += weight * ref[:, :, e - fade_size : e] - else: - e += fade_size - - mag[:, :, s + fade_size : e - fade_size] += ref[ - :, :, s + fade_size : e - fade_size - ] - old_e = e - - return mag - - -def align_wave_head_and_tail(a, b): - l = min([a[0].size, b[0].size]) - - return a[:l, :l], b[:l, :l] - - -def cache_or_load(mix_path, inst_path, mp): - mix_basename = os.path.splitext(os.path.basename(mix_path))[0] - inst_basename = os.path.splitext(os.path.basename(inst_path))[0] - - cache_dir = "mph{}".format( - hashlib.sha1(json.dumps(mp.param, sort_keys=True).encode("utf-8")).hexdigest() - ) - mix_cache_dir = os.path.join("cache", cache_dir) - inst_cache_dir = os.path.join("cache", cache_dir) - - os.makedirs(mix_cache_dir, exist_ok=True) - os.makedirs(inst_cache_dir, exist_ok=True) - - mix_cache_path = os.path.join(mix_cache_dir, mix_basename + ".npy") - inst_cache_path = os.path.join(inst_cache_dir, inst_basename + ".npy") - - if os.path.exists(mix_cache_path) and os.path.exists(inst_cache_path): - X_spec_m = np.load(mix_cache_path) - y_spec_m = np.load(inst_cache_path) - else: - X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {} - - for d in range(len(mp.param["band"]), 0, -1): - bp = mp.param["band"][d] - - if d == len(mp.param["band"]): # high-end band - X_wave[d], _ = librosa.load( - mix_path, bp["sr"], False, dtype=np.float32, res_type=bp["res_type"] - ) - y_wave[d], _ = librosa.load( - inst_path, - bp["sr"], - False, - dtype=np.float32, - res_type=bp["res_type"], - ) - else: # lower bands - X_wave[d] = librosa.resample( - X_wave[d + 1], - mp.param["band"][d + 1]["sr"], - bp["sr"], - res_type=bp["res_type"], - ) - y_wave[d] = librosa.resample( - y_wave[d + 1], - mp.param["band"][d + 1]["sr"], - bp["sr"], - res_type=bp["res_type"], - ) - - X_wave[d], y_wave[d] = align_wave_head_and_tail(X_wave[d], y_wave[d]) - - X_spec_s[d] = wave_to_spectrogram( - X_wave[d], - bp["hl"], - bp["n_fft"], - mp.param["mid_side"], - mp.param["mid_side_b2"], - mp.param["reverse"], - ) - y_spec_s[d] = wave_to_spectrogram( - y_wave[d], - bp["hl"], - bp["n_fft"], - mp.param["mid_side"], - mp.param["mid_side_b2"], - mp.param["reverse"], - ) - - del X_wave, y_wave - - X_spec_m = combine_spectrograms(X_spec_s, mp) - y_spec_m = combine_spectrograms(y_spec_s, mp) - - if X_spec_m.shape != y_spec_m.shape: - raise ValueError("The combined spectrograms are different: " + mix_path) - - _, ext = os.path.splitext(mix_path) - - np.save(mix_cache_path, X_spec_m) - np.save(inst_cache_path, y_spec_m) - - return X_spec_m, y_spec_m - - -def spectrogram_to_wave(spec, hop_length, mid_side, mid_side_b2, reverse): - spec_left = np.asfortranarray(spec[0]) - spec_right = np.asfortranarray(spec[1]) - - wave_left = librosa.istft(spec_left, hop_length=hop_length) - wave_right = librosa.istft(spec_right, hop_length=hop_length) - - if reverse: - return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)]) - elif mid_side: - return np.asfortranarray( - [np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)] - ) - elif mid_side_b2: - return np.asfortranarray( - [ - np.add(wave_right / 1.25, 0.4 * wave_left), - np.subtract(wave_left / 1.25, 0.4 * wave_right), - ] - ) - else: - return np.asfortranarray([wave_left, wave_right]) - - -def spectrogram_to_wave_mt(spec, hop_length, mid_side, reverse, mid_side_b2): - import threading - - spec_left = np.asfortranarray(spec[0]) - spec_right = np.asfortranarray(spec[1]) - - def run_thread(**kwargs): - global wave_left - wave_left = librosa.istft(**kwargs) - - thread = threading.Thread( - target=run_thread, kwargs={"stft_matrix": spec_left, "hop_length": hop_length} - ) - thread.start() - wave_right = librosa.istft(spec_right, hop_length=hop_length) - thread.join() - - if reverse: - return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)]) - elif mid_side: - return np.asfortranarray( - [np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)] - ) - elif mid_side_b2: - return np.asfortranarray( - [ - np.add(wave_right / 1.25, 0.4 * wave_left), - np.subtract(wave_left / 1.25, 0.4 * wave_right), - ] - ) - else: - return np.asfortranarray([wave_left, wave_right]) - - -def cmb_spectrogram_to_wave(spec_m, mp, extra_bins_h=None, extra_bins=None): - wave_band = {} - bands_n = len(mp.param["band"]) - offset = 0 - - for d in range(1, bands_n + 1): - bp = mp.param["band"][d] - spec_s = np.ndarray( - shape=(2, bp["n_fft"] // 2 + 1, spec_m.shape[2]), dtype=complex - ) - h = bp["crop_stop"] - bp["crop_start"] - spec_s[:, bp["crop_start"] : bp["crop_stop"], :] = spec_m[ - :, offset : offset + h, : - ] - - offset += h - if d == bands_n: # higher - if extra_bins_h: # if --high_end_process bypass - max_bin = bp["n_fft"] // 2 - spec_s[:, max_bin - extra_bins_h : max_bin, :] = extra_bins[ - :, :extra_bins_h, : - ] - if bp["hpf_start"] > 0: - spec_s = fft_hp_filter(spec_s, bp["hpf_start"], bp["hpf_stop"] - 1) - if bands_n == 1: - wave = spectrogram_to_wave( - spec_s, - bp["hl"], - mp.param["mid_side"], - mp.param["mid_side_b2"], - mp.param["reverse"], - ) - else: - wave = np.add( - wave, - spectrogram_to_wave( - spec_s, - bp["hl"], - mp.param["mid_side"], - mp.param["mid_side_b2"], - mp.param["reverse"], - ), - ) - else: - sr = mp.param["band"][d + 1]["sr"] - if d == 1: # lower - spec_s = fft_lp_filter(spec_s, bp["lpf_start"], bp["lpf_stop"]) - wave = librosa.resample( - spectrogram_to_wave( - spec_s, - bp["hl"], - mp.param["mid_side"], - mp.param["mid_side_b2"], - mp.param["reverse"], - ), - bp["sr"], - sr, - res_type="sinc_fastest", - ) - else: # mid - spec_s = fft_hp_filter(spec_s, bp["hpf_start"], bp["hpf_stop"] - 1) - spec_s = fft_lp_filter(spec_s, bp["lpf_start"], bp["lpf_stop"]) - wave2 = np.add( - wave, - spectrogram_to_wave( - spec_s, - bp["hl"], - mp.param["mid_side"], - mp.param["mid_side_b2"], - mp.param["reverse"], - ), - ) - # wave = librosa.core.resample(wave2, bp['sr'], sr, res_type="sinc_fastest") - wave = librosa.core.resample(wave2, bp["sr"], sr, res_type="scipy") - - return wave.T - - -def fft_lp_filter(spec, bin_start, bin_stop): - g = 1.0 - for b in range(bin_start, bin_stop): - g -= 1 / (bin_stop - bin_start) - spec[:, b, :] = g * spec[:, b, :] - - spec[:, bin_stop:, :] *= 0 - - return spec - - -def fft_hp_filter(spec, bin_start, bin_stop): - g = 1.0 - for b in range(bin_start, bin_stop, -1): - g -= 1 / (bin_start - bin_stop) - spec[:, b, :] = g * spec[:, b, :] - - spec[:, 0 : bin_stop + 1, :] *= 0 - - return spec - - -def mirroring(a, spec_m, input_high_end, mp): - if "mirroring" == a: - mirror = np.flip( - np.abs( - spec_m[ - :, - mp.param["pre_filter_start"] - - 10 - - input_high_end.shape[1] : mp.param["pre_filter_start"] - - 10, - :, - ] - ), - 1, - ) - mirror = mirror * np.exp(1.0j * np.angle(input_high_end)) - - return np.where( - np.abs(input_high_end) <= np.abs(mirror), input_high_end, mirror - ) - - if "mirroring2" == a: - mirror = np.flip( - np.abs( - spec_m[ - :, - mp.param["pre_filter_start"] - - 10 - - input_high_end.shape[1] : mp.param["pre_filter_start"] - - 10, - :, - ] - ), - 1, - ) - mi = np.multiply(mirror, input_high_end * 1.7) - - return np.where(np.abs(input_high_end) <= np.abs(mi), input_high_end, mi) - - -def ensembling(a, specs): - for i in range(1, len(specs)): - if i == 1: - spec = specs[0] - - ln = min([spec.shape[2], specs[i].shape[2]]) - spec = spec[:, :, :ln] - specs[i] = specs[i][:, :, :ln] - - if "min_mag" == a: - spec = np.where(np.abs(specs[i]) <= np.abs(spec), specs[i], spec) - if "max_mag" == a: - spec = np.where(np.abs(specs[i]) >= np.abs(spec), specs[i], spec) - - return spec - - -def stft(wave, nfft, hl): - wave_left = np.asfortranarray(wave[0]) - wave_right = np.asfortranarray(wave[1]) - spec_left = librosa.stft(wave_left, nfft, hop_length=hl) - spec_right = librosa.stft(wave_right, nfft, hop_length=hl) - spec = np.asfortranarray([spec_left, spec_right]) - - return spec - - -def istft(spec, hl): - spec_left = np.asfortranarray(spec[0]) - spec_right = np.asfortranarray(spec[1]) - - wave_left = librosa.istft(spec_left, hop_length=hl) - wave_right = librosa.istft(spec_right, hop_length=hl) - wave = np.asfortranarray([wave_left, wave_right]) - - -if __name__ == "__main__": - import argparse - import sys - import time - - import cv2 - from model_param_init import ModelParameters - - p = argparse.ArgumentParser() - p.add_argument( - "--algorithm", - "-a", - type=str, - choices=["invert", "invert_p", "min_mag", "max_mag", "deep", "align"], - default="min_mag", - ) - p.add_argument( - "--model_params", - "-m", - type=str, - default=os.path.join("modelparams", "1band_sr44100_hl512.json"), - ) - p.add_argument("--output_name", "-o", type=str, default="output") - p.add_argument("--vocals_only", "-v", action="store_true") - p.add_argument("input", nargs="+") - args = p.parse_args() - - start_time = time.time() - - if args.algorithm.startswith("invert") and len(args.input) != 2: - raise ValueError("There should be two input files.") - - if not args.algorithm.startswith("invert") and len(args.input) < 2: - raise ValueError("There must be at least two input files.") - - wave, specs = {}, {} - mp = ModelParameters(args.model_params) - - for i in range(len(args.input)): - spec = {} - - for d in range(len(mp.param["band"]), 0, -1): - bp = mp.param["band"][d] - - if d == len(mp.param["band"]): # high-end band - wave[d], _ = librosa.load( - args.input[i], - bp["sr"], - False, - dtype=np.float32, - res_type=bp["res_type"], - ) - - if len(wave[d].shape) == 1: # mono to stereo - wave[d] = np.array([wave[d], wave[d]]) - else: # lower bands - wave[d] = librosa.resample( - wave[d + 1], - mp.param["band"][d + 1]["sr"], - bp["sr"], - res_type=bp["res_type"], - ) - - spec[d] = wave_to_spectrogram( - wave[d], - bp["hl"], - bp["n_fft"], - mp.param["mid_side"], - mp.param["mid_side_b2"], - mp.param["reverse"], - ) - - specs[i] = combine_spectrograms(spec, mp) - - del wave - - if args.algorithm == "deep": - d_spec = np.where(np.abs(specs[0]) <= np.abs(spec[1]), specs[0], spec[1]) - v_spec = d_spec - specs[1] - sf.write( - os.path.join("{}.wav".format(args.output_name)), - cmb_spectrogram_to_wave(v_spec, mp), - mp.param["sr"], - ) - - if args.algorithm.startswith("invert"): - ln = min([specs[0].shape[2], specs[1].shape[2]]) - specs[0] = specs[0][:, :, :ln] - specs[1] = specs[1][:, :, :ln] - - if "invert_p" == args.algorithm: - X_mag = np.abs(specs[0]) - y_mag = np.abs(specs[1]) - max_mag = np.where(X_mag >= y_mag, X_mag, y_mag) - v_spec = specs[1] - max_mag * np.exp(1.0j * np.angle(specs[0])) - else: - specs[1] = reduce_vocal_aggressively(specs[0], specs[1], 0.2) - v_spec = specs[0] - specs[1] - - if not args.vocals_only: - X_mag = np.abs(specs[0]) - y_mag = np.abs(specs[1]) - v_mag = np.abs(v_spec) - - X_image = spectrogram_to_image(X_mag) - y_image = spectrogram_to_image(y_mag) - v_image = spectrogram_to_image(v_mag) - - cv2.imwrite("{}_X.png".format(args.output_name), X_image) - cv2.imwrite("{}_y.png".format(args.output_name), y_image) - cv2.imwrite("{}_v.png".format(args.output_name), v_image) - - sf.write( - "{}_X.wav".format(args.output_name), - cmb_spectrogram_to_wave(specs[0], mp), - mp.param["sr"], - ) - sf.write( - "{}_y.wav".format(args.output_name), - cmb_spectrogram_to_wave(specs[1], mp), - mp.param["sr"], - ) - - sf.write( - "{}_v.wav".format(args.output_name), - cmb_spectrogram_to_wave(v_spec, mp), - mp.param["sr"], - ) - else: - if not args.algorithm == "deep": - sf.write( - os.path.join("ensembled", "{}.wav".format(args.output_name)), - cmb_spectrogram_to_wave(ensembling(args.algorithm, specs), mp), - mp.param["sr"], - ) - - if args.algorithm == "align": - trackalignment = [ - { - "file1": '"{}"'.format(args.input[0]), - "file2": '"{}"'.format(args.input[1]), - } - ] - - for i, e in tqdm(enumerate(trackalignment), desc="Performing Alignment..."): - os.system(f"python lib/align_tracks.py {e['file1']} {e['file2']}") - - # print('Total time: {0:.{1}f}s'.format(time.time() - start_time, 1)) diff --git a/infer/lib/uvr5_pack/name_params.json b/infer/lib/uvr5_pack/name_params.json deleted file mode 100644 index 8ed51a68370607a7a8693b99cfb35fc5d92b04af..0000000000000000000000000000000000000000 --- a/infer/lib/uvr5_pack/name_params.json +++ /dev/null @@ -1,263 +0,0 @@ -{ - "equivalent" : [ - { - "model_hash_name" : [ - { - "hash_name": "47939caf0cfe52a0e81442b85b971dfd", - "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", - "param_name": "4band_44100" - }, - { - "hash_name": "4e4ecb9764c50a8c414fee6e10395bbe", - "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json", - "param_name": "4band_v2" - }, - { - "hash_name": "ca106edd563e034bde0bdec4bb7a4b36", - "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json", - "param_name": "4band_v2" - }, - { - "hash_name": "e60a1e84803ce4efc0a6551206cc4b71", - "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", - "param_name": "4band_44100" - }, - { - "hash_name": "a82f14e75892e55e994376edbf0c8435", - "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", - "param_name": "4band_44100" - }, - { - "hash_name": "6dd9eaa6f0420af9f1d403aaafa4cc06", - "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json", - "param_name": "4band_v2_sn" - }, - { - "hash_name": "08611fb99bd59eaa79ad27c58d137727", - "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json", - "param_name": "4band_v2_sn" - }, - { - "hash_name": "5c7bbca45a187e81abbbd351606164e5", - "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json", - "param_name": "3band_44100_msb2" - }, - { - "hash_name": "d6b2cb685a058a091e5e7098192d3233", - "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json", - "param_name": "3band_44100_msb2" - }, - { - "hash_name": "c1b9f38170a7c90e96f027992eb7c62b", - "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", - "param_name": "4band_44100" - }, - { - "hash_name": "c3448ec923fa0edf3d03a19e633faa53", - "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", - "param_name": "4band_44100" - }, - { - "hash_name": "68aa2c8093d0080704b200d140f59e54", - "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100.json", - "param_name": "3band_44100" - }, - { - "hash_name": "fdc83be5b798e4bd29fe00fe6600e147", - "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json", - "param_name": "3band_44100_mid.json" - }, - { - "hash_name": "2ce34bc92fd57f55db16b7a4def3d745", - "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json", - "param_name": "3band_44100_mid.json" - }, - { - "hash_name": "52fdca89576f06cf4340b74a4730ee5f", - "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", - "param_name": "4band_44100.json" - }, - { - "hash_name": "41191165b05d38fc77f072fa9e8e8a30", - "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", - "param_name": "4band_44100.json" - }, - { - "hash_name": "89e83b511ad474592689e562d5b1f80e", - "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/2band_32000.json", - "param_name": "2band_32000.json" - }, - { - "hash_name": "0b954da81d453b716b114d6d7c95177f", - "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/2band_32000.json", - "param_name": "2band_32000.json" - } - - ], - "v4 Models": [ - { - "hash_name": "6a00461c51c2920fd68937d4609ed6c8", - "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json", - "param_name": "1band_sr16000_hl512" - }, - { - "hash_name": "0ab504864d20f1bd378fe9c81ef37140", - "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json", - "param_name": "1band_sr32000_hl512" - }, - { - "hash_name": "7dd21065bf91c10f7fccb57d7d83b07f", - "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json", - "param_name": "1band_sr32000_hl512" - }, - { - "hash_name": "80ab74d65e515caa3622728d2de07d23", - "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json", - "param_name": "1band_sr32000_hl512" - }, - { - "hash_name": "edc115e7fc523245062200c00caa847f", - "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json", - "param_name": "1band_sr33075_hl384" - }, - { - "hash_name": "28063e9f6ab5b341c5f6d3c67f2045b7", - "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json", - "param_name": "1band_sr33075_hl384" - }, - { - "hash_name": "b58090534c52cbc3e9b5104bad666ef2", - "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json", - "param_name": "1band_sr44100_hl512" - }, - { - "hash_name": "0cdab9947f1b0928705f518f3c78ea8f", - "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json", - "param_name": "1band_sr44100_hl512" - }, - { - "hash_name": "ae702fed0238afb5346db8356fe25f13", - "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json", - "param_name": "1band_sr44100_hl1024" - } - ] - } - ], - "User Models" : [ - { - "1 Band": [ - { - "hash_name": "1band_sr16000_hl512", - "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json", - "param_name": "1band_sr16000_hl512" - }, - { - "hash_name": "1band_sr32000_hl512", - "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json", - "param_name": "1band_sr16000_hl512" - }, - { - "hash_name": "1band_sr33075_hl384", - "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json", - "param_name": "1band_sr33075_hl384" - }, - { - "hash_name": "1band_sr44100_hl256", - "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json", - "param_name": "1band_sr44100_hl256" - }, - { - "hash_name": "1band_sr44100_hl512", - "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json", - "param_name": "1band_sr44100_hl512" - }, - { - "hash_name": "1band_sr44100_hl1024", - "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json", - "param_name": "1band_sr44100_hl1024" - } - ], - "2 Band": [ - { - "hash_name": "2band_44100_lofi", - "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json", - "param_name": "2band_44100_lofi" - }, - { - "hash_name": "2band_32000", - "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/2band_32000.json", - "param_name": "2band_32000" - }, - { - "hash_name": "2band_48000", - "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/2band_48000.json", - "param_name": "2band_48000" - } - ], - "3 Band": [ - { - "hash_name": "3band_44100", - "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100.json", - "param_name": "3band_44100" - }, - { - "hash_name": "3band_44100_mid", - "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json", - "param_name": "3band_44100_mid" - }, - { - "hash_name": "3band_44100_msb2", - "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json", - "param_name": "3band_44100_msb2" - } - ], - "4 Band": [ - { - "hash_name": "4band_44100", - "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", - "param_name": "4band_44100" - }, - { - "hash_name": "4band_44100_mid", - "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_mid.json", - "param_name": "4band_44100_mid" - }, - { - "hash_name": "4band_44100_msb", - "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb.json", - "param_name": "4band_44100_msb" - }, - { - "hash_name": "4band_44100_msb2", - "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json", - "param_name": "4band_44100_msb2" - }, - { - "hash_name": "4band_44100_reverse", - "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json", - "param_name": "4band_44100_reverse" - }, - { - "hash_name": "4band_44100_sw", - "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_sw.json", - "param_name": "4band_44100_sw" - }, - { - "hash_name": "4band_v2", - "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json", - "param_name": "4band_v2" - }, - { - "hash_name": "4band_v2_sn", - "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json", - "param_name": "4band_v2_sn" - }, - { - "hash_name": "tmodelparam", - "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/tmodelparam.json", - "param_name": "User Model Param Set" - } - ] - } - ] -} \ No newline at end of file diff --git a/infer/lib/uvr5_pack/utils.py b/infer/lib/uvr5_pack/utils.py deleted file mode 100644 index f4805cdb25e7c50611412a19340ad525d1251d7b..0000000000000000000000000000000000000000 --- a/infer/lib/uvr5_pack/utils.py +++ /dev/null @@ -1,121 +0,0 @@ -import json - -import numpy as np -import torch -from tqdm import tqdm - - -def load_data(file_name: str = "./infer/lib/uvr5_pack/name_params.json") -> dict: - with open(file_name, "r") as f: - data = json.load(f) - - return data - - -def make_padding(width, cropsize, offset): - left = offset - roi_size = cropsize - left * 2 - if roi_size == 0: - roi_size = cropsize - right = roi_size - (width % roi_size) + left - - return left, right, roi_size - - -def inference(X_spec, device, model, aggressiveness, data): - """ - data : dic configs - """ - - def _execute( - X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half=True - ): - model.eval() - with torch.no_grad(): - preds = [] - - iterations = [n_window] - - total_iterations = sum(iterations) - for i in tqdm(range(n_window)): - start = i * roi_size - X_mag_window = X_mag_pad[ - None, :, :, start : start + data["window_size"] - ] - X_mag_window = torch.from_numpy(X_mag_window) - if is_half: - X_mag_window = X_mag_window.half() - X_mag_window = X_mag_window.to(device) - - pred = model.predict(X_mag_window, aggressiveness) - - pred = pred.detach().cpu().numpy() - preds.append(pred[0]) - - pred = np.concatenate(preds, axis=2) - return pred - - def preprocess(X_spec): - X_mag = np.abs(X_spec) - X_phase = np.angle(X_spec) - - return X_mag, X_phase - - X_mag, X_phase = preprocess(X_spec) - - coef = X_mag.max() - X_mag_pre = X_mag / coef - - n_frame = X_mag_pre.shape[2] - pad_l, pad_r, roi_size = make_padding(n_frame, data["window_size"], model.offset) - n_window = int(np.ceil(n_frame / roi_size)) - - X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant") - - if list(model.state_dict().values())[0].dtype == torch.float16: - is_half = True - else: - is_half = False - pred = _execute( - X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half - ) - pred = pred[:, :, :n_frame] - - if data["tta"]: - pad_l += roi_size // 2 - pad_r += roi_size // 2 - n_window += 1 - - X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant") - - pred_tta = _execute( - X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half - ) - pred_tta = pred_tta[:, :, roi_size // 2 :] - pred_tta = pred_tta[:, :, :n_frame] - - return (pred + pred_tta) * 0.5 * coef, X_mag, np.exp(1.0j * X_phase) - else: - return pred * coef, X_mag, np.exp(1.0j * X_phase) - - -def _get_name_params(model_path, model_hash): - data = load_data() - flag = False - ModelName = model_path - for type in list(data): - for model in list(data[type][0]): - for i in range(len(data[type][0][model])): - if str(data[type][0][model][i]["hash_name"]) == model_hash: - flag = True - elif str(data[type][0][model][i]["hash_name"]) in ModelName: - flag = True - - if flag: - model_params_auto = data[type][0][model][i]["model_params"] - param_name_auto = data[type][0][model][i]["param_name"] - if type == "equivalent": - return param_name_auto, model_params_auto - else: - flag = False - return param_name_auto, model_params_auto diff --git a/infer/modules/ipex/__init__.py b/infer/modules/ipex/__init__.py deleted file mode 100644 index cd27bc172f28a20a0378f8e91e4fa463d4118a72..0000000000000000000000000000000000000000 --- a/infer/modules/ipex/__init__.py +++ /dev/null @@ -1,190 +0,0 @@ -import os -import sys -import contextlib -import torch -import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import -from .hijacks import ipex_hijacks -from .attention import attention_init - -# pylint: disable=protected-access, missing-function-docstring, line-too-long - - -def ipex_init(): # pylint: disable=too-many-statements - try: - # Replace cuda with xpu: - torch.cuda.current_device = torch.xpu.current_device - torch.cuda.current_stream = torch.xpu.current_stream - torch.cuda.device = torch.xpu.device - torch.cuda.device_count = torch.xpu.device_count - torch.cuda.device_of = torch.xpu.device_of - torch.cuda.get_device_name = torch.xpu.get_device_name - torch.cuda.get_device_properties = torch.xpu.get_device_properties - torch.cuda.init = torch.xpu.init - torch.cuda.is_available = torch.xpu.is_available - torch.cuda.is_initialized = torch.xpu.is_initialized - torch.cuda.is_current_stream_capturing = lambda: False - torch.cuda.set_device = torch.xpu.set_device - torch.cuda.stream = torch.xpu.stream - torch.cuda.synchronize = torch.xpu.synchronize - torch.cuda.Event = torch.xpu.Event - torch.cuda.Stream = torch.xpu.Stream - torch.cuda.FloatTensor = torch.xpu.FloatTensor - torch.Tensor.cuda = torch.Tensor.xpu - torch.Tensor.is_cuda = torch.Tensor.is_xpu - torch.cuda._initialization_lock = torch.xpu.lazy_init._initialization_lock - torch.cuda._initialized = torch.xpu.lazy_init._initialized - torch.cuda._lazy_seed_tracker = torch.xpu.lazy_init._lazy_seed_tracker - torch.cuda._queued_calls = torch.xpu.lazy_init._queued_calls - torch.cuda._tls = torch.xpu.lazy_init._tls - torch.cuda.threading = torch.xpu.lazy_init.threading - torch.cuda.traceback = torch.xpu.lazy_init.traceback - torch.cuda.Optional = torch.xpu.Optional - torch.cuda.__cached__ = torch.xpu.__cached__ - torch.cuda.__loader__ = torch.xpu.__loader__ - torch.cuda.ComplexFloatStorage = torch.xpu.ComplexFloatStorage - torch.cuda.Tuple = torch.xpu.Tuple - torch.cuda.streams = torch.xpu.streams - torch.cuda._lazy_new = torch.xpu._lazy_new - torch.cuda.FloatStorage = torch.xpu.FloatStorage - torch.cuda.Any = torch.xpu.Any - torch.cuda.__doc__ = torch.xpu.__doc__ - torch.cuda.default_generators = torch.xpu.default_generators - torch.cuda.HalfTensor = torch.xpu.HalfTensor - torch.cuda._get_device_index = torch.xpu._get_device_index - torch.cuda.__path__ = torch.xpu.__path__ - torch.cuda.Device = torch.xpu.Device - torch.cuda.IntTensor = torch.xpu.IntTensor - torch.cuda.ByteStorage = torch.xpu.ByteStorage - torch.cuda.set_stream = torch.xpu.set_stream - torch.cuda.BoolStorage = torch.xpu.BoolStorage - torch.cuda.os = torch.xpu.os - torch.cuda.torch = torch.xpu.torch - torch.cuda.BFloat16Storage = torch.xpu.BFloat16Storage - torch.cuda.Union = torch.xpu.Union - torch.cuda.DoubleTensor = torch.xpu.DoubleTensor - torch.cuda.ShortTensor = torch.xpu.ShortTensor - torch.cuda.LongTensor = torch.xpu.LongTensor - torch.cuda.IntStorage = torch.xpu.IntStorage - torch.cuda.LongStorage = torch.xpu.LongStorage - torch.cuda.__annotations__ = torch.xpu.__annotations__ - torch.cuda.__package__ = torch.xpu.__package__ - torch.cuda.__builtins__ = torch.xpu.__builtins__ - torch.cuda.CharTensor = torch.xpu.CharTensor - torch.cuda.List = torch.xpu.List - torch.cuda._lazy_init = torch.xpu._lazy_init - torch.cuda.BFloat16Tensor = torch.xpu.BFloat16Tensor - torch.cuda.DoubleStorage = torch.xpu.DoubleStorage - torch.cuda.ByteTensor = torch.xpu.ByteTensor - torch.cuda.StreamContext = torch.xpu.StreamContext - torch.cuda.ComplexDoubleStorage = torch.xpu.ComplexDoubleStorage - torch.cuda.ShortStorage = torch.xpu.ShortStorage - torch.cuda._lazy_call = torch.xpu._lazy_call - torch.cuda.HalfStorage = torch.xpu.HalfStorage - torch.cuda.random = torch.xpu.random - torch.cuda._device = torch.xpu._device - torch.cuda.classproperty = torch.xpu.classproperty - torch.cuda.__name__ = torch.xpu.__name__ - torch.cuda._device_t = torch.xpu._device_t - torch.cuda.warnings = torch.xpu.warnings - torch.cuda.__spec__ = torch.xpu.__spec__ - torch.cuda.BoolTensor = torch.xpu.BoolTensor - torch.cuda.CharStorage = torch.xpu.CharStorage - torch.cuda.__file__ = torch.xpu.__file__ - torch.cuda._is_in_bad_fork = torch.xpu.lazy_init._is_in_bad_fork - # torch.cuda.is_current_stream_capturing = torch.xpu.is_current_stream_capturing - - # Memory: - torch.cuda.memory = torch.xpu.memory - if "linux" in sys.platform and "WSL2" in os.popen("uname -a").read(): - torch.xpu.empty_cache = lambda: None - torch.cuda.empty_cache = torch.xpu.empty_cache - torch.cuda.memory_stats = torch.xpu.memory_stats - torch.cuda.memory_summary = torch.xpu.memory_summary - torch.cuda.memory_snapshot = torch.xpu.memory_snapshot - torch.cuda.memory_allocated = torch.xpu.memory_allocated - torch.cuda.max_memory_allocated = torch.xpu.max_memory_allocated - torch.cuda.memory_reserved = torch.xpu.memory_reserved - torch.cuda.memory_cached = torch.xpu.memory_reserved - torch.cuda.max_memory_reserved = torch.xpu.max_memory_reserved - torch.cuda.max_memory_cached = torch.xpu.max_memory_reserved - torch.cuda.reset_peak_memory_stats = torch.xpu.reset_peak_memory_stats - torch.cuda.reset_max_memory_cached = torch.xpu.reset_peak_memory_stats - torch.cuda.reset_max_memory_allocated = torch.xpu.reset_peak_memory_stats - torch.cuda.memory_stats_as_nested_dict = torch.xpu.memory_stats_as_nested_dict - torch.cuda.reset_accumulated_memory_stats = ( - torch.xpu.reset_accumulated_memory_stats - ) - - # RNG: - torch.cuda.get_rng_state = torch.xpu.get_rng_state - torch.cuda.get_rng_state_all = torch.xpu.get_rng_state_all - torch.cuda.set_rng_state = torch.xpu.set_rng_state - torch.cuda.set_rng_state_all = torch.xpu.set_rng_state_all - torch.cuda.manual_seed = torch.xpu.manual_seed - torch.cuda.manual_seed_all = torch.xpu.manual_seed_all - torch.cuda.seed = torch.xpu.seed - torch.cuda.seed_all = torch.xpu.seed_all - torch.cuda.initial_seed = torch.xpu.initial_seed - - # AMP: - torch.cuda.amp = torch.xpu.amp - if not hasattr(torch.cuda.amp, "common"): - torch.cuda.amp.common = contextlib.nullcontext() - torch.cuda.amp.common.amp_definitely_not_available = lambda: False - try: - torch.cuda.amp.GradScaler = torch.xpu.amp.GradScaler - except Exception: # pylint: disable=broad-exception-caught - try: - from .gradscaler import ( - gradscaler_init, - ) # pylint: disable=import-outside-toplevel, import-error - - gradscaler_init() - torch.cuda.amp.GradScaler = torch.xpu.amp.GradScaler - except Exception: # pylint: disable=broad-exception-caught - torch.cuda.amp.GradScaler = ipex.cpu.autocast._grad_scaler.GradScaler - - # C - torch._C._cuda_getCurrentRawStream = ipex._C._getCurrentStream - ipex._C._DeviceProperties.major = 2023 - ipex._C._DeviceProperties.minor = 2 - - # Fix functions with ipex: - torch.cuda.mem_get_info = lambda device=None: [ - ( - torch.xpu.get_device_properties(device).total_memory - - torch.xpu.memory_allocated(device) - ), - torch.xpu.get_device_properties(device).total_memory, - ] - torch._utils._get_available_device_type = lambda: "xpu" - torch.has_cuda = True - torch.cuda.has_half = True - torch.cuda.is_bf16_supported = lambda *args, **kwargs: True - torch.cuda.is_fp16_supported = lambda *args, **kwargs: True - torch.version.cuda = "11.7" - torch.cuda.get_device_capability = lambda *args, **kwargs: [11, 7] - torch.cuda.get_device_properties.major = 11 - torch.cuda.get_device_properties.minor = 7 - torch.cuda.ipc_collect = lambda *args, **kwargs: None - torch.cuda.utilization = lambda *args, **kwargs: 0 - if hasattr(torch.xpu, "getDeviceIdListForCard"): - torch.cuda.getDeviceIdListForCard = torch.xpu.getDeviceIdListForCard - torch.cuda.get_device_id_list_per_card = torch.xpu.getDeviceIdListForCard - else: - torch.cuda.getDeviceIdListForCard = torch.xpu.get_device_id_list_per_card - torch.cuda.get_device_id_list_per_card = ( - torch.xpu.get_device_id_list_per_card - ) - - ipex_hijacks() - attention_init() - try: - from .diffusers import ipex_diffusers - - ipex_diffusers() - except Exception: # pylint: disable=broad-exception-caught - pass - except Exception as e: - return False, e - return True, None diff --git a/infer/modules/ipex/attention.py b/infer/modules/ipex/attention.py deleted file mode 100644 index 78a4775ccf95ded03a953e07e5ffccc7bb4f29b5..0000000000000000000000000000000000000000 --- a/infer/modules/ipex/attention.py +++ /dev/null @@ -1,218 +0,0 @@ -import torch -import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import - -# pylint: disable=protected-access, missing-function-docstring, line-too-long - -original_torch_bmm = torch.bmm - - -def torch_bmm(input, mat2, *, out=None): - if input.dtype != mat2.dtype: - mat2 = mat2.to(input.dtype) - - # ARC GPUs can't allocate more than 4GB to a single block, Slice it: - batch_size_attention, input_tokens, mat2_shape = ( - input.shape[0], - input.shape[1], - mat2.shape[2], - ) - block_multiply = input.element_size() - slice_block_size = input_tokens * mat2_shape / 1024 / 1024 * block_multiply - block_size = batch_size_attention * slice_block_size - - split_slice_size = batch_size_attention - if block_size > 4: - do_split = True - # Find something divisible with the input_tokens - while (split_slice_size * slice_block_size) > 4: - split_slice_size = split_slice_size // 2 - if split_slice_size <= 1: - split_slice_size = 1 - break - else: - do_split = False - - split_2_slice_size = input_tokens - if split_slice_size * slice_block_size > 4: - slice_block_size2 = split_slice_size * mat2_shape / 1024 / 1024 * block_multiply - do_split_2 = True - # Find something divisible with the input_tokens - while (split_2_slice_size * slice_block_size2) > 4: - split_2_slice_size = split_2_slice_size // 2 - if split_2_slice_size <= 1: - split_2_slice_size = 1 - break - else: - do_split_2 = False - - if do_split: - hidden_states = torch.zeros( - input.shape[0], - input.shape[1], - mat2.shape[2], - device=input.device, - dtype=input.dtype, - ) - for i in range(batch_size_attention // split_slice_size): - start_idx = i * split_slice_size - end_idx = (i + 1) * split_slice_size - if do_split_2: - for i2 in range( - input_tokens // split_2_slice_size - ): # pylint: disable=invalid-name - start_idx_2 = i2 * split_2_slice_size - end_idx_2 = (i2 + 1) * split_2_slice_size - hidden_states[start_idx:end_idx, start_idx_2:end_idx_2] = ( - original_torch_bmm( - input[start_idx:end_idx, start_idx_2:end_idx_2], - mat2[start_idx:end_idx, start_idx_2:end_idx_2], - out=out, - ) - ) - else: - hidden_states[start_idx:end_idx] = original_torch_bmm( - input[start_idx:end_idx], mat2[start_idx:end_idx], out=out - ) - else: - return original_torch_bmm(input, mat2, out=out) - return hidden_states - - -original_scaled_dot_product_attention = torch.nn.functional.scaled_dot_product_attention - - -def scaled_dot_product_attention( - query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False -): - # ARC GPUs can't allocate more than 4GB to a single block, Slice it: - if len(query.shape) == 3: - batch_size_attention, query_tokens, shape_four = query.shape - shape_one = 1 - no_shape_one = True - else: - shape_one, batch_size_attention, query_tokens, shape_four = query.shape - no_shape_one = False - - block_multiply = query.element_size() - slice_block_size = ( - shape_one * query_tokens * shape_four / 1024 / 1024 * block_multiply - ) - block_size = batch_size_attention * slice_block_size - - split_slice_size = batch_size_attention - if block_size > 4: - do_split = True - # Find something divisible with the shape_one - while (split_slice_size * slice_block_size) > 4: - split_slice_size = split_slice_size // 2 - if split_slice_size <= 1: - split_slice_size = 1 - break - else: - do_split = False - - split_2_slice_size = query_tokens - if split_slice_size * slice_block_size > 4: - slice_block_size2 = ( - shape_one * split_slice_size * shape_four / 1024 / 1024 * block_multiply - ) - do_split_2 = True - # Find something divisible with the batch_size_attention - while (split_2_slice_size * slice_block_size2) > 4: - split_2_slice_size = split_2_slice_size // 2 - if split_2_slice_size <= 1: - split_2_slice_size = 1 - break - else: - do_split_2 = False - - if do_split: - hidden_states = torch.zeros(query.shape, device=query.device, dtype=query.dtype) - for i in range(batch_size_attention // split_slice_size): - start_idx = i * split_slice_size - end_idx = (i + 1) * split_slice_size - if do_split_2: - for i2 in range( - query_tokens // split_2_slice_size - ): # pylint: disable=invalid-name - start_idx_2 = i2 * split_2_slice_size - end_idx_2 = (i2 + 1) * split_2_slice_size - if no_shape_one: - hidden_states[start_idx:end_idx, start_idx_2:end_idx_2] = ( - original_scaled_dot_product_attention( - query[start_idx:end_idx, start_idx_2:end_idx_2], - key[start_idx:end_idx, start_idx_2:end_idx_2], - value[start_idx:end_idx, start_idx_2:end_idx_2], - attn_mask=( - attn_mask[start_idx:end_idx, start_idx_2:end_idx_2] - if attn_mask is not None - else attn_mask - ), - dropout_p=dropout_p, - is_causal=is_causal, - ) - ) - else: - hidden_states[:, start_idx:end_idx, start_idx_2:end_idx_2] = ( - original_scaled_dot_product_attention( - query[:, start_idx:end_idx, start_idx_2:end_idx_2], - key[:, start_idx:end_idx, start_idx_2:end_idx_2], - value[:, start_idx:end_idx, start_idx_2:end_idx_2], - attn_mask=( - attn_mask[ - :, start_idx:end_idx, start_idx_2:end_idx_2 - ] - if attn_mask is not None - else attn_mask - ), - dropout_p=dropout_p, - is_causal=is_causal, - ) - ) - else: - if no_shape_one: - hidden_states[start_idx:end_idx] = ( - original_scaled_dot_product_attention( - query[start_idx:end_idx], - key[start_idx:end_idx], - value[start_idx:end_idx], - attn_mask=( - attn_mask[start_idx:end_idx] - if attn_mask is not None - else attn_mask - ), - dropout_p=dropout_p, - is_causal=is_causal, - ) - ) - else: - hidden_states[:, start_idx:end_idx] = ( - original_scaled_dot_product_attention( - query[:, start_idx:end_idx], - key[:, start_idx:end_idx], - value[:, start_idx:end_idx], - attn_mask=( - attn_mask[:, start_idx:end_idx] - if attn_mask is not None - else attn_mask - ), - dropout_p=dropout_p, - is_causal=is_causal, - ) - ) - else: - return original_scaled_dot_product_attention( - query, - key, - value, - attn_mask=attn_mask, - dropout_p=dropout_p, - is_causal=is_causal, - ) - return hidden_states - - -def attention_init(): - # ARC GPUs can't allocate more than 4GB to a single block: - torch.bmm = torch_bmm - torch.nn.functional.scaled_dot_product_attention = scaled_dot_product_attention diff --git a/infer/modules/ipex/gradscaler.py b/infer/modules/ipex/gradscaler.py deleted file mode 100644 index 7875151d17c390aca2f8116293c63b0879b7d4c4..0000000000000000000000000000000000000000 --- a/infer/modules/ipex/gradscaler.py +++ /dev/null @@ -1,187 +0,0 @@ -from collections import defaultdict -import torch -import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import -import intel_extension_for_pytorch._C as core # pylint: disable=import-error, unused-import - -# pylint: disable=protected-access, missing-function-docstring, line-too-long - -OptState = ipex.cpu.autocast._grad_scaler.OptState -_MultiDeviceReplicator = ipex.cpu.autocast._grad_scaler._MultiDeviceReplicator -_refresh_per_optimizer_state = ( - ipex.cpu.autocast._grad_scaler._refresh_per_optimizer_state -) - - -def _unscale_grads_( - self, optimizer, inv_scale, found_inf, allow_fp16 -): # pylint: disable=unused-argument - per_device_inv_scale = _MultiDeviceReplicator(inv_scale) - per_device_found_inf = _MultiDeviceReplicator(found_inf) - - # To set up _amp_foreach_non_finite_check_and_unscale_, split grads by device and dtype. - # There could be hundreds of grads, so we'd like to iterate through them just once. - # However, we don't know their devices or dtypes in advance. - - # https://stackoverflow.com/questions/5029934/defaultdict-of-defaultdict - # Google says mypy struggles with defaultdicts type annotations. - per_device_and_dtype_grads = defaultdict(lambda: defaultdict(list)) # type: ignore[var-annotated] - # sync grad to master weight - if hasattr(optimizer, "sync_grad"): - optimizer.sync_grad() - with torch.no_grad(): - for group in optimizer.param_groups: - for param in group["params"]: - if param.grad is None: - continue - if (not allow_fp16) and param.grad.dtype == torch.float16: - raise ValueError("Attempting to unscale FP16 gradients.") - if param.grad.is_sparse: - # is_coalesced() == False means the sparse grad has values with duplicate indices. - # coalesce() deduplicates indices and adds all values that have the same index. - # For scaled fp16 values, there's a good chance coalescing will cause overflow, - # so we should check the coalesced _values(). - if param.grad.dtype is torch.float16: - param.grad = param.grad.coalesce() - to_unscale = param.grad._values() - else: - to_unscale = param.grad - - # -: is there a way to split by device and dtype without appending in the inner loop? - to_unscale = to_unscale.to("cpu") - per_device_and_dtype_grads[to_unscale.device][to_unscale.dtype].append( - to_unscale - ) - - for _, per_dtype_grads in per_device_and_dtype_grads.items(): - for grads in per_dtype_grads.values(): - core._amp_foreach_non_finite_check_and_unscale_( - grads, - per_device_found_inf.get("cpu"), - per_device_inv_scale.get("cpu"), - ) - - return per_device_found_inf._per_device_tensors - - -def unscale_(self, optimizer): - """ - Divides ("unscales") the optimizer's gradient tensors by the scale factor. - :meth:`unscale_` is optional, serving cases where you need to - :ref:`modify or inspect gradients` - between the backward pass(es) and :meth:`step`. - If :meth:`unscale_` is not called explicitly, gradients will be unscaled automatically during :meth:`step`. - Simple example, using :meth:`unscale_` to enable clipping of unscaled gradients:: - ... - scaler.scale(loss).backward() - scaler.unscale_(optimizer) - torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm) - scaler.step(optimizer) - scaler.update() - Args: - optimizer (torch.optim.Optimizer): Optimizer that owns the gradients to be unscaled. - .. warning:: - :meth:`unscale_` should only be called once per optimizer per :meth:`step` call, - and only after all gradients for that optimizer's assigned parameters have been accumulated. - Calling :meth:`unscale_` twice for a given optimizer between each :meth:`step` triggers a RuntimeError. - .. warning:: - :meth:`unscale_` may unscale sparse gradients out of place, replacing the ``.grad`` attribute. - """ - if not self._enabled: - return - - self._check_scale_growth_tracker("unscale_") - - optimizer_state = self._per_optimizer_states[id(optimizer)] - - if optimizer_state["stage"] is OptState.UNSCALED: # pylint: disable=no-else-raise - raise RuntimeError( - "unscale_() has already been called on this optimizer since the last update()." - ) - elif optimizer_state["stage"] is OptState.STEPPED: - raise RuntimeError("unscale_() is being called after step().") - - # FP32 division can be imprecise for certain compile options, so we carry out the reciprocal in FP64. - assert self._scale is not None - inv_scale = ( - self._scale.to("cpu").double().reciprocal().float().to(self._scale.device) - ) - found_inf = torch.full((1,), 0.0, dtype=torch.float32, device=self._scale.device) - - optimizer_state["found_inf_per_device"] = self._unscale_grads_( - optimizer, inv_scale, found_inf, False - ) - optimizer_state["stage"] = OptState.UNSCALED - - -def update(self, new_scale=None): - """ - Updates the scale factor. - If any optimizer steps were skipped the scale is multiplied by ``backoff_factor`` - to reduce it. If ``growth_interval`` unskipped iterations occurred consecutively, - the scale is multiplied by ``growth_factor`` to increase it. - Passing ``new_scale`` sets the new scale value manually. (``new_scale`` is not - used directly, it's used to fill GradScaler's internal scale tensor. So if - ``new_scale`` was a tensor, later in-place changes to that tensor will not further - affect the scale GradScaler uses internally.) - Args: - new_scale (float or :class:`torch.FloatTensor`, optional, default=None): New scale factor. - .. warning:: - :meth:`update` should only be called at the end of the iteration, after ``scaler.step(optimizer)`` has - been invoked for all optimizers used this iteration. - """ - if not self._enabled: - return - - _scale, _growth_tracker = self._check_scale_growth_tracker("update") - - if new_scale is not None: - # Accept a new user-defined scale. - if isinstance(new_scale, float): - self._scale.fill_(new_scale) # type: ignore[union-attr] - else: - reason = "new_scale should be a float or a 1-element torch.FloatTensor with requires_grad=False." - assert isinstance(new_scale, torch.FloatTensor), reason # type: ignore[attr-defined] - assert new_scale.numel() == 1, reason - assert new_scale.requires_grad is False, reason - self._scale.copy_(new_scale) # type: ignore[union-attr] - else: - # Consume shared inf/nan data collected from optimizers to update the scale. - # If all found_inf tensors are on the same device as self._scale, this operation is asynchronous. - found_infs = [ - found_inf.to(device="cpu", non_blocking=True) - for state in self._per_optimizer_states.values() - for found_inf in state["found_inf_per_device"].values() - ] - - assert len(found_infs) > 0, "No inf checks were recorded prior to update." - - found_inf_combined = found_infs[0] - if len(found_infs) > 1: - for i in range(1, len(found_infs)): - found_inf_combined += found_infs[i] - - to_device = _scale.device - _scale = _scale.to("cpu") - _growth_tracker = _growth_tracker.to("cpu") - - core._amp_update_scale_( - _scale, - _growth_tracker, - found_inf_combined, - self._growth_factor, - self._backoff_factor, - self._growth_interval, - ) - - _scale = _scale.to(to_device) - _growth_tracker = _growth_tracker.to(to_device) - # To prepare for next iteration, clear the data collected from optimizers this iteration. - self._per_optimizer_states = defaultdict(_refresh_per_optimizer_state) - - -def gradscaler_init(): - torch.xpu.amp.GradScaler = ipex.cpu.autocast._grad_scaler.GradScaler - torch.xpu.amp.GradScaler._unscale_grads_ = _unscale_grads_ - torch.xpu.amp.GradScaler.unscale_ = unscale_ - torch.xpu.amp.GradScaler.update = update - return torch.xpu.amp.GradScaler diff --git a/infer/modules/ipex/hijacks.py b/infer/modules/ipex/hijacks.py deleted file mode 100644 index fc75f0c7cbfa41f145db95a05296f0668400e981..0000000000000000000000000000000000000000 --- a/infer/modules/ipex/hijacks.py +++ /dev/null @@ -1,365 +0,0 @@ -import contextlib -import importlib -import torch -import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import - -# pylint: disable=protected-access, missing-function-docstring, line-too-long, unnecessary-lambda, no-else-return - - -class CondFunc: # pylint: disable=missing-class-docstring - def __new__(cls, orig_func, sub_func, cond_func): - self = super(CondFunc, cls).__new__(cls) - if isinstance(orig_func, str): - func_path = orig_func.split(".") - for i in range(len(func_path) - 1, -1, -1): - try: - resolved_obj = importlib.import_module(".".join(func_path[:i])) - break - except ImportError: - pass - for attr_name in func_path[i:-1]: - resolved_obj = getattr(resolved_obj, attr_name) - orig_func = getattr(resolved_obj, func_path[-1]) - setattr( - resolved_obj, - func_path[-1], - lambda *args, **kwargs: self(*args, **kwargs), - ) - self.__init__(orig_func, sub_func, cond_func) - return lambda *args, **kwargs: self(*args, **kwargs) - - def __init__(self, orig_func, sub_func, cond_func): - self.__orig_func = orig_func - self.__sub_func = sub_func - self.__cond_func = cond_func - - def __call__(self, *args, **kwargs): - if not self.__cond_func or self.__cond_func(self.__orig_func, *args, **kwargs): - return self.__sub_func(self.__orig_func, *args, **kwargs) - else: - return self.__orig_func(*args, **kwargs) - - -_utils = torch.utils.data._utils - - -def _shutdown_workers(self): - if ( - torch.utils.data._utils is None - or torch.utils.data._utils.python_exit_status is True - or torch.utils.data._utils.python_exit_status is None - ): - return - if hasattr(self, "_shutdown") and not self._shutdown: - self._shutdown = True - try: - if hasattr(self, "_pin_memory_thread"): - self._pin_memory_thread_done_event.set() - self._worker_result_queue.put((None, None)) - self._pin_memory_thread.join() - self._worker_result_queue.cancel_join_thread() - self._worker_result_queue.close() - self._workers_done_event.set() - for worker_id in range(len(self._workers)): - if self._persistent_workers or self._workers_status[worker_id]: - self._mark_worker_as_unavailable(worker_id, shutdown=True) - for w in self._workers: # pylint: disable=invalid-name - w.join(timeout=torch.utils.data._utils.MP_STATUS_CHECK_INTERVAL) - for q in self._index_queues: # pylint: disable=invalid-name - q.cancel_join_thread() - q.close() - finally: - if self._worker_pids_set: - torch.utils.data._utils.signal_handling._remove_worker_pids(id(self)) - self._worker_pids_set = False - for w in self._workers: # pylint: disable=invalid-name - if w.is_alive(): - w.terminate() - - -class DummyDataParallel( - torch.nn.Module -): # pylint: disable=missing-class-docstring, unused-argument, too-few-public-methods - def __new__( - cls, module, device_ids=None, output_device=None, dim=0 - ): # pylint: disable=unused-argument - if isinstance(device_ids, list) and len(device_ids) > 1: - print("IPEX backend doesn't support DataParallel on multiple XPU devices") - return module.to("xpu") - - -def return_null_context(*args, **kwargs): # pylint: disable=unused-argument - return contextlib.nullcontext() - - -def check_device(device): - return bool( - (isinstance(device, torch.device) and device.type == "cuda") - or (isinstance(device, str) and "cuda" in device) - or isinstance(device, int) - ) - - -def return_xpu(device): - return ( - f"xpu:{device[-1]}" - if isinstance(device, str) and ":" in device - else ( - f"xpu:{device}" - if isinstance(device, int) - else torch.device("xpu") if isinstance(device, torch.device) else "xpu" - ) - ) - - -def ipex_no_cuda(orig_func, *args, **kwargs): - torch.cuda.is_available = lambda: False - orig_func(*args, **kwargs) - torch.cuda.is_available = torch.xpu.is_available - - -original_autocast = torch.autocast - - -def ipex_autocast(*args, **kwargs): - if len(args) > 0 and args[0] == "cuda": - return original_autocast("xpu", *args[1:], **kwargs) - else: - return original_autocast(*args, **kwargs) - - -original_torch_cat = torch.cat - - -def torch_cat(tensor, *args, **kwargs): - if len(tensor) == 3 and ( - tensor[0].dtype != tensor[1].dtype or tensor[2].dtype != tensor[1].dtype - ): - return original_torch_cat( - [tensor[0].to(tensor[1].dtype), tensor[1], tensor[2].to(tensor[1].dtype)], - *args, - **kwargs, - ) - else: - return original_torch_cat(tensor, *args, **kwargs) - - -original_interpolate = torch.nn.functional.interpolate - - -def interpolate( - tensor, - size=None, - scale_factor=None, - mode="nearest", - align_corners=None, - recompute_scale_factor=None, - antialias=False, -): # pylint: disable=too-many-arguments - if antialias or align_corners is not None: - return_device = tensor.device - return_dtype = tensor.dtype - return original_interpolate( - tensor.to("cpu", dtype=torch.float32), - size=size, - scale_factor=scale_factor, - mode=mode, - align_corners=align_corners, - recompute_scale_factor=recompute_scale_factor, - antialias=antialias, - ).to(return_device, dtype=return_dtype) - else: - return original_interpolate( - tensor, - size=size, - scale_factor=scale_factor, - mode=mode, - align_corners=align_corners, - recompute_scale_factor=recompute_scale_factor, - antialias=antialias, - ) - - -original_linalg_solve = torch.linalg.solve - - -def linalg_solve(A, B, *args, **kwargs): # pylint: disable=invalid-name - if A.device != torch.device("cpu") or B.device != torch.device("cpu"): - return_device = A.device - return original_linalg_solve(A.to("cpu"), B.to("cpu"), *args, **kwargs).to( - return_device - ) - else: - return original_linalg_solve(A, B, *args, **kwargs) - - -def ipex_hijacks(): - CondFunc( - "torch.Tensor.to", - lambda orig_func, self, device=None, *args, **kwargs: orig_func( - self, return_xpu(device), *args, **kwargs - ), - lambda orig_func, self, device=None, *args, **kwargs: check_device(device), - ) - CondFunc( - "torch.Tensor.cuda", - lambda orig_func, self, device=None, *args, **kwargs: orig_func( - self, return_xpu(device), *args, **kwargs - ), - lambda orig_func, self, device=None, *args, **kwargs: check_device(device), - ) - CondFunc( - "torch.empty", - lambda orig_func, *args, device=None, **kwargs: orig_func( - *args, device=return_xpu(device), **kwargs - ), - lambda orig_func, *args, device=None, **kwargs: check_device(device), - ) - CondFunc( - "torch.load", - lambda orig_func, *args, map_location=None, **kwargs: orig_func( - *args, return_xpu(map_location), **kwargs - ), - lambda orig_func, *args, map_location=None, **kwargs: map_location is None - or check_device(map_location), - ) - CondFunc( - "torch.randn", - lambda orig_func, *args, device=None, **kwargs: orig_func( - *args, device=return_xpu(device), **kwargs - ), - lambda orig_func, *args, device=None, **kwargs: check_device(device), - ) - CondFunc( - "torch.ones", - lambda orig_func, *args, device=None, **kwargs: orig_func( - *args, device=return_xpu(device), **kwargs - ), - lambda orig_func, *args, device=None, **kwargs: check_device(device), - ) - CondFunc( - "torch.zeros", - lambda orig_func, *args, device=None, **kwargs: orig_func( - *args, device=return_xpu(device), **kwargs - ), - lambda orig_func, *args, device=None, **kwargs: check_device(device), - ) - CondFunc( - "torch.tensor", - lambda orig_func, *args, device=None, **kwargs: orig_func( - *args, device=return_xpu(device), **kwargs - ), - lambda orig_func, *args, device=None, **kwargs: check_device(device), - ) - CondFunc( - "torch.linspace", - lambda orig_func, *args, device=None, **kwargs: orig_func( - *args, device=return_xpu(device), **kwargs - ), - lambda orig_func, *args, device=None, **kwargs: check_device(device), - ) - - CondFunc( - "torch.Generator", - lambda orig_func, device=None: torch.xpu.Generator(device), - lambda orig_func, device=None: device is not None - and device != torch.device("cpu") - and device != "cpu", - ) - - CondFunc( - "torch.batch_norm", - lambda orig_func, input, weight, bias, *args, **kwargs: orig_func( - input, - ( - weight - if weight is not None - else torch.ones(input.size()[1], device=input.device) - ), - ( - bias - if bias is not None - else torch.zeros(input.size()[1], device=input.device) - ), - *args, - **kwargs, - ), - lambda orig_func, input, *args, **kwargs: input.device != torch.device("cpu"), - ) - CondFunc( - "torch.instance_norm", - lambda orig_func, input, weight, bias, *args, **kwargs: orig_func( - input, - ( - weight - if weight is not None - else torch.ones(input.size()[1], device=input.device) - ), - ( - bias - if bias is not None - else torch.zeros(input.size()[1], device=input.device) - ), - *args, - **kwargs, - ), - lambda orig_func, input, *args, **kwargs: input.device != torch.device("cpu"), - ) - - # Functions with dtype errors: - CondFunc( - "torch.nn.modules.GroupNorm.forward", - lambda orig_func, self, input: orig_func( - self, input.to(self.weight.data.dtype) - ), - lambda orig_func, self, input: input.dtype != self.weight.data.dtype, - ) - CondFunc( - "torch.nn.modules.linear.Linear.forward", - lambda orig_func, self, input: orig_func( - self, input.to(self.weight.data.dtype) - ), - lambda orig_func, self, input: input.dtype != self.weight.data.dtype, - ) - CondFunc( - "torch.nn.modules.conv.Conv2d.forward", - lambda orig_func, self, input: orig_func( - self, input.to(self.weight.data.dtype) - ), - lambda orig_func, self, input: input.dtype != self.weight.data.dtype, - ) - CondFunc( - "torch.nn.functional.layer_norm", - lambda orig_func, input, normalized_shape=None, weight=None, *args, **kwargs: orig_func( - input.to(weight.data.dtype), normalized_shape, weight, *args, **kwargs - ), - lambda orig_func, input, normalized_shape=None, weight=None, *args, **kwargs: weight - is not None - and input.dtype != weight.data.dtype, - ) - - # Diffusers Float64 (ARC GPUs doesn't support double or Float64): - if not torch.xpu.has_fp64_dtype(): - CondFunc( - "torch.from_numpy", - lambda orig_func, ndarray: orig_func(ndarray.astype("float32")), - lambda orig_func, ndarray: ndarray.dtype == float, - ) - - # Broken functions when torch.cuda.is_available is True: - CondFunc( - "torch.utils.data.dataloader._BaseDataLoaderIter.__init__", - lambda orig_func, *args, **kwargs: ipex_no_cuda(orig_func, *args, **kwargs), - lambda orig_func, *args, **kwargs: True, - ) - - # Functions that make compile mad with CondFunc: - torch.utils.data.dataloader._MultiProcessingDataLoaderIter._shutdown_workers = ( - _shutdown_workers - ) - torch.nn.DataParallel = DummyDataParallel - torch.autocast = ipex_autocast - torch.cat = torch_cat - torch.linalg.solve = linalg_solve - torch.nn.functional.interpolate = interpolate - torch.backends.cuda.sdp_kernel = return_null_context diff --git a/infer/modules/onnx/export.py b/infer/modules/onnx/export.py deleted file mode 100644 index ed4a4162ff04b7e12642fcbe96847f8ea9db06aa..0000000000000000000000000000000000000000 --- a/infer/modules/onnx/export.py +++ /dev/null @@ -1,52 +0,0 @@ -import torch - -from infer.lib.infer_pack.models_onnx import SynthesizerTrnMsNSFsidM - - -def export_onnx(ModelPath, ExportedPath): - cpt = torch.load(ModelPath, map_location="cpu") - cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] - vec_channels = 256 if cpt.get("version", "v1") == "v1" else 768 - - test_phone = torch.rand(1, 200, vec_channels) # hidden unit - test_phone_lengths = torch.tensor([200]).long() # hidden unit 长度(貌似没啥用) - test_pitch = torch.randint(size=(1, 200), low=5, high=255) # 基频(单位赫兹) - test_pitchf = torch.rand(1, 200) # nsf基频 - test_ds = torch.LongTensor([0]) # 说话人ID - test_rnd = torch.rand(1, 192, 200) # 噪声(加入随机因子) - - device = "cpu" # 导出时设备(不影响使用模型) - - net_g = SynthesizerTrnMsNSFsidM( - *cpt["config"], is_half=False, version=cpt.get("version", "v1") - ) # fp32导出(C++要支持fp16必须手动将内存重新排列所以暂时不用fp16) - net_g.load_state_dict(cpt["weight"], strict=False) - input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds", "rnd"] - output_names = [ - "audio", - ] - # net_g.construct_spkmixmap(n_speaker) 多角色混合轨道导出 - torch.onnx.export( - net_g, - ( - test_phone.to(device), - test_phone_lengths.to(device), - test_pitch.to(device), - test_pitchf.to(device), - test_ds.to(device), - test_rnd.to(device), - ), - ExportedPath, - dynamic_axes={ - "phone": [1], - "pitch": [1], - "pitchf": [1], - "rnd": [2], - }, - do_constant_folding=False, - opset_version=13, - verbose=False, - input_names=input_names, - output_names=output_names, - ) - return "Finished" diff --git a/infer/modules/train/extract/extract_f0_print.py b/infer/modules/train/extract/extract_f0_print.py index 9d231e4e86db204704ce894a2b12ebad38665064..2aa7a789ac1818f21fd94299034fdffb447f2fb1 100644 --- a/infer/modules/train/extract/extract_f0_print.py +++ b/infer/modules/train/extract/extract_f0_print.py @@ -15,6 +15,7 @@ from infer.lib.audio import load_audio logging.getLogger("numba").setLevel(logging.WARNING) from multiprocessing import Process +from model import rmvpe exp_dir = sys.argv[1] f = open("%s/extract_f0_feature.log" % exp_dir, "a+") @@ -83,12 +84,7 @@ class FeatureInput(object): f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.fs) elif f0_method == "rmvpe": if hasattr(self, "model_rmvpe") == False: - from infer.lib.rmvpe import RMVPE - - print("Loading rmvpe model") - self.model_rmvpe = RMVPE( - "assets/rmvpe/rmvpe.pt", is_half=False, device="cpu" - ) + self.model_rmvpe = rmvpe f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) return f0 diff --git a/infer/modules/train/extract/extract_f0_rmvpe.py b/infer/modules/train/extract/extract_f0_rmvpe.py index 358bc8cad72c58202ba186d65f6cb925ddd76dc3..114bfd5f57ef7412991dc2236cf302e73a415923 100644 --- a/infer/modules/train/extract/extract_f0_rmvpe.py +++ b/infer/modules/train/extract/extract_f0_rmvpe.py @@ -6,17 +6,18 @@ import logging import numpy as np from infer.lib.audio import load_audio +from model import rmvpe, fp16 logging.getLogger("numba").setLevel(logging.WARNING) class FeatureInput(object): - def __init__(self, exp_dir, samplerate=16000, hop_size=160, is_half=False): + def __init__(self, exp_dir, samplerate=16000, hop_size=160): self.exp_dir = exp_dir self.logfile = open("%s/extract_f0_feature.log" % exp_dir, "a+") self.fs = samplerate self.hop = hop_size - self.is_half = is_half + self.is_half = fp16 self.f0_bin = 256 self.f0_max = 1100.0 @@ -34,12 +35,7 @@ class FeatureInput(object): # p_len = x.shape[0] // self.hop if f0_method == "rmvpe": if hasattr(self, "model_rmvpe") == False: - from infer.lib.rmvpe import RMVPE - - print("Loading rmvpe model") - self.model_rmvpe = RMVPE( - "assets/rmvpe/rmvpe.pt", is_half=self.is_half, device="cuda" - ) + self.model_rmvpe = rmvpe f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) return f0 diff --git a/infer/modules/train/extract/extract_f0_rmvpe_dml.py b/infer/modules/train/extract/extract_f0_rmvpe_dml.py deleted file mode 100644 index 243e825005bd46dfd464f6d49ecf78f0abf03dc2..0000000000000000000000000000000000000000 --- a/infer/modules/train/extract/extract_f0_rmvpe_dml.py +++ /dev/null @@ -1,139 +0,0 @@ -import os -import sys -import traceback - -import parselmouth - -now_dir = os.getcwd() -sys.path.append(now_dir) -import logging - -import numpy as np -import pyworld - -from infer.lib.audio import load_audio - -logging.getLogger("numba").setLevel(logging.WARNING) - -exp_dir = sys.argv[1] -import torch_directml - -device = torch_directml.device(torch_directml.default_device()) -f = open("%s/extract_f0_feature.log" % exp_dir, "a+") - - -def printt(strr): - print(strr) - f.write("%s\n" % strr) - f.flush() - - -class FeatureInput(object): - def __init__(self, samplerate=16000, hop_size=160): - self.fs = samplerate - self.hop = hop_size - - self.f0_bin = 256 - self.f0_max = 1100.0 - self.f0_min = 50.0 - self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700) - self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700) - - def compute_f0(self, path, f0_method): - x = load_audio(path, self.fs) - # p_len = x.shape[0] // self.hop - if f0_method == "rmvpe": - if hasattr(self, "model_rmvpe") == False: - from infer.lib.rmvpe import RMVPE - - print("Loading rmvpe model") - self.model_rmvpe = RMVPE( - "assets/rmvpe/rmvpe.pt", is_half=False, device=device - ) - f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) - return f0 - - def coarse_f0(self, f0): - f0_mel = 1127 * np.log(1 + f0 / 700) - f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * ( - self.f0_bin - 2 - ) / (self.f0_mel_max - self.f0_mel_min) + 1 - - # use 0 or 1 - f0_mel[f0_mel <= 1] = 1 - f0_mel[f0_mel > self.f0_bin - 1] = self.f0_bin - 1 - f0_coarse = np.rint(f0_mel).astype(int) - assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, ( - f0_coarse.max(), - f0_coarse.min(), - ) - return f0_coarse - - def go(self, paths, f0_method): - if len(paths) == 0: - printt("no-f0-todo") - else: - printt("todo-f0-%s" % len(paths)) - n = max(len(paths) // 5, 1) # 每个进程最多打印5条 - for idx, (inp_path, opt_path1, opt_path2) in enumerate(paths): - try: - if idx % n == 0: - printt("f0ing,now-%s,all-%s,-%s" % (idx, len(paths), inp_path)) - if ( - os.path.exists(opt_path1 + ".npy") == True - and os.path.exists(opt_path2 + ".npy") == True - ): - continue - featur_pit = self.compute_f0(inp_path, f0_method) - np.save( - opt_path2, - featur_pit, - allow_pickle=False, - ) # nsf - coarse_pit = self.coarse_f0(featur_pit) - np.save( - opt_path1, - coarse_pit, - allow_pickle=False, - ) # ori - except: - printt("f0fail-%s-%s-%s" % (idx, inp_path, traceback.format_exc())) - - -if __name__ == "__main__": - # exp_dir=r"E:\codes\py39\dataset\mi-test" - # n_p=16 - # f = open("%s/log_extract_f0.log"%exp_dir, "w") - printt(" ".join(sys.argv)) - featureInput = FeatureInput() - paths = [] - inp_root = "%s/1_16k_wavs" % (exp_dir) - opt_root1 = "%s/2a_f0" % (exp_dir) - opt_root2 = "%s/2b-f0nsf" % (exp_dir) - - os.makedirs(opt_root1, exist_ok=True) - os.makedirs(opt_root2, exist_ok=True) - for name in sorted(list(os.listdir(inp_root))): - inp_path = "%s/%s" % (inp_root, name) - if "spec" in inp_path: - continue - opt_path1 = "%s/%s" % (opt_root1, name) - opt_path2 = "%s/%s" % (opt_root2, name) - paths.append([inp_path, opt_path1, opt_path2]) - try: - featureInput.go(paths, "rmvpe") - except: - printt("f0_all_fail-%s" % (traceback.format_exc())) - # ps = [] - # for i in range(n_p): - # p = Process( - # target=featureInput.go, - # args=( - # paths[i::n_p], - # f0method, - # ), - # ) - # ps.append(p) - # p.start() - # for i in range(n_p): - # ps[i].join() diff --git a/infer/modules/train/extract_feature_print.py b/infer/modules/train/extract_feature_print.py index 8a21deb6a16212b1b04056a1ca4c3b15974bbdab..c5a5f0374f5f33d0944142674f945e0fa9c0657f 100644 --- a/infer/modules/train/extract_feature_print.py +++ b/infer/modules/train/extract_feature_print.py @@ -5,26 +5,7 @@ import numpy as np import soundfile as sf import torch import torch.nn.functional as F - - -device = "cpu" -if torch.cuda.is_available(): - device = "cuda" -elif torch.backends.mps.is_available(): - device = "mps" - -model_path = "assets/hubert/hubert_base.pt" -models, saved_cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task( - [model_path], - suffix="", -) -model = models[0] -model = model.to(device) -is_half = False -if is_half: - if device not in ["mps", "cpu"]: - model = model.half() -model.eval() +from model import hubert, hubert_cfg, device, fp16 as is_half # wave must be 16k, hop_size=320 @@ -71,19 +52,17 @@ class HubertFeatureExtractor: if os.path.exists(out_path): continue - feats = readwave(wav_path, normalize=saved_cfg.task.normalize) + feats = readwave(wav_path, normalize=hubert_cfg.task.normalize) padding_mask = torch.BoolTensor(feats.shape).fill_(False) inputs = { "source": ( - feats.half().to(device) - if is_half and device not in ["mps", "cpu"] - else feats.to(device) + feats.half().to(device) if is_half else feats.to(device) ), "padding_mask": padding_mask.to(device), "output_layer": 12, } with torch.no_grad(): - logits = model.extract_features(**inputs) + logits = hubert.extract_features(**inputs) feats = logits[0] feats = feats.squeeze(0).float().cpu().numpy() diff --git a/infer/modules/train/train.py b/infer/modules/train/train.py index 2b900ec34ecdb246838660822bdee4bc9566dde9..c953150118035b01b5d8866a6fa7b359c65a4779 100644 --- a/infer/modules/train/train.py +++ b/infer/modules/train/train.py @@ -35,13 +35,9 @@ except Exception: torch.backends.cudnn.deterministic = False torch.backends.cudnn.benchmark = False -from time import sleep from time import time as ttime -import torch.distributed as dist -import torch.multiprocessing as mp from torch.nn import functional as F -from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.data import DataLoader from torch.utils.tensorboard import SummaryWriter diff --git a/infer/modules/uvr5/mdxnet.py b/infer/modules/uvr5/mdxnet.py deleted file mode 100644 index 2f246db7a7c3186afd60f0b99b8089814331f4ba..0000000000000000000000000000000000000000 --- a/infer/modules/uvr5/mdxnet.py +++ /dev/null @@ -1,256 +0,0 @@ -import os -import logging - -logger = logging.getLogger(__name__) - -import librosa -import numpy as np -import soundfile as sf -import torch -from tqdm import tqdm - -cpu = torch.device("cpu") - - -class ConvTDFNetTrim: - def __init__( - self, device, model_name, target_name, L, dim_f, dim_t, n_fft, hop=1024 - ): - super(ConvTDFNetTrim, self).__init__() - - self.dim_f = dim_f - self.dim_t = 2**dim_t - self.n_fft = n_fft - self.hop = hop - self.n_bins = self.n_fft // 2 + 1 - self.chunk_size = hop * (self.dim_t - 1) - self.window = torch.hann_window(window_length=self.n_fft, periodic=True).to( - device - ) - self.target_name = target_name - self.blender = "blender" in model_name - - self.dim_c = 4 - out_c = self.dim_c * 4 if target_name == "*" else self.dim_c - self.freq_pad = torch.zeros( - [1, out_c, self.n_bins - self.dim_f, self.dim_t] - ).to(device) - - self.n = L // 2 - - def stft(self, x): - x = x.reshape([-1, self.chunk_size]) - x = torch.stft( - x, - n_fft=self.n_fft, - hop_length=self.hop, - window=self.window, - center=True, - return_complex=True, - ) - x = torch.view_as_real(x) - x = x.permute([0, 3, 1, 2]) - x = x.reshape([-1, 2, 2, self.n_bins, self.dim_t]).reshape( - [-1, self.dim_c, self.n_bins, self.dim_t] - ) - return x[:, :, : self.dim_f] - - def istft(self, x, freq_pad=None): - freq_pad = ( - self.freq_pad.repeat([x.shape[0], 1, 1, 1]) - if freq_pad is None - else freq_pad - ) - x = torch.cat([x, freq_pad], -2) - c = 4 * 2 if self.target_name == "*" else 2 - x = x.reshape([-1, c, 2, self.n_bins, self.dim_t]).reshape( - [-1, 2, self.n_bins, self.dim_t] - ) - x = x.permute([0, 2, 3, 1]) - x = x.contiguous() - x = torch.view_as_complex(x) - x = torch.istft( - x, n_fft=self.n_fft, hop_length=self.hop, window=self.window, center=True - ) - return x.reshape([-1, c, self.chunk_size]) - - -def get_models(device, dim_f, dim_t, n_fft): - return ConvTDFNetTrim( - device=device, - model_name="Conv-TDF", - target_name="vocals", - L=11, - dim_f=dim_f, - dim_t=dim_t, - n_fft=n_fft, - ) - - -class Predictor: - def __init__(self, args): - import onnxruntime as ort - - logger.info(ort.get_available_providers()) - self.args = args - self.model_ = get_models( - device=cpu, dim_f=args.dim_f, dim_t=args.dim_t, n_fft=args.n_fft - ) - self.model = ort.InferenceSession( - os.path.join(args.onnx, self.model_.target_name + ".onnx"), - providers=[ - "CUDAExecutionProvider", - "DmlExecutionProvider", - "CPUExecutionProvider", - ], - ) - logger.info("ONNX load done") - - def demix(self, mix): - samples = mix.shape[-1] - margin = self.args.margin - chunk_size = self.args.chunks * 44100 - assert not margin == 0, "margin cannot be zero!" - if margin > chunk_size: - margin = chunk_size - - segmented_mix = {} - - if self.args.chunks == 0 or samples < chunk_size: - chunk_size = samples - - counter = -1 - for skip in range(0, samples, chunk_size): - counter += 1 - - s_margin = 0 if counter == 0 else margin - end = min(skip + chunk_size + margin, samples) - - start = skip - s_margin - - segmented_mix[skip] = mix[:, start:end].copy() - if end == samples: - break - - sources = self.demix_base(segmented_mix, margin_size=margin) - """ - mix:(2,big_sample) - segmented_mix:offset->(2,small_sample) - sources:(1,2,big_sample) - """ - return sources - - def demix_base(self, mixes, margin_size): - chunked_sources = [] - progress_bar = tqdm(total=len(mixes)) - progress_bar.set_description("Processing") - for mix in mixes: - cmix = mixes[mix] - sources = [] - n_sample = cmix.shape[1] - model = self.model_ - trim = model.n_fft // 2 - gen_size = model.chunk_size - 2 * trim - pad = gen_size - n_sample % gen_size - mix_p = np.concatenate( - (np.zeros((2, trim)), cmix, np.zeros((2, pad)), np.zeros((2, trim))), 1 - ) - mix_waves = [] - i = 0 - while i < n_sample + pad: - waves = np.array(mix_p[:, i : i + model.chunk_size]) - mix_waves.append(waves) - i += gen_size - mix_waves = torch.tensor(mix_waves, dtype=torch.float32).to(cpu) - with torch.no_grad(): - _ort = self.model - spek = model.stft(mix_waves) - if self.args.denoise: - spec_pred = ( - -_ort.run(None, {"input": -spek.cpu().numpy()})[0] * 0.5 - + _ort.run(None, {"input": spek.cpu().numpy()})[0] * 0.5 - ) - tar_waves = model.istft(torch.tensor(spec_pred)) - else: - tar_waves = model.istft( - torch.tensor(_ort.run(None, {"input": spek.cpu().numpy()})[0]) - ) - tar_signal = ( - tar_waves[:, :, trim:-trim] - .transpose(0, 1) - .reshape(2, -1) - .numpy()[:, :-pad] - ) - - start = 0 if mix == 0 else margin_size - end = None if mix == list(mixes.keys())[::-1][0] else -margin_size - if margin_size == 0: - end = None - sources.append(tar_signal[:, start:end]) - - progress_bar.update(1) - - chunked_sources.append(sources) - _sources = np.concatenate(chunked_sources, axis=-1) - # del self.model - progress_bar.close() - return _sources - - def prediction(self, m, vocal_root, others_root, format): - os.makedirs(vocal_root, exist_ok=True) - os.makedirs(others_root, exist_ok=True) - basename = os.path.basename(m) - mix, rate = librosa.load(m, mono=False, sr=44100) - if mix.ndim == 1: - mix = np.asfortranarray([mix, mix]) - mix = mix.T - sources = self.demix(mix.T) - opt = sources[0].T - if format in ["wav", "flac"]: - sf.write( - "%s/%s_main_vocal.%s" % (vocal_root, basename, format), mix - opt, rate - ) - sf.write("%s/%s_others.%s" % (others_root, basename, format), opt, rate) - else: - path_vocal = "%s/%s_main_vocal.wav" % (vocal_root, basename) - path_other = "%s/%s_others.wav" % (others_root, basename) - sf.write(path_vocal, mix - opt, rate) - sf.write(path_other, opt, rate) - opt_path_vocal = path_vocal[:-4] + ".%s" % format - opt_path_other = path_other[:-4] + ".%s" % format - if os.path.exists(path_vocal): - os.system( - "ffmpeg -i %s -vn %s -q:a 2 -y" % (path_vocal, opt_path_vocal) - ) - if os.path.exists(opt_path_vocal): - try: - os.remove(path_vocal) - except: - pass - if os.path.exists(path_other): - os.system( - "ffmpeg -i %s -vn %s -q:a 2 -y" % (path_other, opt_path_other) - ) - if os.path.exists(opt_path_other): - try: - os.remove(path_other) - except: - pass - - -class MDXNetDereverb: - def __init__(self, chunks, device): - self.onnx = "assets/uvr5_weights/onnx_dereverb_By_FoxJoy" - self.shifts = 10 # 'Predict with randomised equivariant stabilisation' - self.mixing = "min_mag" # ['default','min_mag','max_mag'] - self.chunks = chunks - self.margin = 44100 - self.dim_t = 9 - self.dim_f = 3072 - self.n_fft = 6144 - self.denoise = True - self.pred = Predictor(self) - self.device = device - - def _path_audio_(self, input, vocal_root, others_root, format, is_hp3=False): - self.pred.prediction(input, vocal_root, others_root, format) diff --git a/infer/modules/uvr5/modules.py b/infer/modules/uvr5/modules.py deleted file mode 100644 index bce3cef4eb83797e9ea196a7c6252abebd106a20..0000000000000000000000000000000000000000 --- a/infer/modules/uvr5/modules.py +++ /dev/null @@ -1,108 +0,0 @@ -import os -import traceback -import logging - -logger = logging.getLogger(__name__) - -import ffmpeg -import torch - -from configs.config import Config -from infer.modules.uvr5.mdxnet import MDXNetDereverb -from infer.modules.uvr5.vr import AudioPre, AudioPreDeEcho - -config = Config() - - -def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format0): - infos = [] - try: - inp_root = inp_root.strip(" ").strip('"').strip("\n").strip('"').strip(" ") - save_root_vocal = ( - save_root_vocal.strip(" ").strip('"').strip("\n").strip('"').strip(" ") - ) - save_root_ins = ( - save_root_ins.strip(" ").strip('"').strip("\n").strip('"').strip(" ") - ) - if model_name == "onnx_dereverb_By_FoxJoy": - pre_fun = MDXNetDereverb(15, config.device) - else: - func = AudioPre if "DeEcho" not in model_name else AudioPreDeEcho - pre_fun = func( - agg=int(agg), - model_path=os.path.join( - os.getenv("weight_uvr5_root"), model_name + ".pth" - ), - device=config.device, - is_half=config.is_half, - ) - is_hp3 = "HP3" in model_name - if inp_root != "": - paths = [os.path.join(inp_root, name) for name in os.listdir(inp_root)] - else: - paths = [path.name for path in paths] - for path in paths: - inp_path = os.path.join(inp_root, path) - need_reformat = 1 - done = 0 - try: - info = ffmpeg.probe(inp_path, cmd="ffprobe") - if ( - info["streams"][0]["channels"] == 2 - and info["streams"][0]["sample_rate"] == "44100" - ): - need_reformat = 0 - pre_fun._path_audio_( - inp_path, save_root_ins, save_root_vocal, format0, is_hp3=is_hp3 - ) - done = 1 - except: - need_reformat = 1 - traceback.print_exc() - if need_reformat == 1: - tmp_path = "%s/%s.reformatted.wav" % ( - os.path.join(os.environ["TEMP"]), - os.path.basename(inp_path), - ) - os.system( - "ffmpeg -i %s -vn -acodec pcm_s16le -ac 2 -ar 44100 %s -y" - % (inp_path, tmp_path) - ) - inp_path = tmp_path - try: - if done == 0: - pre_fun._path_audio_( - inp_path, save_root_ins, save_root_vocal, format0 - ) - infos.append("%s->Success" % (os.path.basename(inp_path))) - yield "\n".join(infos) - except: - try: - if done == 0: - pre_fun._path_audio_( - inp_path, save_root_ins, save_root_vocal, format0 - ) - infos.append("%s->Success" % (os.path.basename(inp_path))) - yield "\n".join(infos) - except: - infos.append( - "%s->%s" % (os.path.basename(inp_path), traceback.format_exc()) - ) - yield "\n".join(infos) - except: - infos.append(traceback.format_exc()) - yield "\n".join(infos) - finally: - try: - if model_name == "onnx_dereverb_By_FoxJoy": - del pre_fun.pred.model - del pre_fun.pred.model_ - else: - del pre_fun.model - del pre_fun - except: - traceback.print_exc() - if torch.cuda.is_available(): - torch.cuda.empty_cache() - logger.info("Executed torch.cuda.empty_cache()") - yield "\n".join(infos) diff --git a/infer/modules/uvr5/vr.py b/infer/modules/uvr5/vr.py deleted file mode 100644 index ed5778438a799c98b138dfa35d0a7f81911c3855..0000000000000000000000000000000000000000 --- a/infer/modules/uvr5/vr.py +++ /dev/null @@ -1,368 +0,0 @@ -import os -import logging - -logger = logging.getLogger(__name__) - -import librosa -import numpy as np -import soundfile as sf -import torch - -from infer.lib.uvr5_pack.lib_v5 import nets_61968KB as Nets -from infer.lib.uvr5_pack.lib_v5 import spec_utils -from infer.lib.uvr5_pack.lib_v5.model_param_init import ModelParameters -from infer.lib.uvr5_pack.lib_v5.nets_new import CascadedNet -from infer.lib.uvr5_pack.utils import inference - - -class AudioPre: - def __init__(self, agg, model_path, device, is_half, tta=False): - self.model_path = model_path - self.device = device - self.data = { - # Processing Options - "postprocess": False, - "tta": tta, - # Constants - "window_size": 512, - "agg": agg, - "high_end_process": "mirroring", - } - mp = ModelParameters("infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json") - model = Nets.CascadedASPPNet(mp.param["bins"] * 2) - cpk = torch.load(model_path, map_location="cpu") - model.load_state_dict(cpk) - model.eval() - if is_half: - model = model.half().to(device) - else: - model = model.to(device) - - self.mp = mp - self.model = model - - def _path_audio_( - self, music_file, ins_root=None, vocal_root=None, format="flac", is_hp3=False - ): - if ins_root is None and vocal_root is None: - return "No save root." - name = os.path.basename(music_file) - if ins_root is not None: - os.makedirs(ins_root, exist_ok=True) - if vocal_root is not None: - os.makedirs(vocal_root, exist_ok=True) - X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {} - bands_n = len(self.mp.param["band"]) - # print(bands_n) - for d in range(bands_n, 0, -1): - bp = self.mp.param["band"][d] - if d == bands_n: # high-end band - ( - X_wave[d], - _, - ) = librosa.core.load( # 理论上librosa读取可能对某些音频有bug,应该上ffmpeg读取,但是太麻烦了弃坑 - music_file, - bp["sr"], - False, - dtype=np.float32, - res_type=bp["res_type"], - ) - if X_wave[d].ndim == 1: - X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]]) - else: # lower bands - X_wave[d] = librosa.core.resample( - X_wave[d + 1], - self.mp.param["band"][d + 1]["sr"], - bp["sr"], - res_type=bp["res_type"], - ) - # Stft of wave source - X_spec_s[d] = spec_utils.wave_to_spectrogram_mt( - X_wave[d], - bp["hl"], - bp["n_fft"], - self.mp.param["mid_side"], - self.mp.param["mid_side_b2"], - self.mp.param["reverse"], - ) - # pdb.set_trace() - if d == bands_n and self.data["high_end_process"] != "none": - input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + ( - self.mp.param["pre_filter_stop"] - self.mp.param["pre_filter_start"] - ) - input_high_end = X_spec_s[d][ - :, bp["n_fft"] // 2 - input_high_end_h : bp["n_fft"] // 2, : - ] - - X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp) - aggresive_set = float(self.data["agg"] / 100) - aggressiveness = { - "value": aggresive_set, - "split_bin": self.mp.param["band"][1]["crop_stop"], - } - with torch.no_grad(): - pred, X_mag, X_phase = inference( - X_spec_m, self.device, self.model, aggressiveness, self.data - ) - # Postprocess - if self.data["postprocess"]: - pred_inv = np.clip(X_mag - pred, 0, np.inf) - pred = spec_utils.mask_silence(pred, pred_inv) - y_spec_m = pred * X_phase - v_spec_m = X_spec_m - y_spec_m - - if ins_root is not None: - if self.data["high_end_process"].startswith("mirroring"): - input_high_end_ = spec_utils.mirroring( - self.data["high_end_process"], y_spec_m, input_high_end, self.mp - ) - wav_instrument = spec_utils.cmb_spectrogram_to_wave( - y_spec_m, self.mp, input_high_end_h, input_high_end_ - ) - else: - wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp) - logger.info("%s instruments done" % name) - if is_hp3 == True: - head = "vocal_" - else: - head = "instrument_" - if format in ["wav", "flac"]: - sf.write( - os.path.join( - ins_root, - head + "{}_{}.{}".format(name, self.data["agg"], format), - ), - (np.array(wav_instrument) * 32768).astype("int16"), - self.mp.param["sr"], - ) # - else: - path = os.path.join( - ins_root, head + "{}_{}.wav".format(name, self.data["agg"]) - ) - sf.write( - path, - (np.array(wav_instrument) * 32768).astype("int16"), - self.mp.param["sr"], - ) - if os.path.exists(path): - opt_format_path = path[:-4] + ".%s" % format - os.system("ffmpeg -i %s -vn %s -q:a 2 -y" % (path, opt_format_path)) - if os.path.exists(opt_format_path): - try: - os.remove(path) - except: - pass - if vocal_root is not None: - if is_hp3 == True: - head = "instrument_" - else: - head = "vocal_" - if self.data["high_end_process"].startswith("mirroring"): - input_high_end_ = spec_utils.mirroring( - self.data["high_end_process"], v_spec_m, input_high_end, self.mp - ) - wav_vocals = spec_utils.cmb_spectrogram_to_wave( - v_spec_m, self.mp, input_high_end_h, input_high_end_ - ) - else: - wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp) - logger.info("%s vocals done" % name) - if format in ["wav", "flac"]: - sf.write( - os.path.join( - vocal_root, - head + "{}_{}.{}".format(name, self.data["agg"], format), - ), - (np.array(wav_vocals) * 32768).astype("int16"), - self.mp.param["sr"], - ) - else: - path = os.path.join( - vocal_root, head + "{}_{}.wav".format(name, self.data["agg"]) - ) - sf.write( - path, - (np.array(wav_vocals) * 32768).astype("int16"), - self.mp.param["sr"], - ) - if os.path.exists(path): - opt_format_path = path[:-4] + ".%s" % format - os.system("ffmpeg -i %s -vn %s -q:a 2 -y" % (path, opt_format_path)) - if os.path.exists(opt_format_path): - try: - os.remove(path) - except: - pass - - -class AudioPreDeEcho: - def __init__(self, agg, model_path, device, is_half, tta=False): - self.model_path = model_path - self.device = device - self.data = { - # Processing Options - "postprocess": False, - "tta": tta, - # Constants - "window_size": 512, - "agg": agg, - "high_end_process": "mirroring", - } - mp = ModelParameters("infer/lib/uvr5_pack/lib_v5/modelparams/4band_v3.json") - nout = 64 if "DeReverb" in model_path else 48 - model = CascadedNet(mp.param["bins"] * 2, nout) - cpk = torch.load(model_path, map_location="cpu") - model.load_state_dict(cpk) - model.eval() - if is_half: - model = model.half().to(device) - else: - model = model.to(device) - - self.mp = mp - self.model = model - - def _path_audio_( - self, music_file, vocal_root=None, ins_root=None, format="flac", is_hp3=False - ): # 3个VR模型vocal和ins是反的 - if ins_root is None and vocal_root is None: - return "No save root." - name = os.path.basename(music_file) - if ins_root is not None: - os.makedirs(ins_root, exist_ok=True) - if vocal_root is not None: - os.makedirs(vocal_root, exist_ok=True) - X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {} - bands_n = len(self.mp.param["band"]) - # print(bands_n) - for d in range(bands_n, 0, -1): - bp = self.mp.param["band"][d] - if d == bands_n: # high-end band - ( - X_wave[d], - _, - ) = librosa.core.load( # 理论上librosa读取可能对某些音频有bug,应该上ffmpeg读取,但是太麻烦了弃坑 - music_file, - bp["sr"], - False, - dtype=np.float32, - res_type=bp["res_type"], - ) - if X_wave[d].ndim == 1: - X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]]) - else: # lower bands - X_wave[d] = librosa.core.resample( - X_wave[d + 1], - self.mp.param["band"][d + 1]["sr"], - bp["sr"], - res_type=bp["res_type"], - ) - # Stft of wave source - X_spec_s[d] = spec_utils.wave_to_spectrogram_mt( - X_wave[d], - bp["hl"], - bp["n_fft"], - self.mp.param["mid_side"], - self.mp.param["mid_side_b2"], - self.mp.param["reverse"], - ) - # pdb.set_trace() - if d == bands_n and self.data["high_end_process"] != "none": - input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + ( - self.mp.param["pre_filter_stop"] - self.mp.param["pre_filter_start"] - ) - input_high_end = X_spec_s[d][ - :, bp["n_fft"] // 2 - input_high_end_h : bp["n_fft"] // 2, : - ] - - X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp) - aggresive_set = float(self.data["agg"] / 100) - aggressiveness = { - "value": aggresive_set, - "split_bin": self.mp.param["band"][1]["crop_stop"], - } - with torch.no_grad(): - pred, X_mag, X_phase = inference( - X_spec_m, self.device, self.model, aggressiveness, self.data - ) - # Postprocess - if self.data["postprocess"]: - pred_inv = np.clip(X_mag - pred, 0, np.inf) - pred = spec_utils.mask_silence(pred, pred_inv) - y_spec_m = pred * X_phase - v_spec_m = X_spec_m - y_spec_m - - if ins_root is not None: - if self.data["high_end_process"].startswith("mirroring"): - input_high_end_ = spec_utils.mirroring( - self.data["high_end_process"], y_spec_m, input_high_end, self.mp - ) - wav_instrument = spec_utils.cmb_spectrogram_to_wave( - y_spec_m, self.mp, input_high_end_h, input_high_end_ - ) - else: - wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp) - logger.info("%s instruments done" % name) - if format in ["wav", "flac"]: - sf.write( - os.path.join( - ins_root, - "vocal_{}_{}.{}".format(name, self.data["agg"], format), - ), - (np.array(wav_instrument) * 32768).astype("int16"), - self.mp.param["sr"], - ) # - else: - path = os.path.join( - ins_root, "vocal_{}_{}.wav".format(name, self.data["agg"]) - ) - sf.write( - path, - (np.array(wav_instrument) * 32768).astype("int16"), - self.mp.param["sr"], - ) - if os.path.exists(path): - opt_format_path = path[:-4] + ".%s" % format - os.system("ffmpeg -i %s -vn %s -q:a 2 -y" % (path, opt_format_path)) - if os.path.exists(opt_format_path): - try: - os.remove(path) - except: - pass - if vocal_root is not None: - if self.data["high_end_process"].startswith("mirroring"): - input_high_end_ = spec_utils.mirroring( - self.data["high_end_process"], v_spec_m, input_high_end, self.mp - ) - wav_vocals = spec_utils.cmb_spectrogram_to_wave( - v_spec_m, self.mp, input_high_end_h, input_high_end_ - ) - else: - wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp) - logger.info("%s vocals done" % name) - if format in ["wav", "flac"]: - sf.write( - os.path.join( - vocal_root, - "instrument_{}_{}.{}".format(name, self.data["agg"], format), - ), - (np.array(wav_vocals) * 32768).astype("int16"), - self.mp.param["sr"], - ) - else: - path = os.path.join( - vocal_root, "instrument_{}_{}.wav".format(name, self.data["agg"]) - ) - sf.write( - path, - (np.array(wav_vocals) * 32768).astype("int16"), - self.mp.param["sr"], - ) - if os.path.exists(path): - opt_format_path = path[:-4] + ".%s" % format - os.system("ffmpeg -i %s -vn %s -q:a 2 -y" % (path, opt_format_path)) - if os.path.exists(opt_format_path): - try: - os.remove(path) - except: - pass diff --git a/infer/modules/vc/__init__.py b/infer/modules/vc/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/infer/modules/vc/modules.py b/infer/modules/vc/modules.py new file mode 100644 index 0000000000000000000000000000000000000000..7706537dca0d0c94167e3a8da7a96b9bf18d95b9 --- /dev/null +++ b/infer/modules/vc/modules.py @@ -0,0 +1,305 @@ +import traceback +import logging + +logger = logging.getLogger(__name__) + +import numpy as np +import soundfile as sf +import torch +from io import BytesIO + +from infer.lib.audio import load_audio, wav2 +from infer.lib.infer_pack.models import ( + SynthesizerTrnMs256NSFsid, + SynthesizerTrnMs256NSFsid_nono, + SynthesizerTrnMs768NSFsid, + SynthesizerTrnMs768NSFsid_nono, +) +from infer.modules.vc.pipeline import Pipeline +from infer.modules.vc.utils import * +from model import hubert + + +class VC: + def __init__(self, config): + self.n_spk = None + self.tgt_sr = None + self.net_g = None + self.pipeline = None + self.cpt = None + self.version = None + self.if_f0 = None + self.version = None + self.hubert_model = None + + self.config = config + + def get_vc(self, sid, *to_return_protect): + logger.info("Get sid: " + sid) + + to_return_protect0 = { + "visible": self.if_f0 != 0, + "value": ( + to_return_protect[0] if self.if_f0 != 0 and to_return_protect else 0.5 + ), + "__type__": "update", + } + to_return_protect1 = { + "visible": self.if_f0 != 0, + "value": ( + to_return_protect[1] if self.if_f0 != 0 and to_return_protect else 0.33 + ), + "__type__": "update", + } + + if sid == "" or sid == []: + if ( + self.hubert_model is not None + ): # 考虑到轮询, 需要加个判断看是否 sid 是由有模型切换到无模型的 + logger.info("Clean model cache") + del (self.net_g, self.n_spk, self.hubert_model, self.tgt_sr) # ,cpt + self.hubert_model = self.net_g = self.n_spk = self.hubert_model = ( + self.tgt_sr + ) = None + if torch.cuda.is_available(): + torch.cuda.empty_cache() + ###楼下不这么折腾清理不干净 + self.if_f0 = self.cpt.get("f0", 1) + self.version = self.cpt.get("version", "v1") + if self.version == "v1": + if self.if_f0 == 1: + self.net_g = SynthesizerTrnMs256NSFsid( + *self.cpt["config"], is_half=self.config.is_half + ) + else: + self.net_g = SynthesizerTrnMs256NSFsid_nono(*self.cpt["config"]) + elif self.version == "v2": + if self.if_f0 == 1: + self.net_g = SynthesizerTrnMs768NSFsid( + *self.cpt["config"], is_half=self.config.is_half + ) + else: + self.net_g = SynthesizerTrnMs768NSFsid_nono(*self.cpt["config"]) + del self.net_g, self.cpt + if torch.cuda.is_available(): + torch.cuda.empty_cache() + return ( + {"visible": False, "__type__": "update"}, + { + "visible": True, + "value": to_return_protect0, + "__type__": "update", + }, + { + "visible": True, + "value": to_return_protect1, + "__type__": "update", + }, + "", + "", + ) + person = sid + logger.info(f"Loading: {person}") + + self.cpt = torch.load(person, map_location="cpu") + self.tgt_sr = self.cpt["config"][-1] + self.cpt["config"][-3] = self.cpt["weight"]["emb_g.weight"].shape[0] # n_spk + self.if_f0 = self.cpt.get("f0", 1) + self.version = self.cpt.get("version", "v1") + + synthesizer_class = { + ("v1", 1): SynthesizerTrnMs256NSFsid, + ("v1", 0): SynthesizerTrnMs256NSFsid_nono, + ("v2", 1): SynthesizerTrnMs768NSFsid, + ("v2", 0): SynthesizerTrnMs768NSFsid_nono, + } + + self.net_g = synthesizer_class.get( + (self.version, self.if_f0), SynthesizerTrnMs256NSFsid + )(*self.cpt["config"], is_half=self.config.is_half) + + del self.net_g.enc_q + + self.net_g.load_state_dict(self.cpt["weight"], strict=False) + self.net_g.eval().to(self.config.device) + if self.config.is_half: + self.net_g = self.net_g.half() + else: + self.net_g = self.net_g.float() + + self.pipeline = Pipeline(self.tgt_sr, self.config) + n_spk = self.cpt["config"][-3] + index = {"value": get_index_path_from_model(sid), "__type__": "update"} + logger.info("Select index: " + index["value"]) + + return ( + ( + {"visible": True, "maximum": n_spk, "__type__": "update"}, + to_return_protect0, + to_return_protect1, + index, + index, + ) + if to_return_protect + else {"visible": True, "maximum": n_spk, "__type__": "update"} + ) + + def vc_single( + self, + sid, + input_audio_path, + f0_up_key, + f0_file, + f0_method, + file_index, + file_index2, + index_rate, + filter_radius, + resample_sr, + rms_mix_rate, + protect, + ): + if input_audio_path is None: + return "You need to upload an audio", None + f0_up_key = int(f0_up_key) + try: + audio = load_audio(input_audio_path, 16000) + audio_max = np.abs(audio).max() / 0.95 + if audio_max > 1: + audio /= audio_max + times = [0, 0, 0] + + if self.hubert_model is None: + self.hubert_model = hubert + + if file_index: + file_index = ( + file_index.strip(" ") + .strip('"') + .strip("\n") + .strip('"') + .strip(" ") + .replace("trained", "added") + ) + elif file_index2: + file_index = file_index2 + else: + file_index = "" # 防止小白写错,自动帮他替换掉 + + audio_opt = self.pipeline.pipeline( + self.hubert_model, + self.net_g, + sid, + audio, + input_audio_path, + times, + f0_up_key, + f0_method, + file_index, + index_rate, + self.if_f0, + filter_radius, + self.tgt_sr, + resample_sr, + rms_mix_rate, + self.version, + protect, + f0_file, + ) + if self.tgt_sr != resample_sr >= 16000: + tgt_sr = resample_sr + else: + tgt_sr = self.tgt_sr + index_info = ( + "Index:\n%s." % file_index + if os.path.exists(file_index) + else "Index not used." + ) + return ( + "Success.\n%s\nTime:\nnpy: %.2fs, f0: %.2fs, infer: %.2fs." + % (index_info, *times), + (tgt_sr, audio_opt), + ) + except: + info = traceback.format_exc() + logger.warning(info) + return info, (None, None) + + def vc_multi( + self, + sid, + dir_path, + opt_root, + paths, + f0_up_key, + f0_method, + file_index, + file_index2, + index_rate, + filter_radius, + resample_sr, + rms_mix_rate, + protect, + format1, + ): + try: + dir_path = ( + dir_path.strip(" ").strip('"').strip("\n").strip('"').strip(" ") + ) # 防止小白拷路径头尾带了空格和"和回车 + opt_root = opt_root.strip(" ").strip('"').strip("\n").strip('"').strip(" ") + os.makedirs(opt_root, exist_ok=True) + try: + if dir_path != "": + paths = [ + os.path.join(dir_path, name) for name in os.listdir(dir_path) + ] + else: + paths = [path.name for path in paths] + except: + traceback.print_exc() + paths = [path.name for path in paths] + infos = [] + for path in paths: + info, opt = self.vc_single( + sid, + path, + f0_up_key, + None, + f0_method, + file_index, + file_index2, + # file_big_npy, + index_rate, + filter_radius, + resample_sr, + rms_mix_rate, + protect, + ) + if "Success" in info: + try: + tgt_sr, audio_opt = opt + if format1 in ["wav", "flac"]: + sf.write( + "%s/%s.%s" + % (opt_root, os.path.basename(path), format1), + audio_opt, + tgt_sr, + ) + else: + path = "%s/%s.%s" % ( + opt_root, + os.path.basename(path), + format1, + ) + with BytesIO() as wavf: + sf.write(wavf, audio_opt, tgt_sr, format="wav") + wavf.seek(0, 0) + with open(path, "wb") as outf: + wav2(wavf, outf, format1) + except: + info += traceback.format_exc() + infos.append("%s->%s" % (os.path.basename(path), info)) + yield "\n".join(infos) + yield "\n".join(infos) + except: + yield traceback.format_exc() diff --git a/infer/modules/vc/pipeline.py b/infer/modules/vc/pipeline.py new file mode 100644 index 0000000000000000000000000000000000000000..15037e6103c1d8b2cbfa1668b5ffa81f98f12302 --- /dev/null +++ b/infer/modules/vc/pipeline.py @@ -0,0 +1,449 @@ +import os +import sys +import traceback +import logging + +logger = logging.getLogger(__name__) + +from functools import lru_cache +from time import time as ttime + +import faiss +import librosa +import numpy as np +import parselmouth +import pyworld +import torch +import torch.nn.functional as F +import torchcrepe +from scipy import signal +from model import rmvpe, device, fp16 + +now_dir = os.getcwd() +sys.path.append(now_dir) + +bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000) + +input_audio_path2wav = {} + + +@lru_cache +def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period): + audio = input_audio_path2wav[input_audio_path] + f0, t = pyworld.harvest( + audio, + fs=fs, + f0_ceil=f0max, + f0_floor=f0min, + frame_period=frame_period, + ) + f0 = pyworld.stonemask(audio, f0, t, fs) + return f0 + + +def change_rms(data1, sr1, data2, sr2, rate): # 1是输入音频,2是输出音频,rate是2的占比 + # print(data1.max(),data2.max()) + rms1 = librosa.feature.rms( + y=data1, frame_length=sr1 // 2 * 2, hop_length=sr1 // 2 + ) # 每半秒一个点 + rms2 = librosa.feature.rms(y=data2, frame_length=sr2 // 2 * 2, hop_length=sr2 // 2) + rms1 = torch.from_numpy(rms1) + rms1 = F.interpolate( + rms1.unsqueeze(0), size=data2.shape[0], mode="linear" + ).squeeze() + rms2 = torch.from_numpy(rms2) + rms2 = F.interpolate( + rms2.unsqueeze(0), size=data2.shape[0], mode="linear" + ).squeeze() + rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-6) + data2 *= ( + torch.pow(rms1, torch.tensor(1 - rate)) + * torch.pow(rms2, torch.tensor(rate - 1)) + ).numpy() + return data2 + + +class Pipeline(object): + def __init__(self, tgt_sr, config): + self.x_pad, self.x_query, self.x_center, self.x_max = ( + config.x_pad, + config.x_query, + config.x_center, + config.x_max, + ) + self.is_half = fp16 + self.sr = 16000 # hubert输入采样率 + self.window = 160 # 每帧点数 + self.t_pad = self.sr * self.x_pad # 每条前后pad时间 + self.t_pad_tgt = tgt_sr * self.x_pad + self.t_pad2 = self.t_pad * 2 + self.t_query = self.sr * self.x_query # 查询切点前后查询时间 + self.t_center = self.sr * self.x_center # 查询切点位置 + self.t_max = self.sr * self.x_max # 免查询时长阈值 + self.device = device + + def get_f0( + self, + input_audio_path, + x, + p_len, + f0_up_key, + f0_method, + filter_radius, + inp_f0=None, + ): + global input_audio_path2wav + time_step = self.window / self.sr * 1000 + f0_min = 50 + f0_max = 1100 + f0_mel_min = 1127 * np.log(1 + f0_min / 700) + f0_mel_max = 1127 * np.log(1 + f0_max / 700) + if f0_method == "pm": + f0 = ( + parselmouth.Sound(x, self.sr) + .to_pitch_ac( + time_step=time_step / 1000, + voicing_threshold=0.6, + pitch_floor=f0_min, + pitch_ceiling=f0_max, + ) + .selected_array["frequency"] + ) + pad_size = (p_len - len(f0) + 1) // 2 + if pad_size > 0 or p_len - len(f0) - pad_size > 0: + f0 = np.pad( + f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant" + ) + elif f0_method == "harvest": + input_audio_path2wav[input_audio_path] = x.astype(np.double) + f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10) + if filter_radius > 2: + f0 = signal.medfilt(f0, 3) + elif f0_method == "crepe": + model = "full" + # Pick a batch size that doesn't cause memory errors on your gpu + batch_size = 512 + # Compute pitch using first gpu + audio = torch.tensor(np.copy(x))[None].float() + f0, pd = torchcrepe.predict( + audio, + self.sr, + self.window, + f0_min, + f0_max, + model, + batch_size=batch_size, + device=self.device, + return_periodicity=True, + ) + pd = torchcrepe.filter.median(pd, 3) + f0 = torchcrepe.filter.mean(f0, 3) + f0[pd < 0.1] = 0 + f0 = f0[0].cpu().numpy() + elif f0_method == "rmvpe": + if not hasattr(self, "model_rmvpe"): + self.model_rmvpe = rmvpe + f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) + + if "privateuseone" in str(self.device): # clean ortruntime memory + del self.model_rmvpe.model + del self.model_rmvpe + logger.info("Cleaning ortruntime memory") + + f0 *= pow(2, f0_up_key / 12) + # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()])) + tf0 = self.sr // self.window # 每秒f0点数 + if inp_f0 is not None: + delta_t = np.round( + (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1 + ).astype("int16") + replace_f0 = np.interp( + list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1] + ) + shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0] + f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[ + :shape + ] + # with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()])) + f0bak = f0.copy() + f0_mel = 1127 * np.log(1 + f0 / 700) + f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / ( + f0_mel_max - f0_mel_min + ) + 1 + f0_mel[f0_mel <= 1] = 1 + f0_mel[f0_mel > 255] = 255 + f0_coarse = np.rint(f0_mel).astype(np.int32) + return f0_coarse, f0bak # 1-0 + + def vc( + self, + model, + net_g, + sid, + audio0, + pitch, + pitchf, + times, + index, + big_npy, + index_rate, + version, + protect, + ): # ,file_index,file_big_npy + feats = torch.from_numpy(audio0) + if self.is_half: + feats = feats.half() + else: + feats = feats.float() + if feats.dim() == 2: # double channels + feats = feats.mean(-1) + assert feats.dim() == 1, feats.dim() + feats = feats.view(1, -1) + padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False) + + inputs = { + "source": feats.to(self.device), + "padding_mask": padding_mask, + "output_layer": 9 if version == "v1" else 12, + } + t0 = ttime() + with torch.no_grad(): + logits = model.extract_features(**inputs) + feats = model.final_proj(logits[0]) if version == "v1" else logits[0] + if protect < 0.5 and pitch is not None and pitchf is not None: + feats0 = feats.clone() + if ( + not isinstance(index, type(None)) + and not isinstance(big_npy, type(None)) + and index_rate != 0 + ): + npy = feats[0].cpu().numpy() + if self.is_half: + npy = npy.astype("float32") + + # _, I = index.search(npy, 1) + # npy = big_npy[I.squeeze()] + + score, ix = index.search(npy, k=8) + weight = np.square(1 / score) + weight /= weight.sum(axis=1, keepdims=True) + npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1) + + if self.is_half: + npy = npy.astype("float16") + feats = ( + torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate + + (1 - index_rate) * feats + ) + + feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) + if protect < 0.5 and pitch is not None and pitchf is not None: + feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute( + 0, 2, 1 + ) + t1 = ttime() + p_len = audio0.shape[0] // self.window + if feats.shape[1] < p_len: + p_len = feats.shape[1] + if pitch is not None and pitchf is not None: + pitch = pitch[:, :p_len] + pitchf = pitchf[:, :p_len] + + if protect < 0.5 and pitch is not None and pitchf is not None: + pitchff = pitchf.clone() + pitchff[pitchf > 0] = 1 + pitchff[pitchf < 1] = protect + pitchff = pitchff.unsqueeze(-1) + feats = feats * pitchff + feats0 * (1 - pitchff) + feats = feats.to(feats0.dtype) + p_len = torch.tensor([p_len], device=self.device).long() + with torch.no_grad(): + hasp = pitch is not None and pitchf is not None + arg = (feats, p_len, pitch, pitchf, sid) if hasp else (feats, p_len, sid) + audio1 = (net_g.infer(*arg)[0][0, 0]).data.cpu().float().numpy() + del hasp, arg + del feats, p_len, padding_mask + if torch.cuda.is_available(): + torch.cuda.empty_cache() + t2 = ttime() + times[0] += t1 - t0 + times[2] += t2 - t1 + return audio1 + + def pipeline( + self, + model, + net_g, + sid, + audio, + input_audio_path, + times, + f0_up_key, + f0_method, + file_index, + index_rate, + if_f0, + filter_radius, + tgt_sr, + resample_sr, + rms_mix_rate, + version, + protect, + f0_file=None, + ): + if ( + file_index != "" + # and file_big_npy != "" + # and os.path.exists(file_big_npy) == True + and os.path.exists(file_index) + and index_rate != 0 + ): + try: + index = faiss.read_index(file_index) + # big_npy = np.load(file_big_npy) + big_npy = index.reconstruct_n(0, index.ntotal) + except: + traceback.print_exc() + index = big_npy = None + else: + index = big_npy = None + audio = signal.filtfilt(bh, ah, audio) + audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect") + opt_ts = [] + if audio_pad.shape[0] > self.t_max: + audio_sum = np.zeros_like(audio) + for i in range(self.window): + audio_sum += np.abs(audio_pad[i : i - self.window]) + for t in range(self.t_center, audio.shape[0], self.t_center): + opt_ts.append( + t + - self.t_query + + np.where( + audio_sum[t - self.t_query : t + self.t_query] + == audio_sum[t - self.t_query : t + self.t_query].min() + )[0][0] + ) + s = 0 + audio_opt = [] + t = None + t1 = ttime() + audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect") + p_len = audio_pad.shape[0] // self.window + inp_f0 = None + if hasattr(f0_file, "name"): + try: + with open(f0_file.name, "r") as f: + lines = f.read().strip("\n").split("\n") + inp_f0 = [] + for line in lines: + inp_f0.append([float(i) for i in line.split(",")]) + inp_f0 = np.array(inp_f0, dtype="float32") + except: + traceback.print_exc() + sid = torch.tensor(sid, device=self.device).unsqueeze(0).long() + pitch, pitchf = None, None + if if_f0 == 1: + pitch, pitchf = self.get_f0( + input_audio_path, + audio_pad, + p_len, + f0_up_key, + f0_method, + filter_radius, + inp_f0, + ) + pitch = pitch[:p_len] + pitchf = pitchf[:p_len] + if "mps" not in str(self.device) or "xpu" not in str(self.device): + pitchf = pitchf.astype(np.float32) + pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long() + pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float() + t2 = ttime() + times[1] += t2 - t1 + for t in opt_ts: + t = t // self.window * self.window + if if_f0 == 1: + audio_opt.append( + self.vc( + model, + net_g, + sid, + audio_pad[s : t + self.t_pad2 + self.window], + pitch[:, s // self.window : (t + self.t_pad2) // self.window], + pitchf[:, s // self.window : (t + self.t_pad2) // self.window], + times, + index, + big_npy, + index_rate, + version, + protect, + )[self.t_pad_tgt : -self.t_pad_tgt] + ) + else: + audio_opt.append( + self.vc( + model, + net_g, + sid, + audio_pad[s : t + self.t_pad2 + self.window], + None, + None, + times, + index, + big_npy, + index_rate, + version, + protect, + )[self.t_pad_tgt : -self.t_pad_tgt] + ) + s = t + if if_f0 == 1: + audio_opt.append( + self.vc( + model, + net_g, + sid, + audio_pad[t:], + pitch[:, t // self.window :] if t is not None else pitch, + pitchf[:, t // self.window :] if t is not None else pitchf, + times, + index, + big_npy, + index_rate, + version, + protect, + )[self.t_pad_tgt : -self.t_pad_tgt] + ) + else: + audio_opt.append( + self.vc( + model, + net_g, + sid, + audio_pad[t:], + None, + None, + times, + index, + big_npy, + index_rate, + version, + protect, + )[self.t_pad_tgt : -self.t_pad_tgt] + ) + audio_opt = np.concatenate(audio_opt) + if rms_mix_rate != 1: + audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate) + if tgt_sr != resample_sr >= 16000: + audio_opt = librosa.resample( + audio_opt, orig_sr=tgt_sr, target_sr=resample_sr + ) + audio_max = np.abs(audio_opt).max() / 0.99 + max_int16 = 32768 + if audio_max > 1: + max_int16 /= audio_max + audio_opt = (audio_opt * max_int16).astype(np.int16) + del pitch, pitchf, sid + if torch.cuda.is_available(): + torch.cuda.empty_cache() + return audio_opt diff --git a/infer/modules/vc/utils.py b/infer/modules/vc/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..7df184fccf1482d9d4f812a5f1b32ef9756fffa0 --- /dev/null +++ b/infer/modules/vc/utils.py @@ -0,0 +1,17 @@ +import os + + +def get_index_path_from_model(sid): + return next( + ( + f + for f in [ + os.path.join(root, name) + for root, _, files in os.walk(os.getenv("index_root"), topdown=False) + for name in files + if name.endswith(".index") and "trained" not in name + ] + if sid.split(".")[0] in f + ), + "", + ) diff --git a/model.py b/model.py new file mode 100644 index 0000000000000000000000000000000000000000..3846765ec92389e155d97acfc077d2aa4225b649 --- /dev/null +++ b/model.py @@ -0,0 +1,23 @@ +from accelerate import Accelerator +from infer.lib.rmvpe import RMVPE +from fairseq.checkpoint_utils import load_model_ensemble_and_task + +accelerator = Accelerator() +device = accelerator.device +print(f"Using device: {device}") + +fp16 = accelerator.mixed_precision == "fp16" +print(f"Using fp16: {fp16}") + +rmvpe_model_path = "assets/rmvpe/rmvpe.pt" +rmvpe = RMVPE(rmvpe_model_path, is_half=fp16, device=device) +print("RMVPE model loaded.") + +hubert_model_path = "assets/hubert/hubert_base.pt" +models, hubert_cfg, _ = load_model_ensemble_and_task([hubert_model_path]) +hubert = models[0] +hubert = hubert.to(device) +if fp16: + hubert = hubert.half() +hubert.eval() +print("Hubert model loaded.") diff --git a/prelude.py b/prelude.py new file mode 100644 index 0000000000000000000000000000000000000000..8160b6a60c995fd0a4e5748e8bb578283d7a09c5 --- /dev/null +++ b/prelude.py @@ -0,0 +1,26 @@ +import os + + +def prelude(): + os.environ["PYTORCH_JIT"] = "0v" + + # patch for jit script + # if we find `def expand_2d_or_3d_tensor(x,` in /usr/local/lib/python3.10/site-packages/fairseq/models/model_utils.py + # patch it with `def expand_2d_or_3d_tensor(x: Tensor,` + FAIRSEQ_CODE = ( + "/usr/local/lib/python3.10/site-packages/fairseq/models/model_utils.py" + ) + if os.path.exists(FAIRSEQ_CODE): + with open(FAIRSEQ_CODE, "r") as f: + lines = f.readlines() + with open(FAIRSEQ_CODE, "w") as f: + for line in lines: + if ( + "def expand_2d_or_3d_tensor(x, trg_dim: int, padding_idx: int):" + in line + ): + f.write( + "def expand_2d_or_3d_tensor(x: Tensor, trg_dim: int, padding_idx: int) -> Tensor:\n" + ) + else: + f.write(line) diff --git a/requirements.txt b/requirements.txt index 94a1876f4b2db37bdf396afb7d7cdbf83465308d..dce058017906a038ccba745b2eb2dd9e09b51446 100644 --- a/requirements.txt +++ b/requirements.txt @@ -25,6 +25,8 @@ tensorboard tqdm>=4.63.1 pyworld==0.3.2 httpx -onnxruntime-gpu python-dotenv>=1.0.0 av +accelerate==0.32.0 +demucs==4.0.1 +torchcrepe