upload onnx model files

Browse files

Files changed (12) hide show

uvronnx/onnx/uvr-sim.onnx +3 -0
uvronnx/src/__pycache__/config.cpython-38.pyc +0 -0
uvronnx/src/__pycache__/ortInferSession.cpython-38.pyc +0 -0
uvronnx/src/config.py +65 -0
uvronnx/src/ortInferSession.py +90 -0
uvronnx/src/utils/AudioHelper.py +87 -0
uvronnx/src/utils/__pycache__/AudioHelper.cpython-38.pyc +0 -0
uvronnx/src/utils/__pycache__/logger.cpython-38.pyc +0 -0
uvronnx/src/utils/__pycache__/spec_utils.cpython-38.pyc +0 -0
uvronnx/src/utils/logger.py +299 -0
uvronnx/src/utils/spec_utils.py +388 -0
uvronnx/src/uvr.py +121 -0

uvronnx/onnx/uvr-sim.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ceeb5f59af63e70ae9ef131844c2cf123b1bbec75f6866e633f7f3efee0bada7
+size 127044627

uvronnx/src/__pycache__/config.cpython-38.pyc ADDED Viewed

Binary file (939 Bytes). View file

uvronnx/src/__pycache__/ortInferSession.cpython-38.pyc ADDED Viewed

Binary file (3.52 kB). View file

uvronnx/src/config.py ADDED Viewed

	@@ -0,0 +1,65 @@

+# -*- coding:utf-8 -*-
+# @FileName  :config.py
+# @Time      :2023/8/2 10:54
+# @Author    :lovemefan
+# @Email     :lovemefan@outlook.com
+UVR_CONFIG = {
+    "bins": 672,
+    "unstable_bins": 8,
+    "reduction_bins": 637,
+    "band": {
+        1: {
+            "sr": 7350,
+            "hl": 80,
+            "n_fft": 640,
+            "crop_start": 0,
+            "crop_stop": 85,
+            "lpf_start": 25,
+            "lpf_stop": 53,
+            "res_type": "polyphase"
+        },
+        2: {
+            "sr": 7350,
+            "hl": 80,
+            "n_fft": 320,
+            "crop_start": 4,
+            "crop_stop": 87,
+            "hpf_start": 25,
+            "hpf_stop": 12,
+            "lpf_start": 31,
+            "lpf_stop": 62,
+            "res_type": "polyphase"
+        },
+        3: {
+            "sr": 14700,
+            "hl": 160,
+            "n_fft": 512,
+            "crop_start": 17,
+            "crop_stop": 216,
+            "hpf_start": 48,
+            "hpf_stop": 24,
+            "lpf_start": 139,
+            "lpf_stop": 210,
+            "res_type": "polyphase"
+        },
+        4: {
+            "sr": 44100,
+            "hl": 480,
+            "n_fft": 960,
+            "crop_start": 78,
+            "crop_stop": 383,
+            "hpf_start": 130,
+            "hpf_stop": 86,
+            "res_type": "kaiser_fast"
+        }
+    },
+    "sr": 44100,
+    "pre_filter_start": 668,
+    "pre_filter_stop": 672,
+    "mid_side": False,
+    "mid_side_b": False,
+    "mid_side_b2": False,
+    "stereo_w": False,
+    "reverse": False,
+}

uvronnx/src/ortInferSession.py ADDED Viewed

	@@ -0,0 +1,90 @@

+# -*- coding:utf-8 -*-
+# @FileName  :ortInferSession.py
+# @Time      :2023/8/3 00:20
+# @Author    :lovemefan
+# @Email     :lovemefan@outlook.com
+from pathlib import Path
+from typing import List, Union
+from uvronnx.src.utils.logger import logger
+import numpy as np
+from onnxruntime import (GraphOptimizationLevel, InferenceSession,
+                         SessionOptions, get_available_providers, get_device)
+class UVROrtInferSession:
+    def __init__(self, config):
+        sess_opt = SessionOptions()
+        sess_opt.log_severity_level = 4
+        sess_opt.enable_cpu_mem_arena = False
+        sess_opt.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL
+        cuda_ep = "CUDAExecutionProvider"
+        cpu_ep = "CPUExecutionProvider"
+        cpu_provider_options = {
+            "arena_extend_strategy": "kSameAsRequested",
+        }
+        EP_list = []
+        if (
+            config["use_cuda"]
+            and get_device() == "GPU"
+            and cuda_ep in get_available_providers()
+        ):
+            EP_list = [(cuda_ep, config[cuda_ep])]
+        EP_list.append((cpu_ep, cpu_provider_options))
+        self._verify_model(config["model_path"])
+        logger.info(f"Loading onnx model at {str(config['model_path'])}")
+        self.session = InferenceSession(
+            str(config["model_path"]), sess_options=sess_opt, providers=EP_list
+        )
+        if config["use_cuda"] and cuda_ep not in self.session.get_providers():
+            logger.warning(
+                f"{cuda_ep} is not available for current env, "
+                f"the inference part is automatically shifted to be "
+                f"executed under {cpu_ep}.\n "
+                "Please ensure the installed onnxruntime-gpu version"
+                " matches your cuda and cudnn version, "
+                "you can check their relations from the offical web site: "
+                "https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html",
+                RuntimeWarning,
+            )
+    def __call__(
+        self, input_chunk: np.ndarray
+    ) -> np.ndarray:
+        input_dict = {
+            "input": input_chunk,
+        }
+        return self.session.run(None, input_dict)[0]
+    def get_input_names(
+        self,
+    ):
+        return [v.name for v in self.session.get_inputs()]
+    def get_output_names(
+        self,
+    ):
+        return [v.name for v in self.session.get_outputs()]
+    def get_character_list(self, key: str = "character"):
+        return self.meta_dict[key].splitlines()
+    def have_key(self, key: str = "character") -> bool:
+        self.meta_dict = self.session.get_modelmeta().custom_metadata_map
+        if key in self.meta_dict.keys():
+            return True
+        return False
+    @staticmethod
+    def _verify_model(model_path):
+        model_path = Path(model_path)
+        if not model_path.exists():
+            raise FileNotFoundError(f"{model_path} does not exists.")
+        if not model_path.is_file():
+            raise FileExistsError(f"{model_path} is not a file.")

uvronnx/src/utils/AudioHelper.py ADDED Viewed

	@@ -0,0 +1,87 @@

+# -*- coding:utf-8 -*-
+# @FileName  :AudioHelper.py
+# @Time      :2023/8/3 00:34
+# @Author    :lovemefan
+# @Email     :lovemefan@outlook.com
+import array
+import math
+import struct
+import numpy as np
+from numpy import where
+class AudioReader:
+    """
+    read audio from sanic request
+    """
+    def __init__(self):
+        pass
+    @staticmethod
+    def get_info(self, path: str):
+        with open(path, "rb") as f:
+            (
+                name,
+                data_lengths,
+                _,
+                _,
+                _,
+                _,
+                channels,
+                sample_rate,
+                bit_rate,
+                block_length,
+                sample_bit,
+                _,
+                pcm_length,
+            ) = struct.unpack_from("<4sL4s4sLHHLLHH4sL", f.read(44))
+            assert sample_rate == 16000, "sample rate must be 16000"
+            nframes = pcm_length // (channels * 2)
+        return nframes
+    @staticmethod
+    def read_wav_bytes(data: bytes):
+        """
+        convert bytes into array of pcm_s16le data
+        :param data: PCM format bytes
+        :return:
+        """
+        # header of wav file
+        info = data[:44]
+        frames = data[44:]
+        (
+            name,
+            data_lengths,
+            _,
+            _,
+            _,
+            _,
+            channels,
+            sample_rate,
+            bit_rate,
+            block_length,
+            sample_bit,
+            _,
+            pcm_length,
+        ) = struct.unpack_from("<4sL4s4sLHHLLHH4sL", info)
+        # shortArray each element is 16bit
+        data = AudioReader.read_pcm_byte(frames)
+        return data, sample_rate
+    @staticmethod
+    def read_wav_file(audio_path: str):
+        with open(audio_path, "rb") as f:
+            data = f.read()
+        return AudioReader.read_wav_bytes(data)
+    @staticmethod
+    def read_pcm_byte(data: bytes):
+        short_array = array.array("h")
+        short_array.frombytes(data)
+        data = np.array(short_array, dtype="float16") / (1 << 15)
+        return data

uvronnx/src/utils/__pycache__/AudioHelper.cpython-38.pyc ADDED Viewed

Binary file (2.09 kB). View file

uvronnx/src/utils/__pycache__/logger.cpython-38.pyc ADDED Viewed

Binary file (9.73 kB). View file

uvronnx/src/utils/__pycache__/spec_utils.cpython-38.pyc ADDED Viewed

Binary file (10.3 kB). View file

uvronnx/src/utils/logger.py ADDED Viewed

	@@ -0,0 +1,299 @@

+# -*- coding:utf-8 -*-
+# @FileName  :logger.py
+# @Time      :2023/8/1 10:44
+# @Author    :lovemefan
+# @Email     :lovemefan@outlook.com
+"""LOGGER Module"""
+import logging
+import logging.config
+import logging.handlers
+import os
+import sys
+from functools import wraps
+from typing import Dict, List, Tuple, Union
+logger_list = []
+LEVEL = ('DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL')
+_LOG_FILE_DIR = '~/.cache/speech-webui/'
+LOCAL_DEFAULT_LOG_FILE_DIR = os.path.join(
+    os.getenv("LOCAL_DEFAULT_PATH", _LOG_FILE_DIR), 'log')
+DEFAULT_FILEHANDLER_FORMAT = '[%(levelname)s] %(asctime)s ' \
+                             '[%(pathname)s:%(lineno)d] %(funcName)s: %(message)s'
+DEFAULT_STDOUT_FORMAT = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+DEFAULT_REDIRECT_FILE_NAME = 'mindspore.log'
+class StreamRedirector:
+    """Stream Re-director for Log."""
+    def __init__(self, source_stream, target_stream):
+        """Redirects the source stream to the target stream.
+        Args:
+            source_stream: Source stream.
+            target_stream: Target stream.
+        """
+        super(StreamRedirector, self).__init__()
+        self.source_stream = source_stream
+        self.target_stream = target_stream
+        self.save_source_stream_fd = os.dup(self.source_stream.fileno())
+    def __call__(self, func):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            self.start()
+            func(*args, **kwargs)
+            self.stop()
+        return wrapper
+    def __enter__(self):
+        self.start()
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.stop()
+    def start(self):
+        """start."""
+        self.source_stream.flush()
+        os.dup2(self.target_stream.fileno(), self.source_stream.fileno())
+    def stop(self):
+        """stop."""
+        self.source_stream.flush()
+        os.dup2(self.save_source_stream_fd, self.source_stream.fileno())
+        self.target_stream.flush()
+def validate_nodes_devices_input(var_name: str, var):
+    """Check the list of nodes or devices.
+    Args:
+        var_name (str): Variable name.
+        var: The name of the variable to be checked.
+    Returns:
+        None
+    """
+    if not (var is None or isinstance(var, (list, tuple, dict))):
+        raise TypeError('The value of {} can be None or a value of type tuple, ' 'list, or dict.'.format(var_name))
+    if isinstance(var, (list, tuple)):
+        for item in var:
+            if not isinstance(item, int):
+                raise TypeError('The elements of a variable of type list or ' 'tuple must be of type int.')
+def validate_level(var_name: str, var):
+    """Verify that the log level is correct.
+    Args:
+        var_name (str): Variable name.
+        var: The name of variable to be checked.
+    Returns:
+        None
+    """
+    if not isinstance(var, str):
+        raise TypeError('The format of {} must be of type str.'.format(var_name))
+    if var not in LEVEL:
+        raise ValueError('{}={} needs to be in {}'.format(var_name, var, LEVEL))
+def validate_std_input_format(to_std: bool, stdout_nodes: Union[List, Tuple, None],
+                              stdout_devices: Union[List, Tuple, None], stdout_level: str):
+    """Validate the input about stdout of the get_logger function."""
+    if not isinstance(to_std, bool):
+        raise TypeError('The format of the to_std must be of type bool.')
+    validate_nodes_devices_input('stdout_nodes', stdout_nodes)
+    validate_nodes_devices_input('stdout_devices', stdout_devices)
+    validate_level('stdout_level', stdout_level)
+def validate_file_input_format(file_level: Union[List[str], Tuple[str]], file_save_dir: str, append_rank_dir: str,
+                               file_name: Union[List[str], Tuple[str]]):
+    """Validate the input about file of the get_logger function."""
+    if not isinstance(file_level, (tuple, list)):
+        raise TypeError('The value of file_level should be list or a tuple.')
+    for level in file_level:
+        validate_level('level in file_level', level)
+    if not len(file_level) == len(file_name):
+        raise ValueError('The length of file_level and file_name should be equal.')
+    if not isinstance(file_save_dir, str):
+        raise TypeError('The value of file_save_dir should be a value of type str.')
+    if not isinstance(append_rank_dir, bool):
+        raise TypeError('The value of append_rank_dir should be a value of type bool.')
+    if not isinstance(file_name, (tuple, list)):
+        raise TypeError('The value of file_name should be list or a tuple.')
+    for name in file_name:
+        if not isinstance(name, str):
+            raise TypeError('The value of name in file_name should be a value of type str.')
+def _convert_level(level: str) -> int:
+    """Convert the format of the log to logging level.
+    Args:
+        level (str): User log level.
+    Returns:
+        level (str): Logging level.
+    """
+    level_convert = {
+        'DEBUG': logging.DEBUG,
+        'INFO': logging.INFO,
+        'WARNING': logging.WARNING,
+        'ERROR': logging.ERROR,
+        'CRITICAL': logging.CRITICAL
+    }
+    level = level_convert.get(level, logging.INFO)
+    return level
+def get_logger(logger_name: str = 'uvr-onnx', **kwargs) -> logging.Logger:
+    """Get the logger. Both computing centers and bare metal servers are
+    available.
+    Args:
+        logger_name (str): Logger name.
+        kwargs (dict): Other input.
+            to_std (bool): If set to True, output the log to stdout.
+            stdout_level (str): The level of the log output to stdout.
+                If the type is str, the options are DEBUG, INFO, WARNING, ERROR, CRITICAL.
+            stdout_format (str): Log format.
+            file_level (list[str] or tuple[str]): The level of the log output to file.
+                eg: ['INFO', 'ERROR'] Indicates that the logger will output logs above
+                    the level INFO and ERROR in the list to the corresponding file.
+                The length of the list needs to be the same as the length of file_name.
+            file_save_dir (str): The folder where the log files are stored.
+            append_rank_dir (bool): Whether to add a folder with the format rank{}.
+            file_name (list[str] or list[tuple]): Store a list of output file names.
+            max_file_size (int): The maximum size of a single log file. Unit: MB.
+            max_num_of_files (int): The maximum number of files to save.
+    Returns:
+        logger (logging.Logger): Logger.
+    """
+    mf_logger = logging.getLogger(logger_name)
+    if logger_name in logger_list:
+        return mf_logger
+    to_std = kwargs.get('to_std', True)
+    stdout_nodes = kwargs.get('stdout_nodes', None)
+    def get_stdout_devices():
+        if os.getenv("STDOUT_DEVICES"):
+            devices = os.getenv("STDOUT_DEVICES")
+            if devices.startswith(("(", "[")) and devices.endswith((")", "]")):
+                devices = devices[1:-1]
+            devices = tuple(map(lambda x: int(x.strip()), devices.split(",")))
+        else:
+            devices = kwargs.get('stdout_devices', None)
+        return devices
+    stdout_devices = get_stdout_devices()
+    stdout_level = kwargs.get('stdout_level', 'INFO')
+    stdout_format = kwargs.get('stdout_format', '')
+    file_level = kwargs.get('file_level', ('INFO', 'ERROR'))
+    file_save_dir = kwargs.get('file_save_dir', '')
+    append_rank_dir = kwargs.get('append_rank_dir', True)
+    file_name = kwargs.get('file_name', (f'info.log', 'error.log'))
+    max_file_size = kwargs.get('max_file_size', 50)
+    max_num_of_files = kwargs.get('max_num_of_files', 5)
+    validate_std_input_format(to_std, stdout_nodes, stdout_devices, stdout_level)
+    validate_file_input_format(file_level, file_save_dir, append_rank_dir, file_name)
+    if to_std:
+        if not stdout_format:
+            stdout_format = DEFAULT_STDOUT_FORMAT
+        stream_handler = logging.StreamHandler(sys.stdout)
+        stream_handler.setLevel(_convert_level(stdout_level))
+        stream_formatter = logging.Formatter(stdout_format)
+        stream_handler.setFormatter(stream_formatter)
+        mf_logger.addHandler(stream_handler)
+    logging_level = []
+    for level in file_level:
+        logging_level.append(_convert_level(level))
+    if not file_save_dir:
+        file_save_dir = LOCAL_DEFAULT_LOG_FILE_DIR
+    file_path = []
+    for name in file_name:
+        path = os.path.join(file_save_dir, name)
+        path = os.path.realpath(path)
+        base_dir = os.path.dirname(path)
+        if not os.path.exists(base_dir):
+            os.makedirs(base_dir, exist_ok=True)
+        file_path.append(path)
+    max_file_size = max_file_size * 1024 * 1024
+    file_formatter = logging.Formatter(DEFAULT_FILEHANDLER_FORMAT)
+    for i, level in enumerate(file_level):
+        file_handler = logging.handlers.RotatingFileHandler(filename=file_path[i],
+                                                            maxBytes=max_file_size,
+                                                            backupCount=max_num_of_files)
+        file_handler.setLevel(level)
+        file_handler.setFormatter(file_formatter)
+        mf_logger.addHandler(file_handler)
+    mf_logger.setLevel(_convert_level('INFO'))
+    mf_logger.propagate = False
+    logger_list.append(logger_name)
+    return mf_logger
+class _LogActionOnce:
+    """
+    A wrapper for modify the warning logging to an empty function. This is used when we want to only log
+    once to avoid the repeated logging.
+    Args:
+        logger (logging): The logger object.
+    """
+    is_logged = dict()
+    def __init__(self, m_logger, key, no_warning=False):
+        self.logger = m_logger
+        self.key = key
+        self.no_warning = no_warning
+    def __call__(self, func):
+        def wrapper(*args, **kwargs):
+            if not hasattr(self.logger, 'warning'):
+                return func(*args, **kwargs)
+            old_func = self.logger.warning
+            if self.no_warning or self.key in _LogActionOnce.is_logged:
+                self.logger.warning = lambda x: x
+            else:
+                _LogActionOnce.is_logged[self.key] = True
+            res = func(*args, **kwargs)
+            if hasattr(self.logger, 'warning'):
+                self.logger.warning = old_func
+            return res
+        return wrapper
+logger = get_logger()

uvronnx/src/utils/spec_utils.py ADDED Viewed

	@@ -0,0 +1,388 @@

+# -*- coding:utf-8 -*-
+# @FileName  :spec_utils.py
+# @Time      :2023/8/2 17:16
+# @Author    :lovemefan
+# @Email     :lovemefan@outlook.com
+import os, librosa
+import numpy as np
+import json, math, hashlib
+def crop_center(h1, h2):
+    h1_shape = h1.size()
+    h2_shape = h2.size()
+    if h1_shape[3] == h2_shape[3]:
+        return h1
+    elif h1_shape[3] < h2_shape[3]:
+        raise ValueError('h1_shape[3] must be greater than h2_shape[3]')
+    s_time = (h1_shape[3] - h2_shape[3]) // 2
+    e_time = s_time + h2_shape[3]
+    h1 = h1[:, :, :, s_time:e_time]
+    return h1
+def wave_to_spectrogram(wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False):
+    if reverse:
+        wave_left = np.flip(np.asfortranarray(wave[0]))
+        wave_right = np.flip(np.asfortranarray(wave[1]))
+    elif mid_side:
+        wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2)
+        wave_right = np.asfortranarray(np.subtract(wave[0], wave[1]))
+    elif mid_side_b2:
+        wave_left = np.asfortranarray(np.add(wave[1], wave[0] * .5))
+        wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * .5))
+    else:
+        wave_left = np.asfortranarray(wave[0])
+        wave_right = np.asfortranarray(wave[1])
+    spec_left = librosa.stft(wave_left, n_fft, hop_length=hop_length)
+    spec_right = librosa.stft(wave_right, n_fft, hop_length=hop_length)
+    spec = np.asfortranarray([spec_left, spec_right])
+    return spec
+def wave_to_spectrogram_mt(wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False):
+    import threading
+    if reverse:
+        wave_left = np.flip(np.asfortranarray(wave[0]))
+        wave_right = np.flip(np.asfortranarray(wave[1]))
+    elif mid_side:
+        wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2)
+        wave_right = np.asfortranarray(np.subtract(wave[0], wave[1]))
+    elif mid_side_b2:
+        wave_left = np.asfortranarray(np.add(wave[1], wave[0] * .5))
+        wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * .5))
+    else:
+        wave_left = np.asfortranarray(wave[0])
+        wave_right = np.asfortranarray(wave[1])
+    def run_thread(**kwargs):
+        global spec_left
+        spec_left = librosa.stft(**kwargs)
+    thread = threading.Thread(target=run_thread, kwargs={'y': wave_left, 'n_fft': n_fft, 'hop_length': hop_length})
+    thread.start()
+    spec_right = librosa.stft(wave_right, n_fft=n_fft, hop_length=hop_length)
+    thread.join()
+    spec = np.asfortranarray([spec_left, spec_right])
+    return spec
+def combine_spectrograms(specs, param):
+    l = min([specs[i].shape[2] for i in specs])
+    spec_c = np.zeros(shape=(2, param['bins'] + 1, l), dtype=np.complex64)
+    offset = 0
+    bands_n = len(param['band'])
+    for d in range(1, bands_n + 1):
+        h = param['band'][d]['crop_stop'] - param['band'][d]['crop_start']
+        spec_c[:, offset:offset + h, :l] = specs[d][:,
+                                           param['band'][d]['crop_start']:param['band'][d]['crop_stop'], :l]
+        offset += h
+    if offset > param['bins']:
+        raise ValueError('Too much bins')
+    # lowpass fiter
+    if param['pre_filter_start'] > 0:  # and mp.param['band'][bands_n]['res_type'] in ['scipy', 'polyphase']:
+        if bands_n == 1:
+            spec_c = fft_lp_filter(spec_c, param['pre_filter_start'], param['pre_filter_stop'])
+        else:
+            gp = 1
+            for b in range(param['pre_filter_start'] + 1, param['pre_filter_stop']):
+                g = math.pow(10, -(b - param['pre_filter_start']) * (3.5 - gp) / 20.0)
+                gp = g
+                spec_c[:, b, :] *= g
+    return np.asfortranarray(spec_c)
+def spectrogram_to_image(spec, mode='magnitude'):
+    if mode == 'magnitude':
+        if np.iscomplexobj(spec):
+            y = np.abs(spec)
+        else:
+            y = spec
+        y = np.log10(y ** 2 + 1e-8)
+    elif mode == 'phase':
+        if np.iscomplexobj(spec):
+            y = np.angle(spec)
+        else:
+            y = spec
+    y -= y.min()
+    y *= 255 / y.max()
+    img = np.uint8(y)
+    if y.ndim == 3:
+        img = img.transpose(1, 2, 0)
+        img = np.concatenate([
+            np.max(img, axis=2, keepdims=True), img
+        ], axis=2)
+    return img
+def reduce_vocal_aggressively(X, y, softmask):
+    v = X - y
+    y_mag_tmp = np.abs(y)
+    v_mag_tmp = np.abs(v)
+    v_mask = v_mag_tmp > y_mag_tmp
+    y_mag = np.clip(y_mag_tmp - v_mag_tmp * v_mask * softmask, 0, np.inf)
+    return y_mag * np.exp(1.j * np.angle(y))
+def mask_silence(mag, ref, thres=0.2, min_range=64, fade_size=32):
+    if min_range < fade_size * 2:
+        raise ValueError('min_range must be >= fade_area * 2')
+    mag = mag.copy()
+    idx = np.where(ref.mean(axis=(0, 1)) < thres)[0]
+    starts = np.insert(idx[np.where(np.diff(idx) != 1)[0] + 1], 0, idx[0])
+    ends = np.append(idx[np.where(np.diff(idx) != 1)[0]], idx[-1])
+    uninformative = np.where(ends - starts > min_range)[0]
+    if len(uninformative) > 0:
+        starts = starts[uninformative]
+        ends = ends[uninformative]
+        old_e = None
+        for s, e in zip(starts, ends):
+            if old_e is not None and s - old_e < fade_size:
+                s = old_e - fade_size * 2
+            if s != 0:
+                weight = np.linspace(0, 1, fade_size)
+                mag[:, :, s:s + fade_size] += weight * ref[:, :, s:s + fade_size]
+            else:
+                s -= fade_size
+            if e != mag.shape[2]:
+                weight = np.linspace(1, 0, fade_size)
+                mag[:, :, e - fade_size:e] += weight * ref[:, :, e - fade_size:e]
+            else:
+                e += fade_size
+            mag[:, :, s + fade_size:e - fade_size] += ref[:, :, s + fade_size:e - fade_size]
+            old_e = e
+    return mag
+def align_wave_head_and_tail(a, b):
+    l = min([a[0].size, b[0].size])
+    return a[:l, :l], b[:l, :l]
+def cache_or_load(mix_path, inst_path, mp):
+    mix_basename = os.path.splitext(os.path.basename(mix_path))[0]
+    inst_basename = os.path.splitext(os.path.basename(inst_path))[0]
+    cache_dir = 'mph{}'.format(hashlib.sha1(json.dumps(mp.param, sort_keys=True).encode('utf-8')).hexdigest())
+    mix_cache_dir = os.path.join('cache', cache_dir)
+    inst_cache_dir = os.path.join('cache', cache_dir)
+    os.makedirs(mix_cache_dir, exist_ok=True)
+    os.makedirs(inst_cache_dir, exist_ok=True)
+    mix_cache_path = os.path.join(mix_cache_dir, mix_basename + '.npy')
+    inst_cache_path = os.path.join(inst_cache_dir, inst_basename + '.npy')
+    if os.path.exists(mix_cache_path) and os.path.exists(inst_cache_path):
+        X_spec_m = np.load(mix_cache_path)
+        y_spec_m = np.load(inst_cache_path)
+    else:
+        X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {}
+        for d in range(len(mp.param['band']), 0, -1):
+            bp = mp.param['band'][d]
+            if d == len(mp.param['band']):  # high-end band
+                X_wave[d], _ = librosa.load(
+                    mix_path, bp['sr'], False, dtype=np.float32, res_type=bp['res_type'])
+                y_wave[d], _ = librosa.load(
+                    inst_path, bp['sr'], False, dtype=np.float32, res_type=bp['res_type'])
+            else:  # lower bands
+                X_wave[d] = librosa.resample(X_wave[d + 1], mp.param['band'][d + 1]['sr'], bp['sr'],
+                                             res_type=bp['res_type'])
+                y_wave[d] = librosa.resample(y_wave[d + 1], mp.param['band'][d + 1]['sr'], bp['sr'],
+                                             res_type=bp['res_type'])
+            X_wave[d], y_wave[d] = align_wave_head_and_tail(X_wave[d], y_wave[d])
+            X_spec_s[d] = wave_to_spectrogram(X_wave[d], bp['hl'], bp['n_fft'], mp.param['mid_side'],
+                                              mp.param['mid_side_b2'], mp.param['reverse'])
+            y_spec_s[d] = wave_to_spectrogram(y_wave[d], bp['hl'], bp['n_fft'], mp.param['mid_side'],
+                                              mp.param['mid_side_b2'], mp.param['reverse'])
+        del X_wave, y_wave
+        X_spec_m = combine_spectrograms(X_spec_s, mp)
+        y_spec_m = combine_spectrograms(y_spec_s, mp)
+        if X_spec_m.shape != y_spec_m.shape:
+            raise ValueError('The combined spectrograms are different: ' + mix_path)
+        _, ext = os.path.splitext(mix_path)
+        np.save(mix_cache_path, X_spec_m)
+        np.save(inst_cache_path, y_spec_m)
+    return X_spec_m, y_spec_m
+def spectrogram_to_wave(spec, hop_length, mid_side, mid_side_b2, reverse):
+    spec_left = np.asfortranarray(spec[0])
+    spec_right = np.asfortranarray(spec[1])
+    wave_left = librosa.istft(spec_left, hop_length=hop_length)
+    wave_right = librosa.istft(spec_right, hop_length=hop_length)
+    if reverse:
+        return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)])
+    elif mid_side:
+        return np.asfortranarray([np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)])
+    elif mid_side_b2:
+        return np.asfortranarray(
+            [np.add(wave_right / 1.25, .4 * wave_left), np.subtract(wave_left / 1.25, .4 * wave_right)])
+    else:
+        return np.asfortranarray([wave_left, wave_right])
+def cmb_spectrogram_to_wave(spec_m, param, extra_bins_h=None, extra_bins=None):
+    wave_band = {}
+    bands_n = len(param['band'])
+    offset = 0
+    for d in range(1, bands_n + 1):
+        bp = param['band'][d]
+        spec_s = np.ndarray(shape=(2, bp['n_fft'] // 2 + 1, spec_m.shape[2]), dtype=complex)
+        h = bp['crop_stop'] - bp['crop_start']
+        spec_s[:, bp['crop_start']:bp['crop_stop'], :] = spec_m[:, offset:offset + h, :]
+        offset += h
+        if d == bands_n:  # higher
+            if extra_bins_h:  # if --high_end_process bypass
+                max_bin = bp['n_fft'] // 2
+                spec_s[:, max_bin - extra_bins_h:max_bin, :] = extra_bins[:, :extra_bins_h, :]
+            if bp['hpf_start'] > 0:
+                spec_s = fft_hp_filter(spec_s, bp['hpf_start'], bp['hpf_stop'] - 1)
+            if bands_n == 1:
+                wave = spectrogram_to_wave(spec_s, bp['hl'], param['mid_side'], param['mid_side_b2'],
+                                           param['reverse'])
+            else:
+                wave = np.add(wave, spectrogram_to_wave(spec_s, bp['hl'], param['mid_side'], param['mid_side_b2'],
+                                                        param['reverse']))
+        else:
+            sr = param['band'][d + 1]['sr']
+            if d == 1:  # lower
+                spec_s = fft_lp_filter(spec_s, bp['lpf_start'], bp['lpf_stop'])
+                wave = librosa.resample(
+                    spectrogram_to_wave(spec_s, bp['hl'], param['mid_side'], param['mid_side_b2'],
+                                        param['reverse']), orig_sr=bp['sr'], target_sr=sr, res_type="sinc_fastest")
+            else:  # mid
+                spec_s = fft_hp_filter(spec_s, bp['hpf_start'], bp['hpf_stop'] - 1)
+                spec_s = fft_lp_filter(spec_s, bp['lpf_start'], bp['lpf_stop'])
+                wave2 = np.add(wave,
+                               spectrogram_to_wave(spec_s, bp['hl'], param['mid_side'], param['mid_side_b2'],
+                                                   param['reverse']))
+                # wave = librosa.core.resample(wave2, bp['sr'], sr, res_type="sinc_fastest")
+                wave = librosa.core.resample(wave2, orig_sr=bp['sr'], target_sr=sr, res_type='scipy')
+    return wave.T
+def fft_lp_filter(spec, bin_start, bin_stop):
+    g = 1.0
+    for b in range(bin_start, bin_stop):
+        g -= 1 / (bin_stop - bin_start)
+        spec[:, b, :] = g * spec[:, b, :]
+    spec[:, bin_stop:, :] *= 0
+    return spec
+def fft_hp_filter(spec, bin_start, bin_stop):
+    g = 1.0
+    for b in range(bin_start, bin_stop, -1):
+        g -= 1 / (bin_start - bin_stop)
+        spec[:, b, :] = g * spec[:, b, :]
+    spec[:, 0:bin_stop + 1, :] *= 0
+    return spec
+def mirroring(a, spec_m, input_high_end, param):
+    if 'mirroring' == a:
+        mirror = np.flip(np.abs(
+            spec_m[:, param['pre_filter_start'] - 10 - input_high_end.shape[1]:param['pre_filter_start'] - 10,
+            :]), 1)
+        mirror = mirror * np.exp(1.j * np.angle(input_high_end))
+        return np.where(np.abs(input_high_end) <= np.abs(mirror), input_high_end, mirror)
+    if 'mirroring2' == a:
+        mirror = np.flip(np.abs(
+            spec_m[:, param['pre_filter_start'] - 10 - input_high_end.shape[1]:param['pre_filter_start'] - 10,
+            :]), 1)
+        mi = np.multiply(mirror, input_high_end * 1.7)
+        return np.where(np.abs(input_high_end) <= np.abs(mi), input_high_end, mi)
+def ensembling(a, specs):
+    for i in range(1, len(specs)):
+        if i == 1:
+            spec = specs[0]
+        ln = min([spec.shape[2], specs[i].shape[2]])
+        spec = spec[:, :, :ln]
+        specs[i] = specs[i][:, :, :ln]
+        if 'min_mag' == a:
+            spec = np.where(np.abs(specs[i]) <= np.abs(spec), specs[i], spec)
+        if 'max_mag' == a:
+            spec = np.where(np.abs(specs[i]) >= np.abs(spec), specs[i], spec)
+    return spec
+def stft(wave, nfft, hl):
+    wave_left = np.asfortranarray(wave[0])
+    wave_right = np.asfortranarray(wave[1])
+    spec_left = librosa.stft(wave_left, nfft, hop_length=hl)
+    spec_right = librosa.stft(wave_right, nfft, hop_length=hl)
+    spec = np.asfortranarray([spec_left, spec_right])
+    return spec
+def istft(spec, hl):
+    spec_left = np.asfortranarray(spec[0])
+    spec_right = np.asfortranarray(spec[1])
+    wave_left = librosa.istft(spec_left, hop_length=hl)
+    wave_right = librosa.istft(spec_right, hop_length=hl)
+    wave = np.asfortranarray([wave_left, wave_right])
+def make_padding(width, cropsize, offset):
+    left = offset
+    roi_size = cropsize - left * 2
+    if roi_size == 0:
+        roi_size = cropsize
+    right = roi_size - (width % roi_size) + left
+    return left, right, roi_size

uvronnx/src/uvr.py ADDED Viewed

	@@ -0,0 +1,121 @@

+# -*- coding:utf-8 -*-
+# @FileName  :uvr.py
+# @Time      :2023/8/2 10:47
+# @Author    :lovemefan
+# @Email     :lovemefan@outlook.com
+import os.path
+import numpy as np
+import librosa
+from tqdm import tqdm
+from uvronnx.src.config import UVR_CONFIG
+from uvronnx.src.ortInferSession import UVROrtInferSession
+from uvronnx.src.utils import spec_utils
+from uvronnx.src.utils.AudioHelper import AudioReader
+from uvronnx.src.utils.spec_utils import make_padding
+class UVRModel:
+    def __init__(self, model_path=None):
+        project_dir = os.path.dirname(os.path.dirname(__file__))
+        model_path = model_path or os.path.join(project_dir, 'onnx/uvr-sim.onnx')
+        assert os.path.exists(model_path), f"{model_path} is not exist"
+        self.model = UVROrtInferSession({
+            'model_path': model_path,
+            'use_cuda': False
+        })
+        self.offset = 128
+        self.window_size = 512
+    def preprocess(x_spec):
+        x_mag = np.abs(x_spec)
+        x_phase = np.angle(x_spec)
+        return x_mag, x_phase
+    def separate_offline(self, mixed_audio, sample_rate=44100):
+        if isinstance(mixed_audio, str):
+            mixed_audio, sample_rate = AudioReader.read_wav_file(mixed_audio)
+        x_wave, y_wave, x_spec_s, y_spec_s = {}, {}, {}, {}
+        bands_n = len(UVR_CONFIG['band'])
+        for d in range(bands_n, 0, -1):
+            bp = UVR_CONFIG['band'][d]
+            if d == bands_n:  # high-end band
+                x_wave[d] = mixed_audio
+                if x_wave[d].ndim == 1:
+                    x_wave[d] = np.asfortranarray([x_wave[d], x_wave[d]])
+            else:  # lower bands
+                x_wave[d] = librosa.core.resample(x_wave[d + 1], orig_sr=UVR_CONFIG['band'][d + 1]['sr'], target_sr=bp['sr'],
+                                                  res_type=bp['res_type'])
+            # Stft of wave source
+            x_spec_s[d] = spec_utils.wave_to_spectrogram_mt(x_wave[d], bp['hl'], bp['n_fft'], UVR_CONFIG['mid_side'],
+                                                            UVR_CONFIG['mid_side_b2'], UVR_CONFIG['reverse'])
+            # pdb.set_trace()
+            if d == bands_n:
+                input_high_end_h = (bp['n_fft'] // 2 - bp['crop_stop']) + (
+                        UVR_CONFIG['pre_filter_stop'] - UVR_CONFIG['pre_filter_start'])
+                input_high_end = x_spec_s[d][:, bp['n_fft'] // 2 - input_high_end_h:bp['n_fft'] // 2, :]
+        x_spec_m = spec_utils.combine_spectrograms(x_spec_s, UVR_CONFIG)
+        def preprocess(x_spec):
+            x_mag = np.abs(x_spec)
+            x_phase = np.angle(x_spec)
+            return x_mag, x_phase
+        x_mag, x_phase = preprocess(x_spec_m)
+        coef = x_mag.max()
+        x_mag_pre = x_mag / coef
+        n_frame = x_mag_pre.shape[2]
+        pad_l, pad_r, roi_size = make_padding(n_frame,
+                                              self.window_size, self.offset)
+        n_window = int(np.ceil(n_frame / roi_size))
+        x_mag_pad = np.pad(
+            x_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode='constant')
+        preds = []
+        iterations = [n_window]
+        total_iterations = sum(iterations)
+        for i in tqdm(range(n_window)):
+            start = i * roi_size
+            x_mag_window = x_mag_pad[None, :, :, start:start + self.window_size]
+            # if (is_half == True): x_mag_window = x_mag_window.half()
+            h = self.model(x_mag_window)
+            pred = h[:, :, :, self.offset:-self.offset]
+            assert pred.shape[3] > 0
+            preds.append(pred[0])
+        pred = np.concatenate(preds, axis=2)
+        pred = pred[:, :, :n_frame]
+        pred, x_mag, x_phase = pred * coef, x_mag, np.exp(1.j * x_phase)
+        y_spec_m = pred * x_phase
+        v_spec_m = x_spec_m - y_spec_m
+        input_high_end_ = spec_utils.mirroring('mirroring', y_spec_m, input_high_end, UVR_CONFIG)
+        wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, UVR_CONFIG, input_high_end_h,
+                                                            input_high_end_)
+        print('instruments done')
+        input_high_end_ = spec_utils.mirroring('mirroring', v_spec_m, input_high_end, UVR_CONFIG)
+        wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, UVR_CONFIG, input_high_end_h, input_high_end_)
+        return wav_instrument, wav_vocals
+if __name__ == '__main__':
+    model = UVRModel()
+    audio, sample_rate = AudioReader.read_wav_file('/Users/cenglingfan/Downloads/晴天.wav_-4key_fumin.wav')
+    instrument, vocal = model.separate_offline(audio, sample_rate)
+    print(instrument)
+    print(vocal)