upload onnx model files
Browse files- uvronnx/onnx/uvr-sim.onnx +3 -0
- uvronnx/src/__pycache__/config.cpython-38.pyc +0 -0
- uvronnx/src/__pycache__/ortInferSession.cpython-38.pyc +0 -0
- uvronnx/src/config.py +65 -0
- uvronnx/src/ortInferSession.py +90 -0
- uvronnx/src/utils/AudioHelper.py +87 -0
- uvronnx/src/utils/__pycache__/AudioHelper.cpython-38.pyc +0 -0
- uvronnx/src/utils/__pycache__/logger.cpython-38.pyc +0 -0
- uvronnx/src/utils/__pycache__/spec_utils.cpython-38.pyc +0 -0
- uvronnx/src/utils/logger.py +299 -0
- uvronnx/src/utils/spec_utils.py +388 -0
- uvronnx/src/uvr.py +121 -0
uvronnx/onnx/uvr-sim.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ceeb5f59af63e70ae9ef131844c2cf123b1bbec75f6866e633f7f3efee0bada7
|
3 |
+
size 127044627
|
uvronnx/src/__pycache__/config.cpython-38.pyc
ADDED
Binary file (939 Bytes). View file
|
|
uvronnx/src/__pycache__/ortInferSession.cpython-38.pyc
ADDED
Binary file (3.52 kB). View file
|
|
uvronnx/src/config.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding:utf-8 -*-
|
2 |
+
# @FileName :config.py
|
3 |
+
# @Time :2023/8/2 10:54
|
4 |
+
# @Author :lovemefan
|
5 |
+
# @Email :lovemefan@outlook.com
|
6 |
+
|
7 |
+
UVR_CONFIG = {
|
8 |
+
"bins": 672,
|
9 |
+
"unstable_bins": 8,
|
10 |
+
"reduction_bins": 637,
|
11 |
+
"band": {
|
12 |
+
1: {
|
13 |
+
"sr": 7350,
|
14 |
+
"hl": 80,
|
15 |
+
"n_fft": 640,
|
16 |
+
"crop_start": 0,
|
17 |
+
"crop_stop": 85,
|
18 |
+
"lpf_start": 25,
|
19 |
+
"lpf_stop": 53,
|
20 |
+
"res_type": "polyphase"
|
21 |
+
},
|
22 |
+
2: {
|
23 |
+
"sr": 7350,
|
24 |
+
"hl": 80,
|
25 |
+
"n_fft": 320,
|
26 |
+
"crop_start": 4,
|
27 |
+
"crop_stop": 87,
|
28 |
+
"hpf_start": 25,
|
29 |
+
"hpf_stop": 12,
|
30 |
+
"lpf_start": 31,
|
31 |
+
"lpf_stop": 62,
|
32 |
+
"res_type": "polyphase"
|
33 |
+
},
|
34 |
+
3: {
|
35 |
+
"sr": 14700,
|
36 |
+
"hl": 160,
|
37 |
+
"n_fft": 512,
|
38 |
+
"crop_start": 17,
|
39 |
+
"crop_stop": 216,
|
40 |
+
"hpf_start": 48,
|
41 |
+
"hpf_stop": 24,
|
42 |
+
"lpf_start": 139,
|
43 |
+
"lpf_stop": 210,
|
44 |
+
"res_type": "polyphase"
|
45 |
+
},
|
46 |
+
4: {
|
47 |
+
"sr": 44100,
|
48 |
+
"hl": 480,
|
49 |
+
"n_fft": 960,
|
50 |
+
"crop_start": 78,
|
51 |
+
"crop_stop": 383,
|
52 |
+
"hpf_start": 130,
|
53 |
+
"hpf_stop": 86,
|
54 |
+
"res_type": "kaiser_fast"
|
55 |
+
}
|
56 |
+
},
|
57 |
+
"sr": 44100,
|
58 |
+
"pre_filter_start": 668,
|
59 |
+
"pre_filter_stop": 672,
|
60 |
+
"mid_side": False,
|
61 |
+
"mid_side_b": False,
|
62 |
+
"mid_side_b2": False,
|
63 |
+
"stereo_w": False,
|
64 |
+
"reverse": False,
|
65 |
+
}
|
uvronnx/src/ortInferSession.py
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding:utf-8 -*-
|
2 |
+
# @FileName :ortInferSession.py
|
3 |
+
# @Time :2023/8/3 00:20
|
4 |
+
# @Author :lovemefan
|
5 |
+
# @Email :lovemefan@outlook.com
|
6 |
+
|
7 |
+
from pathlib import Path
|
8 |
+
from typing import List, Union
|
9 |
+
from uvronnx.src.utils.logger import logger
|
10 |
+
import numpy as np
|
11 |
+
from onnxruntime import (GraphOptimizationLevel, InferenceSession,
|
12 |
+
SessionOptions, get_available_providers, get_device)
|
13 |
+
|
14 |
+
|
15 |
+
class UVROrtInferSession:
|
16 |
+
def __init__(self, config):
|
17 |
+
sess_opt = SessionOptions()
|
18 |
+
sess_opt.log_severity_level = 4
|
19 |
+
sess_opt.enable_cpu_mem_arena = False
|
20 |
+
sess_opt.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL
|
21 |
+
|
22 |
+
cuda_ep = "CUDAExecutionProvider"
|
23 |
+
cpu_ep = "CPUExecutionProvider"
|
24 |
+
cpu_provider_options = {
|
25 |
+
"arena_extend_strategy": "kSameAsRequested",
|
26 |
+
}
|
27 |
+
|
28 |
+
EP_list = []
|
29 |
+
if (
|
30 |
+
config["use_cuda"]
|
31 |
+
and get_device() == "GPU"
|
32 |
+
and cuda_ep in get_available_providers()
|
33 |
+
):
|
34 |
+
EP_list = [(cuda_ep, config[cuda_ep])]
|
35 |
+
EP_list.append((cpu_ep, cpu_provider_options))
|
36 |
+
|
37 |
+
self._verify_model(config["model_path"])
|
38 |
+
logger.info(f"Loading onnx model at {str(config['model_path'])}")
|
39 |
+
self.session = InferenceSession(
|
40 |
+
str(config["model_path"]), sess_options=sess_opt, providers=EP_list
|
41 |
+
)
|
42 |
+
|
43 |
+
if config["use_cuda"] and cuda_ep not in self.session.get_providers():
|
44 |
+
logger.warning(
|
45 |
+
f"{cuda_ep} is not available for current env, "
|
46 |
+
f"the inference part is automatically shifted to be "
|
47 |
+
f"executed under {cpu_ep}.\n "
|
48 |
+
"Please ensure the installed onnxruntime-gpu version"
|
49 |
+
" matches your cuda and cudnn version, "
|
50 |
+
"you can check their relations from the offical web site: "
|
51 |
+
"https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html",
|
52 |
+
RuntimeWarning,
|
53 |
+
)
|
54 |
+
|
55 |
+
def __call__(
|
56 |
+
self, input_chunk: np.ndarray
|
57 |
+
) -> np.ndarray:
|
58 |
+
|
59 |
+
input_dict = {
|
60 |
+
"input": input_chunk,
|
61 |
+
}
|
62 |
+
|
63 |
+
return self.session.run(None, input_dict)[0]
|
64 |
+
|
65 |
+
def get_input_names(
|
66 |
+
self,
|
67 |
+
):
|
68 |
+
return [v.name for v in self.session.get_inputs()]
|
69 |
+
|
70 |
+
def get_output_names(
|
71 |
+
self,
|
72 |
+
):
|
73 |
+
return [v.name for v in self.session.get_outputs()]
|
74 |
+
|
75 |
+
def get_character_list(self, key: str = "character"):
|
76 |
+
return self.meta_dict[key].splitlines()
|
77 |
+
|
78 |
+
def have_key(self, key: str = "character") -> bool:
|
79 |
+
self.meta_dict = self.session.get_modelmeta().custom_metadata_map
|
80 |
+
if key in self.meta_dict.keys():
|
81 |
+
return True
|
82 |
+
return False
|
83 |
+
|
84 |
+
@staticmethod
|
85 |
+
def _verify_model(model_path):
|
86 |
+
model_path = Path(model_path)
|
87 |
+
if not model_path.exists():
|
88 |
+
raise FileNotFoundError(f"{model_path} does not exists.")
|
89 |
+
if not model_path.is_file():
|
90 |
+
raise FileExistsError(f"{model_path} is not a file.")
|
uvronnx/src/utils/AudioHelper.py
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding:utf-8 -*-
|
2 |
+
# @FileName :AudioHelper.py
|
3 |
+
# @Time :2023/8/3 00:34
|
4 |
+
# @Author :lovemefan
|
5 |
+
# @Email :lovemefan@outlook.com
|
6 |
+
|
7 |
+
import array
|
8 |
+
import math
|
9 |
+
import struct
|
10 |
+
import numpy as np
|
11 |
+
from numpy import where
|
12 |
+
|
13 |
+
|
14 |
+
class AudioReader:
|
15 |
+
"""
|
16 |
+
|
17 |
+
read audio from sanic request
|
18 |
+
"""
|
19 |
+
|
20 |
+
def __init__(self):
|
21 |
+
pass
|
22 |
+
|
23 |
+
@staticmethod
|
24 |
+
def get_info(self, path: str):
|
25 |
+
with open(path, "rb") as f:
|
26 |
+
(
|
27 |
+
name,
|
28 |
+
data_lengths,
|
29 |
+
_,
|
30 |
+
_,
|
31 |
+
_,
|
32 |
+
_,
|
33 |
+
channels,
|
34 |
+
sample_rate,
|
35 |
+
bit_rate,
|
36 |
+
block_length,
|
37 |
+
sample_bit,
|
38 |
+
_,
|
39 |
+
pcm_length,
|
40 |
+
) = struct.unpack_from("<4sL4s4sLHHLLHH4sL", f.read(44))
|
41 |
+
assert sample_rate == 16000, "sample rate must be 16000"
|
42 |
+
nframes = pcm_length // (channels * 2)
|
43 |
+
return nframes
|
44 |
+
|
45 |
+
@staticmethod
|
46 |
+
def read_wav_bytes(data: bytes):
|
47 |
+
"""
|
48 |
+
convert bytes into array of pcm_s16le data
|
49 |
+
:param data: PCM format bytes
|
50 |
+
:return:
|
51 |
+
"""
|
52 |
+
|
53 |
+
# header of wav file
|
54 |
+
info = data[:44]
|
55 |
+
frames = data[44:]
|
56 |
+
(
|
57 |
+
name,
|
58 |
+
data_lengths,
|
59 |
+
_,
|
60 |
+
_,
|
61 |
+
_,
|
62 |
+
_,
|
63 |
+
channels,
|
64 |
+
sample_rate,
|
65 |
+
bit_rate,
|
66 |
+
block_length,
|
67 |
+
sample_bit,
|
68 |
+
_,
|
69 |
+
pcm_length,
|
70 |
+
) = struct.unpack_from("<4sL4s4sLHHLLHH4sL", info)
|
71 |
+
# shortArray each element is 16bit
|
72 |
+
data = AudioReader.read_pcm_byte(frames)
|
73 |
+
return data, sample_rate
|
74 |
+
|
75 |
+
@staticmethod
|
76 |
+
def read_wav_file(audio_path: str):
|
77 |
+
with open(audio_path, "rb") as f:
|
78 |
+
data = f.read()
|
79 |
+
return AudioReader.read_wav_bytes(data)
|
80 |
+
|
81 |
+
@staticmethod
|
82 |
+
def read_pcm_byte(data: bytes):
|
83 |
+
short_array = array.array("h")
|
84 |
+
short_array.frombytes(data)
|
85 |
+
data = np.array(short_array, dtype="float16") / (1 << 15)
|
86 |
+
return data
|
87 |
+
|
uvronnx/src/utils/__pycache__/AudioHelper.cpython-38.pyc
ADDED
Binary file (2.09 kB). View file
|
|
uvronnx/src/utils/__pycache__/logger.cpython-38.pyc
ADDED
Binary file (9.73 kB). View file
|
|
uvronnx/src/utils/__pycache__/spec_utils.cpython-38.pyc
ADDED
Binary file (10.3 kB). View file
|
|
uvronnx/src/utils/logger.py
ADDED
@@ -0,0 +1,299 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding:utf-8 -*-
|
2 |
+
# @FileName :logger.py
|
3 |
+
# @Time :2023/8/1 10:44
|
4 |
+
# @Author :lovemefan
|
5 |
+
# @Email :lovemefan@outlook.com
|
6 |
+
"""LOGGER Module"""
|
7 |
+
import logging
|
8 |
+
import logging.config
|
9 |
+
import logging.handlers
|
10 |
+
import os
|
11 |
+
import sys
|
12 |
+
|
13 |
+
from functools import wraps
|
14 |
+
from typing import Dict, List, Tuple, Union
|
15 |
+
|
16 |
+
|
17 |
+
logger_list = []
|
18 |
+
LEVEL = ('DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL')
|
19 |
+
_LOG_FILE_DIR = '~/.cache/speech-webui/'
|
20 |
+
LOCAL_DEFAULT_LOG_FILE_DIR = os.path.join(
|
21 |
+
os.getenv("LOCAL_DEFAULT_PATH", _LOG_FILE_DIR), 'log')
|
22 |
+
|
23 |
+
DEFAULT_FILEHANDLER_FORMAT = '[%(levelname)s] %(asctime)s ' \
|
24 |
+
'[%(pathname)s:%(lineno)d] %(funcName)s: %(message)s'
|
25 |
+
DEFAULT_STDOUT_FORMAT = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
26 |
+
DEFAULT_REDIRECT_FILE_NAME = 'mindspore.log'
|
27 |
+
|
28 |
+
|
29 |
+
class StreamRedirector:
|
30 |
+
"""Stream Re-director for Log."""
|
31 |
+
|
32 |
+
def __init__(self, source_stream, target_stream):
|
33 |
+
"""Redirects the source stream to the target stream.
|
34 |
+
|
35 |
+
Args:
|
36 |
+
source_stream: Source stream.
|
37 |
+
target_stream: Target stream.
|
38 |
+
"""
|
39 |
+
super(StreamRedirector, self).__init__()
|
40 |
+
|
41 |
+
self.source_stream = source_stream
|
42 |
+
self.target_stream = target_stream
|
43 |
+
|
44 |
+
self.save_source_stream_fd = os.dup(self.source_stream.fileno())
|
45 |
+
|
46 |
+
def __call__(self, func):
|
47 |
+
|
48 |
+
@wraps(func)
|
49 |
+
def wrapper(*args, **kwargs):
|
50 |
+
self.start()
|
51 |
+
func(*args, **kwargs)
|
52 |
+
self.stop()
|
53 |
+
|
54 |
+
return wrapper
|
55 |
+
|
56 |
+
def __enter__(self):
|
57 |
+
self.start()
|
58 |
+
|
59 |
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
60 |
+
self.stop()
|
61 |
+
|
62 |
+
def start(self):
|
63 |
+
"""start."""
|
64 |
+
self.source_stream.flush()
|
65 |
+
os.dup2(self.target_stream.fileno(), self.source_stream.fileno())
|
66 |
+
|
67 |
+
def stop(self):
|
68 |
+
"""stop."""
|
69 |
+
self.source_stream.flush()
|
70 |
+
os.dup2(self.save_source_stream_fd, self.source_stream.fileno())
|
71 |
+
self.target_stream.flush()
|
72 |
+
|
73 |
+
|
74 |
+
def validate_nodes_devices_input(var_name: str, var):
|
75 |
+
"""Check the list of nodes or devices.
|
76 |
+
|
77 |
+
Args:
|
78 |
+
var_name (str): Variable name.
|
79 |
+
var: The name of the variable to be checked.
|
80 |
+
|
81 |
+
Returns:
|
82 |
+
None
|
83 |
+
"""
|
84 |
+
if not (var is None or isinstance(var, (list, tuple, dict))):
|
85 |
+
raise TypeError('The value of {} can be None or a value of type tuple, ' 'list, or dict.'.format(var_name))
|
86 |
+
if isinstance(var, (list, tuple)):
|
87 |
+
for item in var:
|
88 |
+
if not isinstance(item, int):
|
89 |
+
raise TypeError('The elements of a variable of type list or ' 'tuple must be of type int.')
|
90 |
+
|
91 |
+
|
92 |
+
def validate_level(var_name: str, var):
|
93 |
+
"""Verify that the log level is correct.
|
94 |
+
|
95 |
+
Args:
|
96 |
+
var_name (str): Variable name.
|
97 |
+
var: The name of variable to be checked.
|
98 |
+
|
99 |
+
Returns:
|
100 |
+
None
|
101 |
+
"""
|
102 |
+
if not isinstance(var, str):
|
103 |
+
raise TypeError('The format of {} must be of type str.'.format(var_name))
|
104 |
+
if var not in LEVEL:
|
105 |
+
raise ValueError('{}={} needs to be in {}'.format(var_name, var, LEVEL))
|
106 |
+
|
107 |
+
|
108 |
+
def validate_std_input_format(to_std: bool, stdout_nodes: Union[List, Tuple, None],
|
109 |
+
stdout_devices: Union[List, Tuple, None], stdout_level: str):
|
110 |
+
"""Validate the input about stdout of the get_logger function."""
|
111 |
+
|
112 |
+
if not isinstance(to_std, bool):
|
113 |
+
raise TypeError('The format of the to_std must be of type bool.')
|
114 |
+
|
115 |
+
validate_nodes_devices_input('stdout_nodes', stdout_nodes)
|
116 |
+
validate_nodes_devices_input('stdout_devices', stdout_devices)
|
117 |
+
validate_level('stdout_level', stdout_level)
|
118 |
+
|
119 |
+
|
120 |
+
def validate_file_input_format(file_level: Union[List[str], Tuple[str]], file_save_dir: str, append_rank_dir: str,
|
121 |
+
file_name: Union[List[str], Tuple[str]]):
|
122 |
+
"""Validate the input about file of the get_logger function."""
|
123 |
+
|
124 |
+
if not isinstance(file_level, (tuple, list)):
|
125 |
+
raise TypeError('The value of file_level should be list or a tuple.')
|
126 |
+
for level in file_level:
|
127 |
+
validate_level('level in file_level', level)
|
128 |
+
|
129 |
+
if not len(file_level) == len(file_name):
|
130 |
+
raise ValueError('The length of file_level and file_name should be equal.')
|
131 |
+
|
132 |
+
if not isinstance(file_save_dir, str):
|
133 |
+
raise TypeError('The value of file_save_dir should be a value of type str.')
|
134 |
+
|
135 |
+
if not isinstance(append_rank_dir, bool):
|
136 |
+
raise TypeError('The value of append_rank_dir should be a value of type bool.')
|
137 |
+
|
138 |
+
if not isinstance(file_name, (tuple, list)):
|
139 |
+
raise TypeError('The value of file_name should be list or a tuple.')
|
140 |
+
for name in file_name:
|
141 |
+
if not isinstance(name, str):
|
142 |
+
raise TypeError('The value of name in file_name should be a value of type str.')
|
143 |
+
|
144 |
+
|
145 |
+
def _convert_level(level: str) -> int:
|
146 |
+
"""Convert the format of the log to logging level.
|
147 |
+
|
148 |
+
Args:
|
149 |
+
level (str): User log level.
|
150 |
+
|
151 |
+
Returns:
|
152 |
+
level (str): Logging level.
|
153 |
+
"""
|
154 |
+
level_convert = {
|
155 |
+
'DEBUG': logging.DEBUG,
|
156 |
+
'INFO': logging.INFO,
|
157 |
+
'WARNING': logging.WARNING,
|
158 |
+
'ERROR': logging.ERROR,
|
159 |
+
'CRITICAL': logging.CRITICAL
|
160 |
+
}
|
161 |
+
level = level_convert.get(level, logging.INFO)
|
162 |
+
|
163 |
+
return level
|
164 |
+
|
165 |
+
|
166 |
+
def get_logger(logger_name: str = 'uvr-onnx', **kwargs) -> logging.Logger:
|
167 |
+
"""Get the logger. Both computing centers and bare metal servers are
|
168 |
+
available.
|
169 |
+
|
170 |
+
Args:
|
171 |
+
logger_name (str): Logger name.
|
172 |
+
kwargs (dict): Other input.
|
173 |
+
to_std (bool): If set to True, output the log to stdout.
|
174 |
+
stdout_level (str): The level of the log output to stdout.
|
175 |
+
If the type is str, the options are DEBUG, INFO, WARNING, ERROR, CRITICAL.
|
176 |
+
stdout_format (str): Log format.
|
177 |
+
file_level (list[str] or tuple[str]): The level of the log output to file.
|
178 |
+
eg: ['INFO', 'ERROR'] Indicates that the logger will output logs above
|
179 |
+
the level INFO and ERROR in the list to the corresponding file.
|
180 |
+
The length of the list needs to be the same as the length of file_name.
|
181 |
+
file_save_dir (str): The folder where the log files are stored.
|
182 |
+
append_rank_dir (bool): Whether to add a folder with the format rank{}.
|
183 |
+
file_name (list[str] or list[tuple]): Store a list of output file names.
|
184 |
+
max_file_size (int): The maximum size of a single log file. Unit: MB.
|
185 |
+
max_num_of_files (int): The maximum number of files to save.
|
186 |
+
|
187 |
+
Returns:
|
188 |
+
logger (logging.Logger): Logger.
|
189 |
+
"""
|
190 |
+
mf_logger = logging.getLogger(logger_name)
|
191 |
+
if logger_name in logger_list:
|
192 |
+
return mf_logger
|
193 |
+
|
194 |
+
to_std = kwargs.get('to_std', True)
|
195 |
+
stdout_nodes = kwargs.get('stdout_nodes', None)
|
196 |
+
|
197 |
+
def get_stdout_devices():
|
198 |
+
if os.getenv("STDOUT_DEVICES"):
|
199 |
+
devices = os.getenv("STDOUT_DEVICES")
|
200 |
+
if devices.startswith(("(", "[")) and devices.endswith((")", "]")):
|
201 |
+
devices = devices[1:-1]
|
202 |
+
devices = tuple(map(lambda x: int(x.strip()), devices.split(",")))
|
203 |
+
else:
|
204 |
+
devices = kwargs.get('stdout_devices', None)
|
205 |
+
return devices
|
206 |
+
|
207 |
+
stdout_devices = get_stdout_devices()
|
208 |
+
stdout_level = kwargs.get('stdout_level', 'INFO')
|
209 |
+
stdout_format = kwargs.get('stdout_format', '')
|
210 |
+
file_level = kwargs.get('file_level', ('INFO', 'ERROR'))
|
211 |
+
file_save_dir = kwargs.get('file_save_dir', '')
|
212 |
+
append_rank_dir = kwargs.get('append_rank_dir', True)
|
213 |
+
file_name = kwargs.get('file_name', (f'info.log', 'error.log'))
|
214 |
+
max_file_size = kwargs.get('max_file_size', 50)
|
215 |
+
max_num_of_files = kwargs.get('max_num_of_files', 5)
|
216 |
+
|
217 |
+
validate_std_input_format(to_std, stdout_nodes, stdout_devices, stdout_level)
|
218 |
+
validate_file_input_format(file_level, file_save_dir, append_rank_dir, file_name)
|
219 |
+
|
220 |
+
if to_std:
|
221 |
+
if not stdout_format:
|
222 |
+
stdout_format = DEFAULT_STDOUT_FORMAT
|
223 |
+
stream_handler = logging.StreamHandler(sys.stdout)
|
224 |
+
stream_handler.setLevel(_convert_level(stdout_level))
|
225 |
+
stream_formatter = logging.Formatter(stdout_format)
|
226 |
+
stream_handler.setFormatter(stream_formatter)
|
227 |
+
mf_logger.addHandler(stream_handler)
|
228 |
+
|
229 |
+
logging_level = []
|
230 |
+
for level in file_level:
|
231 |
+
logging_level.append(_convert_level(level))
|
232 |
+
|
233 |
+
if not file_save_dir:
|
234 |
+
file_save_dir = LOCAL_DEFAULT_LOG_FILE_DIR
|
235 |
+
|
236 |
+
file_path = []
|
237 |
+
for name in file_name:
|
238 |
+
path = os.path.join(file_save_dir, name)
|
239 |
+
path = os.path.realpath(path)
|
240 |
+
base_dir = os.path.dirname(path)
|
241 |
+
if not os.path.exists(base_dir):
|
242 |
+
os.makedirs(base_dir, exist_ok=True)
|
243 |
+
file_path.append(path)
|
244 |
+
|
245 |
+
max_file_size = max_file_size * 1024 * 1024
|
246 |
+
|
247 |
+
file_formatter = logging.Formatter(DEFAULT_FILEHANDLER_FORMAT)
|
248 |
+
for i, level in enumerate(file_level):
|
249 |
+
file_handler = logging.handlers.RotatingFileHandler(filename=file_path[i],
|
250 |
+
maxBytes=max_file_size,
|
251 |
+
backupCount=max_num_of_files)
|
252 |
+
file_handler.setLevel(level)
|
253 |
+
file_handler.setFormatter(file_formatter)
|
254 |
+
mf_logger.addHandler(file_handler)
|
255 |
+
|
256 |
+
mf_logger.setLevel(_convert_level('INFO'))
|
257 |
+
|
258 |
+
mf_logger.propagate = False
|
259 |
+
|
260 |
+
logger_list.append(logger_name)
|
261 |
+
|
262 |
+
return mf_logger
|
263 |
+
|
264 |
+
|
265 |
+
class _LogActionOnce:
|
266 |
+
"""
|
267 |
+
A wrapper for modify the warning logging to an empty function. This is used when we want to only log
|
268 |
+
once to avoid the repeated logging.
|
269 |
+
|
270 |
+
Args:
|
271 |
+
logger (logging): The logger object.
|
272 |
+
|
273 |
+
"""
|
274 |
+
is_logged = dict()
|
275 |
+
|
276 |
+
def __init__(self, m_logger, key, no_warning=False):
|
277 |
+
self.logger = m_logger
|
278 |
+
self.key = key
|
279 |
+
self.no_warning = no_warning
|
280 |
+
|
281 |
+
def __call__(self, func):
|
282 |
+
def wrapper(*args, **kwargs):
|
283 |
+
if not hasattr(self.logger, 'warning'):
|
284 |
+
return func(*args, **kwargs)
|
285 |
+
|
286 |
+
old_func = self.logger.warning
|
287 |
+
if self.no_warning or self.key in _LogActionOnce.is_logged:
|
288 |
+
self.logger.warning = lambda x: x
|
289 |
+
else:
|
290 |
+
_LogActionOnce.is_logged[self.key] = True
|
291 |
+
res = func(*args, **kwargs)
|
292 |
+
if hasattr(self.logger, 'warning'):
|
293 |
+
self.logger.warning = old_func
|
294 |
+
return res
|
295 |
+
|
296 |
+
return wrapper
|
297 |
+
|
298 |
+
|
299 |
+
logger = get_logger()
|
uvronnx/src/utils/spec_utils.py
ADDED
@@ -0,0 +1,388 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding:utf-8 -*-
|
2 |
+
# @FileName :spec_utils.py
|
3 |
+
# @Time :2023/8/2 17:16
|
4 |
+
# @Author :lovemefan
|
5 |
+
# @Email :lovemefan@outlook.com
|
6 |
+
import os, librosa
|
7 |
+
import numpy as np
|
8 |
+
import json, math, hashlib
|
9 |
+
|
10 |
+
|
11 |
+
def crop_center(h1, h2):
|
12 |
+
h1_shape = h1.size()
|
13 |
+
h2_shape = h2.size()
|
14 |
+
|
15 |
+
if h1_shape[3] == h2_shape[3]:
|
16 |
+
return h1
|
17 |
+
elif h1_shape[3] < h2_shape[3]:
|
18 |
+
raise ValueError('h1_shape[3] must be greater than h2_shape[3]')
|
19 |
+
|
20 |
+
s_time = (h1_shape[3] - h2_shape[3]) // 2
|
21 |
+
e_time = s_time + h2_shape[3]
|
22 |
+
h1 = h1[:, :, :, s_time:e_time]
|
23 |
+
|
24 |
+
return h1
|
25 |
+
|
26 |
+
|
27 |
+
def wave_to_spectrogram(wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False):
|
28 |
+
if reverse:
|
29 |
+
wave_left = np.flip(np.asfortranarray(wave[0]))
|
30 |
+
wave_right = np.flip(np.asfortranarray(wave[1]))
|
31 |
+
elif mid_side:
|
32 |
+
wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2)
|
33 |
+
wave_right = np.asfortranarray(np.subtract(wave[0], wave[1]))
|
34 |
+
elif mid_side_b2:
|
35 |
+
wave_left = np.asfortranarray(np.add(wave[1], wave[0] * .5))
|
36 |
+
wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * .5))
|
37 |
+
else:
|
38 |
+
wave_left = np.asfortranarray(wave[0])
|
39 |
+
wave_right = np.asfortranarray(wave[1])
|
40 |
+
|
41 |
+
spec_left = librosa.stft(wave_left, n_fft, hop_length=hop_length)
|
42 |
+
spec_right = librosa.stft(wave_right, n_fft, hop_length=hop_length)
|
43 |
+
|
44 |
+
spec = np.asfortranarray([spec_left, spec_right])
|
45 |
+
|
46 |
+
return spec
|
47 |
+
|
48 |
+
|
49 |
+
def wave_to_spectrogram_mt(wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False):
|
50 |
+
import threading
|
51 |
+
|
52 |
+
if reverse:
|
53 |
+
wave_left = np.flip(np.asfortranarray(wave[0]))
|
54 |
+
wave_right = np.flip(np.asfortranarray(wave[1]))
|
55 |
+
elif mid_side:
|
56 |
+
wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2)
|
57 |
+
wave_right = np.asfortranarray(np.subtract(wave[0], wave[1]))
|
58 |
+
elif mid_side_b2:
|
59 |
+
wave_left = np.asfortranarray(np.add(wave[1], wave[0] * .5))
|
60 |
+
wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * .5))
|
61 |
+
else:
|
62 |
+
wave_left = np.asfortranarray(wave[0])
|
63 |
+
wave_right = np.asfortranarray(wave[1])
|
64 |
+
|
65 |
+
def run_thread(**kwargs):
|
66 |
+
global spec_left
|
67 |
+
spec_left = librosa.stft(**kwargs)
|
68 |
+
|
69 |
+
thread = threading.Thread(target=run_thread, kwargs={'y': wave_left, 'n_fft': n_fft, 'hop_length': hop_length})
|
70 |
+
thread.start()
|
71 |
+
spec_right = librosa.stft(wave_right, n_fft=n_fft, hop_length=hop_length)
|
72 |
+
thread.join()
|
73 |
+
|
74 |
+
spec = np.asfortranarray([spec_left, spec_right])
|
75 |
+
|
76 |
+
return spec
|
77 |
+
|
78 |
+
|
79 |
+
def combine_spectrograms(specs, param):
|
80 |
+
l = min([specs[i].shape[2] for i in specs])
|
81 |
+
spec_c = np.zeros(shape=(2, param['bins'] + 1, l), dtype=np.complex64)
|
82 |
+
offset = 0
|
83 |
+
bands_n = len(param['band'])
|
84 |
+
|
85 |
+
for d in range(1, bands_n + 1):
|
86 |
+
h = param['band'][d]['crop_stop'] - param['band'][d]['crop_start']
|
87 |
+
spec_c[:, offset:offset + h, :l] = specs[d][:,
|
88 |
+
param['band'][d]['crop_start']:param['band'][d]['crop_stop'], :l]
|
89 |
+
offset += h
|
90 |
+
|
91 |
+
if offset > param['bins']:
|
92 |
+
raise ValueError('Too much bins')
|
93 |
+
|
94 |
+
# lowpass fiter
|
95 |
+
if param['pre_filter_start'] > 0: # and mp.param['band'][bands_n]['res_type'] in ['scipy', 'polyphase']:
|
96 |
+
if bands_n == 1:
|
97 |
+
spec_c = fft_lp_filter(spec_c, param['pre_filter_start'], param['pre_filter_stop'])
|
98 |
+
else:
|
99 |
+
gp = 1
|
100 |
+
for b in range(param['pre_filter_start'] + 1, param['pre_filter_stop']):
|
101 |
+
g = math.pow(10, -(b - param['pre_filter_start']) * (3.5 - gp) / 20.0)
|
102 |
+
gp = g
|
103 |
+
spec_c[:, b, :] *= g
|
104 |
+
|
105 |
+
return np.asfortranarray(spec_c)
|
106 |
+
|
107 |
+
|
108 |
+
def spectrogram_to_image(spec, mode='magnitude'):
|
109 |
+
if mode == 'magnitude':
|
110 |
+
if np.iscomplexobj(spec):
|
111 |
+
y = np.abs(spec)
|
112 |
+
else:
|
113 |
+
y = spec
|
114 |
+
y = np.log10(y ** 2 + 1e-8)
|
115 |
+
elif mode == 'phase':
|
116 |
+
if np.iscomplexobj(spec):
|
117 |
+
y = np.angle(spec)
|
118 |
+
else:
|
119 |
+
y = spec
|
120 |
+
|
121 |
+
y -= y.min()
|
122 |
+
y *= 255 / y.max()
|
123 |
+
img = np.uint8(y)
|
124 |
+
|
125 |
+
if y.ndim == 3:
|
126 |
+
img = img.transpose(1, 2, 0)
|
127 |
+
img = np.concatenate([
|
128 |
+
np.max(img, axis=2, keepdims=True), img
|
129 |
+
], axis=2)
|
130 |
+
|
131 |
+
return img
|
132 |
+
|
133 |
+
|
134 |
+
def reduce_vocal_aggressively(X, y, softmask):
|
135 |
+
v = X - y
|
136 |
+
y_mag_tmp = np.abs(y)
|
137 |
+
v_mag_tmp = np.abs(v)
|
138 |
+
|
139 |
+
v_mask = v_mag_tmp > y_mag_tmp
|
140 |
+
y_mag = np.clip(y_mag_tmp - v_mag_tmp * v_mask * softmask, 0, np.inf)
|
141 |
+
|
142 |
+
return y_mag * np.exp(1.j * np.angle(y))
|
143 |
+
|
144 |
+
|
145 |
+
def mask_silence(mag, ref, thres=0.2, min_range=64, fade_size=32):
|
146 |
+
if min_range < fade_size * 2:
|
147 |
+
raise ValueError('min_range must be >= fade_area * 2')
|
148 |
+
|
149 |
+
mag = mag.copy()
|
150 |
+
|
151 |
+
idx = np.where(ref.mean(axis=(0, 1)) < thres)[0]
|
152 |
+
starts = np.insert(idx[np.where(np.diff(idx) != 1)[0] + 1], 0, idx[0])
|
153 |
+
ends = np.append(idx[np.where(np.diff(idx) != 1)[0]], idx[-1])
|
154 |
+
uninformative = np.where(ends - starts > min_range)[0]
|
155 |
+
if len(uninformative) > 0:
|
156 |
+
starts = starts[uninformative]
|
157 |
+
ends = ends[uninformative]
|
158 |
+
old_e = None
|
159 |
+
for s, e in zip(starts, ends):
|
160 |
+
if old_e is not None and s - old_e < fade_size:
|
161 |
+
s = old_e - fade_size * 2
|
162 |
+
|
163 |
+
if s != 0:
|
164 |
+
weight = np.linspace(0, 1, fade_size)
|
165 |
+
mag[:, :, s:s + fade_size] += weight * ref[:, :, s:s + fade_size]
|
166 |
+
else:
|
167 |
+
s -= fade_size
|
168 |
+
|
169 |
+
if e != mag.shape[2]:
|
170 |
+
weight = np.linspace(1, 0, fade_size)
|
171 |
+
mag[:, :, e - fade_size:e] += weight * ref[:, :, e - fade_size:e]
|
172 |
+
else:
|
173 |
+
e += fade_size
|
174 |
+
|
175 |
+
mag[:, :, s + fade_size:e - fade_size] += ref[:, :, s + fade_size:e - fade_size]
|
176 |
+
old_e = e
|
177 |
+
|
178 |
+
return mag
|
179 |
+
|
180 |
+
|
181 |
+
def align_wave_head_and_tail(a, b):
|
182 |
+
l = min([a[0].size, b[0].size])
|
183 |
+
|
184 |
+
return a[:l, :l], b[:l, :l]
|
185 |
+
|
186 |
+
|
187 |
+
def cache_or_load(mix_path, inst_path, mp):
|
188 |
+
mix_basename = os.path.splitext(os.path.basename(mix_path))[0]
|
189 |
+
inst_basename = os.path.splitext(os.path.basename(inst_path))[0]
|
190 |
+
|
191 |
+
cache_dir = 'mph{}'.format(hashlib.sha1(json.dumps(mp.param, sort_keys=True).encode('utf-8')).hexdigest())
|
192 |
+
mix_cache_dir = os.path.join('cache', cache_dir)
|
193 |
+
inst_cache_dir = os.path.join('cache', cache_dir)
|
194 |
+
|
195 |
+
os.makedirs(mix_cache_dir, exist_ok=True)
|
196 |
+
os.makedirs(inst_cache_dir, exist_ok=True)
|
197 |
+
|
198 |
+
mix_cache_path = os.path.join(mix_cache_dir, mix_basename + '.npy')
|
199 |
+
inst_cache_path = os.path.join(inst_cache_dir, inst_basename + '.npy')
|
200 |
+
|
201 |
+
if os.path.exists(mix_cache_path) and os.path.exists(inst_cache_path):
|
202 |
+
X_spec_m = np.load(mix_cache_path)
|
203 |
+
y_spec_m = np.load(inst_cache_path)
|
204 |
+
else:
|
205 |
+
X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {}
|
206 |
+
|
207 |
+
for d in range(len(mp.param['band']), 0, -1):
|
208 |
+
bp = mp.param['band'][d]
|
209 |
+
|
210 |
+
if d == len(mp.param['band']): # high-end band
|
211 |
+
X_wave[d], _ = librosa.load(
|
212 |
+
mix_path, bp['sr'], False, dtype=np.float32, res_type=bp['res_type'])
|
213 |
+
y_wave[d], _ = librosa.load(
|
214 |
+
inst_path, bp['sr'], False, dtype=np.float32, res_type=bp['res_type'])
|
215 |
+
else: # lower bands
|
216 |
+
X_wave[d] = librosa.resample(X_wave[d + 1], mp.param['band'][d + 1]['sr'], bp['sr'],
|
217 |
+
res_type=bp['res_type'])
|
218 |
+
y_wave[d] = librosa.resample(y_wave[d + 1], mp.param['band'][d + 1]['sr'], bp['sr'],
|
219 |
+
res_type=bp['res_type'])
|
220 |
+
|
221 |
+
X_wave[d], y_wave[d] = align_wave_head_and_tail(X_wave[d], y_wave[d])
|
222 |
+
|
223 |
+
X_spec_s[d] = wave_to_spectrogram(X_wave[d], bp['hl'], bp['n_fft'], mp.param['mid_side'],
|
224 |
+
mp.param['mid_side_b2'], mp.param['reverse'])
|
225 |
+
y_spec_s[d] = wave_to_spectrogram(y_wave[d], bp['hl'], bp['n_fft'], mp.param['mid_side'],
|
226 |
+
mp.param['mid_side_b2'], mp.param['reverse'])
|
227 |
+
|
228 |
+
del X_wave, y_wave
|
229 |
+
|
230 |
+
X_spec_m = combine_spectrograms(X_spec_s, mp)
|
231 |
+
y_spec_m = combine_spectrograms(y_spec_s, mp)
|
232 |
+
|
233 |
+
if X_spec_m.shape != y_spec_m.shape:
|
234 |
+
raise ValueError('The combined spectrograms are different: ' + mix_path)
|
235 |
+
|
236 |
+
_, ext = os.path.splitext(mix_path)
|
237 |
+
|
238 |
+
np.save(mix_cache_path, X_spec_m)
|
239 |
+
np.save(inst_cache_path, y_spec_m)
|
240 |
+
|
241 |
+
return X_spec_m, y_spec_m
|
242 |
+
|
243 |
+
|
244 |
+
def spectrogram_to_wave(spec, hop_length, mid_side, mid_side_b2, reverse):
|
245 |
+
spec_left = np.asfortranarray(spec[0])
|
246 |
+
spec_right = np.asfortranarray(spec[1])
|
247 |
+
|
248 |
+
wave_left = librosa.istft(spec_left, hop_length=hop_length)
|
249 |
+
wave_right = librosa.istft(spec_right, hop_length=hop_length)
|
250 |
+
|
251 |
+
if reverse:
|
252 |
+
return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)])
|
253 |
+
elif mid_side:
|
254 |
+
return np.asfortranarray([np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)])
|
255 |
+
elif mid_side_b2:
|
256 |
+
return np.asfortranarray(
|
257 |
+
[np.add(wave_right / 1.25, .4 * wave_left), np.subtract(wave_left / 1.25, .4 * wave_right)])
|
258 |
+
else:
|
259 |
+
return np.asfortranarray([wave_left, wave_right])
|
260 |
+
|
261 |
+
|
262 |
+
def cmb_spectrogram_to_wave(spec_m, param, extra_bins_h=None, extra_bins=None):
|
263 |
+
wave_band = {}
|
264 |
+
bands_n = len(param['band'])
|
265 |
+
offset = 0
|
266 |
+
|
267 |
+
for d in range(1, bands_n + 1):
|
268 |
+
bp = param['band'][d]
|
269 |
+
spec_s = np.ndarray(shape=(2, bp['n_fft'] // 2 + 1, spec_m.shape[2]), dtype=complex)
|
270 |
+
h = bp['crop_stop'] - bp['crop_start']
|
271 |
+
spec_s[:, bp['crop_start']:bp['crop_stop'], :] = spec_m[:, offset:offset + h, :]
|
272 |
+
|
273 |
+
offset += h
|
274 |
+
if d == bands_n: # higher
|
275 |
+
if extra_bins_h: # if --high_end_process bypass
|
276 |
+
max_bin = bp['n_fft'] // 2
|
277 |
+
spec_s[:, max_bin - extra_bins_h:max_bin, :] = extra_bins[:, :extra_bins_h, :]
|
278 |
+
if bp['hpf_start'] > 0:
|
279 |
+
spec_s = fft_hp_filter(spec_s, bp['hpf_start'], bp['hpf_stop'] - 1)
|
280 |
+
if bands_n == 1:
|
281 |
+
wave = spectrogram_to_wave(spec_s, bp['hl'], param['mid_side'], param['mid_side_b2'],
|
282 |
+
param['reverse'])
|
283 |
+
else:
|
284 |
+
wave = np.add(wave, spectrogram_to_wave(spec_s, bp['hl'], param['mid_side'], param['mid_side_b2'],
|
285 |
+
param['reverse']))
|
286 |
+
else:
|
287 |
+
sr = param['band'][d + 1]['sr']
|
288 |
+
if d == 1: # lower
|
289 |
+
spec_s = fft_lp_filter(spec_s, bp['lpf_start'], bp['lpf_stop'])
|
290 |
+
wave = librosa.resample(
|
291 |
+
spectrogram_to_wave(spec_s, bp['hl'], param['mid_side'], param['mid_side_b2'],
|
292 |
+
param['reverse']), orig_sr=bp['sr'], target_sr=sr, res_type="sinc_fastest")
|
293 |
+
else: # mid
|
294 |
+
spec_s = fft_hp_filter(spec_s, bp['hpf_start'], bp['hpf_stop'] - 1)
|
295 |
+
spec_s = fft_lp_filter(spec_s, bp['lpf_start'], bp['lpf_stop'])
|
296 |
+
wave2 = np.add(wave,
|
297 |
+
spectrogram_to_wave(spec_s, bp['hl'], param['mid_side'], param['mid_side_b2'],
|
298 |
+
param['reverse']))
|
299 |
+
# wave = librosa.core.resample(wave2, bp['sr'], sr, res_type="sinc_fastest")
|
300 |
+
wave = librosa.core.resample(wave2, orig_sr=bp['sr'], target_sr=sr, res_type='scipy')
|
301 |
+
|
302 |
+
return wave.T
|
303 |
+
|
304 |
+
|
305 |
+
def fft_lp_filter(spec, bin_start, bin_stop):
|
306 |
+
g = 1.0
|
307 |
+
for b in range(bin_start, bin_stop):
|
308 |
+
g -= 1 / (bin_stop - bin_start)
|
309 |
+
spec[:, b, :] = g * spec[:, b, :]
|
310 |
+
|
311 |
+
spec[:, bin_stop:, :] *= 0
|
312 |
+
|
313 |
+
return spec
|
314 |
+
|
315 |
+
|
316 |
+
def fft_hp_filter(spec, bin_start, bin_stop):
|
317 |
+
g = 1.0
|
318 |
+
for b in range(bin_start, bin_stop, -1):
|
319 |
+
g -= 1 / (bin_start - bin_stop)
|
320 |
+
spec[:, b, :] = g * spec[:, b, :]
|
321 |
+
|
322 |
+
spec[:, 0:bin_stop + 1, :] *= 0
|
323 |
+
|
324 |
+
return spec
|
325 |
+
|
326 |
+
|
327 |
+
def mirroring(a, spec_m, input_high_end, param):
|
328 |
+
if 'mirroring' == a:
|
329 |
+
mirror = np.flip(np.abs(
|
330 |
+
spec_m[:, param['pre_filter_start'] - 10 - input_high_end.shape[1]:param['pre_filter_start'] - 10,
|
331 |
+
:]), 1)
|
332 |
+
mirror = mirror * np.exp(1.j * np.angle(input_high_end))
|
333 |
+
|
334 |
+
return np.where(np.abs(input_high_end) <= np.abs(mirror), input_high_end, mirror)
|
335 |
+
|
336 |
+
if 'mirroring2' == a:
|
337 |
+
mirror = np.flip(np.abs(
|
338 |
+
spec_m[:, param['pre_filter_start'] - 10 - input_high_end.shape[1]:param['pre_filter_start'] - 10,
|
339 |
+
:]), 1)
|
340 |
+
mi = np.multiply(mirror, input_high_end * 1.7)
|
341 |
+
|
342 |
+
return np.where(np.abs(input_high_end) <= np.abs(mi), input_high_end, mi)
|
343 |
+
|
344 |
+
|
345 |
+
def ensembling(a, specs):
|
346 |
+
for i in range(1, len(specs)):
|
347 |
+
if i == 1:
|
348 |
+
spec = specs[0]
|
349 |
+
|
350 |
+
ln = min([spec.shape[2], specs[i].shape[2]])
|
351 |
+
spec = spec[:, :, :ln]
|
352 |
+
specs[i] = specs[i][:, :, :ln]
|
353 |
+
|
354 |
+
if 'min_mag' == a:
|
355 |
+
spec = np.where(np.abs(specs[i]) <= np.abs(spec), specs[i], spec)
|
356 |
+
if 'max_mag' == a:
|
357 |
+
spec = np.where(np.abs(specs[i]) >= np.abs(spec), specs[i], spec)
|
358 |
+
|
359 |
+
return spec
|
360 |
+
|
361 |
+
|
362 |
+
def stft(wave, nfft, hl):
|
363 |
+
wave_left = np.asfortranarray(wave[0])
|
364 |
+
wave_right = np.asfortranarray(wave[1])
|
365 |
+
spec_left = librosa.stft(wave_left, nfft, hop_length=hl)
|
366 |
+
spec_right = librosa.stft(wave_right, nfft, hop_length=hl)
|
367 |
+
spec = np.asfortranarray([spec_left, spec_right])
|
368 |
+
|
369 |
+
return spec
|
370 |
+
|
371 |
+
|
372 |
+
def istft(spec, hl):
|
373 |
+
spec_left = np.asfortranarray(spec[0])
|
374 |
+
spec_right = np.asfortranarray(spec[1])
|
375 |
+
|
376 |
+
wave_left = librosa.istft(spec_left, hop_length=hl)
|
377 |
+
wave_right = librosa.istft(spec_right, hop_length=hl)
|
378 |
+
wave = np.asfortranarray([wave_left, wave_right])
|
379 |
+
|
380 |
+
|
381 |
+
def make_padding(width, cropsize, offset):
|
382 |
+
left = offset
|
383 |
+
roi_size = cropsize - left * 2
|
384 |
+
if roi_size == 0:
|
385 |
+
roi_size = cropsize
|
386 |
+
right = roi_size - (width % roi_size) + left
|
387 |
+
return left, right, roi_size
|
388 |
+
|
uvronnx/src/uvr.py
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding:utf-8 -*-
|
2 |
+
# @FileName :uvr.py
|
3 |
+
# @Time :2023/8/2 10:47
|
4 |
+
# @Author :lovemefan
|
5 |
+
# @Email :lovemefan@outlook.com
|
6 |
+
import os.path
|
7 |
+
|
8 |
+
import numpy as np
|
9 |
+
import librosa
|
10 |
+
from tqdm import tqdm
|
11 |
+
|
12 |
+
from uvronnx.src.config import UVR_CONFIG
|
13 |
+
from uvronnx.src.ortInferSession import UVROrtInferSession
|
14 |
+
from uvronnx.src.utils import spec_utils
|
15 |
+
from uvronnx.src.utils.AudioHelper import AudioReader
|
16 |
+
from uvronnx.src.utils.spec_utils import make_padding
|
17 |
+
|
18 |
+
|
19 |
+
class UVRModel:
|
20 |
+
def __init__(self, model_path=None):
|
21 |
+
project_dir = os.path.dirname(os.path.dirname(__file__))
|
22 |
+
model_path = model_path or os.path.join(project_dir, 'onnx/uvr-sim.onnx')
|
23 |
+
assert os.path.exists(model_path), f"{model_path} is not exist"
|
24 |
+
|
25 |
+
self.model = UVROrtInferSession({
|
26 |
+
'model_path': model_path,
|
27 |
+
'use_cuda': False
|
28 |
+
})
|
29 |
+
self.offset = 128
|
30 |
+
self.window_size = 512
|
31 |
+
|
32 |
+
def preprocess(x_spec):
|
33 |
+
x_mag = np.abs(x_spec)
|
34 |
+
x_phase = np.angle(x_spec)
|
35 |
+
|
36 |
+
return x_mag, x_phase
|
37 |
+
|
38 |
+
def separate_offline(self, mixed_audio, sample_rate=44100):
|
39 |
+
if isinstance(mixed_audio, str):
|
40 |
+
mixed_audio, sample_rate = AudioReader.read_wav_file(mixed_audio)
|
41 |
+
|
42 |
+
x_wave, y_wave, x_spec_s, y_spec_s = {}, {}, {}, {}
|
43 |
+
bands_n = len(UVR_CONFIG['band'])
|
44 |
+
for d in range(bands_n, 0, -1):
|
45 |
+
bp = UVR_CONFIG['band'][d]
|
46 |
+
if d == bands_n: # high-end band
|
47 |
+
x_wave[d] = mixed_audio
|
48 |
+
if x_wave[d].ndim == 1:
|
49 |
+
x_wave[d] = np.asfortranarray([x_wave[d], x_wave[d]])
|
50 |
+
else: # lower bands
|
51 |
+
x_wave[d] = librosa.core.resample(x_wave[d + 1], orig_sr=UVR_CONFIG['band'][d + 1]['sr'], target_sr=bp['sr'],
|
52 |
+
res_type=bp['res_type'])
|
53 |
+
# Stft of wave source
|
54 |
+
x_spec_s[d] = spec_utils.wave_to_spectrogram_mt(x_wave[d], bp['hl'], bp['n_fft'], UVR_CONFIG['mid_side'],
|
55 |
+
UVR_CONFIG['mid_side_b2'], UVR_CONFIG['reverse'])
|
56 |
+
# pdb.set_trace()
|
57 |
+
if d == bands_n:
|
58 |
+
input_high_end_h = (bp['n_fft'] // 2 - bp['crop_stop']) + (
|
59 |
+
UVR_CONFIG['pre_filter_stop'] - UVR_CONFIG['pre_filter_start'])
|
60 |
+
input_high_end = x_spec_s[d][:, bp['n_fft'] // 2 - input_high_end_h:bp['n_fft'] // 2, :]
|
61 |
+
|
62 |
+
x_spec_m = spec_utils.combine_spectrograms(x_spec_s, UVR_CONFIG)
|
63 |
+
|
64 |
+
def preprocess(x_spec):
|
65 |
+
x_mag = np.abs(x_spec)
|
66 |
+
x_phase = np.angle(x_spec)
|
67 |
+
return x_mag, x_phase
|
68 |
+
|
69 |
+
x_mag, x_phase = preprocess(x_spec_m)
|
70 |
+
|
71 |
+
coef = x_mag.max()
|
72 |
+
x_mag_pre = x_mag / coef
|
73 |
+
|
74 |
+
n_frame = x_mag_pre.shape[2]
|
75 |
+
pad_l, pad_r, roi_size = make_padding(n_frame,
|
76 |
+
self.window_size, self.offset)
|
77 |
+
n_window = int(np.ceil(n_frame / roi_size))
|
78 |
+
|
79 |
+
x_mag_pad = np.pad(
|
80 |
+
x_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode='constant')
|
81 |
+
|
82 |
+
preds = []
|
83 |
+
|
84 |
+
iterations = [n_window]
|
85 |
+
|
86 |
+
total_iterations = sum(iterations)
|
87 |
+
for i in tqdm(range(n_window)):
|
88 |
+
start = i * roi_size
|
89 |
+
x_mag_window = x_mag_pad[None, :, :, start:start + self.window_size]
|
90 |
+
# if (is_half == True): x_mag_window = x_mag_window.half()
|
91 |
+
|
92 |
+
h = self.model(x_mag_window)
|
93 |
+
pred = h[:, :, :, self.offset:-self.offset]
|
94 |
+
assert pred.shape[3] > 0
|
95 |
+
|
96 |
+
preds.append(pred[0])
|
97 |
+
|
98 |
+
pred = np.concatenate(preds, axis=2)
|
99 |
+
pred = pred[:, :, :n_frame]
|
100 |
+
pred, x_mag, x_phase = pred * coef, x_mag, np.exp(1.j * x_phase)
|
101 |
+
|
102 |
+
y_spec_m = pred * x_phase
|
103 |
+
v_spec_m = x_spec_m - y_spec_m
|
104 |
+
|
105 |
+
input_high_end_ = spec_utils.mirroring('mirroring', y_spec_m, input_high_end, UVR_CONFIG)
|
106 |
+
wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, UVR_CONFIG, input_high_end_h,
|
107 |
+
input_high_end_)
|
108 |
+
print('instruments done')
|
109 |
+
|
110 |
+
input_high_end_ = spec_utils.mirroring('mirroring', v_spec_m, input_high_end, UVR_CONFIG)
|
111 |
+
wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, UVR_CONFIG, input_high_end_h, input_high_end_)
|
112 |
+
|
113 |
+
return wav_instrument, wav_vocals
|
114 |
+
|
115 |
+
|
116 |
+
if __name__ == '__main__':
|
117 |
+
model = UVRModel()
|
118 |
+
audio, sample_rate = AudioReader.read_wav_file('/Users/cenglingfan/Downloads/晴天.wav_-4key_fumin.wav')
|
119 |
+
instrument, vocal = model.separate_offline(audio, sample_rate)
|
120 |
+
print(instrument)
|
121 |
+
print(vocal)
|