JacobLinCool commited on
Commit
3a010aa
1 Parent(s): 4b56fbf

feat: infer

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitignore +1 -0
  2. README.md +4 -2
  3. app.py +114 -27
  4. assets/pretrained_v2/D40k.pth +0 -3
  5. assets/pretrained_v2/G40k.pth +0 -3
  6. config.json +1 -1
  7. configs/config.py +245 -0
  8. infer/lib/audio.py +1 -1
  9. infer/lib/rmvpe.py +2 -12
  10. infer/lib/train/process_ckpt.py +2 -2
  11. infer/lib/uvr5_pack/lib_v5/dataset.py +0 -183
  12. infer/lib/uvr5_pack/lib_v5/layers.py +0 -118
  13. infer/lib/uvr5_pack/lib_v5/layers_123812KB .py +0 -118
  14. infer/lib/uvr5_pack/lib_v5/layers_123821KB.py +0 -118
  15. infer/lib/uvr5_pack/lib_v5/layers_33966KB.py +0 -126
  16. infer/lib/uvr5_pack/lib_v5/layers_537227KB.py +0 -126
  17. infer/lib/uvr5_pack/lib_v5/layers_537238KB.py +0 -126
  18. infer/lib/uvr5_pack/lib_v5/layers_new.py +0 -125
  19. infer/lib/uvr5_pack/lib_v5/model_param_init.py +0 -69
  20. infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json +0 -19
  21. infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json +0 -19
  22. infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json +0 -19
  23. infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json +0 -19
  24. infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json +0 -19
  25. infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json +0 -19
  26. infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512_cut.json +0 -19
  27. infer/lib/uvr5_pack/lib_v5/modelparams/2band_32000.json +0 -30
  28. infer/lib/uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json +0 -30
  29. infer/lib/uvr5_pack/lib_v5/modelparams/2band_48000.json +0 -30
  30. infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100.json +0 -42
  31. infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json +0 -43
  32. infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json +0 -43
  33. infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json +0 -54
  34. infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_mid.json +0 -55
  35. infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb.json +0 -55
  36. infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json +0 -55
  37. infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json +0 -55
  38. infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_sw.json +0 -55
  39. infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json +0 -54
  40. infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json +0 -55
  41. infer/lib/uvr5_pack/lib_v5/modelparams/4band_v3.json +0 -54
  42. infer/lib/uvr5_pack/lib_v5/modelparams/ensemble.json +0 -43
  43. infer/lib/uvr5_pack/lib_v5/nets.py +0 -123
  44. infer/lib/uvr5_pack/lib_v5/nets_123812KB.py +0 -122
  45. infer/lib/uvr5_pack/lib_v5/nets_123821KB.py +0 -122
  46. infer/lib/uvr5_pack/lib_v5/nets_33966KB.py +0 -122
  47. infer/lib/uvr5_pack/lib_v5/nets_537227KB.py +0 -123
  48. infer/lib/uvr5_pack/lib_v5/nets_537238KB.py +0 -123
  49. infer/lib/uvr5_pack/lib_v5/nets_61968KB.py +0 -122
  50. infer/lib/uvr5_pack/lib_v5/nets_new.py +0 -133
.gitignore CHANGED
@@ -1,2 +1,3 @@
1
  .DS_Store
2
  *.pyc
 
 
1
  .DS_Store
2
  *.pyc
3
+ __pycache__
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: RVC Trainer
3
  emoji: 🦀
4
  colorFrom: gray
5
  colorTo: gray
@@ -9,4 +9,6 @@ app_file: app.py
9
  pinned: false
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
1
  ---
2
+ title: ZeroRVC
3
  emoji: 🦀
4
  colorFrom: gray
5
  colorTo: gray
 
9
  pinned: false
10
  ---
11
 
12
+ # ZeroRVC
13
+
14
+ Run Retrieval-based Voice Conversion training and inference on HuggingFace ZeroGPU.
app.py CHANGED
@@ -1,11 +1,12 @@
 
 
 
 
 
1
  import os
2
  import traceback
3
-
4
  import numpy as np
5
  from sklearn.cluster import MiniBatchKMeans
6
-
7
- os.environ["PYTORCH_JIT"] = "0v"
8
-
9
  from random import shuffle
10
  import gradio as gr
11
  import zipfile
@@ -18,23 +19,12 @@ from infer.modules.train.extract.extract_f0_rmvpe import FeatureInput
18
  from infer.modules.train.extract_feature_print import HubertFeatureExtractor
19
  from infer.modules.train.train import train
20
  from infer.lib.train.process_ckpt import extract_small_model
 
 
 
 
21
  from zero import zero
22
-
23
- # patch for jit script
24
- # if we find `def expand_2d_or_3d_tensor(x,` in /usr/local/lib/python3.10/site-packages/fairseq/models/model_utils.py
25
- # patch it with `def expand_2d_or_3d_tensor(x: Tensor,`
26
- FAIRSEQ_CODE = "/usr/local/lib/python3.10/site-packages/fairseq/models/model_utils.py"
27
- if os.path.exists(FAIRSEQ_CODE):
28
- with open(FAIRSEQ_CODE, "r") as f:
29
- lines = f.readlines()
30
- with open(FAIRSEQ_CODE, "w") as f:
31
- for line in lines:
32
- if "def expand_2d_or_3d_tensor(x, trg_dim: int, padding_idx: int):" in line:
33
- f.write(
34
- "def expand_2d_or_3d_tensor(x: Tensor, trg_dim: int, padding_idx: int) -> Tensor:\n"
35
- )
36
- else:
37
- f.write(line)
38
 
39
 
40
  def extract_audio_files(zip_file: str, target_dir: str) -> list[str]:
@@ -189,13 +179,15 @@ def download_weight(exp_dir: str) -> str:
189
  raise gr.Error("No model found")
190
 
191
  latest_model = max(models, key=os.path.getctime)
 
192
 
193
  name = os.path.basename(exp_dir)
 
194
  extract_small_model(
195
- latest_model, name, "40k", True, "Model trained by ZeroGPU.", "v2"
196
  )
197
 
198
- return "assets/weights/%s.pth" % name
199
 
200
 
201
  def train_index(exp_dir: str) -> str:
@@ -269,9 +261,70 @@ def restore_expdir(zip: str) -> str:
269
  return exp_dir
270
 
271
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
  with gr.Blocks() as app:
273
  # allow user to manually select the experiment directory
274
- exp_dir = gr.Textbox(label="Experiment directory (don't touch it unless you know what you are doing)", visible=True, interactive=True)
 
 
 
 
275
 
276
  with gr.Tabs():
277
  with gr.Tab(label="New / Restore"):
@@ -284,10 +337,10 @@ with gr.Blocks() as app:
284
  preprocess_output = gr.Textbox(
285
  label="Preprocessing output", lines=5
286
  )
287
- with gr.Column():
288
- preprocess_btn = gr.Button(
289
- value="Start New Experiment", variant="primary"
290
- )
291
 
292
  with gr.Row():
293
  restore_zip_file = gr.File(
@@ -327,6 +380,26 @@ with gr.Blocks() as app:
327
  )
328
  download_expdir_output = gr.File(label="Download experiment directory")
329
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
330
  preprocess_btn.click(
331
  fn=preprocess,
332
  inputs=[zip_file],
@@ -343,6 +416,10 @@ with gr.Blocks() as app:
343
  fn=train_model,
344
  inputs=[exp_dir],
345
  outputs=[latest_model],
 
 
 
 
346
  )
347
 
348
  train_index_btn.click(
@@ -369,4 +446,14 @@ with gr.Blocks() as app:
369
  outputs=[exp_dir],
370
  )
371
 
 
 
 
 
 
 
 
 
 
 
372
  app.launch()
 
1
+ from typing import Tuple
2
+ from prelude import prelude
3
+
4
+ prelude()
5
+
6
  import os
7
  import traceback
 
8
  import numpy as np
9
  from sklearn.cluster import MiniBatchKMeans
 
 
 
10
  from random import shuffle
11
  import gradio as gr
12
  import zipfile
 
19
  from infer.modules.train.extract_feature_print import HubertFeatureExtractor
20
  from infer.modules.train.train import train
21
  from infer.lib.train.process_ckpt import extract_small_model
22
+ from infer.modules.vc.modules import VC
23
+ from configs.config import Config
24
+ import demucs.separate
25
+ import soundfile as sf
26
  from zero import zero
27
+ from model import device
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
 
30
  def extract_audio_files(zip_file: str, target_dir: str) -> list[str]:
 
179
  raise gr.Error("No model found")
180
 
181
  latest_model = max(models, key=os.path.getctime)
182
+ print(f"Latest model: {latest_model}")
183
 
184
  name = os.path.basename(exp_dir)
185
+ out = os.path.join(exp_dir, f"{name}.pth")
186
  extract_small_model(
187
+ latest_model, out, "40k", True, "Model trained by ZeroGPU.", "v2"
188
  )
189
 
190
+ return out
191
 
192
 
193
  def train_index(exp_dir: str) -> str:
 
261
  return exp_dir
262
 
263
 
264
+ @zero(duration=120)
265
+ def infer(exp_dir: str, original_audio: str, f0add: int) -> Tuple[int, np.ndarray]:
266
+ name = os.path.basename(exp_dir)
267
+ model = os.path.join(exp_dir, f"{name}.pth")
268
+ if not os.path.exists(model):
269
+ raise gr.Error("Model not found")
270
+
271
+ index = glob(f"{exp_dir}/added_*.index")
272
+ if not index:
273
+ raise gr.Error("Index not found")
274
+
275
+ base = os.path.basename(original_audio)
276
+ base = os.path.splitext(base)[0]
277
+ demucs.separate.main(
278
+ ["--two-stems", "vocals", "-d", str(device), "-n", "htdemucs", original_audio]
279
+ )
280
+ out = os.path.join("separated", "htdemucs", base, "vocals.wav")
281
+
282
+ cfg = Config()
283
+ vc = VC(cfg)
284
+ vc.get_vc(model)
285
+ _, wav_opt = vc.vc_single(
286
+ 0,
287
+ out,
288
+ f0add,
289
+ None,
290
+ "rmvpe",
291
+ index,
292
+ None,
293
+ 0.5,
294
+ 3,
295
+ 0,
296
+ 1,
297
+ 0.33,
298
+ )
299
+
300
+ sr = wav_opt[0]
301
+ data = wav_opt[1]
302
+
303
+ return sr, data
304
+
305
+
306
+ def merge(exp_dir: str, original_audio: str, vocal: Tuple[int, np.ndarray]) -> str:
307
+ base = os.path.basename(original_audio)
308
+ base = os.path.splitext(base)[0]
309
+ music = os.path.join("separated", "htdemucs", base, "no-vocals.wav")
310
+
311
+ tmp = os.path.join(exp_dir, "tmp.wav")
312
+ sf.write(tmp, vocal[1], vocal[0])
313
+
314
+ os.system(
315
+ f"ffmpeg -i {music} -i {tmp} -filter_complex '[1]volume=2[a];[0][a]amix=inputs=2:duration=first:dropout_transition=2' {tmp}.merged.mp3"
316
+ )
317
+
318
+ return f"{tmp}.merged.mp3"
319
+
320
+
321
  with gr.Blocks() as app:
322
  # allow user to manually select the experiment directory
323
+ exp_dir = gr.Textbox(
324
+ label="Experiment directory (don't touch it unless you know what you are doing)",
325
+ visible=True,
326
+ interactive=True,
327
+ )
328
 
329
  with gr.Tabs():
330
  with gr.Tab(label="New / Restore"):
 
337
  preprocess_output = gr.Textbox(
338
  label="Preprocessing output", lines=5
339
  )
340
+
341
+ preprocess_btn = gr.Button(
342
+ value="Start New Experiment", variant="primary"
343
+ )
344
 
345
  with gr.Row():
346
  restore_zip_file = gr.File(
 
380
  )
381
  download_expdir_output = gr.File(label="Download experiment directory")
382
 
383
+ with gr.Tab(label="Inference"):
384
+ with gr.Row():
385
+ original_audio = gr.Audio(
386
+ label="Upload original audio",
387
+ type="filepath",
388
+ show_download_button=True,
389
+ )
390
+ f0add = gr.Slider(
391
+ label="F0 add",
392
+ minimum=-16,
393
+ maximum=16,
394
+ step=1,
395
+ value=0,
396
+ )
397
+ infer_btn = gr.Button(value="Infer", variant="primary")
398
+ with gr.Row():
399
+ infer_output = gr.Audio(label="Inferred audio")
400
+ with gr.Row():
401
+ merge_output = gr.Audio(label="Merged audio")
402
+
403
  preprocess_btn.click(
404
  fn=preprocess,
405
  inputs=[zip_file],
 
416
  fn=train_model,
417
  inputs=[exp_dir],
418
  outputs=[latest_model],
419
+ ).success(
420
+ fn=train_model,
421
+ inputs=[exp_dir],
422
+ outputs=[latest_model],
423
  )
424
 
425
  train_index_btn.click(
 
446
  outputs=[exp_dir],
447
  )
448
 
449
+ infer_btn.click(
450
+ fn=infer,
451
+ inputs=[exp_dir, original_audio, f0add],
452
+ outputs=[infer_output],
453
+ ).success(
454
+ fn=merge,
455
+ inputs=[exp_dir, original_audio, infer_output],
456
+ outputs=[merge_output],
457
+ )
458
+
459
  app.launch()
assets/pretrained_v2/D40k.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:471378e894e7191f89a94eda8288c5947b16bbe0b10c3f1f17efdb7a1d998242
3
- size 142875703
 
 
 
 
assets/pretrained_v2/G40k.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a3843da7fde33db1dab176146c70d6c2df06eafe9457f4e3aa10024e9c6a4b69
3
- size 72959671
 
 
 
 
config.json CHANGED
@@ -67,7 +67,7 @@
67
  "c_mel": 45,
68
  "epochs": 20000,
69
  "eps": 1e-09,
70
- "fp16_run": false,
71
  "init_lr_ratio": 1,
72
  "learning_rate": 0.0001,
73
  "log_interval": 200,
 
67
  "c_mel": 45,
68
  "epochs": 20000,
69
  "eps": 1e-09,
70
+ "fp16_run": true,
71
  "init_lr_ratio": 1,
72
  "learning_rate": 0.0001,
73
  "log_interval": 200,
configs/config.py ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import sys
4
+ import json
5
+ import shutil
6
+ from multiprocessing import cpu_count
7
+
8
+ import torch
9
+ import logging
10
+ from model import device, fp16
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ version_config_list = [
16
+ "v1/32k.json",
17
+ "v1/40k.json",
18
+ "v1/48k.json",
19
+ "v2/48k.json",
20
+ "v2/32k.json",
21
+ ]
22
+
23
+
24
+ def singleton_variable(func):
25
+ def wrapper(*args, **kwargs):
26
+ if not wrapper.instance:
27
+ wrapper.instance = func(*args, **kwargs)
28
+ return wrapper.instance
29
+
30
+ wrapper.instance = None
31
+ return wrapper
32
+
33
+
34
+ @singleton_variable
35
+ class Config:
36
+ def __init__(self):
37
+ self.device = str(device)
38
+ self.is_half = fp16
39
+ self.use_jit = False
40
+ self.n_cpu = 0
41
+ self.gpu_name = None
42
+ self.json_config = self.load_config_json()
43
+ self.gpu_mem = None
44
+ (
45
+ self.python_cmd,
46
+ self.listen_port,
47
+ self.iscolab,
48
+ self.noparallel,
49
+ self.noautoopen,
50
+ self.dml,
51
+ ) = self.arg_parse()
52
+ self.instead = ""
53
+ self.preprocess_per = 3.7
54
+ self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()
55
+
56
+ @staticmethod
57
+ def load_config_json() -> dict:
58
+ d = {}
59
+ # for config_file in version_config_list:
60
+ # p = f"configs/inuse/{config_file}"
61
+ # if not os.path.exists(p):
62
+ # shutil.copy(f"configs/{config_file}", p)
63
+ # with open(f"configs/inuse/{config_file}", "r") as f:
64
+ # d[config_file] = json.load(f)
65
+ return d
66
+
67
+ @staticmethod
68
+ def arg_parse() -> tuple:
69
+ exe = sys.executable or "python"
70
+ parser = argparse.ArgumentParser()
71
+ parser.add_argument("--port", type=int, default=7865, help="Listen port")
72
+ parser.add_argument("--pycmd", type=str, default=exe, help="Python command")
73
+ parser.add_argument("--colab", action="store_true", help="Launch in colab")
74
+ parser.add_argument(
75
+ "--noparallel", action="store_true", help="Disable parallel processing"
76
+ )
77
+ parser.add_argument(
78
+ "--noautoopen",
79
+ action="store_true",
80
+ help="Do not open in browser automatically",
81
+ )
82
+ parser.add_argument(
83
+ "--dml",
84
+ action="store_true",
85
+ help="torch_dml",
86
+ )
87
+ cmd_opts = parser.parse_args()
88
+
89
+ cmd_opts.port = cmd_opts.port if 0 <= cmd_opts.port <= 65535 else 7865
90
+
91
+ return (
92
+ cmd_opts.pycmd,
93
+ cmd_opts.port,
94
+ cmd_opts.colab,
95
+ cmd_opts.noparallel,
96
+ cmd_opts.noautoopen,
97
+ cmd_opts.dml,
98
+ )
99
+
100
+ # has_mps is only available in nightly pytorch (for now) and MasOS 12.3+.
101
+ # check `getattr` and try it for compatibility
102
+ @staticmethod
103
+ def has_mps() -> bool:
104
+ if not torch.backends.mps.is_available():
105
+ return False
106
+ try:
107
+ torch.zeros(1).to(torch.device("mps"))
108
+ return True
109
+ except Exception:
110
+ return False
111
+
112
+ @staticmethod
113
+ def has_xpu() -> bool:
114
+ if hasattr(torch, "xpu") and torch.xpu.is_available():
115
+ return True
116
+ else:
117
+ return False
118
+
119
+ def use_fp32_config(self):
120
+ for config_file in version_config_list:
121
+ self.json_config[config_file]["train"]["fp16_run"] = False
122
+ with open(f"configs/inuse/{config_file}", "r") as f:
123
+ strr = f.read().replace("true", "false")
124
+ with open(f"configs/inuse/{config_file}", "w") as f:
125
+ f.write(strr)
126
+ logger.info("overwrite " + config_file)
127
+ self.preprocess_per = 3.0
128
+ logger.info("overwrite preprocess_per to %d" % (self.preprocess_per))
129
+
130
+ def device_config(self) -> tuple:
131
+ if torch.cuda.is_available():
132
+ if self.has_xpu():
133
+ self.device = self.instead = "xpu:0"
134
+ self.is_half = True
135
+ i_device = int(self.device.split(":")[-1])
136
+ self.gpu_name = torch.cuda.get_device_name(i_device)
137
+ if (
138
+ ("16" in self.gpu_name and "V100" not in self.gpu_name.upper())
139
+ or "P40" in self.gpu_name.upper()
140
+ or "P10" in self.gpu_name.upper()
141
+ or "1060" in self.gpu_name
142
+ or "1070" in self.gpu_name
143
+ or "1080" in self.gpu_name
144
+ ):
145
+ logger.info("Found GPU %s, force to fp32", self.gpu_name)
146
+ self.is_half = False
147
+ self.use_fp32_config()
148
+ else:
149
+ logger.info("Found GPU %s", self.gpu_name)
150
+ self.gpu_mem = int(
151
+ torch.cuda.get_device_properties(i_device).total_memory
152
+ / 1024
153
+ / 1024
154
+ / 1024
155
+ + 0.4
156
+ )
157
+ if self.gpu_mem <= 4:
158
+ self.preprocess_per = 3.0
159
+ elif self.has_mps():
160
+ logger.info("No supported Nvidia GPU found")
161
+ self.device = self.instead = "mps"
162
+ self.is_half = False
163
+ self.use_fp32_config()
164
+ else:
165
+ logger.info("No supported Nvidia GPU found")
166
+ self.device = self.instead = "cpu"
167
+ self.is_half = False
168
+ self.use_fp32_config()
169
+
170
+ if self.n_cpu == 0:
171
+ self.n_cpu = cpu_count()
172
+
173
+ if self.is_half:
174
+ # 6G显存配置
175
+ x_pad = 3
176
+ x_query = 10
177
+ x_center = 60
178
+ x_max = 65
179
+ else:
180
+ # 5G显存配置
181
+ x_pad = 1
182
+ x_query = 6
183
+ x_center = 38
184
+ x_max = 41
185
+
186
+ if self.gpu_mem is not None and self.gpu_mem <= 4:
187
+ x_pad = 1
188
+ x_query = 5
189
+ x_center = 30
190
+ x_max = 32
191
+ if self.dml:
192
+ logger.info("Use DirectML instead")
193
+ if (
194
+ os.path.exists(
195
+ "runtime\Lib\site-packages\onnxruntime\capi\DirectML.dll"
196
+ )
197
+ == False
198
+ ):
199
+ try:
200
+ os.rename(
201
+ "runtime\Lib\site-packages\onnxruntime",
202
+ "runtime\Lib\site-packages\onnxruntime-cuda",
203
+ )
204
+ except:
205
+ pass
206
+ try:
207
+ os.rename(
208
+ "runtime\Lib\site-packages\onnxruntime-dml",
209
+ "runtime\Lib\site-packages\onnxruntime",
210
+ )
211
+ except:
212
+ pass
213
+ # if self.device != "cpu":
214
+ import torch_directml
215
+
216
+ self.device = torch_directml.device(torch_directml.default_device())
217
+ self.is_half = False
218
+ else:
219
+ if self.instead:
220
+ logger.info(f"Use {self.instead} instead")
221
+ if (
222
+ os.path.exists(
223
+ "runtime\Lib\site-packages\onnxruntime\capi\onnxruntime_providers_cuda.dll"
224
+ )
225
+ == False
226
+ ):
227
+ try:
228
+ os.rename(
229
+ "runtime\Lib\site-packages\onnxruntime",
230
+ "runtime\Lib\site-packages\onnxruntime-dml",
231
+ )
232
+ except:
233
+ pass
234
+ try:
235
+ os.rename(
236
+ "runtime\Lib\site-packages\onnxruntime-cuda",
237
+ "runtime\Lib\site-packages\onnxruntime",
238
+ )
239
+ except:
240
+ pass
241
+ logger.info(
242
+ "Half-precision floating-point: %s, device: %s"
243
+ % (self.is_half, self.device)
244
+ )
245
+ return x_pad, x_query, x_center, x_max
infer/lib/audio.py CHANGED
@@ -1,8 +1,8 @@
1
  import platform, os
 
2
  import ffmpeg
3
  import numpy as np
4
  import av
5
- from io import BytesIO
6
 
7
 
8
  def wav2(i, o, format):
 
1
  import platform, os
2
+ import traceback
3
  import ffmpeg
4
  import numpy as np
5
  import av
 
6
 
7
 
8
  def wav2(i, o, format):
infer/lib/rmvpe.py CHANGED
@@ -1,24 +1,14 @@
1
  from io import BytesIO
2
  import os
3
- from typing import List, Optional, Tuple
4
  import numpy as np
5
  import torch
6
 
7
  from infer.lib import jit
8
 
9
- try:
10
- # Fix "Torch not compiled with CUDA enabled"
11
- import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import
12
-
13
- if torch.xpu.is_available():
14
- from infer.modules.ipex import ipex_init
15
-
16
- ipex_init()
17
- except Exception: # pylint: disable=broad-exception-caught
18
- pass
19
  import torch.nn as nn
20
  import torch.nn.functional as F
21
- from librosa.util import normalize, pad_center, tiny
22
  from scipy.signal import get_window
23
 
24
  import logging
 
1
  from io import BytesIO
2
  import os
3
+ from typing import List
4
  import numpy as np
5
  import torch
6
 
7
  from infer.lib import jit
8
 
 
 
 
 
 
 
 
 
 
 
9
  import torch.nn as nn
10
  import torch.nn.functional as F
11
+ from librosa.util import pad_center
12
  from scipy.signal import get_window
13
 
14
  import logging
infer/lib/train/process_ckpt.py CHANGED
@@ -61,7 +61,7 @@ def show_info(path):
61
  return traceback.format_exc()
62
 
63
 
64
- def extract_small_model(path, name, sr, if_f0, info, version):
65
  try:
66
  ckpt = torch.load(path, map_location="cpu")
67
  if "model" in ckpt:
@@ -185,7 +185,7 @@ def extract_small_model(path, name, sr, if_f0, info, version):
185
  opt["version"] = version
186
  opt["sr"] = sr
187
  opt["f0"] = int(if_f0)
188
- torch.save(opt, "assets/weights/%s.pth" % name)
189
  return "Success."
190
  except:
191
  return traceback.format_exc()
 
61
  return traceback.format_exc()
62
 
63
 
64
+ def extract_small_model(path, out, sr, if_f0, info, version):
65
  try:
66
  ckpt = torch.load(path, map_location="cpu")
67
  if "model" in ckpt:
 
185
  opt["version"] = version
186
  opt["sr"] = sr
187
  opt["f0"] = int(if_f0)
188
+ torch.save(opt, out)
189
  return "Success."
190
  except:
191
  return traceback.format_exc()
infer/lib/uvr5_pack/lib_v5/dataset.py DELETED
@@ -1,183 +0,0 @@
1
- import os
2
- import random
3
-
4
- import numpy as np
5
- import torch
6
- import torch.utils.data
7
- from tqdm import tqdm
8
-
9
- from . import spec_utils
10
-
11
-
12
- class VocalRemoverValidationSet(torch.utils.data.Dataset):
13
- def __init__(self, patch_list):
14
- self.patch_list = patch_list
15
-
16
- def __len__(self):
17
- return len(self.patch_list)
18
-
19
- def __getitem__(self, idx):
20
- path = self.patch_list[idx]
21
- data = np.load(path)
22
-
23
- X, y = data["X"], data["y"]
24
-
25
- X_mag = np.abs(X)
26
- y_mag = np.abs(y)
27
-
28
- return X_mag, y_mag
29
-
30
-
31
- def make_pair(mix_dir, inst_dir):
32
- input_exts = [".wav", ".m4a", ".mp3", ".mp4", ".flac"]
33
-
34
- X_list = sorted(
35
- [
36
- os.path.join(mix_dir, fname)
37
- for fname in os.listdir(mix_dir)
38
- if os.path.splitext(fname)[1] in input_exts
39
- ]
40
- )
41
- y_list = sorted(
42
- [
43
- os.path.join(inst_dir, fname)
44
- for fname in os.listdir(inst_dir)
45
- if os.path.splitext(fname)[1] in input_exts
46
- ]
47
- )
48
-
49
- filelist = list(zip(X_list, y_list))
50
-
51
- return filelist
52
-
53
-
54
- def train_val_split(dataset_dir, split_mode, val_rate, val_filelist):
55
- if split_mode == "random":
56
- filelist = make_pair(
57
- os.path.join(dataset_dir, "mixtures"),
58
- os.path.join(dataset_dir, "instruments"),
59
- )
60
-
61
- random.shuffle(filelist)
62
-
63
- if len(val_filelist) == 0:
64
- val_size = int(len(filelist) * val_rate)
65
- train_filelist = filelist[:-val_size]
66
- val_filelist = filelist[-val_size:]
67
- else:
68
- train_filelist = [
69
- pair for pair in filelist if list(pair) not in val_filelist
70
- ]
71
- elif split_mode == "subdirs":
72
- if len(val_filelist) != 0:
73
- raise ValueError(
74
- "The `val_filelist` option is not available in `subdirs` mode"
75
- )
76
-
77
- train_filelist = make_pair(
78
- os.path.join(dataset_dir, "training/mixtures"),
79
- os.path.join(dataset_dir, "training/instruments"),
80
- )
81
-
82
- val_filelist = make_pair(
83
- os.path.join(dataset_dir, "validation/mixtures"),
84
- os.path.join(dataset_dir, "validation/instruments"),
85
- )
86
-
87
- return train_filelist, val_filelist
88
-
89
-
90
- def augment(X, y, reduction_rate, reduction_mask, mixup_rate, mixup_alpha):
91
- perm = np.random.permutation(len(X))
92
- for i, idx in enumerate(tqdm(perm)):
93
- if np.random.uniform() < reduction_rate:
94
- y[idx] = spec_utils.reduce_vocal_aggressively(
95
- X[idx], y[idx], reduction_mask
96
- )
97
-
98
- if np.random.uniform() < 0.5:
99
- # swap channel
100
- X[idx] = X[idx, ::-1]
101
- y[idx] = y[idx, ::-1]
102
- if np.random.uniform() < 0.02:
103
- # mono
104
- X[idx] = X[idx].mean(axis=0, keepdims=True)
105
- y[idx] = y[idx].mean(axis=0, keepdims=True)
106
- if np.random.uniform() < 0.02:
107
- # inst
108
- X[idx] = y[idx]
109
-
110
- if np.random.uniform() < mixup_rate and i < len(perm) - 1:
111
- lam = np.random.beta(mixup_alpha, mixup_alpha)
112
- X[idx] = lam * X[idx] + (1 - lam) * X[perm[i + 1]]
113
- y[idx] = lam * y[idx] + (1 - lam) * y[perm[i + 1]]
114
-
115
- return X, y
116
-
117
-
118
- def make_padding(width, cropsize, offset):
119
- left = offset
120
- roi_size = cropsize - left * 2
121
- if roi_size == 0:
122
- roi_size = cropsize
123
- right = roi_size - (width % roi_size) + left
124
-
125
- return left, right, roi_size
126
-
127
-
128
- def make_training_set(filelist, cropsize, patches, sr, hop_length, n_fft, offset):
129
- len_dataset = patches * len(filelist)
130
-
131
- X_dataset = np.zeros((len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64)
132
- y_dataset = np.zeros((len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64)
133
-
134
- for i, (X_path, y_path) in enumerate(tqdm(filelist)):
135
- X, y = spec_utils.cache_or_load(X_path, y_path, sr, hop_length, n_fft)
136
- coef = np.max([np.abs(X).max(), np.abs(y).max()])
137
- X, y = X / coef, y / coef
138
-
139
- l, r, roi_size = make_padding(X.shape[2], cropsize, offset)
140
- X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode="constant")
141
- y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode="constant")
142
-
143
- starts = np.random.randint(0, X_pad.shape[2] - cropsize, patches)
144
- ends = starts + cropsize
145
- for j in range(patches):
146
- idx = i * patches + j
147
- X_dataset[idx] = X_pad[:, :, starts[j] : ends[j]]
148
- y_dataset[idx] = y_pad[:, :, starts[j] : ends[j]]
149
-
150
- return X_dataset, y_dataset
151
-
152
-
153
- def make_validation_set(filelist, cropsize, sr, hop_length, n_fft, offset):
154
- patch_list = []
155
- patch_dir = "cs{}_sr{}_hl{}_nf{}_of{}".format(
156
- cropsize, sr, hop_length, n_fft, offset
157
- )
158
- os.makedirs(patch_dir, exist_ok=True)
159
-
160
- for i, (X_path, y_path) in enumerate(tqdm(filelist)):
161
- basename = os.path.splitext(os.path.basename(X_path))[0]
162
-
163
- X, y = spec_utils.cache_or_load(X_path, y_path, sr, hop_length, n_fft)
164
- coef = np.max([np.abs(X).max(), np.abs(y).max()])
165
- X, y = X / coef, y / coef
166
-
167
- l, r, roi_size = make_padding(X.shape[2], cropsize, offset)
168
- X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode="constant")
169
- y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode="constant")
170
-
171
- len_dataset = int(np.ceil(X.shape[2] / roi_size))
172
- for j in range(len_dataset):
173
- outpath = os.path.join(patch_dir, "{}_p{}.npz".format(basename, j))
174
- start = j * roi_size
175
- if not os.path.exists(outpath):
176
- np.savez(
177
- outpath,
178
- X=X_pad[:, :, start : start + cropsize],
179
- y=y_pad[:, :, start : start + cropsize],
180
- )
181
- patch_list.append(outpath)
182
-
183
- return VocalRemoverValidationSet(patch_list)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/layers.py DELETED
@@ -1,118 +0,0 @@
1
- import torch
2
- import torch.nn.functional as F
3
- from torch import nn
4
-
5
- from . import spec_utils
6
-
7
-
8
- class Conv2DBNActiv(nn.Module):
9
- def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
10
- super(Conv2DBNActiv, self).__init__()
11
- self.conv = nn.Sequential(
12
- nn.Conv2d(
13
- nin,
14
- nout,
15
- kernel_size=ksize,
16
- stride=stride,
17
- padding=pad,
18
- dilation=dilation,
19
- bias=False,
20
- ),
21
- nn.BatchNorm2d(nout),
22
- activ(),
23
- )
24
-
25
- def __call__(self, x):
26
- return self.conv(x)
27
-
28
-
29
- class SeperableConv2DBNActiv(nn.Module):
30
- def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
31
- super(SeperableConv2DBNActiv, self).__init__()
32
- self.conv = nn.Sequential(
33
- nn.Conv2d(
34
- nin,
35
- nin,
36
- kernel_size=ksize,
37
- stride=stride,
38
- padding=pad,
39
- dilation=dilation,
40
- groups=nin,
41
- bias=False,
42
- ),
43
- nn.Conv2d(nin, nout, kernel_size=1, bias=False),
44
- nn.BatchNorm2d(nout),
45
- activ(),
46
- )
47
-
48
- def __call__(self, x):
49
- return self.conv(x)
50
-
51
-
52
- class Encoder(nn.Module):
53
- def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
54
- super(Encoder, self).__init__()
55
- self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
56
- self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
57
-
58
- def __call__(self, x):
59
- skip = self.conv1(x)
60
- h = self.conv2(skip)
61
-
62
- return h, skip
63
-
64
-
65
- class Decoder(nn.Module):
66
- def __init__(
67
- self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
68
- ):
69
- super(Decoder, self).__init__()
70
- self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
71
- self.dropout = nn.Dropout2d(0.1) if dropout else None
72
-
73
- def __call__(self, x, skip=None):
74
- x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
75
- if skip is not None:
76
- skip = spec_utils.crop_center(skip, x)
77
- x = torch.cat([x, skip], dim=1)
78
- h = self.conv(x)
79
-
80
- if self.dropout is not None:
81
- h = self.dropout(h)
82
-
83
- return h
84
-
85
-
86
- class ASPPModule(nn.Module):
87
- def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
88
- super(ASPPModule, self).__init__()
89
- self.conv1 = nn.Sequential(
90
- nn.AdaptiveAvgPool2d((1, None)),
91
- Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
92
- )
93
- self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
94
- self.conv3 = SeperableConv2DBNActiv(
95
- nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
96
- )
97
- self.conv4 = SeperableConv2DBNActiv(
98
- nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
99
- )
100
- self.conv5 = SeperableConv2DBNActiv(
101
- nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102
- )
103
- self.bottleneck = nn.Sequential(
104
- Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
105
- )
106
-
107
- def forward(self, x):
108
- _, _, h, w = x.size()
109
- feat1 = F.interpolate(
110
- self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
111
- )
112
- feat2 = self.conv2(x)
113
- feat3 = self.conv3(x)
114
- feat4 = self.conv4(x)
115
- feat5 = self.conv5(x)
116
- out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
117
- bottle = self.bottleneck(out)
118
- return bottle
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/layers_123812KB .py DELETED
@@ -1,118 +0,0 @@
1
- import torch
2
- import torch.nn.functional as F
3
- from torch import nn
4
-
5
- from . import spec_utils
6
-
7
-
8
- class Conv2DBNActiv(nn.Module):
9
- def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
10
- super(Conv2DBNActiv, self).__init__()
11
- self.conv = nn.Sequential(
12
- nn.Conv2d(
13
- nin,
14
- nout,
15
- kernel_size=ksize,
16
- stride=stride,
17
- padding=pad,
18
- dilation=dilation,
19
- bias=False,
20
- ),
21
- nn.BatchNorm2d(nout),
22
- activ(),
23
- )
24
-
25
- def __call__(self, x):
26
- return self.conv(x)
27
-
28
-
29
- class SeperableConv2DBNActiv(nn.Module):
30
- def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
31
- super(SeperableConv2DBNActiv, self).__init__()
32
- self.conv = nn.Sequential(
33
- nn.Conv2d(
34
- nin,
35
- nin,
36
- kernel_size=ksize,
37
- stride=stride,
38
- padding=pad,
39
- dilation=dilation,
40
- groups=nin,
41
- bias=False,
42
- ),
43
- nn.Conv2d(nin, nout, kernel_size=1, bias=False),
44
- nn.BatchNorm2d(nout),
45
- activ(),
46
- )
47
-
48
- def __call__(self, x):
49
- return self.conv(x)
50
-
51
-
52
- class Encoder(nn.Module):
53
- def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
54
- super(Encoder, self).__init__()
55
- self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
56
- self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
57
-
58
- def __call__(self, x):
59
- skip = self.conv1(x)
60
- h = self.conv2(skip)
61
-
62
- return h, skip
63
-
64
-
65
- class Decoder(nn.Module):
66
- def __init__(
67
- self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
68
- ):
69
- super(Decoder, self).__init__()
70
- self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
71
- self.dropout = nn.Dropout2d(0.1) if dropout else None
72
-
73
- def __call__(self, x, skip=None):
74
- x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
75
- if skip is not None:
76
- skip = spec_utils.crop_center(skip, x)
77
- x = torch.cat([x, skip], dim=1)
78
- h = self.conv(x)
79
-
80
- if self.dropout is not None:
81
- h = self.dropout(h)
82
-
83
- return h
84
-
85
-
86
- class ASPPModule(nn.Module):
87
- def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
88
- super(ASPPModule, self).__init__()
89
- self.conv1 = nn.Sequential(
90
- nn.AdaptiveAvgPool2d((1, None)),
91
- Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
92
- )
93
- self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
94
- self.conv3 = SeperableConv2DBNActiv(
95
- nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
96
- )
97
- self.conv4 = SeperableConv2DBNActiv(
98
- nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
99
- )
100
- self.conv5 = SeperableConv2DBNActiv(
101
- nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102
- )
103
- self.bottleneck = nn.Sequential(
104
- Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
105
- )
106
-
107
- def forward(self, x):
108
- _, _, h, w = x.size()
109
- feat1 = F.interpolate(
110
- self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
111
- )
112
- feat2 = self.conv2(x)
113
- feat3 = self.conv3(x)
114
- feat4 = self.conv4(x)
115
- feat5 = self.conv5(x)
116
- out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
117
- bottle = self.bottleneck(out)
118
- return bottle
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/layers_123821KB.py DELETED
@@ -1,118 +0,0 @@
1
- import torch
2
- import torch.nn.functional as F
3
- from torch import nn
4
-
5
- from . import spec_utils
6
-
7
-
8
- class Conv2DBNActiv(nn.Module):
9
- def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
10
- super(Conv2DBNActiv, self).__init__()
11
- self.conv = nn.Sequential(
12
- nn.Conv2d(
13
- nin,
14
- nout,
15
- kernel_size=ksize,
16
- stride=stride,
17
- padding=pad,
18
- dilation=dilation,
19
- bias=False,
20
- ),
21
- nn.BatchNorm2d(nout),
22
- activ(),
23
- )
24
-
25
- def __call__(self, x):
26
- return self.conv(x)
27
-
28
-
29
- class SeperableConv2DBNActiv(nn.Module):
30
- def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
31
- super(SeperableConv2DBNActiv, self).__init__()
32
- self.conv = nn.Sequential(
33
- nn.Conv2d(
34
- nin,
35
- nin,
36
- kernel_size=ksize,
37
- stride=stride,
38
- padding=pad,
39
- dilation=dilation,
40
- groups=nin,
41
- bias=False,
42
- ),
43
- nn.Conv2d(nin, nout, kernel_size=1, bias=False),
44
- nn.BatchNorm2d(nout),
45
- activ(),
46
- )
47
-
48
- def __call__(self, x):
49
- return self.conv(x)
50
-
51
-
52
- class Encoder(nn.Module):
53
- def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
54
- super(Encoder, self).__init__()
55
- self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
56
- self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
57
-
58
- def __call__(self, x):
59
- skip = self.conv1(x)
60
- h = self.conv2(skip)
61
-
62
- return h, skip
63
-
64
-
65
- class Decoder(nn.Module):
66
- def __init__(
67
- self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
68
- ):
69
- super(Decoder, self).__init__()
70
- self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
71
- self.dropout = nn.Dropout2d(0.1) if dropout else None
72
-
73
- def __call__(self, x, skip=None):
74
- x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
75
- if skip is not None:
76
- skip = spec_utils.crop_center(skip, x)
77
- x = torch.cat([x, skip], dim=1)
78
- h = self.conv(x)
79
-
80
- if self.dropout is not None:
81
- h = self.dropout(h)
82
-
83
- return h
84
-
85
-
86
- class ASPPModule(nn.Module):
87
- def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
88
- super(ASPPModule, self).__init__()
89
- self.conv1 = nn.Sequential(
90
- nn.AdaptiveAvgPool2d((1, None)),
91
- Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
92
- )
93
- self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
94
- self.conv3 = SeperableConv2DBNActiv(
95
- nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
96
- )
97
- self.conv4 = SeperableConv2DBNActiv(
98
- nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
99
- )
100
- self.conv5 = SeperableConv2DBNActiv(
101
- nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102
- )
103
- self.bottleneck = nn.Sequential(
104
- Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
105
- )
106
-
107
- def forward(self, x):
108
- _, _, h, w = x.size()
109
- feat1 = F.interpolate(
110
- self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
111
- )
112
- feat2 = self.conv2(x)
113
- feat3 = self.conv3(x)
114
- feat4 = self.conv4(x)
115
- feat5 = self.conv5(x)
116
- out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
117
- bottle = self.bottleneck(out)
118
- return bottle
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/layers_33966KB.py DELETED
@@ -1,126 +0,0 @@
1
- import torch
2
- import torch.nn.functional as F
3
- from torch import nn
4
-
5
- from . import spec_utils
6
-
7
-
8
- class Conv2DBNActiv(nn.Module):
9
- def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
10
- super(Conv2DBNActiv, self).__init__()
11
- self.conv = nn.Sequential(
12
- nn.Conv2d(
13
- nin,
14
- nout,
15
- kernel_size=ksize,
16
- stride=stride,
17
- padding=pad,
18
- dilation=dilation,
19
- bias=False,
20
- ),
21
- nn.BatchNorm2d(nout),
22
- activ(),
23
- )
24
-
25
- def __call__(self, x):
26
- return self.conv(x)
27
-
28
-
29
- class SeperableConv2DBNActiv(nn.Module):
30
- def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
31
- super(SeperableConv2DBNActiv, self).__init__()
32
- self.conv = nn.Sequential(
33
- nn.Conv2d(
34
- nin,
35
- nin,
36
- kernel_size=ksize,
37
- stride=stride,
38
- padding=pad,
39
- dilation=dilation,
40
- groups=nin,
41
- bias=False,
42
- ),
43
- nn.Conv2d(nin, nout, kernel_size=1, bias=False),
44
- nn.BatchNorm2d(nout),
45
- activ(),
46
- )
47
-
48
- def __call__(self, x):
49
- return self.conv(x)
50
-
51
-
52
- class Encoder(nn.Module):
53
- def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
54
- super(Encoder, self).__init__()
55
- self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
56
- self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
57
-
58
- def __call__(self, x):
59
- skip = self.conv1(x)
60
- h = self.conv2(skip)
61
-
62
- return h, skip
63
-
64
-
65
- class Decoder(nn.Module):
66
- def __init__(
67
- self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
68
- ):
69
- super(Decoder, self).__init__()
70
- self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
71
- self.dropout = nn.Dropout2d(0.1) if dropout else None
72
-
73
- def __call__(self, x, skip=None):
74
- x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
75
- if skip is not None:
76
- skip = spec_utils.crop_center(skip, x)
77
- x = torch.cat([x, skip], dim=1)
78
- h = self.conv(x)
79
-
80
- if self.dropout is not None:
81
- h = self.dropout(h)
82
-
83
- return h
84
-
85
-
86
- class ASPPModule(nn.Module):
87
- def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
88
- super(ASPPModule, self).__init__()
89
- self.conv1 = nn.Sequential(
90
- nn.AdaptiveAvgPool2d((1, None)),
91
- Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
92
- )
93
- self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
94
- self.conv3 = SeperableConv2DBNActiv(
95
- nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
96
- )
97
- self.conv4 = SeperableConv2DBNActiv(
98
- nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
99
- )
100
- self.conv5 = SeperableConv2DBNActiv(
101
- nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102
- )
103
- self.conv6 = SeperableConv2DBNActiv(
104
- nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
105
- )
106
- self.conv7 = SeperableConv2DBNActiv(
107
- nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
108
- )
109
- self.bottleneck = nn.Sequential(
110
- Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
111
- )
112
-
113
- def forward(self, x):
114
- _, _, h, w = x.size()
115
- feat1 = F.interpolate(
116
- self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
117
- )
118
- feat2 = self.conv2(x)
119
- feat3 = self.conv3(x)
120
- feat4 = self.conv4(x)
121
- feat5 = self.conv5(x)
122
- feat6 = self.conv6(x)
123
- feat7 = self.conv7(x)
124
- out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1)
125
- bottle = self.bottleneck(out)
126
- return bottle
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/layers_537227KB.py DELETED
@@ -1,126 +0,0 @@
1
- import torch
2
- import torch.nn.functional as F
3
- from torch import nn
4
-
5
- from . import spec_utils
6
-
7
-
8
- class Conv2DBNActiv(nn.Module):
9
- def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
10
- super(Conv2DBNActiv, self).__init__()
11
- self.conv = nn.Sequential(
12
- nn.Conv2d(
13
- nin,
14
- nout,
15
- kernel_size=ksize,
16
- stride=stride,
17
- padding=pad,
18
- dilation=dilation,
19
- bias=False,
20
- ),
21
- nn.BatchNorm2d(nout),
22
- activ(),
23
- )
24
-
25
- def __call__(self, x):
26
- return self.conv(x)
27
-
28
-
29
- class SeperableConv2DBNActiv(nn.Module):
30
- def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
31
- super(SeperableConv2DBNActiv, self).__init__()
32
- self.conv = nn.Sequential(
33
- nn.Conv2d(
34
- nin,
35
- nin,
36
- kernel_size=ksize,
37
- stride=stride,
38
- padding=pad,
39
- dilation=dilation,
40
- groups=nin,
41
- bias=False,
42
- ),
43
- nn.Conv2d(nin, nout, kernel_size=1, bias=False),
44
- nn.BatchNorm2d(nout),
45
- activ(),
46
- )
47
-
48
- def __call__(self, x):
49
- return self.conv(x)
50
-
51
-
52
- class Encoder(nn.Module):
53
- def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
54
- super(Encoder, self).__init__()
55
- self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
56
- self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
57
-
58
- def __call__(self, x):
59
- skip = self.conv1(x)
60
- h = self.conv2(skip)
61
-
62
- return h, skip
63
-
64
-
65
- class Decoder(nn.Module):
66
- def __init__(
67
- self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
68
- ):
69
- super(Decoder, self).__init__()
70
- self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
71
- self.dropout = nn.Dropout2d(0.1) if dropout else None
72
-
73
- def __call__(self, x, skip=None):
74
- x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
75
- if skip is not None:
76
- skip = spec_utils.crop_center(skip, x)
77
- x = torch.cat([x, skip], dim=1)
78
- h = self.conv(x)
79
-
80
- if self.dropout is not None:
81
- h = self.dropout(h)
82
-
83
- return h
84
-
85
-
86
- class ASPPModule(nn.Module):
87
- def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
88
- super(ASPPModule, self).__init__()
89
- self.conv1 = nn.Sequential(
90
- nn.AdaptiveAvgPool2d((1, None)),
91
- Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
92
- )
93
- self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
94
- self.conv3 = SeperableConv2DBNActiv(
95
- nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
96
- )
97
- self.conv4 = SeperableConv2DBNActiv(
98
- nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
99
- )
100
- self.conv5 = SeperableConv2DBNActiv(
101
- nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102
- )
103
- self.conv6 = SeperableConv2DBNActiv(
104
- nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
105
- )
106
- self.conv7 = SeperableConv2DBNActiv(
107
- nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
108
- )
109
- self.bottleneck = nn.Sequential(
110
- Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
111
- )
112
-
113
- def forward(self, x):
114
- _, _, h, w = x.size()
115
- feat1 = F.interpolate(
116
- self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
117
- )
118
- feat2 = self.conv2(x)
119
- feat3 = self.conv3(x)
120
- feat4 = self.conv4(x)
121
- feat5 = self.conv5(x)
122
- feat6 = self.conv6(x)
123
- feat7 = self.conv7(x)
124
- out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1)
125
- bottle = self.bottleneck(out)
126
- return bottle
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/layers_537238KB.py DELETED
@@ -1,126 +0,0 @@
1
- import torch
2
- import torch.nn.functional as F
3
- from torch import nn
4
-
5
- from . import spec_utils
6
-
7
-
8
- class Conv2DBNActiv(nn.Module):
9
- def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
10
- super(Conv2DBNActiv, self).__init__()
11
- self.conv = nn.Sequential(
12
- nn.Conv2d(
13
- nin,
14
- nout,
15
- kernel_size=ksize,
16
- stride=stride,
17
- padding=pad,
18
- dilation=dilation,
19
- bias=False,
20
- ),
21
- nn.BatchNorm2d(nout),
22
- activ(),
23
- )
24
-
25
- def __call__(self, x):
26
- return self.conv(x)
27
-
28
-
29
- class SeperableConv2DBNActiv(nn.Module):
30
- def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
31
- super(SeperableConv2DBNActiv, self).__init__()
32
- self.conv = nn.Sequential(
33
- nn.Conv2d(
34
- nin,
35
- nin,
36
- kernel_size=ksize,
37
- stride=stride,
38
- padding=pad,
39
- dilation=dilation,
40
- groups=nin,
41
- bias=False,
42
- ),
43
- nn.Conv2d(nin, nout, kernel_size=1, bias=False),
44
- nn.BatchNorm2d(nout),
45
- activ(),
46
- )
47
-
48
- def __call__(self, x):
49
- return self.conv(x)
50
-
51
-
52
- class Encoder(nn.Module):
53
- def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
54
- super(Encoder, self).__init__()
55
- self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
56
- self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
57
-
58
- def __call__(self, x):
59
- skip = self.conv1(x)
60
- h = self.conv2(skip)
61
-
62
- return h, skip
63
-
64
-
65
- class Decoder(nn.Module):
66
- def __init__(
67
- self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
68
- ):
69
- super(Decoder, self).__init__()
70
- self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
71
- self.dropout = nn.Dropout2d(0.1) if dropout else None
72
-
73
- def __call__(self, x, skip=None):
74
- x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
75
- if skip is not None:
76
- skip = spec_utils.crop_center(skip, x)
77
- x = torch.cat([x, skip], dim=1)
78
- h = self.conv(x)
79
-
80
- if self.dropout is not None:
81
- h = self.dropout(h)
82
-
83
- return h
84
-
85
-
86
- class ASPPModule(nn.Module):
87
- def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
88
- super(ASPPModule, self).__init__()
89
- self.conv1 = nn.Sequential(
90
- nn.AdaptiveAvgPool2d((1, None)),
91
- Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
92
- )
93
- self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
94
- self.conv3 = SeperableConv2DBNActiv(
95
- nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
96
- )
97
- self.conv4 = SeperableConv2DBNActiv(
98
- nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
99
- )
100
- self.conv5 = SeperableConv2DBNActiv(
101
- nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102
- )
103
- self.conv6 = SeperableConv2DBNActiv(
104
- nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
105
- )
106
- self.conv7 = SeperableConv2DBNActiv(
107
- nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
108
- )
109
- self.bottleneck = nn.Sequential(
110
- Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
111
- )
112
-
113
- def forward(self, x):
114
- _, _, h, w = x.size()
115
- feat1 = F.interpolate(
116
- self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
117
- )
118
- feat2 = self.conv2(x)
119
- feat3 = self.conv3(x)
120
- feat4 = self.conv4(x)
121
- feat5 = self.conv5(x)
122
- feat6 = self.conv6(x)
123
- feat7 = self.conv7(x)
124
- out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1)
125
- bottle = self.bottleneck(out)
126
- return bottle
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/layers_new.py DELETED
@@ -1,125 +0,0 @@
1
- import torch
2
- import torch.nn.functional as F
3
- from torch import nn
4
-
5
- from . import spec_utils
6
-
7
-
8
- class Conv2DBNActiv(nn.Module):
9
- def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
10
- super(Conv2DBNActiv, self).__init__()
11
- self.conv = nn.Sequential(
12
- nn.Conv2d(
13
- nin,
14
- nout,
15
- kernel_size=ksize,
16
- stride=stride,
17
- padding=pad,
18
- dilation=dilation,
19
- bias=False,
20
- ),
21
- nn.BatchNorm2d(nout),
22
- activ(),
23
- )
24
-
25
- def __call__(self, x):
26
- return self.conv(x)
27
-
28
-
29
- class Encoder(nn.Module):
30
- def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
31
- super(Encoder, self).__init__()
32
- self.conv1 = Conv2DBNActiv(nin, nout, ksize, stride, pad, activ=activ)
33
- self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)
34
-
35
- def __call__(self, x):
36
- h = self.conv1(x)
37
- h = self.conv2(h)
38
-
39
- return h
40
-
41
-
42
- class Decoder(nn.Module):
43
- def __init__(
44
- self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
45
- ):
46
- super(Decoder, self).__init__()
47
- self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
48
- # self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)
49
- self.dropout = nn.Dropout2d(0.1) if dropout else None
50
-
51
- def __call__(self, x, skip=None):
52
- x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
53
-
54
- if skip is not None:
55
- skip = spec_utils.crop_center(skip, x)
56
- x = torch.cat([x, skip], dim=1)
57
-
58
- h = self.conv1(x)
59
- # h = self.conv2(h)
60
-
61
- if self.dropout is not None:
62
- h = self.dropout(h)
63
-
64
- return h
65
-
66
-
67
- class ASPPModule(nn.Module):
68
- def __init__(self, nin, nout, dilations=(4, 8, 12), activ=nn.ReLU, dropout=False):
69
- super(ASPPModule, self).__init__()
70
- self.conv1 = nn.Sequential(
71
- nn.AdaptiveAvgPool2d((1, None)),
72
- Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ),
73
- )
74
- self.conv2 = Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ)
75
- self.conv3 = Conv2DBNActiv(
76
- nin, nout, 3, 1, dilations[0], dilations[0], activ=activ
77
- )
78
- self.conv4 = Conv2DBNActiv(
79
- nin, nout, 3, 1, dilations[1], dilations[1], activ=activ
80
- )
81
- self.conv5 = Conv2DBNActiv(
82
- nin, nout, 3, 1, dilations[2], dilations[2], activ=activ
83
- )
84
- self.bottleneck = Conv2DBNActiv(nout * 5, nout, 1, 1, 0, activ=activ)
85
- self.dropout = nn.Dropout2d(0.1) if dropout else None
86
-
87
- def forward(self, x):
88
- _, _, h, w = x.size()
89
- feat1 = F.interpolate(
90
- self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
91
- )
92
- feat2 = self.conv2(x)
93
- feat3 = self.conv3(x)
94
- feat4 = self.conv4(x)
95
- feat5 = self.conv5(x)
96
- out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
97
- out = self.bottleneck(out)
98
-
99
- if self.dropout is not None:
100
- out = self.dropout(out)
101
-
102
- return out
103
-
104
-
105
- class LSTMModule(nn.Module):
106
- def __init__(self, nin_conv, nin_lstm, nout_lstm):
107
- super(LSTMModule, self).__init__()
108
- self.conv = Conv2DBNActiv(nin_conv, 1, 1, 1, 0)
109
- self.lstm = nn.LSTM(
110
- input_size=nin_lstm, hidden_size=nout_lstm // 2, bidirectional=True
111
- )
112
- self.dense = nn.Sequential(
113
- nn.Linear(nout_lstm, nin_lstm), nn.BatchNorm1d(nin_lstm), nn.ReLU()
114
- )
115
-
116
- def forward(self, x):
117
- N, _, nbins, nframes = x.size()
118
- h = self.conv(x)[:, 0] # N, nbins, nframes
119
- h = h.permute(2, 0, 1) # nframes, N, nbins
120
- h, _ = self.lstm(h)
121
- h = self.dense(h.reshape(-1, h.size()[-1])) # nframes * N, nbins
122
- h = h.reshape(nframes, N, 1, nbins)
123
- h = h.permute(1, 2, 3, 0)
124
-
125
- return h
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/model_param_init.py DELETED
@@ -1,69 +0,0 @@
1
- import json
2
- import os
3
- import pathlib
4
-
5
- default_param = {}
6
- default_param["bins"] = 768
7
- default_param["unstable_bins"] = 9 # training only
8
- default_param["reduction_bins"] = 762 # training only
9
- default_param["sr"] = 44100
10
- default_param["pre_filter_start"] = 757
11
- default_param["pre_filter_stop"] = 768
12
- default_param["band"] = {}
13
-
14
-
15
- default_param["band"][1] = {
16
- "sr": 11025,
17
- "hl": 128,
18
- "n_fft": 960,
19
- "crop_start": 0,
20
- "crop_stop": 245,
21
- "lpf_start": 61, # inference only
22
- "res_type": "polyphase",
23
- }
24
-
25
- default_param["band"][2] = {
26
- "sr": 44100,
27
- "hl": 512,
28
- "n_fft": 1536,
29
- "crop_start": 24,
30
- "crop_stop": 547,
31
- "hpf_start": 81, # inference only
32
- "res_type": "sinc_best",
33
- }
34
-
35
-
36
- def int_keys(d):
37
- r = {}
38
- for k, v in d:
39
- if k.isdigit():
40
- k = int(k)
41
- r[k] = v
42
- return r
43
-
44
-
45
- class ModelParameters(object):
46
- def __init__(self, config_path=""):
47
- if ".pth" == pathlib.Path(config_path).suffix:
48
- import zipfile
49
-
50
- with zipfile.ZipFile(config_path, "r") as zip:
51
- self.param = json.loads(
52
- zip.read("param.json"), object_pairs_hook=int_keys
53
- )
54
- elif ".json" == pathlib.Path(config_path).suffix:
55
- with open(config_path, "r") as f:
56
- self.param = json.loads(f.read(), object_pairs_hook=int_keys)
57
- else:
58
- self.param = default_param
59
-
60
- for k in [
61
- "mid_side",
62
- "mid_side_b",
63
- "mid_side_b2",
64
- "stereo_w",
65
- "stereo_n",
66
- "reverse",
67
- ]:
68
- if not k in self.param:
69
- self.param[k] = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json DELETED
@@ -1,19 +0,0 @@
1
- {
2
- "bins": 1024,
3
- "unstable_bins": 0,
4
- "reduction_bins": 0,
5
- "band": {
6
- "1": {
7
- "sr": 16000,
8
- "hl": 512,
9
- "n_fft": 2048,
10
- "crop_start": 0,
11
- "crop_stop": 1024,
12
- "hpf_start": -1,
13
- "res_type": "sinc_best"
14
- }
15
- },
16
- "sr": 16000,
17
- "pre_filter_start": 1023,
18
- "pre_filter_stop": 1024
19
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json DELETED
@@ -1,19 +0,0 @@
1
- {
2
- "bins": 1024,
3
- "unstable_bins": 0,
4
- "reduction_bins": 0,
5
- "band": {
6
- "1": {
7
- "sr": 32000,
8
- "hl": 512,
9
- "n_fft": 2048,
10
- "crop_start": 0,
11
- "crop_stop": 1024,
12
- "hpf_start": -1,
13
- "res_type": "kaiser_fast"
14
- }
15
- },
16
- "sr": 32000,
17
- "pre_filter_start": 1000,
18
- "pre_filter_stop": 1021
19
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json DELETED
@@ -1,19 +0,0 @@
1
- {
2
- "bins": 1024,
3
- "unstable_bins": 0,
4
- "reduction_bins": 0,
5
- "band": {
6
- "1": {
7
- "sr": 33075,
8
- "hl": 384,
9
- "n_fft": 2048,
10
- "crop_start": 0,
11
- "crop_stop": 1024,
12
- "hpf_start": -1,
13
- "res_type": "sinc_best"
14
- }
15
- },
16
- "sr": 33075,
17
- "pre_filter_start": 1000,
18
- "pre_filter_stop": 1021
19
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json DELETED
@@ -1,19 +0,0 @@
1
- {
2
- "bins": 1024,
3
- "unstable_bins": 0,
4
- "reduction_bins": 0,
5
- "band": {
6
- "1": {
7
- "sr": 44100,
8
- "hl": 1024,
9
- "n_fft": 2048,
10
- "crop_start": 0,
11
- "crop_stop": 1024,
12
- "hpf_start": -1,
13
- "res_type": "sinc_best"
14
- }
15
- },
16
- "sr": 44100,
17
- "pre_filter_start": 1023,
18
- "pre_filter_stop": 1024
19
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json DELETED
@@ -1,19 +0,0 @@
1
- {
2
- "bins": 256,
3
- "unstable_bins": 0,
4
- "reduction_bins": 0,
5
- "band": {
6
- "1": {
7
- "sr": 44100,
8
- "hl": 256,
9
- "n_fft": 512,
10
- "crop_start": 0,
11
- "crop_stop": 256,
12
- "hpf_start": -1,
13
- "res_type": "sinc_best"
14
- }
15
- },
16
- "sr": 44100,
17
- "pre_filter_start": 256,
18
- "pre_filter_stop": 256
19
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json DELETED
@@ -1,19 +0,0 @@
1
- {
2
- "bins": 1024,
3
- "unstable_bins": 0,
4
- "reduction_bins": 0,
5
- "band": {
6
- "1": {
7
- "sr": 44100,
8
- "hl": 512,
9
- "n_fft": 2048,
10
- "crop_start": 0,
11
- "crop_stop": 1024,
12
- "hpf_start": -1,
13
- "res_type": "sinc_best"
14
- }
15
- },
16
- "sr": 44100,
17
- "pre_filter_start": 1023,
18
- "pre_filter_stop": 1024
19
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512_cut.json DELETED
@@ -1,19 +0,0 @@
1
- {
2
- "bins": 1024,
3
- "unstable_bins": 0,
4
- "reduction_bins": 0,
5
- "band": {
6
- "1": {
7
- "sr": 44100,
8
- "hl": 512,
9
- "n_fft": 2048,
10
- "crop_start": 0,
11
- "crop_stop": 700,
12
- "hpf_start": -1,
13
- "res_type": "sinc_best"
14
- }
15
- },
16
- "sr": 44100,
17
- "pre_filter_start": 1023,
18
- "pre_filter_stop": 700
19
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/modelparams/2band_32000.json DELETED
@@ -1,30 +0,0 @@
1
- {
2
- "bins": 768,
3
- "unstable_bins": 7,
4
- "reduction_bins": 705,
5
- "band": {
6
- "1": {
7
- "sr": 6000,
8
- "hl": 66,
9
- "n_fft": 512,
10
- "crop_start": 0,
11
- "crop_stop": 240,
12
- "lpf_start": 60,
13
- "lpf_stop": 118,
14
- "res_type": "sinc_fastest"
15
- },
16
- "2": {
17
- "sr": 32000,
18
- "hl": 352,
19
- "n_fft": 1024,
20
- "crop_start": 22,
21
- "crop_stop": 505,
22
- "hpf_start": 44,
23
- "hpf_stop": 23,
24
- "res_type": "sinc_medium"
25
- }
26
- },
27
- "sr": 32000,
28
- "pre_filter_start": 710,
29
- "pre_filter_stop": 731
30
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json DELETED
@@ -1,30 +0,0 @@
1
- {
2
- "bins": 512,
3
- "unstable_bins": 7,
4
- "reduction_bins": 510,
5
- "band": {
6
- "1": {
7
- "sr": 11025,
8
- "hl": 160,
9
- "n_fft": 768,
10
- "crop_start": 0,
11
- "crop_stop": 192,
12
- "lpf_start": 41,
13
- "lpf_stop": 139,
14
- "res_type": "sinc_fastest"
15
- },
16
- "2": {
17
- "sr": 44100,
18
- "hl": 640,
19
- "n_fft": 1024,
20
- "crop_start": 10,
21
- "crop_stop": 320,
22
- "hpf_start": 47,
23
- "hpf_stop": 15,
24
- "res_type": "sinc_medium"
25
- }
26
- },
27
- "sr": 44100,
28
- "pre_filter_start": 510,
29
- "pre_filter_stop": 512
30
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/modelparams/2band_48000.json DELETED
@@ -1,30 +0,0 @@
1
- {
2
- "bins": 768,
3
- "unstable_bins": 7,
4
- "reduction_bins": 705,
5
- "band": {
6
- "1": {
7
- "sr": 6000,
8
- "hl": 66,
9
- "n_fft": 512,
10
- "crop_start": 0,
11
- "crop_stop": 240,
12
- "lpf_start": 60,
13
- "lpf_stop": 240,
14
- "res_type": "sinc_fastest"
15
- },
16
- "2": {
17
- "sr": 48000,
18
- "hl": 528,
19
- "n_fft": 1536,
20
- "crop_start": 22,
21
- "crop_stop": 505,
22
- "hpf_start": 82,
23
- "hpf_stop": 22,
24
- "res_type": "sinc_medium"
25
- }
26
- },
27
- "sr": 48000,
28
- "pre_filter_start": 710,
29
- "pre_filter_stop": 731
30
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100.json DELETED
@@ -1,42 +0,0 @@
1
- {
2
- "bins": 768,
3
- "unstable_bins": 5,
4
- "reduction_bins": 733,
5
- "band": {
6
- "1": {
7
- "sr": 11025,
8
- "hl": 128,
9
- "n_fft": 768,
10
- "crop_start": 0,
11
- "crop_stop": 278,
12
- "lpf_start": 28,
13
- "lpf_stop": 140,
14
- "res_type": "polyphase"
15
- },
16
- "2": {
17
- "sr": 22050,
18
- "hl": 256,
19
- "n_fft": 768,
20
- "crop_start": 14,
21
- "crop_stop": 322,
22
- "hpf_start": 70,
23
- "hpf_stop": 14,
24
- "lpf_start": 283,
25
- "lpf_stop": 314,
26
- "res_type": "polyphase"
27
- },
28
- "3": {
29
- "sr": 44100,
30
- "hl": 512,
31
- "n_fft": 768,
32
- "crop_start": 131,
33
- "crop_stop": 313,
34
- "hpf_start": 154,
35
- "hpf_stop": 141,
36
- "res_type": "sinc_medium"
37
- }
38
- },
39
- "sr": 44100,
40
- "pre_filter_start": 757,
41
- "pre_filter_stop": 768
42
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json DELETED
@@ -1,43 +0,0 @@
1
- {
2
- "mid_side": true,
3
- "bins": 768,
4
- "unstable_bins": 5,
5
- "reduction_bins": 733,
6
- "band": {
7
- "1": {
8
- "sr": 11025,
9
- "hl": 128,
10
- "n_fft": 768,
11
- "crop_start": 0,
12
- "crop_stop": 278,
13
- "lpf_start": 28,
14
- "lpf_stop": 140,
15
- "res_type": "polyphase"
16
- },
17
- "2": {
18
- "sr": 22050,
19
- "hl": 256,
20
- "n_fft": 768,
21
- "crop_start": 14,
22
- "crop_stop": 322,
23
- "hpf_start": 70,
24
- "hpf_stop": 14,
25
- "lpf_start": 283,
26
- "lpf_stop": 314,
27
- "res_type": "polyphase"
28
- },
29
- "3": {
30
- "sr": 44100,
31
- "hl": 512,
32
- "n_fft": 768,
33
- "crop_start": 131,
34
- "crop_stop": 313,
35
- "hpf_start": 154,
36
- "hpf_stop": 141,
37
- "res_type": "sinc_medium"
38
- }
39
- },
40
- "sr": 44100,
41
- "pre_filter_start": 757,
42
- "pre_filter_stop": 768
43
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json DELETED
@@ -1,43 +0,0 @@
1
- {
2
- "mid_side_b2": true,
3
- "bins": 640,
4
- "unstable_bins": 7,
5
- "reduction_bins": 565,
6
- "band": {
7
- "1": {
8
- "sr": 11025,
9
- "hl": 108,
10
- "n_fft": 1024,
11
- "crop_start": 0,
12
- "crop_stop": 187,
13
- "lpf_start": 92,
14
- "lpf_stop": 186,
15
- "res_type": "polyphase"
16
- },
17
- "2": {
18
- "sr": 22050,
19
- "hl": 216,
20
- "n_fft": 768,
21
- "crop_start": 0,
22
- "crop_stop": 212,
23
- "hpf_start": 68,
24
- "hpf_stop": 34,
25
- "lpf_start": 174,
26
- "lpf_stop": 209,
27
- "res_type": "polyphase"
28
- },
29
- "3": {
30
- "sr": 44100,
31
- "hl": 432,
32
- "n_fft": 640,
33
- "crop_start": 66,
34
- "crop_stop": 307,
35
- "hpf_start": 86,
36
- "hpf_stop": 72,
37
- "res_type": "kaiser_fast"
38
- }
39
- },
40
- "sr": 44100,
41
- "pre_filter_start": 639,
42
- "pre_filter_stop": 640
43
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json DELETED
@@ -1,54 +0,0 @@
1
- {
2
- "bins": 768,
3
- "unstable_bins": 7,
4
- "reduction_bins": 668,
5
- "band": {
6
- "1": {
7
- "sr": 11025,
8
- "hl": 128,
9
- "n_fft": 1024,
10
- "crop_start": 0,
11
- "crop_stop": 186,
12
- "lpf_start": 37,
13
- "lpf_stop": 73,
14
- "res_type": "polyphase"
15
- },
16
- "2": {
17
- "sr": 11025,
18
- "hl": 128,
19
- "n_fft": 512,
20
- "crop_start": 4,
21
- "crop_stop": 185,
22
- "hpf_start": 36,
23
- "hpf_stop": 18,
24
- "lpf_start": 93,
25
- "lpf_stop": 185,
26
- "res_type": "polyphase"
27
- },
28
- "3": {
29
- "sr": 22050,
30
- "hl": 256,
31
- "n_fft": 512,
32
- "crop_start": 46,
33
- "crop_stop": 186,
34
- "hpf_start": 93,
35
- "hpf_stop": 46,
36
- "lpf_start": 164,
37
- "lpf_stop": 186,
38
- "res_type": "polyphase"
39
- },
40
- "4": {
41
- "sr": 44100,
42
- "hl": 512,
43
- "n_fft": 768,
44
- "crop_start": 121,
45
- "crop_stop": 382,
46
- "hpf_start": 138,
47
- "hpf_stop": 123,
48
- "res_type": "sinc_medium"
49
- }
50
- },
51
- "sr": 44100,
52
- "pre_filter_start": 740,
53
- "pre_filter_stop": 768
54
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_mid.json DELETED
@@ -1,55 +0,0 @@
1
- {
2
- "bins": 768,
3
- "unstable_bins": 7,
4
- "mid_side": true,
5
- "reduction_bins": 668,
6
- "band": {
7
- "1": {
8
- "sr": 11025,
9
- "hl": 128,
10
- "n_fft": 1024,
11
- "crop_start": 0,
12
- "crop_stop": 186,
13
- "lpf_start": 37,
14
- "lpf_stop": 73,
15
- "res_type": "polyphase"
16
- },
17
- "2": {
18
- "sr": 11025,
19
- "hl": 128,
20
- "n_fft": 512,
21
- "crop_start": 4,
22
- "crop_stop": 185,
23
- "hpf_start": 36,
24
- "hpf_stop": 18,
25
- "lpf_start": 93,
26
- "lpf_stop": 185,
27
- "res_type": "polyphase"
28
- },
29
- "3": {
30
- "sr": 22050,
31
- "hl": 256,
32
- "n_fft": 512,
33
- "crop_start": 46,
34
- "crop_stop": 186,
35
- "hpf_start": 93,
36
- "hpf_stop": 46,
37
- "lpf_start": 164,
38
- "lpf_stop": 186,
39
- "res_type": "polyphase"
40
- },
41
- "4": {
42
- "sr": 44100,
43
- "hl": 512,
44
- "n_fft": 768,
45
- "crop_start": 121,
46
- "crop_stop": 382,
47
- "hpf_start": 138,
48
- "hpf_stop": 123,
49
- "res_type": "sinc_medium"
50
- }
51
- },
52
- "sr": 44100,
53
- "pre_filter_start": 740,
54
- "pre_filter_stop": 768
55
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb.json DELETED
@@ -1,55 +0,0 @@
1
- {
2
- "mid_side_b": true,
3
- "bins": 768,
4
- "unstable_bins": 7,
5
- "reduction_bins": 668,
6
- "band": {
7
- "1": {
8
- "sr": 11025,
9
- "hl": 128,
10
- "n_fft": 1024,
11
- "crop_start": 0,
12
- "crop_stop": 186,
13
- "lpf_start": 37,
14
- "lpf_stop": 73,
15
- "res_type": "polyphase"
16
- },
17
- "2": {
18
- "sr": 11025,
19
- "hl": 128,
20
- "n_fft": 512,
21
- "crop_start": 4,
22
- "crop_stop": 185,
23
- "hpf_start": 36,
24
- "hpf_stop": 18,
25
- "lpf_start": 93,
26
- "lpf_stop": 185,
27
- "res_type": "polyphase"
28
- },
29
- "3": {
30
- "sr": 22050,
31
- "hl": 256,
32
- "n_fft": 512,
33
- "crop_start": 46,
34
- "crop_stop": 186,
35
- "hpf_start": 93,
36
- "hpf_stop": 46,
37
- "lpf_start": 164,
38
- "lpf_stop": 186,
39
- "res_type": "polyphase"
40
- },
41
- "4": {
42
- "sr": 44100,
43
- "hl": 512,
44
- "n_fft": 768,
45
- "crop_start": 121,
46
- "crop_stop": 382,
47
- "hpf_start": 138,
48
- "hpf_stop": 123,
49
- "res_type": "sinc_medium"
50
- }
51
- },
52
- "sr": 44100,
53
- "pre_filter_start": 740,
54
- "pre_filter_stop": 768
55
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json DELETED
@@ -1,55 +0,0 @@
1
- {
2
- "mid_side_b": true,
3
- "bins": 768,
4
- "unstable_bins": 7,
5
- "reduction_bins": 668,
6
- "band": {
7
- "1": {
8
- "sr": 11025,
9
- "hl": 128,
10
- "n_fft": 1024,
11
- "crop_start": 0,
12
- "crop_stop": 186,
13
- "lpf_start": 37,
14
- "lpf_stop": 73,
15
- "res_type": "polyphase"
16
- },
17
- "2": {
18
- "sr": 11025,
19
- "hl": 128,
20
- "n_fft": 512,
21
- "crop_start": 4,
22
- "crop_stop": 185,
23
- "hpf_start": 36,
24
- "hpf_stop": 18,
25
- "lpf_start": 93,
26
- "lpf_stop": 185,
27
- "res_type": "polyphase"
28
- },
29
- "3": {
30
- "sr": 22050,
31
- "hl": 256,
32
- "n_fft": 512,
33
- "crop_start": 46,
34
- "crop_stop": 186,
35
- "hpf_start": 93,
36
- "hpf_stop": 46,
37
- "lpf_start": 164,
38
- "lpf_stop": 186,
39
- "res_type": "polyphase"
40
- },
41
- "4": {
42
- "sr": 44100,
43
- "hl": 512,
44
- "n_fft": 768,
45
- "crop_start": 121,
46
- "crop_stop": 382,
47
- "hpf_start": 138,
48
- "hpf_stop": 123,
49
- "res_type": "sinc_medium"
50
- }
51
- },
52
- "sr": 44100,
53
- "pre_filter_start": 740,
54
- "pre_filter_stop": 768
55
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json DELETED
@@ -1,55 +0,0 @@
1
- {
2
- "reverse": true,
3
- "bins": 768,
4
- "unstable_bins": 7,
5
- "reduction_bins": 668,
6
- "band": {
7
- "1": {
8
- "sr": 11025,
9
- "hl": 128,
10
- "n_fft": 1024,
11
- "crop_start": 0,
12
- "crop_stop": 186,
13
- "lpf_start": 37,
14
- "lpf_stop": 73,
15
- "res_type": "polyphase"
16
- },
17
- "2": {
18
- "sr": 11025,
19
- "hl": 128,
20
- "n_fft": 512,
21
- "crop_start": 4,
22
- "crop_stop": 185,
23
- "hpf_start": 36,
24
- "hpf_stop": 18,
25
- "lpf_start": 93,
26
- "lpf_stop": 185,
27
- "res_type": "polyphase"
28
- },
29
- "3": {
30
- "sr": 22050,
31
- "hl": 256,
32
- "n_fft": 512,
33
- "crop_start": 46,
34
- "crop_stop": 186,
35
- "hpf_start": 93,
36
- "hpf_stop": 46,
37
- "lpf_start": 164,
38
- "lpf_stop": 186,
39
- "res_type": "polyphase"
40
- },
41
- "4": {
42
- "sr": 44100,
43
- "hl": 512,
44
- "n_fft": 768,
45
- "crop_start": 121,
46
- "crop_stop": 382,
47
- "hpf_start": 138,
48
- "hpf_stop": 123,
49
- "res_type": "sinc_medium"
50
- }
51
- },
52
- "sr": 44100,
53
- "pre_filter_start": 740,
54
- "pre_filter_stop": 768
55
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_sw.json DELETED
@@ -1,55 +0,0 @@
1
- {
2
- "stereo_w": true,
3
- "bins": 768,
4
- "unstable_bins": 7,
5
- "reduction_bins": 668,
6
- "band": {
7
- "1": {
8
- "sr": 11025,
9
- "hl": 128,
10
- "n_fft": 1024,
11
- "crop_start": 0,
12
- "crop_stop": 186,
13
- "lpf_start": 37,
14
- "lpf_stop": 73,
15
- "res_type": "polyphase"
16
- },
17
- "2": {
18
- "sr": 11025,
19
- "hl": 128,
20
- "n_fft": 512,
21
- "crop_start": 4,
22
- "crop_stop": 185,
23
- "hpf_start": 36,
24
- "hpf_stop": 18,
25
- "lpf_start": 93,
26
- "lpf_stop": 185,
27
- "res_type": "polyphase"
28
- },
29
- "3": {
30
- "sr": 22050,
31
- "hl": 256,
32
- "n_fft": 512,
33
- "crop_start": 46,
34
- "crop_stop": 186,
35
- "hpf_start": 93,
36
- "hpf_stop": 46,
37
- "lpf_start": 164,
38
- "lpf_stop": 186,
39
- "res_type": "polyphase"
40
- },
41
- "4": {
42
- "sr": 44100,
43
- "hl": 512,
44
- "n_fft": 768,
45
- "crop_start": 121,
46
- "crop_stop": 382,
47
- "hpf_start": 138,
48
- "hpf_stop": 123,
49
- "res_type": "sinc_medium"
50
- }
51
- },
52
- "sr": 44100,
53
- "pre_filter_start": 740,
54
- "pre_filter_stop": 768
55
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json DELETED
@@ -1,54 +0,0 @@
1
- {
2
- "bins": 672,
3
- "unstable_bins": 8,
4
- "reduction_bins": 637,
5
- "band": {
6
- "1": {
7
- "sr": 7350,
8
- "hl": 80,
9
- "n_fft": 640,
10
- "crop_start": 0,
11
- "crop_stop": 85,
12
- "lpf_start": 25,
13
- "lpf_stop": 53,
14
- "res_type": "polyphase"
15
- },
16
- "2": {
17
- "sr": 7350,
18
- "hl": 80,
19
- "n_fft": 320,
20
- "crop_start": 4,
21
- "crop_stop": 87,
22
- "hpf_start": 25,
23
- "hpf_stop": 12,
24
- "lpf_start": 31,
25
- "lpf_stop": 62,
26
- "res_type": "polyphase"
27
- },
28
- "3": {
29
- "sr": 14700,
30
- "hl": 160,
31
- "n_fft": 512,
32
- "crop_start": 17,
33
- "crop_stop": 216,
34
- "hpf_start": 48,
35
- "hpf_stop": 24,
36
- "lpf_start": 139,
37
- "lpf_stop": 210,
38
- "res_type": "polyphase"
39
- },
40
- "4": {
41
- "sr": 44100,
42
- "hl": 480,
43
- "n_fft": 960,
44
- "crop_start": 78,
45
- "crop_stop": 383,
46
- "hpf_start": 130,
47
- "hpf_stop": 86,
48
- "res_type": "kaiser_fast"
49
- }
50
- },
51
- "sr": 44100,
52
- "pre_filter_start": 668,
53
- "pre_filter_stop": 672
54
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json DELETED
@@ -1,55 +0,0 @@
1
- {
2
- "bins": 672,
3
- "unstable_bins": 8,
4
- "reduction_bins": 637,
5
- "band": {
6
- "1": {
7
- "sr": 7350,
8
- "hl": 80,
9
- "n_fft": 640,
10
- "crop_start": 0,
11
- "crop_stop": 85,
12
- "lpf_start": 25,
13
- "lpf_stop": 53,
14
- "res_type": "polyphase"
15
- },
16
- "2": {
17
- "sr": 7350,
18
- "hl": 80,
19
- "n_fft": 320,
20
- "crop_start": 4,
21
- "crop_stop": 87,
22
- "hpf_start": 25,
23
- "hpf_stop": 12,
24
- "lpf_start": 31,
25
- "lpf_stop": 62,
26
- "res_type": "polyphase"
27
- },
28
- "3": {
29
- "sr": 14700,
30
- "hl": 160,
31
- "n_fft": 512,
32
- "crop_start": 17,
33
- "crop_stop": 216,
34
- "hpf_start": 48,
35
- "hpf_stop": 24,
36
- "lpf_start": 139,
37
- "lpf_stop": 210,
38
- "res_type": "polyphase"
39
- },
40
- "4": {
41
- "sr": 44100,
42
- "hl": 480,
43
- "n_fft": 960,
44
- "crop_start": 78,
45
- "crop_stop": 383,
46
- "hpf_start": 130,
47
- "hpf_stop": 86,
48
- "convert_channels": "stereo_n",
49
- "res_type": "kaiser_fast"
50
- }
51
- },
52
- "sr": 44100,
53
- "pre_filter_start": 668,
54
- "pre_filter_stop": 672
55
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/modelparams/4band_v3.json DELETED
@@ -1,54 +0,0 @@
1
- {
2
- "bins": 672,
3
- "unstable_bins": 8,
4
- "reduction_bins": 530,
5
- "band": {
6
- "1": {
7
- "sr": 7350,
8
- "hl": 80,
9
- "n_fft": 640,
10
- "crop_start": 0,
11
- "crop_stop": 85,
12
- "lpf_start": 25,
13
- "lpf_stop": 53,
14
- "res_type": "polyphase"
15
- },
16
- "2": {
17
- "sr": 7350,
18
- "hl": 80,
19
- "n_fft": 320,
20
- "crop_start": 4,
21
- "crop_stop": 87,
22
- "hpf_start": 25,
23
- "hpf_stop": 12,
24
- "lpf_start": 31,
25
- "lpf_stop": 62,
26
- "res_type": "polyphase"
27
- },
28
- "3": {
29
- "sr": 14700,
30
- "hl": 160,
31
- "n_fft": 512,
32
- "crop_start": 17,
33
- "crop_stop": 216,
34
- "hpf_start": 48,
35
- "hpf_stop": 24,
36
- "lpf_start": 139,
37
- "lpf_stop": 210,
38
- "res_type": "polyphase"
39
- },
40
- "4": {
41
- "sr": 44100,
42
- "hl": 480,
43
- "n_fft": 960,
44
- "crop_start": 78,
45
- "crop_stop": 383,
46
- "hpf_start": 130,
47
- "hpf_stop": 86,
48
- "res_type": "kaiser_fast"
49
- }
50
- },
51
- "sr": 44100,
52
- "pre_filter_start": 668,
53
- "pre_filter_stop": 672
54
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/modelparams/ensemble.json DELETED
@@ -1,43 +0,0 @@
1
- {
2
- "mid_side_b2": true,
3
- "bins": 1280,
4
- "unstable_bins": 7,
5
- "reduction_bins": 565,
6
- "band": {
7
- "1": {
8
- "sr": 11025,
9
- "hl": 108,
10
- "n_fft": 2048,
11
- "crop_start": 0,
12
- "crop_stop": 374,
13
- "lpf_start": 92,
14
- "lpf_stop": 186,
15
- "res_type": "polyphase"
16
- },
17
- "2": {
18
- "sr": 22050,
19
- "hl": 216,
20
- "n_fft": 1536,
21
- "crop_start": 0,
22
- "crop_stop": 424,
23
- "hpf_start": 68,
24
- "hpf_stop": 34,
25
- "lpf_start": 348,
26
- "lpf_stop": 418,
27
- "res_type": "polyphase"
28
- },
29
- "3": {
30
- "sr": 44100,
31
- "hl": 432,
32
- "n_fft": 1280,
33
- "crop_start": 132,
34
- "crop_stop": 614,
35
- "hpf_start": 172,
36
- "hpf_stop": 144,
37
- "res_type": "polyphase"
38
- }
39
- },
40
- "sr": 44100,
41
- "pre_filter_start": 1280,
42
- "pre_filter_stop": 1280
43
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/nets.py DELETED
@@ -1,123 +0,0 @@
1
- import layers
2
- import torch
3
- import torch.nn.functional as F
4
- from torch import nn
5
-
6
- from . import spec_utils
7
-
8
-
9
- class BaseASPPNet(nn.Module):
10
- def __init__(self, nin, ch, dilations=(4, 8, 16)):
11
- super(BaseASPPNet, self).__init__()
12
- self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
13
- self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
14
- self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
15
- self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
16
-
17
- self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
18
-
19
- self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
20
- self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
21
- self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
22
- self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
23
-
24
- def __call__(self, x):
25
- h, e1 = self.enc1(x)
26
- h, e2 = self.enc2(h)
27
- h, e3 = self.enc3(h)
28
- h, e4 = self.enc4(h)
29
-
30
- h = self.aspp(h)
31
-
32
- h = self.dec4(h, e4)
33
- h = self.dec3(h, e3)
34
- h = self.dec2(h, e2)
35
- h = self.dec1(h, e1)
36
-
37
- return h
38
-
39
-
40
- class CascadedASPPNet(nn.Module):
41
- def __init__(self, n_fft):
42
- super(CascadedASPPNet, self).__init__()
43
- self.stg1_low_band_net = BaseASPPNet(2, 16)
44
- self.stg1_high_band_net = BaseASPPNet(2, 16)
45
-
46
- self.stg2_bridge = layers.Conv2DBNActiv(18, 8, 1, 1, 0)
47
- self.stg2_full_band_net = BaseASPPNet(8, 16)
48
-
49
- self.stg3_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
50
- self.stg3_full_band_net = BaseASPPNet(16, 32)
51
-
52
- self.out = nn.Conv2d(32, 2, 1, bias=False)
53
- self.aux1_out = nn.Conv2d(16, 2, 1, bias=False)
54
- self.aux2_out = nn.Conv2d(16, 2, 1, bias=False)
55
-
56
- self.max_bin = n_fft // 2
57
- self.output_bin = n_fft // 2 + 1
58
-
59
- self.offset = 128
60
-
61
- def forward(self, x, aggressiveness=None):
62
- mix = x.detach()
63
- x = x.clone()
64
-
65
- x = x[:, :, : self.max_bin]
66
-
67
- bandw = x.size()[2] // 2
68
- aux1 = torch.cat(
69
- [
70
- self.stg1_low_band_net(x[:, :, :bandw]),
71
- self.stg1_high_band_net(x[:, :, bandw:]),
72
- ],
73
- dim=2,
74
- )
75
-
76
- h = torch.cat([x, aux1], dim=1)
77
- aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
78
-
79
- h = torch.cat([x, aux1, aux2], dim=1)
80
- h = self.stg3_full_band_net(self.stg3_bridge(h))
81
-
82
- mask = torch.sigmoid(self.out(h))
83
- mask = F.pad(
84
- input=mask,
85
- pad=(0, 0, 0, self.output_bin - mask.size()[2]),
86
- mode="replicate",
87
- )
88
-
89
- if self.training:
90
- aux1 = torch.sigmoid(self.aux1_out(aux1))
91
- aux1 = F.pad(
92
- input=aux1,
93
- pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
94
- mode="replicate",
95
- )
96
- aux2 = torch.sigmoid(self.aux2_out(aux2))
97
- aux2 = F.pad(
98
- input=aux2,
99
- pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
100
- mode="replicate",
101
- )
102
- return mask * mix, aux1 * mix, aux2 * mix
103
- else:
104
- if aggressiveness:
105
- mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
106
- mask[:, :, : aggressiveness["split_bin"]],
107
- 1 + aggressiveness["value"] / 3,
108
- )
109
- mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
110
- mask[:, :, aggressiveness["split_bin"] :],
111
- 1 + aggressiveness["value"],
112
- )
113
-
114
- return mask * mix
115
-
116
- def predict(self, x_mag, aggressiveness=None):
117
- h = self.forward(x_mag, aggressiveness)
118
-
119
- if self.offset > 0:
120
- h = h[:, :, :, self.offset : -self.offset]
121
- assert h.size()[3] > 0
122
-
123
- return h
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/nets_123812KB.py DELETED
@@ -1,122 +0,0 @@
1
- import torch
2
- import torch.nn.functional as F
3
- from torch import nn
4
-
5
- from . import layers_123821KB as layers
6
-
7
-
8
- class BaseASPPNet(nn.Module):
9
- def __init__(self, nin, ch, dilations=(4, 8, 16)):
10
- super(BaseASPPNet, self).__init__()
11
- self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
12
- self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
13
- self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
14
- self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
15
-
16
- self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
17
-
18
- self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
19
- self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
20
- self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
21
- self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
22
-
23
- def __call__(self, x):
24
- h, e1 = self.enc1(x)
25
- h, e2 = self.enc2(h)
26
- h, e3 = self.enc3(h)
27
- h, e4 = self.enc4(h)
28
-
29
- h = self.aspp(h)
30
-
31
- h = self.dec4(h, e4)
32
- h = self.dec3(h, e3)
33
- h = self.dec2(h, e2)
34
- h = self.dec1(h, e1)
35
-
36
- return h
37
-
38
-
39
- class CascadedASPPNet(nn.Module):
40
- def __init__(self, n_fft):
41
- super(CascadedASPPNet, self).__init__()
42
- self.stg1_low_band_net = BaseASPPNet(2, 32)
43
- self.stg1_high_band_net = BaseASPPNet(2, 32)
44
-
45
- self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
46
- self.stg2_full_band_net = BaseASPPNet(16, 32)
47
-
48
- self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
49
- self.stg3_full_band_net = BaseASPPNet(32, 64)
50
-
51
- self.out = nn.Conv2d(64, 2, 1, bias=False)
52
- self.aux1_out = nn.Conv2d(32, 2, 1, bias=False)
53
- self.aux2_out = nn.Conv2d(32, 2, 1, bias=False)
54
-
55
- self.max_bin = n_fft // 2
56
- self.output_bin = n_fft // 2 + 1
57
-
58
- self.offset = 128
59
-
60
- def forward(self, x, aggressiveness=None):
61
- mix = x.detach()
62
- x = x.clone()
63
-
64
- x = x[:, :, : self.max_bin]
65
-
66
- bandw = x.size()[2] // 2
67
- aux1 = torch.cat(
68
- [
69
- self.stg1_low_band_net(x[:, :, :bandw]),
70
- self.stg1_high_band_net(x[:, :, bandw:]),
71
- ],
72
- dim=2,
73
- )
74
-
75
- h = torch.cat([x, aux1], dim=1)
76
- aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
77
-
78
- h = torch.cat([x, aux1, aux2], dim=1)
79
- h = self.stg3_full_band_net(self.stg3_bridge(h))
80
-
81
- mask = torch.sigmoid(self.out(h))
82
- mask = F.pad(
83
- input=mask,
84
- pad=(0, 0, 0, self.output_bin - mask.size()[2]),
85
- mode="replicate",
86
- )
87
-
88
- if self.training:
89
- aux1 = torch.sigmoid(self.aux1_out(aux1))
90
- aux1 = F.pad(
91
- input=aux1,
92
- pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
93
- mode="replicate",
94
- )
95
- aux2 = torch.sigmoid(self.aux2_out(aux2))
96
- aux2 = F.pad(
97
- input=aux2,
98
- pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
99
- mode="replicate",
100
- )
101
- return mask * mix, aux1 * mix, aux2 * mix
102
- else:
103
- if aggressiveness:
104
- mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
105
- mask[:, :, : aggressiveness["split_bin"]],
106
- 1 + aggressiveness["value"] / 3,
107
- )
108
- mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
109
- mask[:, :, aggressiveness["split_bin"] :],
110
- 1 + aggressiveness["value"],
111
- )
112
-
113
- return mask * mix
114
-
115
- def predict(self, x_mag, aggressiveness=None):
116
- h = self.forward(x_mag, aggressiveness)
117
-
118
- if self.offset > 0:
119
- h = h[:, :, :, self.offset : -self.offset]
120
- assert h.size()[3] > 0
121
-
122
- return h
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/nets_123821KB.py DELETED
@@ -1,122 +0,0 @@
1
- import torch
2
- import torch.nn.functional as F
3
- from torch import nn
4
-
5
- from . import layers_123821KB as layers
6
-
7
-
8
- class BaseASPPNet(nn.Module):
9
- def __init__(self, nin, ch, dilations=(4, 8, 16)):
10
- super(BaseASPPNet, self).__init__()
11
- self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
12
- self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
13
- self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
14
- self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
15
-
16
- self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
17
-
18
- self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
19
- self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
20
- self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
21
- self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
22
-
23
- def __call__(self, x):
24
- h, e1 = self.enc1(x)
25
- h, e2 = self.enc2(h)
26
- h, e3 = self.enc3(h)
27
- h, e4 = self.enc4(h)
28
-
29
- h = self.aspp(h)
30
-
31
- h = self.dec4(h, e4)
32
- h = self.dec3(h, e3)
33
- h = self.dec2(h, e2)
34
- h = self.dec1(h, e1)
35
-
36
- return h
37
-
38
-
39
- class CascadedASPPNet(nn.Module):
40
- def __init__(self, n_fft):
41
- super(CascadedASPPNet, self).__init__()
42
- self.stg1_low_band_net = BaseASPPNet(2, 32)
43
- self.stg1_high_band_net = BaseASPPNet(2, 32)
44
-
45
- self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
46
- self.stg2_full_band_net = BaseASPPNet(16, 32)
47
-
48
- self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
49
- self.stg3_full_band_net = BaseASPPNet(32, 64)
50
-
51
- self.out = nn.Conv2d(64, 2, 1, bias=False)
52
- self.aux1_out = nn.Conv2d(32, 2, 1, bias=False)
53
- self.aux2_out = nn.Conv2d(32, 2, 1, bias=False)
54
-
55
- self.max_bin = n_fft // 2
56
- self.output_bin = n_fft // 2 + 1
57
-
58
- self.offset = 128
59
-
60
- def forward(self, x, aggressiveness=None):
61
- mix = x.detach()
62
- x = x.clone()
63
-
64
- x = x[:, :, : self.max_bin]
65
-
66
- bandw = x.size()[2] // 2
67
- aux1 = torch.cat(
68
- [
69
- self.stg1_low_band_net(x[:, :, :bandw]),
70
- self.stg1_high_band_net(x[:, :, bandw:]),
71
- ],
72
- dim=2,
73
- )
74
-
75
- h = torch.cat([x, aux1], dim=1)
76
- aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
77
-
78
- h = torch.cat([x, aux1, aux2], dim=1)
79
- h = self.stg3_full_band_net(self.stg3_bridge(h))
80
-
81
- mask = torch.sigmoid(self.out(h))
82
- mask = F.pad(
83
- input=mask,
84
- pad=(0, 0, 0, self.output_bin - mask.size()[2]),
85
- mode="replicate",
86
- )
87
-
88
- if self.training:
89
- aux1 = torch.sigmoid(self.aux1_out(aux1))
90
- aux1 = F.pad(
91
- input=aux1,
92
- pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
93
- mode="replicate",
94
- )
95
- aux2 = torch.sigmoid(self.aux2_out(aux2))
96
- aux2 = F.pad(
97
- input=aux2,
98
- pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
99
- mode="replicate",
100
- )
101
- return mask * mix, aux1 * mix, aux2 * mix
102
- else:
103
- if aggressiveness:
104
- mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
105
- mask[:, :, : aggressiveness["split_bin"]],
106
- 1 + aggressiveness["value"] / 3,
107
- )
108
- mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
109
- mask[:, :, aggressiveness["split_bin"] :],
110
- 1 + aggressiveness["value"],
111
- )
112
-
113
- return mask * mix
114
-
115
- def predict(self, x_mag, aggressiveness=None):
116
- h = self.forward(x_mag, aggressiveness)
117
-
118
- if self.offset > 0:
119
- h = h[:, :, :, self.offset : -self.offset]
120
- assert h.size()[3] > 0
121
-
122
- return h
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/nets_33966KB.py DELETED
@@ -1,122 +0,0 @@
1
- import torch
2
- import torch.nn.functional as F
3
- from torch import nn
4
-
5
- from . import layers_33966KB as layers
6
-
7
-
8
- class BaseASPPNet(nn.Module):
9
- def __init__(self, nin, ch, dilations=(4, 8, 16, 32)):
10
- super(BaseASPPNet, self).__init__()
11
- self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
12
- self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
13
- self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
14
- self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
15
-
16
- self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
17
-
18
- self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
19
- self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
20
- self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
21
- self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
22
-
23
- def __call__(self, x):
24
- h, e1 = self.enc1(x)
25
- h, e2 = self.enc2(h)
26
- h, e3 = self.enc3(h)
27
- h, e4 = self.enc4(h)
28
-
29
- h = self.aspp(h)
30
-
31
- h = self.dec4(h, e4)
32
- h = self.dec3(h, e3)
33
- h = self.dec2(h, e2)
34
- h = self.dec1(h, e1)
35
-
36
- return h
37
-
38
-
39
- class CascadedASPPNet(nn.Module):
40
- def __init__(self, n_fft):
41
- super(CascadedASPPNet, self).__init__()
42
- self.stg1_low_band_net = BaseASPPNet(2, 16)
43
- self.stg1_high_band_net = BaseASPPNet(2, 16)
44
-
45
- self.stg2_bridge = layers.Conv2DBNActiv(18, 8, 1, 1, 0)
46
- self.stg2_full_band_net = BaseASPPNet(8, 16)
47
-
48
- self.stg3_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
49
- self.stg3_full_band_net = BaseASPPNet(16, 32)
50
-
51
- self.out = nn.Conv2d(32, 2, 1, bias=False)
52
- self.aux1_out = nn.Conv2d(16, 2, 1, bias=False)
53
- self.aux2_out = nn.Conv2d(16, 2, 1, bias=False)
54
-
55
- self.max_bin = n_fft // 2
56
- self.output_bin = n_fft // 2 + 1
57
-
58
- self.offset = 128
59
-
60
- def forward(self, x, aggressiveness=None):
61
- mix = x.detach()
62
- x = x.clone()
63
-
64
- x = x[:, :, : self.max_bin]
65
-
66
- bandw = x.size()[2] // 2
67
- aux1 = torch.cat(
68
- [
69
- self.stg1_low_band_net(x[:, :, :bandw]),
70
- self.stg1_high_band_net(x[:, :, bandw:]),
71
- ],
72
- dim=2,
73
- )
74
-
75
- h = torch.cat([x, aux1], dim=1)
76
- aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
77
-
78
- h = torch.cat([x, aux1, aux2], dim=1)
79
- h = self.stg3_full_band_net(self.stg3_bridge(h))
80
-
81
- mask = torch.sigmoid(self.out(h))
82
- mask = F.pad(
83
- input=mask,
84
- pad=(0, 0, 0, self.output_bin - mask.size()[2]),
85
- mode="replicate",
86
- )
87
-
88
- if self.training:
89
- aux1 = torch.sigmoid(self.aux1_out(aux1))
90
- aux1 = F.pad(
91
- input=aux1,
92
- pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
93
- mode="replicate",
94
- )
95
- aux2 = torch.sigmoid(self.aux2_out(aux2))
96
- aux2 = F.pad(
97
- input=aux2,
98
- pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
99
- mode="replicate",
100
- )
101
- return mask * mix, aux1 * mix, aux2 * mix
102
- else:
103
- if aggressiveness:
104
- mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
105
- mask[:, :, : aggressiveness["split_bin"]],
106
- 1 + aggressiveness["value"] / 3,
107
- )
108
- mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
109
- mask[:, :, aggressiveness["split_bin"] :],
110
- 1 + aggressiveness["value"],
111
- )
112
-
113
- return mask * mix
114
-
115
- def predict(self, x_mag, aggressiveness=None):
116
- h = self.forward(x_mag, aggressiveness)
117
-
118
- if self.offset > 0:
119
- h = h[:, :, :, self.offset : -self.offset]
120
- assert h.size()[3] > 0
121
-
122
- return h
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/nets_537227KB.py DELETED
@@ -1,123 +0,0 @@
1
- import numpy as np
2
- import torch
3
- import torch.nn.functional as F
4
- from torch import nn
5
-
6
- from . import layers_537238KB as layers
7
-
8
-
9
- class BaseASPPNet(nn.Module):
10
- def __init__(self, nin, ch, dilations=(4, 8, 16)):
11
- super(BaseASPPNet, self).__init__()
12
- self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
13
- self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
14
- self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
15
- self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
16
-
17
- self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
18
-
19
- self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
20
- self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
21
- self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
22
- self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
23
-
24
- def __call__(self, x):
25
- h, e1 = self.enc1(x)
26
- h, e2 = self.enc2(h)
27
- h, e3 = self.enc3(h)
28
- h, e4 = self.enc4(h)
29
-
30
- h = self.aspp(h)
31
-
32
- h = self.dec4(h, e4)
33
- h = self.dec3(h, e3)
34
- h = self.dec2(h, e2)
35
- h = self.dec1(h, e1)
36
-
37
- return h
38
-
39
-
40
- class CascadedASPPNet(nn.Module):
41
- def __init__(self, n_fft):
42
- super(CascadedASPPNet, self).__init__()
43
- self.stg1_low_band_net = BaseASPPNet(2, 64)
44
- self.stg1_high_band_net = BaseASPPNet(2, 64)
45
-
46
- self.stg2_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
47
- self.stg2_full_band_net = BaseASPPNet(32, 64)
48
-
49
- self.stg3_bridge = layers.Conv2DBNActiv(130, 64, 1, 1, 0)
50
- self.stg3_full_band_net = BaseASPPNet(64, 128)
51
-
52
- self.out = nn.Conv2d(128, 2, 1, bias=False)
53
- self.aux1_out = nn.Conv2d(64, 2, 1, bias=False)
54
- self.aux2_out = nn.Conv2d(64, 2, 1, bias=False)
55
-
56
- self.max_bin = n_fft // 2
57
- self.output_bin = n_fft // 2 + 1
58
-
59
- self.offset = 128
60
-
61
- def forward(self, x, aggressiveness=None):
62
- mix = x.detach()
63
- x = x.clone()
64
-
65
- x = x[:, :, : self.max_bin]
66
-
67
- bandw = x.size()[2] // 2
68
- aux1 = torch.cat(
69
- [
70
- self.stg1_low_band_net(x[:, :, :bandw]),
71
- self.stg1_high_band_net(x[:, :, bandw:]),
72
- ],
73
- dim=2,
74
- )
75
-
76
- h = torch.cat([x, aux1], dim=1)
77
- aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
78
-
79
- h = torch.cat([x, aux1, aux2], dim=1)
80
- h = self.stg3_full_band_net(self.stg3_bridge(h))
81
-
82
- mask = torch.sigmoid(self.out(h))
83
- mask = F.pad(
84
- input=mask,
85
- pad=(0, 0, 0, self.output_bin - mask.size()[2]),
86
- mode="replicate",
87
- )
88
-
89
- if self.training:
90
- aux1 = torch.sigmoid(self.aux1_out(aux1))
91
- aux1 = F.pad(
92
- input=aux1,
93
- pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
94
- mode="replicate",
95
- )
96
- aux2 = torch.sigmoid(self.aux2_out(aux2))
97
- aux2 = F.pad(
98
- input=aux2,
99
- pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
100
- mode="replicate",
101
- )
102
- return mask * mix, aux1 * mix, aux2 * mix
103
- else:
104
- if aggressiveness:
105
- mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
106
- mask[:, :, : aggressiveness["split_bin"]],
107
- 1 + aggressiveness["value"] / 3,
108
- )
109
- mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
110
- mask[:, :, aggressiveness["split_bin"] :],
111
- 1 + aggressiveness["value"],
112
- )
113
-
114
- return mask * mix
115
-
116
- def predict(self, x_mag, aggressiveness=None):
117
- h = self.forward(x_mag, aggressiveness)
118
-
119
- if self.offset > 0:
120
- h = h[:, :, :, self.offset : -self.offset]
121
- assert h.size()[3] > 0
122
-
123
- return h
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/nets_537238KB.py DELETED
@@ -1,123 +0,0 @@
1
- import numpy as np
2
- import torch
3
- import torch.nn.functional as F
4
- from torch import nn
5
-
6
- from . import layers_537238KB as layers
7
-
8
-
9
- class BaseASPPNet(nn.Module):
10
- def __init__(self, nin, ch, dilations=(4, 8, 16)):
11
- super(BaseASPPNet, self).__init__()
12
- self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
13
- self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
14
- self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
15
- self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
16
-
17
- self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
18
-
19
- self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
20
- self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
21
- self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
22
- self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
23
-
24
- def __call__(self, x):
25
- h, e1 = self.enc1(x)
26
- h, e2 = self.enc2(h)
27
- h, e3 = self.enc3(h)
28
- h, e4 = self.enc4(h)
29
-
30
- h = self.aspp(h)
31
-
32
- h = self.dec4(h, e4)
33
- h = self.dec3(h, e3)
34
- h = self.dec2(h, e2)
35
- h = self.dec1(h, e1)
36
-
37
- return h
38
-
39
-
40
- class CascadedASPPNet(nn.Module):
41
- def __init__(self, n_fft):
42
- super(CascadedASPPNet, self).__init__()
43
- self.stg1_low_band_net = BaseASPPNet(2, 64)
44
- self.stg1_high_band_net = BaseASPPNet(2, 64)
45
-
46
- self.stg2_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
47
- self.stg2_full_band_net = BaseASPPNet(32, 64)
48
-
49
- self.stg3_bridge = layers.Conv2DBNActiv(130, 64, 1, 1, 0)
50
- self.stg3_full_band_net = BaseASPPNet(64, 128)
51
-
52
- self.out = nn.Conv2d(128, 2, 1, bias=False)
53
- self.aux1_out = nn.Conv2d(64, 2, 1, bias=False)
54
- self.aux2_out = nn.Conv2d(64, 2, 1, bias=False)
55
-
56
- self.max_bin = n_fft // 2
57
- self.output_bin = n_fft // 2 + 1
58
-
59
- self.offset = 128
60
-
61
- def forward(self, x, aggressiveness=None):
62
- mix = x.detach()
63
- x = x.clone()
64
-
65
- x = x[:, :, : self.max_bin]
66
-
67
- bandw = x.size()[2] // 2
68
- aux1 = torch.cat(
69
- [
70
- self.stg1_low_band_net(x[:, :, :bandw]),
71
- self.stg1_high_band_net(x[:, :, bandw:]),
72
- ],
73
- dim=2,
74
- )
75
-
76
- h = torch.cat([x, aux1], dim=1)
77
- aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
78
-
79
- h = torch.cat([x, aux1, aux2], dim=1)
80
- h = self.stg3_full_band_net(self.stg3_bridge(h))
81
-
82
- mask = torch.sigmoid(self.out(h))
83
- mask = F.pad(
84
- input=mask,
85
- pad=(0, 0, 0, self.output_bin - mask.size()[2]),
86
- mode="replicate",
87
- )
88
-
89
- if self.training:
90
- aux1 = torch.sigmoid(self.aux1_out(aux1))
91
- aux1 = F.pad(
92
- input=aux1,
93
- pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
94
- mode="replicate",
95
- )
96
- aux2 = torch.sigmoid(self.aux2_out(aux2))
97
- aux2 = F.pad(
98
- input=aux2,
99
- pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
100
- mode="replicate",
101
- )
102
- return mask * mix, aux1 * mix, aux2 * mix
103
- else:
104
- if aggressiveness:
105
- mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
106
- mask[:, :, : aggressiveness["split_bin"]],
107
- 1 + aggressiveness["value"] / 3,
108
- )
109
- mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
110
- mask[:, :, aggressiveness["split_bin"] :],
111
- 1 + aggressiveness["value"],
112
- )
113
-
114
- return mask * mix
115
-
116
- def predict(self, x_mag, aggressiveness=None):
117
- h = self.forward(x_mag, aggressiveness)
118
-
119
- if self.offset > 0:
120
- h = h[:, :, :, self.offset : -self.offset]
121
- assert h.size()[3] > 0
122
-
123
- return h
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/nets_61968KB.py DELETED
@@ -1,122 +0,0 @@
1
- import torch
2
- import torch.nn.functional as F
3
- from torch import nn
4
-
5
- from . import layers_123821KB as layers
6
-
7
-
8
- class BaseASPPNet(nn.Module):
9
- def __init__(self, nin, ch, dilations=(4, 8, 16)):
10
- super(BaseASPPNet, self).__init__()
11
- self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
12
- self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
13
- self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
14
- self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
15
-
16
- self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
17
-
18
- self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
19
- self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
20
- self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
21
- self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
22
-
23
- def __call__(self, x):
24
- h, e1 = self.enc1(x)
25
- h, e2 = self.enc2(h)
26
- h, e3 = self.enc3(h)
27
- h, e4 = self.enc4(h)
28
-
29
- h = self.aspp(h)
30
-
31
- h = self.dec4(h, e4)
32
- h = self.dec3(h, e3)
33
- h = self.dec2(h, e2)
34
- h = self.dec1(h, e1)
35
-
36
- return h
37
-
38
-
39
- class CascadedASPPNet(nn.Module):
40
- def __init__(self, n_fft):
41
- super(CascadedASPPNet, self).__init__()
42
- self.stg1_low_band_net = BaseASPPNet(2, 32)
43
- self.stg1_high_band_net = BaseASPPNet(2, 32)
44
-
45
- self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
46
- self.stg2_full_band_net = BaseASPPNet(16, 32)
47
-
48
- self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
49
- self.stg3_full_band_net = BaseASPPNet(32, 64)
50
-
51
- self.out = nn.Conv2d(64, 2, 1, bias=False)
52
- self.aux1_out = nn.Conv2d(32, 2, 1, bias=False)
53
- self.aux2_out = nn.Conv2d(32, 2, 1, bias=False)
54
-
55
- self.max_bin = n_fft // 2
56
- self.output_bin = n_fft // 2 + 1
57
-
58
- self.offset = 128
59
-
60
- def forward(self, x, aggressiveness=None):
61
- mix = x.detach()
62
- x = x.clone()
63
-
64
- x = x[:, :, : self.max_bin]
65
-
66
- bandw = x.size()[2] // 2
67
- aux1 = torch.cat(
68
- [
69
- self.stg1_low_band_net(x[:, :, :bandw]),
70
- self.stg1_high_band_net(x[:, :, bandw:]),
71
- ],
72
- dim=2,
73
- )
74
-
75
- h = torch.cat([x, aux1], dim=1)
76
- aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
77
-
78
- h = torch.cat([x, aux1, aux2], dim=1)
79
- h = self.stg3_full_band_net(self.stg3_bridge(h))
80
-
81
- mask = torch.sigmoid(self.out(h))
82
- mask = F.pad(
83
- input=mask,
84
- pad=(0, 0, 0, self.output_bin - mask.size()[2]),
85
- mode="replicate",
86
- )
87
-
88
- if self.training:
89
- aux1 = torch.sigmoid(self.aux1_out(aux1))
90
- aux1 = F.pad(
91
- input=aux1,
92
- pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
93
- mode="replicate",
94
- )
95
- aux2 = torch.sigmoid(self.aux2_out(aux2))
96
- aux2 = F.pad(
97
- input=aux2,
98
- pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
99
- mode="replicate",
100
- )
101
- return mask * mix, aux1 * mix, aux2 * mix
102
- else:
103
- if aggressiveness:
104
- mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
105
- mask[:, :, : aggressiveness["split_bin"]],
106
- 1 + aggressiveness["value"] / 3,
107
- )
108
- mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
109
- mask[:, :, aggressiveness["split_bin"] :],
110
- 1 + aggressiveness["value"],
111
- )
112
-
113
- return mask * mix
114
-
115
- def predict(self, x_mag, aggressiveness=None):
116
- h = self.forward(x_mag, aggressiveness)
117
-
118
- if self.offset > 0:
119
- h = h[:, :, :, self.offset : -self.offset]
120
- assert h.size()[3] > 0
121
-
122
- return h
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/nets_new.py DELETED
@@ -1,133 +0,0 @@
1
- import torch
2
- import torch.nn.functional as F
3
- from torch import nn
4
-
5
- from . import layers_new
6
-
7
-
8
- class BaseNet(nn.Module):
9
- def __init__(
10
- self, nin, nout, nin_lstm, nout_lstm, dilations=((4, 2), (8, 4), (12, 6))
11
- ):
12
- super(BaseNet, self).__init__()
13
- self.enc1 = layers_new.Conv2DBNActiv(nin, nout, 3, 1, 1)
14
- self.enc2 = layers_new.Encoder(nout, nout * 2, 3, 2, 1)
15
- self.enc3 = layers_new.Encoder(nout * 2, nout * 4, 3, 2, 1)
16
- self.enc4 = layers_new.Encoder(nout * 4, nout * 6, 3, 2, 1)
17
- self.enc5 = layers_new.Encoder(nout * 6, nout * 8, 3, 2, 1)
18
-
19
- self.aspp = layers_new.ASPPModule(nout * 8, nout * 8, dilations, dropout=True)
20
-
21
- self.dec4 = layers_new.Decoder(nout * (6 + 8), nout * 6, 3, 1, 1)
22
- self.dec3 = layers_new.Decoder(nout * (4 + 6), nout * 4, 3, 1, 1)
23
- self.dec2 = layers_new.Decoder(nout * (2 + 4), nout * 2, 3, 1, 1)
24
- self.lstm_dec2 = layers_new.LSTMModule(nout * 2, nin_lstm, nout_lstm)
25
- self.dec1 = layers_new.Decoder(nout * (1 + 2) + 1, nout * 1, 3, 1, 1)
26
-
27
- def __call__(self, x):
28
- e1 = self.enc1(x)
29
- e2 = self.enc2(e1)
30
- e3 = self.enc3(e2)
31
- e4 = self.enc4(e3)
32
- e5 = self.enc5(e4)
33
-
34
- h = self.aspp(e5)
35
-
36
- h = self.dec4(h, e4)
37
- h = self.dec3(h, e3)
38
- h = self.dec2(h, e2)
39
- h = torch.cat([h, self.lstm_dec2(h)], dim=1)
40
- h = self.dec1(h, e1)
41
-
42
- return h
43
-
44
-
45
- class CascadedNet(nn.Module):
46
- def __init__(self, n_fft, nout=32, nout_lstm=128):
47
- super(CascadedNet, self).__init__()
48
-
49
- self.max_bin = n_fft // 2
50
- self.output_bin = n_fft // 2 + 1
51
- self.nin_lstm = self.max_bin // 2
52
- self.offset = 64
53
-
54
- self.stg1_low_band_net = nn.Sequential(
55
- BaseNet(2, nout // 2, self.nin_lstm // 2, nout_lstm),
56
- layers_new.Conv2DBNActiv(nout // 2, nout // 4, 1, 1, 0),
57
- )
58
-
59
- self.stg1_high_band_net = BaseNet(
60
- 2, nout // 4, self.nin_lstm // 2, nout_lstm // 2
61
- )
62
-
63
- self.stg2_low_band_net = nn.Sequential(
64
- BaseNet(nout // 4 + 2, nout, self.nin_lstm // 2, nout_lstm),
65
- layers_new.Conv2DBNActiv(nout, nout // 2, 1, 1, 0),
66
- )
67
- self.stg2_high_band_net = BaseNet(
68
- nout // 4 + 2, nout // 2, self.nin_lstm // 2, nout_lstm // 2
69
- )
70
-
71
- self.stg3_full_band_net = BaseNet(
72
- 3 * nout // 4 + 2, nout, self.nin_lstm, nout_lstm
73
- )
74
-
75
- self.out = nn.Conv2d(nout, 2, 1, bias=False)
76
- self.aux_out = nn.Conv2d(3 * nout // 4, 2, 1, bias=False)
77
-
78
- def forward(self, x):
79
- x = x[:, :, : self.max_bin]
80
-
81
- bandw = x.size()[2] // 2
82
- l1_in = x[:, :, :bandw]
83
- h1_in = x[:, :, bandw:]
84
- l1 = self.stg1_low_band_net(l1_in)
85
- h1 = self.stg1_high_band_net(h1_in)
86
- aux1 = torch.cat([l1, h1], dim=2)
87
-
88
- l2_in = torch.cat([l1_in, l1], dim=1)
89
- h2_in = torch.cat([h1_in, h1], dim=1)
90
- l2 = self.stg2_low_band_net(l2_in)
91
- h2 = self.stg2_high_band_net(h2_in)
92
- aux2 = torch.cat([l2, h2], dim=2)
93
-
94
- f3_in = torch.cat([x, aux1, aux2], dim=1)
95
- f3 = self.stg3_full_band_net(f3_in)
96
-
97
- mask = torch.sigmoid(self.out(f3))
98
- mask = F.pad(
99
- input=mask,
100
- pad=(0, 0, 0, self.output_bin - mask.size()[2]),
101
- mode="replicate",
102
- )
103
-
104
- if self.training:
105
- aux = torch.cat([aux1, aux2], dim=1)
106
- aux = torch.sigmoid(self.aux_out(aux))
107
- aux = F.pad(
108
- input=aux,
109
- pad=(0, 0, 0, self.output_bin - aux.size()[2]),
110
- mode="replicate",
111
- )
112
- return mask, aux
113
- else:
114
- return mask
115
-
116
- def predict_mask(self, x):
117
- mask = self.forward(x)
118
-
119
- if self.offset > 0:
120
- mask = mask[:, :, :, self.offset : -self.offset]
121
- assert mask.size()[3] > 0
122
-
123
- return mask
124
-
125
- def predict(self, x, aggressiveness=None):
126
- mask = self.forward(x)
127
- pred_mag = x * mask
128
-
129
- if self.offset > 0:
130
- pred_mag = pred_mag[:, :, :, self.offset : -self.offset]
131
- assert pred_mag.size()[3] > 0
132
-
133
- return pred_mag