UVR5

Runtime error

App Files Files Community

Eddycrack864 commited on Jan 31, 2024

Commit

ede719c

verified ·

1 Parent(s): 240c2c0

Upload separate.py

Browse files

Files changed (1) hide show

separate.py +355 -872

separate.py CHANGED Viewed

@@ -6,15 +6,13 @@ from demucs.model_v2 import auto_load_demucs_model_v2
 from demucs.pretrained import get_model as _gm
 from demucs.utils import apply_model_v1
 from demucs.utils import apply_model_v2
-from lib_v5.tfc_tdf_v3 import TFC_TDF_net, STFT
 from lib_v5 import spec_utils
 from lib_v5.vr_network import nets
 from lib_v5.vr_network import nets_new
-from lib_v5.vr_network.model_param_init import ModelParameters
 from pathlib import Path
 from gui_data.constants import *
 from gui_data.error_handling import *
-from scipy import signal
 import audioread
 import gzip
 import librosa
@@ -26,85 +24,31 @@ import torch
 import warnings
 import pydub
 import soundfile as sf
 import lib_v5.mdxnet as MdxnetSet
-import math
-#import random
-from onnx import load
-from onnx2pytorch import ConvertModel
-import gc
 if TYPE_CHECKING:
     from UVR import ModelData
-# if not is_macos:
-#     import torch_directml
-mps_available = torch.backends.mps.is_available() if is_macos else False
-cuda_available = torch.cuda.is_available()
-# def get_gpu_info():
-#     directml_device, directml_available = DIRECTML_DEVICE, False
-#     if not is_macos:
-#         directml_available = torch_directml.is_available()
-#         if directml_available:
-#             directml_device = str(torch_directml.device()).partition(":")[0]
-#     return directml_device, directml_available
-# DIRECTML_DEVICE, directml_available = get_gpu_info()
-def clear_gpu_cache():
-    gc.collect()
-    if is_macos:
-        torch.mps.empty_cache()
-    else:
-        torch.cuda.empty_cache()
 warnings.filterwarnings("ignore")
 cpu = torch.device('cpu')
 class SeperateAttributes:
-    def __init__(self, model_data: ModelData,
-                 process_data: dict,
-                 main_model_primary_stem_4_stem=None,
-                 main_process_method=None,
-                 is_return_dual=True,
-                 main_model_primary=None,
-                 vocal_stem_path=None,
-                 master_inst_source=None,
-                 master_vocal_source=None):
         self.list_all_models: list
         self.process_data = process_data
         self.progress_value = 0
         self.set_progress_bar = process_data['set_progress_bar']
         self.write_to_console = process_data['write_to_console']
-        if vocal_stem_path:
-            self.audio_file, self.audio_file_base = vocal_stem_path
-            self.audio_file_base_voc_split = lambda stem, split:os.path.join(self.export_path, f'{self.audio_file_base.replace("_(Vocals)", "")}_({stem}_{split}).wav')
-        else:
-            self.audio_file = process_data['audio_file']
-            self.audio_file_base = process_data['audio_file_base']
-            self.audio_file_base_voc_split = None
         self.export_path = process_data['export_path']
         self.cached_source_callback = process_data['cached_source_callback']
         self.cached_model_source_holder = process_data['cached_model_source_holder']
         self.is_4_stem_ensemble = process_data['is_4_stem_ensemble']
         self.list_all_models = process_data['list_all_models']
         self.process_iteration = process_data['process_iteration']
-        self.is_return_dual = is_return_dual
-        self.is_pitch_change = model_data.is_pitch_change
-        self.semitone_shift = model_data.semitone_shift
-        self.is_match_frequency_pitch = model_data.is_match_frequency_pitch
-        self.overlap = model_data.overlap
-        self.overlap_mdx = model_data.overlap_mdx
-        self.overlap_mdx23 = model_data.overlap_mdx23
-        self.is_mdx_combine_stems = model_data.is_mdx_combine_stems
-        self.is_mdx_c = model_data.is_mdx_c
-        self.mdx_c_configs = model_data.mdx_c_configs
-        self.mdxnet_stem_select = model_data.mdxnet_stem_select
         self.mixer_path = model_data.mixer_path
         self.model_samplerate = model_data.model_samplerate
         self.model_capacity = model_data.model_capacity
@@ -126,11 +70,9 @@ class SeperateAttributes:
         self.is_ensemble_mode = model_data.is_ensemble_mode
         self.secondary_model = model_data.secondary_model #
         self.primary_model_primary_stem = model_data.primary_model_primary_stem
-        self.primary_stem_native = model_data.primary_stem_native
         self.primary_stem = model_data.primary_stem #
         self.secondary_stem = model_data.secondary_stem #
         self.is_invert_spec = model_data.is_invert_spec #
-        self.is_deverb_vocals = model_data.is_deverb_vocals
         self.is_mixer_mode = model_data.is_mixer_mode #
         self.secondary_model_scale = model_data.secondary_model_scale #
         self.is_demucs_pre_proc_model_inst_mix = model_data.is_demucs_pre_proc_model_inst_mix #
@@ -140,87 +82,49 @@ class SeperateAttributes:
         self.secondary_source = None
         self.secondary_source_primary = None
         self.secondary_source_secondary = None
-        self.main_model_primary_stem_4_stem = main_model_primary_stem_4_stem
-        self.main_model_primary = main_model_primary
-        self.ensemble_primary_stem = model_data.ensemble_primary_stem
-        self.is_multi_stem_ensemble = model_data.is_multi_stem_ensemble
-        self.is_other_gpu = False
-        self.is_deverb = True
-        self.DENOISER_MODEL = model_data.DENOISER_MODEL
-        self.DEVERBER_MODEL = model_data.DEVERBER_MODEL
-        self.is_source_swap = False
-        self.vocal_split_model = model_data.vocal_split_model
-        self.is_vocal_split_model = model_data.is_vocal_split_model
-        self.master_vocal_path = None
-        self.set_master_inst_source = None
-        self.master_inst_source = master_inst_source
-        self.master_vocal_source = master_vocal_source
-        self.is_save_inst_vocal_splitter = isinstance(master_inst_source, np.ndarray) and model_data.is_save_inst_vocal_splitter
-        self.is_inst_only_voc_splitter = model_data.is_inst_only_voc_splitter
-        self.is_karaoke = model_data.is_karaoke
-        self.is_bv_model = model_data.is_bv_model
-        self.is_bv_model_rebalenced = model_data.bv_model_rebalance and self.is_vocal_split_model
-        self.is_sec_bv_rebalance = model_data.is_sec_bv_rebalance
-        self.stem_path_init = os.path.join(self.export_path, f'{self.audio_file_base}_({self.secondary_stem}).wav')
-        self.deverb_vocal_opt = model_data.deverb_vocal_opt
-        self.is_save_vocal_only = model_data.is_save_vocal_only
-        self.device = cpu
-        self.run_type = ['CPUExecutionProvider']
-        self.is_opencl = False
-        self.device_set = model_data.device_set
-        self.is_use_opencl = model_data.is_use_opencl
-        if self.is_inst_only_voc_splitter or self.is_sec_bv_rebalance:
-            self.is_primary_stem_only = False
-            self.is_secondary_stem_only = False
-        if main_model_primary and self.is_multi_stem_ensemble:
-            self.primary_stem, self.secondary_stem = main_model_primary, secondary_stem(main_model_primary)
-        if self.is_gpu_conversion >= 0:
-            if mps_available:
-                self.device, self.is_other_gpu = 'mps', True
-            else:
-                device_prefix = None
-                if self.device_set != DEFAULT:
-                    device_prefix = CUDA_DEVICE#DIRECTML_DEVICE if self.is_use_opencl and directml_available else CUDA_DEVICE
-                # if directml_available and self.is_use_opencl:
-                #     self.device = torch_directml.device() if not device_prefix else f'{device_prefix}:{self.device_set}'
-                #     self.is_other_gpu = True
-                if cuda_available:# and not self.is_use_opencl:
-                    self.device = CUDA_DEVICE if not device_prefix else f'{device_prefix}:{self.device_set}'
-                    self.run_type = ['CUDAExecutionProvider']
         if model_data.process_method == MDX_ARCH_TYPE:
             self.is_mdx_ckpt = model_data.is_mdx_ckpt
             self.primary_model_name, self.primary_sources = self.cached_source_callback(MDX_ARCH_TYPE, model_name=self.model_basename)
-            self.is_denoise = model_data.is_denoise#
-            self.is_denoise_model = model_data.is_denoise_model#
-            self.is_mdx_c_seg_def = model_data.is_mdx_c_seg_def#
             self.mdx_batch_size = model_data.mdx_batch_size
             self.compensate = model_data.compensate
-            self.mdx_segment_size = model_data.mdx_segment_size
-            if self.is_mdx_c:
-                if not self.is_4_stem_ensemble:
-                    self.primary_stem = model_data.ensemble_primary_stem if process_data['is_ensemble_master'] else model_data.primary_stem
-                    self.secondary_stem = model_data.ensemble_secondary_stem if process_data['is_ensemble_master'] else model_data.secondary_stem
-            else:
-                self.dim_f, self.dim_t = model_data.mdx_dim_f_set, 2**model_data.mdx_dim_t_set
-            self.check_label_secondary_stem_runs()
             self.n_fft = model_data.mdx_n_fft_scale_set
             self.chunks = model_data.chunks
             self.margin = model_data.margin
             self.adjust = 1
             self.dim_c = 4
             self.hop = 1024
         if model_data.process_method == DEMUCS_ARCH_TYPE:
             self.demucs_stems = model_data.demucs_stems if not main_process_method in [MDX_ARCH_TYPE, VR_ARCH_TYPE] else None
             self.secondary_model_4_stem = model_data.secondary_model_4_stem
             self.secondary_model_4_stem_scale = model_data.secondary_model_4_stem_scale
             self.is_chunk_demucs = model_data.is_chunk_demucs
             self.segment = model_data.segment
             self.demucs_version = model_data.demucs_version
@@ -229,37 +133,28 @@ class SeperateAttributes:
             self.is_demucs_combine_stems = model_data.is_demucs_combine_stems
             self.demucs_stem_count = model_data.demucs_stem_count
             self.pre_proc_model = model_data.pre_proc_model
-            self.device = cpu if self.is_other_gpu and not self.demucs_version in [DEMUCS_V3, DEMUCS_V4] else self.device
-            self.primary_stem = model_data.ensemble_primary_stem if process_data['is_ensemble_master'] else model_data.primary_stem
-            self.secondary_stem = model_data.ensemble_secondary_stem if process_data['is_ensemble_master'] else model_data.secondary_stem
-            if (self.is_multi_stem_ensemble or self.is_4_stem_ensemble) and not self.is_secondary_model:
-                self.is_return_dual = False
-            if self.is_multi_stem_ensemble and main_model_primary:
-                self.is_4_stem_ensemble = False
-                if main_model_primary in self.demucs_source_map.keys():
-                    self.primary_stem = main_model_primary
-                    self.secondary_stem = secondary_stem(main_model_primary)
-                elif secondary_stem(main_model_primary) in self.demucs_source_map.keys():
-                    self.primary_stem = secondary_stem(main_model_primary)
-                    self.secondary_stem = main_model_primary
             if self.is_secondary_model and not process_data['is_ensemble_master']:
                 if not self.demucs_stem_count == 2 and model_data.primary_model_primary_stem == INST_STEM:
                     self.primary_stem = VOCAL_STEM
                     self.secondary_stem = INST_STEM
                 else:
                     self.primary_stem = model_data.primary_model_primary_stem
-                    self.secondary_stem = secondary_stem(self.primary_stem)
             self.shifts = model_data.shifts
             self.is_split_mode = model_data.is_split_mode if not self.demucs_version == DEMUCS_V4 else True
             self.primary_model_name, self.primary_sources = self.cached_source_callback(DEMUCS_ARCH_TYPE, model_name=self.model_basename)
         if model_data.process_method == VR_ARCH_TYPE:
-            self.check_label_secondary_stem_runs()
             self.primary_model_name, self.primary_sources = self.cached_source_callback(VR_ARCH_TYPE, model_name=self.model_basename)
             self.mp = model_data.vr_model_param
             self.high_end_process = model_data.is_high_end_process
@@ -269,44 +164,28 @@ class SeperateAttributes:
             self.batch_size = model_data.batch_size
             self.window_size = model_data.window_size
             self.input_high_end_h = None
-            self.input_high_end = None
             self.post_process_threshold = model_data.post_process_threshold
             self.aggressiveness = {'value': model_data.aggression_setting,
                                    'split_bin': self.mp.param['band'][1]['crop_stop'],
                                    'aggr_correction': self.mp.param.get('aggr_correction')}
-    def check_label_secondary_stem_runs(self):
-        # For ensemble master that's not a 4-stem ensemble, and not mdx_c
-        if self.process_data['is_ensemble_master'] and not self.is_4_stem_ensemble and not self.is_mdx_c:
-            if self.ensemble_primary_stem != self.primary_stem:
-                self.is_primary_stem_only, self.is_secondary_stem_only = self.is_secondary_stem_only, self.is_primary_stem_only
-        # For secondary models
-        if self.is_pre_proc_model or self.is_secondary_model:
-            self.is_primary_stem_only = False
-            self.is_secondary_stem_only = False
     def start_inference_console_write(self):
-        if self.is_secondary_model and not self.is_pre_proc_model and not self.is_vocal_split_model:
             self.write_to_console(INFERENCE_STEP_2_SEC(self.process_method, self.model_basename))
         if self.is_pre_proc_model:
             self.write_to_console(INFERENCE_STEP_2_PRE(self.process_method, self.model_basename))
-        if self.is_vocal_split_model:
-            self.write_to_console(INFERENCE_STEP_2_VOC_S(self.process_method, self.model_basename))
     def running_inference_console_write(self, is_no_write=False):
         self.write_to_console(DONE, base_text='') if not is_no_write else None
         self.set_progress_bar(0.05) if not is_no_write else None
-        if self.is_secondary_model and not self.is_pre_proc_model and not self.is_vocal_split_model:
             self.write_to_console(INFERENCE_STEP_1_SEC)
         elif self.is_pre_proc_model:
             self.write_to_console(INFERENCE_STEP_1_PRE)
-        elif self.is_vocal_split_model:
-            self.write_to_console(INFERENCE_STEP_1_VOC_S)
         else:
             self.write_to_console(INFERENCE_STEP_1)
@@ -319,14 +198,19 @@ class SeperateAttributes:
             self.set_progress_bar(0.1, (0.8/length*self.progress_value))
-    def load_cached_sources(self):
         if self.is_secondary_model and not self.is_pre_proc_model:
             self.write_to_console(INFERENCE_STEP_2_SEC_CACHED_MODOEL(self.process_method, self.model_basename))
         elif self.is_pre_proc_model:
             self.write_to_console(INFERENCE_STEP_2_PRE_CACHED_MODOEL(self.process_method, self.model_basename))
         else:
-            self.write_to_console(INFERENCE_STEP_2_PRIMARY_CACHED, "")
     def cache_source(self, secondary_sources):
@@ -341,142 +225,49 @@ class SeperateAttributes:
             if self.process_method == DEMUCS_ARCH_TYPE:
                 self.cached_model_source_holder(DEMUCS_ARCH_TYPE, secondary_sources, self.model_basename)
-    def process_vocal_split_chain(self, sources: dict):
-        def is_valid_vocal_split_condition(master_vocal_source):
-            """Checks if conditions for vocal split processing are met."""
-            conditions = [
-                isinstance(master_vocal_source, np.ndarray),
-                self.vocal_split_model,
-                not self.is_ensemble_mode,
-                not self.is_karaoke,
-                not self.is_bv_model
-            ]
-            return all(conditions)
-        # Retrieve sources from the dictionary with default fallbacks
-        master_inst_source = sources.get(INST_STEM, None)
-        master_vocal_source = sources.get(VOCAL_STEM, None)
-        # Process the vocal split chain if conditions are met
-        if is_valid_vocal_split_condition(master_vocal_source):
-            process_chain_model(
-                self.vocal_split_model,
-                self.process_data,
-                vocal_stem_path=self.master_vocal_path,
-                master_vocal_source=master_vocal_source,
-                master_inst_source=master_inst_source
-            )
-    def process_secondary_stem(self, stem_source, secondary_model_source=None, model_scale=None):
         if not self.is_secondary_model:
-            if self.is_secondary_model_activated and isinstance(secondary_model_source, np.ndarray):
-                secondary_model_scale = model_scale if model_scale else self.secondary_model_scale
-                stem_source = spec_utils.average_dual_sources(stem_source, secondary_model_source, secondary_model_scale)
-        return stem_source
-    def final_process(self, stem_path, source, secondary_source, stem_name, samplerate):
-        source = self.process_secondary_stem(source, secondary_source)
-        self.write_audio(stem_path, source, samplerate, stem_name=stem_name)
-        return {stem_name: source}
-    def write_audio(self, stem_path: str, stem_source, samplerate, stem_name=None):
-        def save_audio_file(path, source):
-            source = spec_utils.normalize(source, self.is_normalization)
-            sf.write(path, source, samplerate, subtype=self.wav_type_set)
-            if is_not_ensemble:
-                save_format(path, self.save_format, self.mp3_bit_set)
-        def save_voc_split_instrumental(stem_name, stem_source, is_inst_invert=False):
-            inst_stem_name = "Instrumental (With Lead Vocals)" if stem_name == LEAD_VOCAL_STEM else "Instrumental (With Backing Vocals)"
-            inst_stem_path_name = LEAD_VOCAL_STEM_I if stem_name == LEAD_VOCAL_STEM else BV_VOCAL_STEM_I
-            inst_stem_path = self.audio_file_base_voc_split(INST_STEM, inst_stem_path_name)
-            stem_source = -stem_source if is_inst_invert else stem_source
-            inst_stem_source = spec_utils.combine_arrarys([self.master_inst_source, stem_source], is_swap=True)
-            save_with_message(inst_stem_path, inst_stem_name, inst_stem_source)
-        def save_voc_split_vocal(stem_name, stem_source):
-            voc_split_stem_name = LEAD_VOCAL_STEM_LABEL if stem_name == LEAD_VOCAL_STEM else BV_VOCAL_STEM_LABEL
-            voc_split_stem_path = self.audio_file_base_voc_split(VOCAL_STEM, stem_name)
-            save_with_message(voc_split_stem_path, voc_split_stem_name, stem_source)
-        def save_with_message(stem_path, stem_name, stem_source):
-            is_deverb = self.is_deverb_vocals and (
-                self.deverb_vocal_opt == stem_name or
-                (self.deverb_vocal_opt == 'ALL' and
-                (stem_name == VOCAL_STEM or stem_name == LEAD_VOCAL_STEM_LABEL or stem_name == BV_VOCAL_STEM_LABEL)))
-            self.write_to_console(f'{SAVING_STEM[0]}{stem_name}{SAVING_STEM[1]}')
-            if is_deverb and is_not_ensemble:
-                deverb_vocals(stem_path, stem_source)
-            save_audio_file(stem_path, stem_source)
             self.write_to_console(DONE, base_text='')
-        def deverb_vocals(stem_path:str, stem_source):
-            self.write_to_console(INFERENCE_STEP_DEVERBING, base_text='')
-            stem_source_deverbed, stem_source_2 = vr_denoiser(stem_source, self.device, is_deverber=True, model_path=self.DEVERBER_MODEL)
-            save_audio_file(stem_path.replace(".wav", "_deverbed.wav"), stem_source_deverbed)
-            save_audio_file(stem_path.replace(".wav", "_reverb_only.wav"), stem_source_2)
-        is_bv_model_lead = (self.is_bv_model_rebalenced and self.is_vocal_split_model and stem_name == LEAD_VOCAL_STEM)
-        is_bv_rebalance_lead = (self.is_bv_model_rebalenced and self.is_vocal_split_model and stem_name == BV_VOCAL_STEM)
-        is_no_vocal_save = self.is_inst_only_voc_splitter and (stem_name == VOCAL_STEM or stem_name == BV_VOCAL_STEM or stem_name == LEAD_VOCAL_STEM) or is_bv_model_lead
-        is_not_ensemble = (not self.is_ensemble_mode or self.is_vocal_split_model)
-        is_do_not_save_inst = (self.is_save_vocal_only and self.is_sec_bv_rebalance and stem_name == INST_STEM)
-        if is_bv_rebalance_lead:
-            master_voc_source = spec_utils.match_array_shapes(self.master_vocal_source, stem_source, is_swap=True)
-            bv_rebalance_lead_source = stem_source-master_voc_source
-        if not is_bv_model_lead and not is_do_not_save_inst:
-            if self.is_vocal_split_model or not self.is_secondary_model:
-                if self.is_vocal_split_model and not self.is_inst_only_voc_splitter:
-                    save_voc_split_vocal(stem_name, stem_source)
-                    if is_bv_rebalance_lead:
-                        save_voc_split_vocal(LEAD_VOCAL_STEM, bv_rebalance_lead_source)
-                else:
-                    if not is_no_vocal_save:
-                        save_with_message(stem_path, stem_name, stem_source)
-                if self.is_save_inst_vocal_splitter and not self.is_save_vocal_only:
-                    save_voc_split_instrumental(stem_name, stem_source)
-                    if is_bv_rebalance_lead:
-                        save_voc_split_instrumental(LEAD_VOCAL_STEM, bv_rebalance_lead_source, is_inst_invert=True)
-                self.set_progress_bar(0.95)
-        if stem_name == VOCAL_STEM:
-            self.master_vocal_path = stem_path
-    def pitch_fix(self, source, sr_pitched, org_mix):
-        semitone_shift = self.semitone_shift
-        source = spec_utils.change_pitch_semitones(source, sr_pitched, semitone_shift=semitone_shift)[0]
-        source = spec_utils.match_array_shapes(source, org_mix)
-        return source
-    def match_frequency_pitch(self, mix):
-        source = mix
-        if self.is_match_frequency_pitch and self.is_pitch_change:
-            source, sr_pitched = spec_utils.change_pitch_semitones(mix, 44100, semitone_shift=-self.semitone_shift)
-            source = self.pitch_fix(source, sr_pitched, mix)
-        return source
 class SeperateMDX(SeperateAttributes):
     def seperate(self):
         samplerate = 44100
-        if self.primary_model_name == self.model_basename and isinstance(self.primary_sources, tuple):
-            mix, source = self.primary_sources
-            self.load_cached_sources()
         else:
             self.start_inference_console_write()
@@ -486,145 +277,105 @@ class SeperateMDX(SeperateAttributes):
                 separator = MdxnetSet.ConvTDFNet(**model_params)
                 self.model_run = separator.load_from_checkpoint(self.model_path).to(self.device).eval()
             else:
-                if self.mdx_segment_size == self.dim_t and not self.is_other_gpu:
-                    ort_ = ort.InferenceSession(self.model_path, providers=self.run_type)
-                    self.model_run = lambda spek:ort_.run(None, {'input': spek.cpu().numpy()})[0]
-                else:
-                    self.model_run = ConvertModel(load(self.model_path))
-                    self.model_run.to(self.device).eval()
             self.running_inference_console_write()
-            mix = prepare_mix(self.audio_file)
-            source = self.demix(mix)
-            if not self.is_vocal_split_model:
-                self.cache_source((mix, source))
             self.write_to_console(DONE, base_text='')
-        mdx_net_cut = True if self.primary_stem in MDX_NET_FREQ_CUT and self.is_match_frequency_pitch else False
-        if self.is_secondary_model_activated and self.secondary_model:
-            self.secondary_source_primary, self.secondary_source_secondary = process_secondary_model(self.secondary_model, self.process_data, main_process_method=self.process_method, main_model_primary=self.primary_stem)
         if not self.is_primary_stem_only:
             secondary_stem_path = os.path.join(self.export_path, f'{self.audio_file_base}_({self.secondary_stem}).wav')
             if not isinstance(self.secondary_source, np.ndarray):
-                raw_mix = self.demix(self.match_frequency_pitch(mix), is_match_mix=True) if mdx_net_cut else self.match_frequency_pitch(mix)
-                self.secondary_source = spec_utils.invert_stem(raw_mix, source) if self.is_invert_spec else mix.T-source.T
-            self.secondary_source_map = self.final_process(secondary_stem_path, self.secondary_source, self.secondary_source_secondary, self.secondary_stem, samplerate)
-        if not self.is_secondary_stem_only:
-            primary_stem_path = os.path.join(self.export_path, f'{self.audio_file_base}_({self.primary_stem}).wav')
-            if not isinstance(self.primary_source, np.ndarray):
-                self.primary_source = source.T
-            self.primary_source_map = self.final_process(primary_stem_path, self.primary_source, self.secondary_source_primary, self.primary_stem, samplerate)
-        clear_gpu_cache()
         secondary_sources = {**self.primary_source_map, **self.secondary_source_map}
-        self.process_vocal_split_chain(secondary_sources)
-        if self.is_secondary_model or self.is_pre_proc_model:
             return secondary_sources
     def initialize_model_settings(self):
         self.n_bins = self.n_fft//2+1
         self.trim = self.n_fft//2
-        self.chunk_size = self.hop * (self.mdx_segment_size-1)
         self.gen_size = self.chunk_size-2*self.trim
-        self.stft = STFT(self.n_fft, self.hop, self.dim_f, self.device)
-    def demix(self, mix, is_match_mix=False):
-        self.initialize_model_settings()
-        org_mix = mix
-        tar_waves_ = []
-        if is_match_mix:
-            chunk_size = self.hop * (256-1)
-            overlap = 0.02
         else:
-            chunk_size = self.chunk_size
-            overlap = self.overlap_mdx
-            if self.is_pitch_change:
-                mix, sr_pitched = spec_utils.change_pitch_semitones(mix, 44100, semitone_shift=-self.semitone_shift)
-        gen_size = chunk_size-2*self.trim
-        pad = gen_size + self.trim - ((mix.shape[-1]) % gen_size)
-        mixture = np.concatenate((np.zeros((2, self.trim), dtype='float32'), mix, np.zeros((2, pad), dtype='float32')), 1)
-        step = self.chunk_size - self.n_fft if overlap == DEFAULT else int((1 - overlap) * chunk_size)
-        result = np.zeros((1, 2, mixture.shape[-1]), dtype=np.float32)
-        divider = np.zeros((1, 2, mixture.shape[-1]), dtype=np.float32)
-        total = 0
-        total_chunks = (mixture.shape[-1] + step - 1) // step
-        for i in range(0, mixture.shape[-1], step):
-            total += 1
-            start = i
-            end = min(i + chunk_size, mixture.shape[-1])
-            chunk_size_actual = end - start
-            if overlap == 0:
-                window = None
-            else:
-                window = np.hanning(chunk_size_actual)
-                window = np.tile(window[None, None, :], (1, 2, 1))
-            mix_part_ = mixture[:, start:end]
-            if end != i + chunk_size:
-                pad_size = (i + chunk_size) - end
-                mix_part_ = np.concatenate((mix_part_, np.zeros((2, pad_size), dtype='float32')), axis=-1)
-            mix_part = torch.tensor([mix_part_], dtype=torch.float32).to(self.device)
-            mix_waves = mix_part.split(self.mdx_batch_size)
             with torch.no_grad():
                 for mix_wave in mix_waves:
-                    self.running_inference_progress_bar(total_chunks, is_match_mix=is_match_mix)
-                    tar_waves = self.run_model(mix_wave, is_match_mix=is_match_mix)
-                    if window is not None:
-                        tar_waves[..., :chunk_size_actual] *= window
-                        divider[..., start:end] += window
-                    else:
-                        divider[..., start:end] += 1
-                    result[..., start:end] += tar_waves[..., :end-start]
-        tar_waves = result / divider
-        tar_waves_.append(tar_waves)
-        tar_waves_ = np.vstack(tar_waves_)[:, :, self.trim:-self.trim]
-        tar_waves = np.concatenate(tar_waves_, axis=-1)[:, :mix.shape[-1]]
-        source = tar_waves[:,0:None]
-        if self.is_pitch_change and not is_match_mix:
-            source = self.pitch_fix(source, sr_pitched, org_mix)
-        source = source if is_match_mix else source*self.compensate
-        if self.is_denoise_model and not is_match_mix:
-            if NO_STEM in self.primary_stem_native or self.primary_stem_native == INST_STEM:
-                if org_mix.shape[1] != source.shape[1]:
-                    source = spec_utils.match_array_shapes(source, org_mix)
-                source = org_mix - vr_denoiser(org_mix-source, self.device, model_path=self.DENOISER_MODEL)
-            else:
-                source = vr_denoiser(source, self.device, model_path=self.DENOISER_MODEL)
-        return source
-    def run_model(self, mix, is_match_mix=False):
         spek = self.stft(mix.to(self.device))*self.adjust
         spek[:, :, :3, :] *= 0
@@ -634,189 +385,58 @@ class SeperateMDX(SeperateAttributes):
         else:
             spec_pred = -self.model_run(-spek)*0.5+self.model_run(spek)*0.5 if self.is_denoise else self.model_run(spek)
-        return self.stft.inverse(torch.tensor(spec_pred).to(self.device)).cpu().detach().numpy()
-class SeperateMDXC(SeperateAttributes):
     def seperate(self):
-        samplerate = 44100
-        sources = None
-        if self.primary_model_name == self.model_basename and isinstance(self.primary_sources, tuple):
-            mix, sources = self.primary_sources
-            self.load_cached_sources()
-        else:
-            self.start_inference_console_write()
-            self.running_inference_console_write()
-            mix = prepare_mix(self.audio_file)
-            sources = self.demix(mix)
-            if not self.is_vocal_split_model:
-                self.cache_source((mix, sources))
-            self.write_to_console(DONE, base_text='')
-        stem_list = [self.mdx_c_configs.training.target_instrument] if self.mdx_c_configs.training.target_instrument else [i for i in self.mdx_c_configs.training.instruments]
-        if self.is_secondary_model:
-            if self.is_pre_proc_model:
-                self.mdxnet_stem_select = stem_list[0]
-            else:
-                self.mdxnet_stem_select = self.main_model_primary_stem_4_stem if self.main_model_primary_stem_4_stem else self.primary_model_primary_stem
-            self.primary_stem = self.mdxnet_stem_select
-            self.secondary_stem = secondary_stem(self.mdxnet_stem_select)
-            self.is_primary_stem_only, self.is_secondary_stem_only = False, False
-        is_all_stems = self.mdxnet_stem_select == ALL_STEMS
-        is_not_ensemble_master = not self.process_data['is_ensemble_master']
-        is_not_single_stem = not len(stem_list) <= 2
-        is_not_secondary_model = not self.is_secondary_model
-        is_ensemble_4_stem = self.is_4_stem_ensemble and is_not_single_stem
-        if (is_all_stems and is_not_ensemble_master and is_not_single_stem and is_not_secondary_model) or is_ensemble_4_stem and not self.is_pre_proc_model:
-            for stem in stem_list:
-                primary_stem_path = os.path.join(self.export_path, f'{self.audio_file_base}_({stem}).wav')
-                self.primary_source = sources[stem].T
-                self.write_audio(primary_stem_path, self.primary_source, samplerate, stem_name=stem)
-                if stem == VOCAL_STEM and not self.is_sec_bv_rebalance:
-                    self.process_vocal_split_chain({VOCAL_STEM:stem})
-        else:
-            if len(stem_list) == 1:
-                source_primary = sources
-            else:
-                source_primary = sources[stem_list[0]] if self.is_multi_stem_ensemble and len(stem_list) == 2 else sources[self.mdxnet_stem_select]
-            if self.is_secondary_model_activated and self.secondary_model:
-                self.secondary_source_primary, self.secondary_source_secondary = process_secondary_model(self.secondary_model,
-                                                                                                         self.process_data,
-                                                                                                         main_process_method=self.process_method,
-                                                                                                         main_model_primary=self.primary_stem)
-            if not self.is_primary_stem_only:
-                secondary_stem_path = os.path.join(self.export_path, f'{self.audio_file_base}_({self.secondary_stem}).wav')
-                if not isinstance(self.secondary_source, np.ndarray):
-                    if self.is_mdx_combine_stems and len(stem_list) >= 2:
-                        if len(stem_list) == 2:
-                            secondary_source = sources[self.secondary_stem]
-                        else:
-                            sources.pop(self.primary_stem)
-                            next_stem = next(iter(sources))
-                            secondary_source = np.zeros_like(sources[next_stem])
-                            for v in sources.values():
-                                secondary_source += v
-                        self.secondary_source = secondary_source.T
-                    else:
-                        self.secondary_source, raw_mix = source_primary, self.match_frequency_pitch(mix)
-                        self.secondary_source = spec_utils.to_shape(self.secondary_source, raw_mix.shape)
-                        if self.is_invert_spec:
-                            self.secondary_source = spec_utils.invert_stem(raw_mix, self.secondary_source)
-                        else:
-                            self.secondary_source = (-self.secondary_source.T+raw_mix.T)
-                self.secondary_source_map = self.final_process(secondary_stem_path, self.secondary_source, self.secondary_source_secondary, self.secondary_stem, samplerate)
-            if not self.is_secondary_stem_only:
-                primary_stem_path = os.path.join(self.export_path, f'{self.audio_file_base}_({self.primary_stem}).wav')
-                if not isinstance(self.primary_source, np.ndarray):
-                    self.primary_source = source_primary.T
-                self.primary_source_map = self.final_process(primary_stem_path, self.primary_source, self.secondary_source_primary, self.primary_stem, samplerate)
-        clear_gpu_cache()
-        secondary_sources = {**self.primary_source_map, **self.secondary_source_map}
-        self.process_vocal_split_chain(secondary_sources)
-        if self.is_secondary_model or self.is_pre_proc_model:
-            return secondary_sources
-    def demix(self, mix):
-        sr_pitched = 441000
-        org_mix = mix
-        if self.is_pitch_change:
-            mix, sr_pitched = spec_utils.change_pitch_semitones(mix, 44100, semitone_shift=-self.semitone_shift)
-        model = TFC_TDF_net(self.mdx_c_configs, device=self.device)
-        model.load_state_dict(torch.load(self.model_path, map_location=cpu))
-        model.to(self.device).eval()
-        mix = torch.tensor(mix, dtype=torch.float32)
-        try:
-            S = model.num_target_instruments
-        except Exception as e:
-            S = model.module.num_target_instruments
-        mdx_segment_size = self.mdx_c_configs.inference.dim_t if self.is_mdx_c_seg_def else self.mdx_segment_size
-        batch_size = self.mdx_batch_size
-        chunk_size = self.mdx_c_configs.audio.hop_length * (mdx_segment_size - 1)
-        overlap = self.overlap_mdx23
-        hop_size = chunk_size // overlap
-        mix_shape = mix.shape[1]
-        pad_size = hop_size - (mix_shape - chunk_size) % hop_size
-        mix = torch.cat([torch.zeros(2, chunk_size - hop_size), mix, torch.zeros(2, pad_size + chunk_size - hop_size)], 1)
-        chunks = mix.unfold(1, chunk_size, hop_size).transpose(0, 1)
-        batches = [chunks[i : i + batch_size] for i in range(0, len(chunks), batch_size)]
-        X = torch.zeros(S, *mix.shape) if S > 1 else torch.zeros_like(mix)
-        X = X.to(self.device)
-        with torch.no_grad():
-            cnt = 0
-            for batch in batches:
-                self.running_inference_progress_bar(len(batches))
-                x = model(batch.to(self.device))
-                for w in x:
-                    X[..., cnt * hop_size : cnt * hop_size + chunk_size] += w
-                    cnt += 1
-        estimated_sources = X[..., chunk_size - hop_size:-(pad_size + chunk_size - hop_size)] / overlap
-        del X
-        pitch_fix = lambda s:self.pitch_fix(s, sr_pitched, org_mix)
-        if S > 1:
-            sources = {k: pitch_fix(v) if self.is_pitch_change else v for k, v in zip(self.mdx_c_configs.training.instruments, estimated_sources.cpu().detach().numpy())}
-            del estimated_sources
-            if self.is_denoise_model:
-                if VOCAL_STEM in sources.keys() and INST_STEM in sources.keys():
-                    sources[VOCAL_STEM] = vr_denoiser(sources[VOCAL_STEM], self.device, model_path=self.DENOISER_MODEL)
-                    if sources[VOCAL_STEM].shape[1] != org_mix.shape[1]:
-                        sources[VOCAL_STEM] = spec_utils.match_array_shapes(sources[VOCAL_STEM], org_mix)
-                    sources[INST_STEM] = org_mix - sources[VOCAL_STEM]
-            return sources
-        else:
-            est_s = estimated_sources.cpu().detach().numpy()
-            del estimated_sources
-            return pitch_fix(est_s) if self.is_pitch_change else est_s
-class SeperateDemucs(SeperateAttributes):
-    def seperate(self):
         samplerate = 44100
         source = None
         model_scale = None
         stem_source = None
         stem_source_secondary = None
         inst_mix = None
         inst_source = None
         is_no_write = False
         is_no_piano_guitar = False
-        is_no_cache = False
-        if self.primary_model_name == self.model_basename and isinstance(self.primary_sources, np.ndarray) and not self.pre_proc_model:
             source = self.primary_sources
-            self.load_cached_sources()
         else:
             self.start_inference_console_write()
-            is_no_cache = True
-        mix = prepare_mix(self.audio_file)
-        if is_no_cache:
             if self.demucs_version == DEMUCS_V1:
                 if str(self.model_path).endswith(".gz"):
                     self.model_path = gzip.open(self.model_path, "rb")
@@ -842,23 +462,26 @@ class SeperateDemucs(SeperateAttributes):
                     is_no_write = True
                     self.write_to_console(DONE, base_text='')
                     mix_no_voc = process_secondary_model(self.pre_proc_model, self.process_data, is_pre_proc_model=True)
-                    inst_mix = prepare_mix(mix_no_voc[INST_STEM])
                     self.process_iteration()
                     self.running_inference_console_write(is_no_write=is_no_write)
                     inst_source = self.demix_demucs(inst_mix)
                     self.process_iteration()
             self.running_inference_console_write(is_no_write=is_no_write) if not self.pre_proc_model else None
             if self.primary_model_name == self.model_basename and isinstance(self.primary_sources, np.ndarray) and self.pre_proc_model:
                 source = self.primary_sources
             else:
                 source = self.demix_demucs(mix)
             self.write_to_console(DONE, base_text='')
             del self.demucs
-            clear_gpu_cache()
         if isinstance(inst_source, np.ndarray):
             source_reshape = spec_utils.reshape_sources(inst_source[self.demucs_source_map[VOCAL_STEM]], source[self.demucs_source_map[VOCAL_STEM]])
@@ -866,7 +489,6 @@ class SeperateDemucs(SeperateAttributes):
             source = inst_source
         if isinstance(source, np.ndarray):
             if len(source) == 2:
                 self.demucs_source_map = DEMUCS_2_SOURCE_MAPPER
             else:
@@ -881,40 +503,46 @@ class SeperateDemucs(SeperateAttributes):
                         other_source += i
                     source_reshape = spec_utils.reshape_sources(source[self.demucs_source_map[OTHER_STEM]], other_source)
                     source[self.demucs_source_map[OTHER_STEM]] = source_reshape
-        if not self.is_vocal_split_model:
             self.cache_source(source)
-        if (self.demucs_stems == ALL_STEMS and not self.process_data['is_ensemble_master']) or self.is_4_stem_ensemble and not self.is_return_dual:
             for stem_name, stem_value in self.demucs_source_map.items():
                 if self.is_secondary_model_activated and not self.is_secondary_model and not stem_value >= 4:
                     if self.secondary_model_4_stem[stem_value]:
                         model_scale = self.secondary_model_4_stem_scale[stem_value]
-                        stem_source_secondary = process_secondary_model(self.secondary_model_4_stem[stem_value], self.process_data, main_model_primary_stem_4_stem=stem_name, is_source_load=True, is_return_dual=False)
                         if isinstance(stem_source_secondary, np.ndarray):
-                            stem_source_secondary = stem_source_secondary[1 if self.secondary_model_4_stem[stem_value].demucs_stem_count == 2 else stem_value].T
                         elif type(stem_source_secondary) is dict:
                             stem_source_secondary = stem_source_secondary[stem_name]
                 stem_source_secondary = None if stem_value >= 4 else stem_source_secondary
                 stem_path = os.path.join(self.export_path, f'{self.audio_file_base}_({stem_name}).wav')
-                stem_source = source[stem_value].T
-                stem_source = self.process_secondary_stem(stem_source, secondary_model_source=stem_source_secondary, model_scale=model_scale)
-                self.write_audio(stem_path, stem_source, samplerate, stem_name=stem_name)
-                if stem_name == VOCAL_STEM and not self.is_sec_bv_rebalance:
-                    self.process_vocal_split_chain({VOCAL_STEM:stem_source})
             if self.is_secondary_model:
                 return source
         else:
-            if self.is_secondary_model_activated and self.secondary_model:
                     self.secondary_source_primary, self.secondary_source_secondary = process_secondary_model(self.secondary_model, self.process_data, main_process_method=self.process_method)
             if not self.is_primary_stem_only:
                 def secondary_save(sec_stem_name, source, raw_mixture=None, is_inst_mixture=False):
                     secondary_source = self.secondary_source if not is_inst_mixture else None
                     secondary_stem_path = os.path.join(self.export_path, f'{self.audio_file_base}_({sec_stem_name}).wav')
                     secondary_source_secondary = None
@@ -930,12 +558,12 @@ class SeperateDemucs(SeperateAttributes):
                             secondary_source = np.zeros_like(source[0])
                             for i in source:
                                 secondary_source += i
-                            secondary_source = secondary_source.T
                         else:
                             if not isinstance(raw_mixture, np.ndarray):
-                                raw_mixture = prepare_mix(self.audio_file)
-                            secondary_source = source[self.demucs_source_map[self.primary_stem]]
                             if self.is_invert_spec:
                                 secondary_source = spec_utils.invert_stem(raw_mixture, secondary_source)
@@ -946,90 +574,86 @@ class SeperateDemucs(SeperateAttributes):
                     if not is_inst_mixture:
                         self.secondary_source = secondary_source
                         secondary_source_secondary = self.secondary_source_secondary
-                        self.secondary_source = self.process_secondary_stem(secondary_source, secondary_source_secondary)
                         self.secondary_source_map = {self.secondary_stem: self.secondary_source}
-                    self.write_audio(secondary_stem_path, secondary_source, samplerate, stem_name=sec_stem_name)
-                secondary_save(self.secondary_stem, source, raw_mixture=mix)
                 if self.is_demucs_pre_proc_model_inst_mix and self.pre_proc_model and not self.is_4_stem_ensemble:
-                    secondary_save(f"{self.secondary_stem} {INST_STEM}", source, raw_mixture=inst_mix, is_inst_mixture=True)
-            if not self.is_secondary_stem_only:
-                primary_stem_path = os.path.join(self.export_path, f'{self.audio_file_base}_({self.primary_stem}).wav')
-                if not isinstance(self.primary_source, np.ndarray):
-                    self.primary_source = source[self.demucs_source_map[self.primary_stem]].T
-                self.primary_source_map = self.final_process(primary_stem_path, self.primary_source, self.secondary_source_primary, self.primary_stem, samplerate)
             secondary_sources = {**self.primary_source_map, **self.secondary_source_map}
-            self.process_vocal_split_chain(secondary_sources)
             if self.is_secondary_model:
                 return secondary_sources
     def demix_demucs(self, mix):
-        org_mix = mix
-        if self.is_pitch_change:
-            mix, sr_pitched = spec_utils.change_pitch_semitones(mix, 44100, semitone_shift=-self.semitone_shift)
         processed = {}
-        mix = torch.tensor(mix, dtype=torch.float32)
-        ref = mix.mean(0)
-        mix = (mix - ref.mean()) / ref.std()
-        mix_infer = mix
-        with torch.no_grad():
-            if self.demucs_version == DEMUCS_V1:
-                sources = apply_model_v1(self.demucs,
-                                            mix_infer.to(self.device),
-                                            self.shifts,
-                                            self.is_split_mode,
-                                            set_progress_bar=self.set_progress_bar)
-            elif self.demucs_version == DEMUCS_V2:
-                sources = apply_model_v2(self.demucs,
-                                            mix_infer.to(self.device),
                                             self.shifts,
                                             self.is_split_mode,
                                             self.overlap,
-                                            set_progress_bar=self.set_progress_bar)
-            else:
-                sources = apply_model(self.demucs,
-                                        mix_infer[None],
-                                        self.shifts,
-                                        self.is_split_mode,
-                                        self.overlap,
-                                        static_shifts=1 if self.shifts == 0 else self.shifts,
-                                        set_progress_bar=self.set_progress_bar,
-                                        device=self.device)[0]
-        sources = (sources * ref.std() + ref.mean()).cpu().numpy()
-        sources[[0,1]] = sources[[1,0]]
-        processed[mix] = sources[:,:,0:None].copy()
-        sources = list(processed.values())
-        sources = [s[:,:,0:None] for s in sources]
-        #sources = [self.pitch_fix(s[:,:,0:None], sr_pitched, org_mix) if self.is_pitch_change else s[:,:,0:None] for s in sources]
         sources = np.concatenate(sources, axis=-1)
-        if self.is_pitch_change:
-            sources = np.stack([self.pitch_fix(stem, sr_pitched, org_mix) for stem in sources])
         return sources
 class SeperateVR(SeperateAttributes):
     def seperate(self):
-        if self.primary_model_name == self.model_basename and isinstance(self.primary_sources, tuple):
-            y_spec, v_spec = self.primary_sources
-            self.load_cached_sources()
         else:
             self.start_inference_console_write()
-            device = self.device
             nn_arch_sizes = [
                 31191, # default
@@ -1039,11 +663,7 @@ class SeperateVR(SeperateAttributes):
             nn_arch_size = min(nn_arch_sizes, key=lambda x:abs(x-model_size))
             if nn_arch_size in vr_5_1_models or self.is_vr_51_model:
-                self.model_run = nets_new.CascadedNet(self.mp.param['bins'] * 2,
-                                                      nn_arch_size,
-                                                      nout=self.model_capacity[0],
-                                                      nout_lstm=self.model_capacity[1])
-                self.is_vr_51_model = True
             else:
                 self.model_run = nets.determine_model_capacity(self.mp.param['bins'] * 2, nn_arch_size)
@@ -1053,36 +673,41 @@ class SeperateVR(SeperateAttributes):
             self.running_inference_console_write()
             y_spec, v_spec = self.inference_vr(self.loading_mix(), device, self.aggressiveness)
-            if not self.is_vocal_split_model:
-                self.cache_source((y_spec, v_spec))
             self.write_to_console(DONE, base_text='')
-        if self.is_secondary_model_activated and self.secondary_model:
-            self.secondary_source_primary, self.secondary_source_secondary = process_secondary_model(self.secondary_model, self.process_data, main_process_method=self.process_method, main_model_primary=self.primary_stem)
         if not self.is_secondary_stem_only:
             primary_stem_path = os.path.join(self.export_path, f'{self.audio_file_base}_({self.primary_stem}).wav')
             if not isinstance(self.primary_source, np.ndarray):
-                self.primary_source = self.spec_to_wav(y_spec).T
                 if not self.model_samplerate == 44100:
                     self.primary_source = librosa.resample(self.primary_source.T, orig_sr=self.model_samplerate, target_sr=44100).T
-            self.primary_source_map = self.final_process(primary_stem_path, self.primary_source, self.secondary_source_primary, self.primary_stem, 44100)
         if not self.is_primary_stem_only:
             secondary_stem_path = os.path.join(self.export_path, f'{self.audio_file_base}_({self.secondary_stem}).wav')
             if not isinstance(self.secondary_source, np.ndarray):
-                self.secondary_source = self.spec_to_wav(v_spec).T
                 if not self.model_samplerate == 44100:
                     self.secondary_source = librosa.resample(self.secondary_source.T, orig_sr=self.model_samplerate, target_sr=44100).T
-            self.secondary_source_map = self.final_process(secondary_stem_path, self.secondary_source, self.secondary_source_secondary, self.secondary_stem, 44100)
-        clear_gpu_cache()
         secondary_sources = {**self.primary_source_map, **self.secondary_source_map}
-        self.process_vocal_split_chain(secondary_sources)
         if self.is_secondary_model:
             return secondary_sources
@@ -1092,9 +717,6 @@ class SeperateVR(SeperateAttributes):
         bands_n = len(self.mp.param['band'])
-        audio_file = spec_utils.write_array_to_mem(self.audio_file, subtype=self.wav_type_set)
-        is_mp3 = audio_file.endswith('.mp3') if isinstance(audio_file, str) else False
         for d in range(bands_n, 0, -1):
             bp = self.mp.param['band'][d]
@@ -1104,25 +726,26 @@ class SeperateVR(SeperateAttributes):
                 wav_resolution = bp['res_type']
             if d == bands_n: # high-end band
-                X_wave[d], _ = librosa.load(audio_file, bp['sr'], False, dtype=np.float32, res_type=wav_resolution)
-                X_spec_s[d] = spec_utils.wave_to_spectrogram(X_wave[d], bp['hl'], bp['n_fft'], self.mp, band=d, is_v51_model=self.is_vr_51_model)
-                if not np.any(X_wave[d]) and is_mp3:
-                    X_wave[d] = rerun_mp3(audio_file, bp['sr'])
                 if X_wave[d].ndim == 1:
                     X_wave[d] = np.asarray([X_wave[d], X_wave[d]])
             else: # lower bands
                 X_wave[d] = librosa.resample(X_wave[d+1], self.mp.param['band'][d+1]['sr'], bp['sr'], res_type=wav_resolution)
-                X_spec_s[d] = spec_utils.wave_to_spectrogram(X_wave[d], bp['hl'], bp['n_fft'], self.mp, band=d, is_v51_model=self.is_vr_51_model)
             if d == bands_n and self.high_end_process != 'none':
                 self.input_high_end_h = (bp['n_fft']//2 - bp['crop_stop']) + (self.mp.param['pre_filter_stop'] - self.mp.param['pre_filter_start'])
                 self.input_high_end = X_spec_s[d][:, bp['n_fft']//2-self.input_high_end_h:bp['n_fft']//2, :]
-        X_spec = spec_utils.combine_spectrograms(X_spec_s, self.mp, is_v51_model=self.is_vr_51_model)
-        del X_wave, X_spec_s, audio_file
         return X_spec
@@ -1160,6 +783,7 @@ class SeperateVR(SeperateAttributes):
             return mask
         def postprocess(mask, X_mag, X_phase):
             is_non_accom_stem = False
             for stem in NON_ACCOM_STEMS:
                 if stem == self.primary_stem:
@@ -1174,7 +798,6 @@ class SeperateVR(SeperateAttributes):
             v_spec = (1 - mask) * X_mag * np.exp(1.j * X_phase)
             return y_spec, v_spec
         X_mag, X_phase = spec_utils.preprocess(X_spec)
         n_frame = X_mag.shape[2]
         pad_l, pad_r, roi_size = spec_utils.make_padding(n_frame, self.window_size, self.model_run.offset)
@@ -1198,77 +821,35 @@ class SeperateVR(SeperateAttributes):
         return y_spec, v_spec
     def spec_to_wav(self, spec):
-        if self.high_end_process.startswith('mirroring') and isinstance(self.input_high_end, np.ndarray) and self.input_high_end_h:
             input_high_end_ = spec_utils.mirroring(self.high_end_process, spec, self.input_high_end, self.mp)
-            wav = spec_utils.cmb_spectrogram_to_wave(spec, self.mp, self.input_high_end_h, input_high_end_, is_v51_model=self.is_vr_51_model)
         else:
-            wav = spec_utils.cmb_spectrogram_to_wave(spec, self.mp, is_v51_model=self.is_vr_51_model)
         return wav
-def process_secondary_model(secondary_model: ModelData,
-                            process_data,
-                            main_model_primary_stem_4_stem=None,
-                            is_source_load=False,
-                            main_process_method=None,
-                            is_pre_proc_model=False,
-                            is_return_dual=True,
-                            main_model_primary=None):
     if not is_pre_proc_model:
         process_iteration = process_data['process_iteration']
         process_iteration()
     if secondary_model.process_method == VR_ARCH_TYPE:
-        seperator = SeperateVR(secondary_model, process_data, main_model_primary_stem_4_stem=main_model_primary_stem_4_stem, main_process_method=main_process_method, main_model_primary=main_model_primary)
     if secondary_model.process_method == MDX_ARCH_TYPE:
-        if secondary_model.is_mdx_c:
-            seperator = SeperateMDXC(secondary_model, process_data, main_model_primary_stem_4_stem=main_model_primary_stem_4_stem, main_process_method=main_process_method, is_return_dual=is_return_dual, main_model_primary=main_model_primary)
-        else:
-            seperator = SeperateMDX(secondary_model, process_data, main_model_primary_stem_4_stem=main_model_primary_stem_4_stem, main_process_method=main_process_method, main_model_primary=main_model_primary)
     if secondary_model.process_method == DEMUCS_ARCH_TYPE:
-        seperator = SeperateDemucs(secondary_model, process_data, main_model_primary_stem_4_stem=main_model_primary_stem_4_stem, main_process_method=main_process_method, is_return_dual=is_return_dual, main_model_primary=main_model_primary)
     secondary_sources = seperator.seperate()
-    if type(secondary_sources) is dict and not is_source_load and not is_pre_proc_model:
-        return gather_sources(secondary_model.primary_model_primary_stem, secondary_stem(secondary_model.primary_model_primary_stem), secondary_sources)
     else:
         return secondary_sources
-def process_chain_model(secondary_model: ModelData,
-                        process_data,
-                        vocal_stem_path,
-                        master_vocal_source,
-                        master_inst_source=None):
-    process_iteration = process_data['process_iteration']
-    process_iteration()
-    if secondary_model.bv_model_rebalance:
-        vocal_source = spec_utils.reduce_mix_bv(master_inst_source, master_vocal_source, reduction_rate=secondary_model.bv_model_rebalance)
-    else:
-        vocal_source = master_vocal_source
-    vocal_stem_path = [vocal_source, os.path.splitext(os.path.basename(vocal_stem_path))[0]]
-    if secondary_model.process_method == VR_ARCH_TYPE:
-        seperator = SeperateVR(secondary_model, process_data, vocal_stem_path=vocal_stem_path, master_inst_source=master_inst_source, master_vocal_source=master_vocal_source)
-    if secondary_model.process_method == MDX_ARCH_TYPE:
-        if secondary_model.is_mdx_c:
-            seperator = SeperateMDXC(secondary_model, process_data, vocal_stem_path=vocal_stem_path, master_inst_source=master_inst_source, master_vocal_source=master_vocal_source)
-        else:
-            seperator = SeperateMDX(secondary_model, process_data, vocal_stem_path=vocal_stem_path, master_inst_source=master_inst_source, master_vocal_source=master_vocal_source)
-    if secondary_model.process_method == DEMUCS_ARCH_TYPE:
-        seperator = SeperateDemucs(secondary_model, process_data, vocal_stem_path=vocal_stem_path, master_inst_source=master_inst_source, master_vocal_source=master_vocal_source)
-    secondary_sources = seperator.seperate()
-    if type(secondary_sources) is dict:
-        return secondary_sources
-    else:
-        return None
 def gather_sources(primary_stem_name, secondary_stem_name, secondary_sources: dict):
     source_primary = False
@@ -1282,23 +863,53 @@ def gather_sources(primary_stem_name, secondary_stem_name, secondary_sources: di
     return source_primary, source_secondary
-def prepare_mix(mix):
     audio_path = mix
     if not isinstance(mix, np.ndarray):
-        mix, sr = librosa.load(mix, mono=False, sr=44100)
     else:
         mix = mix.T
-    if isinstance(audio_path, str):
-        if not np.any(mix) and audio_path.endswith('.mp3'):
-            mix = rerun_mp3(audio_path)
     if mix.ndim == 1:
         mix = np.asfortranarray([mix,mix])
-    return mix
 def rerun_mp3(audio_file, sample_rate=44100):
@@ -1323,137 +934,9 @@ def save_format(audio_path, save_format, mp3_bit_set):
         if save_format == MP3:
             audio_path_mp3 = audio_path.replace(".wav", ".mp3")
-            try:
-                musfile.export(audio_path_mp3, format="mp3", bitrate=mp3_bit_set, codec="libmp3lame")
-            except Exception as e:
-                print(e)
-                musfile.export(audio_path_mp3, format="mp3", bitrate=mp3_bit_set)
         try:
             os.remove(audio_path)
         except Exception as e:
             print(e)
-def pitch_shift(mix):
-    new_sr = 31183
-    # Resample audio file
-    resampled_audio = signal.resample_poly(mix, new_sr, 44100)
-    return resampled_audio
-def list_to_dictionary(lst):
-    dictionary = {item: index for index, item in enumerate(lst)}
-    return dictionary
-def vr_denoiser(X, device, hop_length=1024, n_fft=2048, cropsize=256, is_deverber=False, model_path=None):
-    batchsize = 4
-    if is_deverber:
-        nout, nout_lstm = 64, 128
-        mp = ModelParameters(os.path.join('lib_v5', 'vr_network', 'modelparams', '4band_v3.json'))
-        n_fft = mp.param['bins'] * 2
-    else:
-        mp = None
-        hop_length=1024
-        nout, nout_lstm = 16, 128
-    model = nets_new.CascadedNet(n_fft, nout=nout, nout_lstm=nout_lstm)
-    model.load_state_dict(torch.load(model_path, map_location=cpu))
-    model.to(device)
-    if mp is None:
-        X_spec = spec_utils.wave_to_spectrogram_old(X, hop_length, n_fft)
-    else:
-        X_spec = loading_mix(X.T, mp)
-    #PreProcess
-    X_mag = np.abs(X_spec)
-    X_phase = np.angle(X_spec)
-    #Sep
-    n_frame = X_mag.shape[2]
-    pad_l, pad_r, roi_size = spec_utils.make_padding(n_frame, cropsize, model.offset)
-    X_mag_pad = np.pad(X_mag, ((0, 0), (0, 0), (pad_l, pad_r)), mode='constant')
-    X_mag_pad /= X_mag_pad.max()
-    X_dataset = []
-    patches = (X_mag_pad.shape[2] - 2 * model.offset) // roi_size
-    for i in range(patches):
-        start = i * roi_size
-        X_mag_crop = X_mag_pad[:, :, start:start + cropsize]
-        X_dataset.append(X_mag_crop)
-    X_dataset = np.asarray(X_dataset)
-    model.eval()
-    with torch.no_grad():
-        mask = []
-        # To reduce the overhead, dataloader is not used.
-        for i in range(0, patches, batchsize):
-            X_batch = X_dataset[i: i + batchsize]
-            X_batch = torch.from_numpy(X_batch).to(device)
-            pred = model.predict_mask(X_batch)
-            pred = pred.detach().cpu().numpy()
-            pred = np.concatenate(pred, axis=2)
-            mask.append(pred)
-        mask = np.concatenate(mask, axis=2)
-    mask = mask[:, :, :n_frame]
-    #Post Proc
-    if is_deverber:
-        v_spec = mask * X_mag * np.exp(1.j * X_phase)
-        y_spec = (1 - mask) * X_mag * np.exp(1.j * X_phase)
-    else:
-        v_spec = (1 - mask) * X_mag * np.exp(1.j * X_phase)
-    if mp is None:
-        wave = spec_utils.spectrogram_to_wave_old(v_spec, hop_length=1024)
-    else:
-        wave = spec_utils.cmb_spectrogram_to_wave(v_spec, mp, is_v51_model=True).T
-    wave = spec_utils.match_array_shapes(wave, X)
-    if is_deverber:
-        wave_2 = spec_utils.cmb_spectrogram_to_wave(y_spec, mp, is_v51_model=True).T
-        wave_2 = spec_utils.match_array_shapes(wave_2, X)
-        return wave, wave_2
-    else:
-        return wave
-def loading_mix(X, mp):
-    X_wave, X_spec_s = {}, {}
-    bands_n = len(mp.param['band'])
-    for d in range(bands_n, 0, -1):
-        bp = mp.param['band'][d]
-        if OPERATING_SYSTEM == 'Darwin':
-            wav_resolution = 'polyphase' if SYSTEM_PROC == ARM or ARM in SYSTEM_ARCH else bp['res_type']
-        else:
-            wav_resolution = 'polyphase'#bp['res_type']
-        if d == bands_n: # high-end band
-            X_wave[d] = X
-        else: # lower bands
-            X_wave[d] = librosa.resample(X_wave[d+1], mp.param['band'][d+1]['sr'], bp['sr'], res_type=wav_resolution)
-        X_spec_s[d] = spec_utils.wave_to_spectrogram(X_wave[d], bp['hl'], bp['n_fft'], mp, band=d, is_v51_model=True)
-        # if d == bands_n and is_high_end_process:
-        #     input_high_end_h = (bp['n_fft']//2 - bp['crop_stop']) + (mp.param['pre_filter_stop'] - mp.param['pre_filter_start'])
-        #     input_high_end = X_spec_s[d][:, bp['n_fft']//2-input_high_end_h:bp['n_fft']//2, :]
-    X_spec = spec_utils.combine_spectrograms(X_spec_s, mp)
-    del X_wave, X_spec_s
-    return X_spec

 from demucs.pretrained import get_model as _gm
 from demucs.utils import apply_model_v1
 from demucs.utils import apply_model_v2
 from lib_v5 import spec_utils
 from lib_v5.vr_network import nets
 from lib_v5.vr_network import nets_new
+#from lib_v5.vr_network.model_param_init import ModelParameters
 from pathlib import Path
 from gui_data.constants import *
 from gui_data.error_handling import *
 import audioread
 import gzip
 import librosa
 import warnings
 import pydub
 import soundfile as sf
+import traceback
 import lib_v5.mdxnet as MdxnetSet
 if TYPE_CHECKING:
     from UVR import ModelData
 warnings.filterwarnings("ignore")
 cpu = torch.device('cpu')
 class SeperateAttributes:
+    def __init__(self, model_data: ModelData, process_data: dict, main_model_primary_stem_4_stem=None, main_process_method=None):
         self.list_all_models: list
         self.process_data = process_data
         self.progress_value = 0
         self.set_progress_bar = process_data['set_progress_bar']
         self.write_to_console = process_data['write_to_console']
+        self.audio_file = process_data['audio_file']
+        self.audio_file_base = process_data['audio_file_base']
         self.export_path = process_data['export_path']
         self.cached_source_callback = process_data['cached_source_callback']
         self.cached_model_source_holder = process_data['cached_model_source_holder']
         self.is_4_stem_ensemble = process_data['is_4_stem_ensemble']
         self.list_all_models = process_data['list_all_models']
         self.process_iteration = process_data['process_iteration']
         self.mixer_path = model_data.mixer_path
         self.model_samplerate = model_data.model_samplerate
         self.model_capacity = model_data.model_capacity
         self.is_ensemble_mode = model_data.is_ensemble_mode
         self.secondary_model = model_data.secondary_model #
         self.primary_model_primary_stem = model_data.primary_model_primary_stem
         self.primary_stem = model_data.primary_stem #
         self.secondary_stem = model_data.secondary_stem #
         self.is_invert_spec = model_data.is_invert_spec #
         self.is_mixer_mode = model_data.is_mixer_mode #
         self.secondary_model_scale = model_data.secondary_model_scale #
         self.is_demucs_pre_proc_model_inst_mix = model_data.is_demucs_pre_proc_model_inst_mix #
         self.secondary_source = None
         self.secondary_source_primary = None
         self.secondary_source_secondary = None
+        if not model_data.process_method == DEMUCS_ARCH_TYPE:
+            if process_data['is_ensemble_master'] and not self.is_4_stem_ensemble:
+                if not model_data.ensemble_primary_stem == self.primary_stem:
+                    self.is_primary_stem_only, self.is_secondary_stem_only = self.is_secondary_stem_only, self.is_primary_stem_only
+            if self.is_secondary_model and not process_data['is_ensemble_master']:
+                if not self.primary_model_primary_stem == self.primary_stem and not main_model_primary_stem_4_stem:
+                    self.is_primary_stem_only, self.is_secondary_stem_only = self.is_secondary_stem_only, self.is_primary_stem_only
+            if main_model_primary_stem_4_stem:
+                self.is_primary_stem_only = True if main_model_primary_stem_4_stem == self.primary_stem else False
+                self.is_secondary_stem_only = True if not main_model_primary_stem_4_stem == self.primary_stem else False
+            if self.is_pre_proc_model:
+                self.is_primary_stem_only = True if self.primary_stem == INST_STEM else False
+                self.is_secondary_stem_only = True if self.secondary_stem == INST_STEM else False
         if model_data.process_method == MDX_ARCH_TYPE:
             self.is_mdx_ckpt = model_data.is_mdx_ckpt
             self.primary_model_name, self.primary_sources = self.cached_source_callback(MDX_ARCH_TYPE, model_name=self.model_basename)
+            self.is_denoise = model_data.is_denoise
             self.mdx_batch_size = model_data.mdx_batch_size
             self.compensate = model_data.compensate
+            self.dim_f, self.dim_t = model_data.mdx_dim_f_set, 2**model_data.mdx_dim_t_set
             self.n_fft = model_data.mdx_n_fft_scale_set
             self.chunks = model_data.chunks
             self.margin = model_data.margin
             self.adjust = 1
             self.dim_c = 4
             self.hop = 1024
+            if self.is_gpu_conversion >= 0 and torch.cuda.is_available():
+                self.device, self.run_type = torch.device('cuda:0'), ['CUDAExecutionProvider']
+            else:
+                self.device, self.run_type = torch.device('cpu'), ['CPUExecutionProvider']
         if model_data.process_method == DEMUCS_ARCH_TYPE:
             self.demucs_stems = model_data.demucs_stems if not main_process_method in [MDX_ARCH_TYPE, VR_ARCH_TYPE] else None
             self.secondary_model_4_stem = model_data.secondary_model_4_stem
             self.secondary_model_4_stem_scale = model_data.secondary_model_4_stem_scale
+            self.primary_stem = model_data.ensemble_primary_stem if process_data['is_ensemble_master'] else model_data.primary_stem
+            self.secondary_stem = model_data.ensemble_secondary_stem if process_data['is_ensemble_master'] else model_data.secondary_stem
             self.is_chunk_demucs = model_data.is_chunk_demucs
             self.segment = model_data.segment
             self.demucs_version = model_data.demucs_version
             self.is_demucs_combine_stems = model_data.is_demucs_combine_stems
             self.demucs_stem_count = model_data.demucs_stem_count
             self.pre_proc_model = model_data.pre_proc_model
             if self.is_secondary_model and not process_data['is_ensemble_master']:
                 if not self.demucs_stem_count == 2 and model_data.primary_model_primary_stem == INST_STEM:
                     self.primary_stem = VOCAL_STEM
                     self.secondary_stem = INST_STEM
                 else:
                     self.primary_stem = model_data.primary_model_primary_stem
+                    self.secondary_stem = STEM_PAIR_MAPPER[self.primary_stem]
+            if self.is_chunk_demucs:
+                self.chunks_demucs = model_data.chunks_demucs
+                self.margin_demucs = model_data.margin_demucs
+            else:
+                self.chunks_demucs = 0
+                self.margin_demucs = 44100
             self.shifts = model_data.shifts
             self.is_split_mode = model_data.is_split_mode if not self.demucs_version == DEMUCS_V4 else True
+            self.overlap = model_data.overlap
             self.primary_model_name, self.primary_sources = self.cached_source_callback(DEMUCS_ARCH_TYPE, model_name=self.model_basename)
         if model_data.process_method == VR_ARCH_TYPE:
             self.primary_model_name, self.primary_sources = self.cached_source_callback(VR_ARCH_TYPE, model_name=self.model_basename)
             self.mp = model_data.vr_model_param
             self.high_end_process = model_data.is_high_end_process
             self.batch_size = model_data.batch_size
             self.window_size = model_data.window_size
             self.input_high_end_h = None
             self.post_process_threshold = model_data.post_process_threshold
             self.aggressiveness = {'value': model_data.aggression_setting,
                                    'split_bin': self.mp.param['band'][1]['crop_stop'],
                                    'aggr_correction': self.mp.param.get('aggr_correction')}
     def start_inference_console_write(self):
+        if self.is_secondary_model and not self.is_pre_proc_model:
             self.write_to_console(INFERENCE_STEP_2_SEC(self.process_method, self.model_basename))
         if self.is_pre_proc_model:
             self.write_to_console(INFERENCE_STEP_2_PRE(self.process_method, self.model_basename))
     def running_inference_console_write(self, is_no_write=False):
         self.write_to_console(DONE, base_text='') if not is_no_write else None
         self.set_progress_bar(0.05) if not is_no_write else None
+        if self.is_secondary_model and not self.is_pre_proc_model:
             self.write_to_console(INFERENCE_STEP_1_SEC)
         elif self.is_pre_proc_model:
             self.write_to_console(INFERENCE_STEP_1_PRE)
         else:
             self.write_to_console(INFERENCE_STEP_1)
             self.set_progress_bar(0.1, (0.8/length*self.progress_value))
+    def load_cached_sources(self, is_4_stem_demucs=False):
         if self.is_secondary_model and not self.is_pre_proc_model:
             self.write_to_console(INFERENCE_STEP_2_SEC_CACHED_MODOEL(self.process_method, self.model_basename))
         elif self.is_pre_proc_model:
             self.write_to_console(INFERENCE_STEP_2_PRE_CACHED_MODOEL(self.process_method, self.model_basename))
         else:
+            self.write_to_console(INFERENCE_STEP_2_PRIMARY_CACHED)
+        if not is_4_stem_demucs:
+            primary_stem, secondary_stem = gather_sources(self.primary_stem, self.secondary_stem, self.primary_sources)
+            return primary_stem, secondary_stem
     def cache_source(self, secondary_sources):
             if self.process_method == DEMUCS_ARCH_TYPE:
                 self.cached_model_source_holder(DEMUCS_ARCH_TYPE, secondary_sources, self.model_basename)
+    def write_audio(self, stem_path, stem_source, samplerate, secondary_model_source=None, model_scale=None):
         if not self.is_secondary_model:
+            if self.is_secondary_model_activated:
+                if isinstance(secondary_model_source, np.ndarray):
+                    secondary_model_scale = model_scale if model_scale else self.secondary_model_scale
+                    stem_source = spec_utils.average_dual_sources(stem_source, secondary_model_source, secondary_model_scale)
+            sf.write(stem_path, stem_source, samplerate, subtype=self.wav_type_set)
+            save_format(stem_path, self.save_format, self.mp3_bit_set) if not self.is_ensemble_mode else None
             self.write_to_console(DONE, base_text='')
+            self.set_progress_bar(0.95)
+    def run_mixer(self, mix, sources):
+        try:
+            if self.is_mixer_mode and len(sources) == 4:
+                mixer = MdxnetSet.Mixer(self.device, self.mixer_path).eval()
+                with torch.no_grad():
+                    mix = torch.tensor(mix, dtype=torch.float32)
+                    sources_ = torch.tensor(sources).detach()
+                    x = torch.cat([sources_, mix.unsqueeze(0)], 0)
+                    sources_ = mixer(x)
+                final_source = np.array(sources_)
+            else:
+                final_source = sources
+        except Exception as e:
+            error_name = f'{type(e).__name__}'
+            traceback_text = ''.join(traceback.format_tb(e.__traceback__))
+            message = f'{error_name}: "{e}"\n{traceback_text}"'
+            print('Mixer Failed: ', message)
+            final_source = sources
+        return final_source
 class SeperateMDX(SeperateAttributes):
     def seperate(self):
         samplerate = 44100
+        if self.primary_model_name == self.model_basename and self.primary_sources:
+            self.primary_source, self.secondary_source = self.load_cached_sources()
         else:
             self.start_inference_console_write()
                 separator = MdxnetSet.ConvTDFNet(**model_params)
                 self.model_run = separator.load_from_checkpoint(self.model_path).to(self.device).eval()
             else:
+                ort_ = ort.InferenceSession(self.model_path, providers=self.run_type)
+                self.model_run = lambda spek:ort_.run(None, {'input': spek.cpu().numpy()})[0]
+            self.initialize_model_settings()
             self.running_inference_console_write()
+            mdx_net_cut = True if self.primary_stem in MDX_NET_FREQ_CUT else False
+            mix, raw_mix, samplerate = prepare_mix(self.audio_file, self.chunks, self.margin, mdx_net_cut=mdx_net_cut)
+            source = self.demix_base(mix, is_ckpt=self.is_mdx_ckpt)[0]
             self.write_to_console(DONE, base_text='')
+        if self.is_secondary_model_activated:
+            if self.secondary_model:
+                self.secondary_source_primary, self.secondary_source_secondary = process_secondary_model(self.secondary_model, self.process_data, main_process_method=self.process_method)
+        if not self.is_secondary_stem_only:
+            self.write_to_console(f'{SAVING_STEM[0]}{self.primary_stem}{SAVING_STEM[1]}') if not self.is_secondary_model else None
+            primary_stem_path = os.path.join(self.export_path, f'{self.audio_file_base}_({self.primary_stem}).wav')
+            if not isinstance(self.primary_source, np.ndarray):
+                self.primary_source = spec_utils.normalize(source, self.is_normalization).T
+            self.primary_source_map = {self.primary_stem: self.primary_source}
+            self.write_audio(primary_stem_path, self.primary_source, samplerate, self.secondary_source_primary)
         if not self.is_primary_stem_only:
+            self.write_to_console(f'{SAVING_STEM[0]}{self.secondary_stem}{SAVING_STEM[1]}') if not self.is_secondary_model else None
             secondary_stem_path = os.path.join(self.export_path, f'{self.audio_file_base}_({self.secondary_stem}).wav')
             if not isinstance(self.secondary_source, np.ndarray):
+                raw_mix = self.demix_base(raw_mix, is_match_mix=True)[0] if mdx_net_cut else raw_mix
+                self.secondary_source, raw_mix = spec_utils.normalize_two_stem(source*self.compensate, raw_mix, self.is_normalization)
+                if self.is_invert_spec:
+                    self.secondary_source = spec_utils.invert_stem(raw_mix, self.secondary_source)
+                else:
+                    self.secondary_source = (-self.secondary_source.T+raw_mix.T)
+            self.secondary_source_map = {self.secondary_stem: self.secondary_source}
+            self.write_audio(secondary_stem_path, self.secondary_source, samplerate, self.secondary_source_secondary)
+        torch.cuda.empty_cache()
         secondary_sources = {**self.primary_source_map, **self.secondary_source_map}
+        self.cache_source(secondary_sources)
+        if self.is_secondary_model:
             return secondary_sources
     def initialize_model_settings(self):
         self.n_bins = self.n_fft//2+1
         self.trim = self.n_fft//2
+        self.chunk_size = self.hop * (self.dim_t-1)
+        self.window = torch.hann_window(window_length=self.n_fft, periodic=False).to(self.device)
+        self.freq_pad = torch.zeros([1, self.dim_c, self.n_bins-self.dim_f, self.dim_t]).to(self.device)
         self.gen_size = self.chunk_size-2*self.trim
+    def initialize_mix(self, mix, is_ckpt=False):
+        if is_ckpt:
+            pad = self.gen_size + self.trim - ((mix.shape[-1]) % self.gen_size)
+            mixture = np.concatenate((np.zeros((2, self.trim), dtype='float32'),mix, np.zeros((2, pad), dtype='float32')), 1)
+            num_chunks = mixture.shape[-1] // self.gen_size
+            mix_waves = [mixture[:, i * self.gen_size: i * self.gen_size + self.chunk_size] for i in range(num_chunks)]
         else:
+            mix_waves = []
+            n_sample = mix.shape[1]
+            pad = self.gen_size - n_sample%self.gen_size
+            mix_p = np.concatenate((np.zeros((2,self.trim)), mix, np.zeros((2,pad)), np.zeros((2,self.trim))), 1)
+            i = 0
+            while i < n_sample + pad:
+                waves = np.array(mix_p[:, i:i+self.chunk_size])
+                mix_waves.append(waves)
+                i += self.gen_size
+        mix_waves = torch.tensor(mix_waves, dtype=torch.float32).to(self.device)
+        return mix_waves, pad
+    def demix_base(self, mix, is_ckpt=False, is_match_mix=False):
+        chunked_sources = []
+        for slice in mix:
+            sources = []
+            tar_waves_ = []
+            mix_p = mix[slice]
+            mix_waves, pad = self.initialize_mix(mix_p, is_ckpt=is_ckpt)
+            mix_waves = mix_waves.split(self.mdx_batch_size)
+            pad = mix_p.shape[-1] if is_ckpt else -pad
             with torch.no_grad():
                 for mix_wave in mix_waves:
+                    self.running_inference_progress_bar(len(mix)*len(mix_waves), is_match_mix=is_match_mix)
+                    tar_waves = self.run_model(mix_wave, is_ckpt=is_ckpt, is_match_mix=is_match_mix)
+                    tar_waves_.append(tar_waves)
+                tar_waves_ = np.vstack(tar_waves_)[:, :, self.trim:-self.trim] if is_ckpt else tar_waves_
+                tar_waves = np.concatenate(tar_waves_, axis=-1)[:, :pad]
+                start = 0 if slice == 0 else self.margin
+                end = None if slice == list(mix.keys())[::-1][0] or self.margin == 0 else -self.margin
+                sources.append(tar_waves[:,start:end]*(1/self.adjust))
+            chunked_sources.append(sources)
+        sources = np.concatenate(chunked_sources, axis=-1)
+        return sources
+    def run_model(self, mix, is_ckpt=False, is_match_mix=False):
         spek = self.stft(mix.to(self.device))*self.adjust
         spek[:, :, :3, :] *= 0
         else:
             spec_pred = -self.model_run(-spek)*0.5+self.model_run(spek)*0.5 if self.is_denoise else self.model_run(spek)
+        if is_ckpt:
+            return self.istft(spec_pred).cpu().detach().numpy()
+        else:
+            return self.istft(torch.tensor(spec_pred).to(self.device)).to(cpu)[:,:,self.trim:-self.trim].transpose(0,1).reshape(2, -1).numpy()
+    def stft(self, x):
+        x = x.reshape([-1, self.chunk_size])
+        x = torch.stft(x, n_fft=self.n_fft, hop_length=self.hop, window=self.window, center=True,return_complex=True)
+        x=torch.view_as_real(x)
+        x = x.permute([0,3,1,2])
+        x = x.reshape([-1,2,2,self.n_bins,self.dim_t]).reshape([-1,self.dim_c,self.n_bins,self.dim_t])
+        return x[:,:,:self.dim_f]
+    def istft(self, x, freq_pad=None):
+        freq_pad = self.freq_pad.repeat([x.shape[0],1,1,1]) if freq_pad is None else freq_pad
+        x = torch.cat([x, freq_pad], -2)
+        x = x.reshape([-1,2,2,self.n_bins,self.dim_t]).reshape([-1,2,self.n_bins,self.dim_t])
+        x = x.permute([0,2,3,1])
+        x=x.contiguous()
+        x=torch.view_as_complex(x)
+        x = torch.istft(x, n_fft=self.n_fft, hop_length=self.hop, window=self.window, center=True)
+        return x.reshape([-1,2,self.chunk_size])
+class SeperateDemucs(SeperateAttributes):
     def seperate(self):
         samplerate = 44100
         source = None
         model_scale = None
         stem_source = None
         stem_source_secondary = None
         inst_mix = None
+        inst_raw_mix = None
+        raw_mix = None
         inst_source = None
         is_no_write = False
         is_no_piano_guitar = False
+        if self.primary_model_name == self.model_basename and type(self.primary_sources) is dict and not self.pre_proc_model:
+            self.primary_source, self.secondary_source = self.load_cached_sources()
+        elif self.primary_model_name == self.model_basename and isinstance(self.primary_sources, np.ndarray) and not self.pre_proc_model:
             source = self.primary_sources
+            self.load_cached_sources(is_4_stem_demucs=True)
         else:
             self.start_inference_console_write()
+            if self.is_gpu_conversion >= 0:
+                self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
+            else:
+                self.device = torch.device('cpu')
             if self.demucs_version == DEMUCS_V1:
                 if str(self.model_path).endswith(".gz"):
                     self.model_path = gzip.open(self.model_path, "rb")
                     is_no_write = True
                     self.write_to_console(DONE, base_text='')
                     mix_no_voc = process_secondary_model(self.pre_proc_model, self.process_data, is_pre_proc_model=True)
+                    inst_mix, inst_raw_mix, inst_samplerate = prepare_mix(mix_no_voc[INST_STEM], self.chunks_demucs, self.margin_demucs)
                     self.process_iteration()
                     self.running_inference_console_write(is_no_write=is_no_write)
                     inst_source = self.demix_demucs(inst_mix)
+                    inst_source = self.run_mixer(inst_raw_mix, inst_source)
                     self.process_iteration()
             self.running_inference_console_write(is_no_write=is_no_write) if not self.pre_proc_model else None
+            mix, raw_mix, samplerate = prepare_mix(self.audio_file, self.chunks_demucs, self.margin_demucs)
             if self.primary_model_name == self.model_basename and isinstance(self.primary_sources, np.ndarray) and self.pre_proc_model:
                 source = self.primary_sources
             else:
                 source = self.demix_demucs(mix)
+                source = self.run_mixer(raw_mix, source)
             self.write_to_console(DONE, base_text='')
             del self.demucs
+            torch.cuda.empty_cache()
         if isinstance(inst_source, np.ndarray):
             source_reshape = spec_utils.reshape_sources(inst_source[self.demucs_source_map[VOCAL_STEM]], source[self.demucs_source_map[VOCAL_STEM]])
             source = inst_source
         if isinstance(source, np.ndarray):
             if len(source) == 2:
                 self.demucs_source_map = DEMUCS_2_SOURCE_MAPPER
             else:
                         other_source += i
                     source_reshape = spec_utils.reshape_sources(source[self.demucs_source_map[OTHER_STEM]], other_source)
                     source[self.demucs_source_map[OTHER_STEM]] = source_reshape
+        if (self.demucs_stems == ALL_STEMS and not self.process_data['is_ensemble_master']) or self.is_4_stem_ensemble:
             self.cache_source(source)
             for stem_name, stem_value in self.demucs_source_map.items():
                 if self.is_secondary_model_activated and not self.is_secondary_model and not stem_value >= 4:
                     if self.secondary_model_4_stem[stem_value]:
                         model_scale = self.secondary_model_4_stem_scale[stem_value]
+                        stem_source_secondary = process_secondary_model(self.secondary_model_4_stem[stem_value], self.process_data, main_model_primary_stem_4_stem=stem_name, is_4_stem_demucs=True)
                         if isinstance(stem_source_secondary, np.ndarray):
+                            stem_source_secondary = stem_source_secondary[1 if self.secondary_model_4_stem[stem_value].demucs_stem_count == 2 else stem_value]
+                            stem_source_secondary = spec_utils.normalize(stem_source_secondary, self.is_normalization).T
                         elif type(stem_source_secondary) is dict:
                             stem_source_secondary = stem_source_secondary[stem_name]
                 stem_source_secondary = None if stem_value >= 4 else stem_source_secondary
+                self.write_to_console(f'{SAVING_STEM[0]}{stem_name}{SAVING_STEM[1]}') if not self.is_secondary_model else None
                 stem_path = os.path.join(self.export_path, f'{self.audio_file_base}_({stem_name}).wav')
+                stem_source = spec_utils.normalize(source[stem_value], self.is_normalization).T
+                self.write_audio(stem_path, stem_source, samplerate, secondary_model_source=stem_source_secondary, model_scale=model_scale)
             if self.is_secondary_model:
                 return source
         else:
+            if self.is_secondary_model_activated:
+                if self.secondary_model:
                     self.secondary_source_primary, self.secondary_source_secondary = process_secondary_model(self.secondary_model, self.process_data, main_process_method=self.process_method)
+            if not self.is_secondary_stem_only:
+                self.write_to_console(f'{SAVING_STEM[0]}{self.primary_stem}{SAVING_STEM[1]}') if not self.is_secondary_model else None
+                primary_stem_path = os.path.join(self.export_path, f'{self.audio_file_base}_({self.primary_stem}).wav')
+                if not isinstance(self.primary_source, np.ndarray):
+                    self.primary_source = spec_utils.normalize(source[self.demucs_source_map[self.primary_stem]], self.is_normalization).T
+                self.primary_source_map = {self.primary_stem: self.primary_source}
+                self.write_audio(primary_stem_path, self.primary_source, samplerate, self.secondary_source_primary)
             if not self.is_primary_stem_only:
                 def secondary_save(sec_stem_name, source, raw_mixture=None, is_inst_mixture=False):
                     secondary_source = self.secondary_source if not is_inst_mixture else None
+                    self.write_to_console(f'{SAVING_STEM[0]}{sec_stem_name}{SAVING_STEM[1]}') if not self.is_secondary_model else None
                     secondary_stem_path = os.path.join(self.export_path, f'{self.audio_file_base}_({sec_stem_name}).wav')
                     secondary_source_secondary = None
                             secondary_source = np.zeros_like(source[0])
                             for i in source:
                                 secondary_source += i
+                            secondary_source = spec_utils.normalize(secondary_source, self.is_normalization).T
                         else:
                             if not isinstance(raw_mixture, np.ndarray):
+                                raw_mixture = prepare_mix(self.audio_file, self.chunks_demucs, self.margin_demucs, is_missing_mix=True)
+                            secondary_source, raw_mixture = spec_utils.normalize_two_stem(source[self.demucs_source_map[self.primary_stem]], raw_mixture, self.is_normalization)
                             if self.is_invert_spec:
                                 secondary_source = spec_utils.invert_stem(raw_mixture, secondary_source)
                     if not is_inst_mixture:
                         self.secondary_source = secondary_source
                         secondary_source_secondary = self.secondary_source_secondary
                         self.secondary_source_map = {self.secondary_stem: self.secondary_source}
+                    self.write_audio(secondary_stem_path, secondary_source, samplerate, secondary_source_secondary)
+                secondary_save(self.secondary_stem, source, raw_mixture=raw_mix)
                 if self.is_demucs_pre_proc_model_inst_mix and self.pre_proc_model and not self.is_4_stem_ensemble:
+                    secondary_save(f"{self.secondary_stem} {INST_STEM}", source, raw_mixture=inst_raw_mix, is_inst_mixture=True)
             secondary_sources = {**self.primary_source_map, **self.secondary_source_map}
+            self.cache_source(secondary_sources)
             if self.is_secondary_model:
                 return secondary_sources
     def demix_demucs(self, mix):
         processed = {}
+        set_progress_bar = None if self.is_chunk_demucs else self.set_progress_bar
+        for nmix in mix:
+            self.progress_value += 1
+            self.set_progress_bar(0.1, (0.8/len(mix)*self.progress_value)) if self.is_chunk_demucs else None
+            cmix = mix[nmix]
+            cmix = torch.tensor(cmix, dtype=torch.float32)
+            ref = cmix.mean(0)
+            cmix = (cmix - ref.mean()) / ref.std()
+            mix_infer = cmix
+            with torch.no_grad():
+                if self.demucs_version == DEMUCS_V1:
+                    sources = apply_model_v1(self.demucs,
+                                                mix_infer.to(self.device),
+                                                self.shifts,
+                                                self.is_split_mode,
+                                                set_progress_bar=set_progress_bar)
+                elif self.demucs_version == DEMUCS_V2:
+                    sources = apply_model_v2(self.demucs,
+                                                mix_infer.to(self.device),
+                                                self.shifts,
+                                                self.is_split_mode,
+                                                self.overlap,
+                                                set_progress_bar=set_progress_bar)
+                else:
+                    sources = apply_model(self.demucs,
+                                            mix_infer[None],
                                             self.shifts,
                                             self.is_split_mode,
                                             self.overlap,
+                                            static_shifts=1 if self.shifts == 0 else self.shifts,
+                                            set_progress_bar=set_progress_bar,
+                                            device=self.device)[0]
+            sources = (sources * ref.std() + ref.mean()).cpu().numpy()
+            sources[[0,1]] = sources[[1,0]]
+            start = 0 if nmix == 0 else self.margin_demucs
+            end = None if nmix == list(mix.keys())[::-1][0] else -self.margin_demucs
+            if self.margin_demucs == 0:
+                end = None
+            processed[nmix] = sources[:,:,start:end].copy()
+            sources = list(processed.values())
         sources = np.concatenate(sources, axis=-1)
         return sources
 class SeperateVR(SeperateAttributes):
     def seperate(self):
+        if self.primary_model_name == self.model_basename and self.primary_sources:
+            self.primary_source, self.secondary_source = self.load_cached_sources()
         else:
             self.start_inference_console_write()
+            if self.is_gpu_conversion >= 0:
+                if OPERATING_SYSTEM == 'Darwin':
+                    device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
+                else:
+                    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
+            else:
+                device = torch.device('cpu')
             nn_arch_sizes = [
                 31191, # default
             nn_arch_size = min(nn_arch_sizes, key=lambda x:abs(x-model_size))
             if nn_arch_size in vr_5_1_models or self.is_vr_51_model:
+                self.model_run = nets_new.CascadedNet(self.mp.param['bins'] * 2, nn_arch_size, nout=self.model_capacity[0], nout_lstm=self.model_capacity[1])
             else:
                 self.model_run = nets.determine_model_capacity(self.mp.param['bins'] * 2, nn_arch_size)
             self.running_inference_console_write()
             y_spec, v_spec = self.inference_vr(self.loading_mix(), device, self.aggressiveness)
             self.write_to_console(DONE, base_text='')
+        if self.is_secondary_model_activated:
+            if self.secondary_model:
+                self.secondary_source_primary, self.secondary_source_secondary = process_secondary_model(self.secondary_model, self.process_data, main_process_method=self.process_method)
         if not self.is_secondary_stem_only:
+            self.write_to_console(f'{SAVING_STEM[0]}{self.primary_stem}{SAVING_STEM[1]}') if not self.is_secondary_model else None
             primary_stem_path = os.path.join(self.export_path, f'{self.audio_file_base}_({self.primary_stem}).wav')
             if not isinstance(self.primary_source, np.ndarray):
+                self.primary_source = spec_utils.normalize(self.spec_to_wav(y_spec), self.is_normalization).T
                 if not self.model_samplerate == 44100:
                     self.primary_source = librosa.resample(self.primary_source.T, orig_sr=self.model_samplerate, target_sr=44100).T
+            self.primary_source_map = {self.primary_stem: self.primary_source}
+            self.write_audio(primary_stem_path, self.primary_source, 44100, self.secondary_source_primary)
         if not self.is_primary_stem_only:
+            self.write_to_console(f'{SAVING_STEM[0]}{self.secondary_stem}{SAVING_STEM[1]}') if not self.is_secondary_model else None
             secondary_stem_path = os.path.join(self.export_path, f'{self.audio_file_base}_({self.secondary_stem}).wav')
             if not isinstance(self.secondary_source, np.ndarray):
+                self.secondary_source = self.spec_to_wav(v_spec)
+                self.secondary_source = spec_utils.normalize(self.spec_to_wav(v_spec), self.is_normalization).T
                 if not self.model_samplerate == 44100:
                     self.secondary_source = librosa.resample(self.secondary_source.T, orig_sr=self.model_samplerate, target_sr=44100).T
+            self.secondary_source_map = {self.secondary_stem: self.secondary_source}
+            self.write_audio(secondary_stem_path, self.secondary_source, 44100, self.secondary_source_secondary)
+        torch.cuda.empty_cache()
         secondary_sources = {**self.primary_source_map, **self.secondary_source_map}
+        self.cache_source(secondary_sources)
         if self.is_secondary_model:
             return secondary_sources
         bands_n = len(self.mp.param['band'])
         for d in range(bands_n, 0, -1):
             bp = self.mp.param['band'][d]
                 wav_resolution = bp['res_type']
             if d == bands_n: # high-end band
+                X_wave[d], _ = librosa.load(self.audio_file, bp['sr'], False, dtype=np.float32, res_type=wav_resolution)
+                if not np.any(X_wave[d]) and self.audio_file.endswith('.mp3'):
+                    X_wave[d] = rerun_mp3(self.audio_file, bp['sr'])
                 if X_wave[d].ndim == 1:
                     X_wave[d] = np.asarray([X_wave[d], X_wave[d]])
             else: # lower bands
                 X_wave[d] = librosa.resample(X_wave[d+1], self.mp.param['band'][d+1]['sr'], bp['sr'], res_type=wav_resolution)
+            X_spec_s[d] = spec_utils.wave_to_spectrogram_mt(X_wave[d], bp['hl'], bp['n_fft'], self.mp.param['mid_side'],
+                                                            self.mp.param['mid_side_b2'], self.mp.param['reverse'])
             if d == bands_n and self.high_end_process != 'none':
                 self.input_high_end_h = (bp['n_fft']//2 - bp['crop_stop']) + (self.mp.param['pre_filter_stop'] - self.mp.param['pre_filter_start'])
                 self.input_high_end = X_spec_s[d][:, bp['n_fft']//2-self.input_high_end_h:bp['n_fft']//2, :]
+        X_spec = spec_utils.combine_spectrograms(X_spec_s, self.mp)
+        del X_wave, X_spec_s
         return X_spec
             return mask
         def postprocess(mask, X_mag, X_phase):
             is_non_accom_stem = False
             for stem in NON_ACCOM_STEMS:
                 if stem == self.primary_stem:
             v_spec = (1 - mask) * X_mag * np.exp(1.j * X_phase)
             return y_spec, v_spec
         X_mag, X_phase = spec_utils.preprocess(X_spec)
         n_frame = X_mag.shape[2]
         pad_l, pad_r, roi_size = spec_utils.make_padding(n_frame, self.window_size, self.model_run.offset)
         return y_spec, v_spec
     def spec_to_wav(self, spec):
+        if self.high_end_process.startswith('mirroring'):
             input_high_end_ = spec_utils.mirroring(self.high_end_process, spec, self.input_high_end, self.mp)
+            wav = spec_utils.cmb_spectrogram_to_wave(spec, self.mp, self.input_high_end_h, input_high_end_)
         else:
+            wav = spec_utils.cmb_spectrogram_to_wave(spec, self.mp)
         return wav
+def process_secondary_model(secondary_model: ModelData, process_data, main_model_primary_stem_4_stem=None, is_4_stem_demucs=False, main_process_method=None, is_pre_proc_model=False):
     if not is_pre_proc_model:
         process_iteration = process_data['process_iteration']
         process_iteration()
     if secondary_model.process_method == VR_ARCH_TYPE:
+        seperator = SeperateVR(secondary_model, process_data, main_model_primary_stem_4_stem=main_model_primary_stem_4_stem, main_process_method=main_process_method)
     if secondary_model.process_method == MDX_ARCH_TYPE:
+        seperator = SeperateMDX(secondary_model, process_data, main_model_primary_stem_4_stem=main_model_primary_stem_4_stem, main_process_method=main_process_method)
     if secondary_model.process_method == DEMUCS_ARCH_TYPE:
+        seperator = SeperateDemucs(secondary_model, process_data, main_model_primary_stem_4_stem=main_model_primary_stem_4_stem, main_process_method=main_process_method)
     secondary_sources = seperator.seperate()
+    if type(secondary_sources) is dict and not is_4_stem_demucs and not is_pre_proc_model:
+        return gather_sources(secondary_model.primary_model_primary_stem, STEM_PAIR_MAPPER[secondary_model.primary_model_primary_stem], secondary_sources)
     else:
         return secondary_sources
 def gather_sources(primary_stem_name, secondary_stem_name, secondary_sources: dict):
     source_primary = False
     return source_primary, source_secondary
+def prepare_mix(mix, chunk_set, margin_set, mdx_net_cut=False, is_missing_mix=False):
     audio_path = mix
+    samplerate = 44100
     if not isinstance(mix, np.ndarray):
+        mix, samplerate = librosa.load(mix, mono=False, sr=44100)
     else:
         mix = mix.T
+    if not np.any(mix) and audio_path.endswith('.mp3'):
+        mix = rerun_mp3(audio_path)
     if mix.ndim == 1:
         mix = np.asfortranarray([mix,mix])
+    def get_segmented_mix(chunk_set=chunk_set):
+        segmented_mix = {}
+        samples = mix.shape[-1]
+        margin = margin_set
+        chunk_size = chunk_set*44100
+        assert not margin == 0, 'margin cannot be zero!'
+        if margin > chunk_size:
+            margin = chunk_size
+        if chunk_set == 0 or samples < chunk_size:
+            chunk_size = samples
+        counter = -1
+        for skip in range(0, samples, chunk_size):
+            counter+=1
+            s_margin = 0 if counter == 0 else margin
+            end = min(skip+chunk_size+margin, samples)
+            start = skip-s_margin
+            segmented_mix[skip] = mix[:,start:end].copy()
+            if end == samples:
+                break
+        return segmented_mix
+    if is_missing_mix:
+        return mix
+    else:
+        segmented_mix = get_segmented_mix()
+        raw_mix = get_segmented_mix(chunk_set=0) if mdx_net_cut else mix
+        return segmented_mix, raw_mix, samplerate
 def rerun_mp3(audio_file, sample_rate=44100):
         if save_format == MP3:
             audio_path_mp3 = audio_path.replace(".wav", ".mp3")
+            musfile.export(audio_path_mp3, format="mp3", bitrate=mp3_bit_set)
         try:
             os.remove(audio_path)
         except Exception as e:
             print(e)