|
""" |
|
Implementation of the 'audio effects chain normalization' |
|
""" |
|
import numpy as np |
|
import scipy |
|
|
|
import os |
|
import sys |
|
currentdir = os.path.dirname(os.path.realpath(__file__)) |
|
sys.path.append(currentdir) |
|
from utils_data_normalization import * |
|
from normalization_imager import * |
|
|
|
|
|
''' |
|
Audio Effects Chain Normalization |
|
process: normalizes input stems according to given precomputed features |
|
''' |
|
class Audio_Effects_Normalizer: |
|
def __init__(self, precomputed_feature_path, \ |
|
STEMS=['drums', 'bass', 'other', 'vocals'], \ |
|
EFFECTS=['eq', 'compression', 'imager', 'loudness']): |
|
self.STEMS = STEMS |
|
self.EFFECTS = EFFECTS |
|
|
|
|
|
self.SR = 44100 |
|
self.SUBTYPE = 'PCM_16' |
|
|
|
|
|
self.FFT_SIZE = 2**16 |
|
self.HOP_LENGTH = self.FFT_SIZE//4 |
|
|
|
|
|
self.NTAPS = 1001 |
|
self.LUFS = -30 |
|
self.MIN_DB = -40 |
|
|
|
|
|
self.COMP_USE_EXPANDER = False |
|
self.COMP_PEAK_NORM = -10.0 |
|
self.COMP_TRUE_PEAK = False |
|
self.COMP_PERCENTILE = 75 |
|
self.COMP_MIN_TH = -40 |
|
self.COMP_MAX_RATIO = 20 |
|
comp_settings = {key:{} for key in self.STEMS} |
|
for key in comp_settings: |
|
if key == 'vocals': |
|
comp_settings[key]['attack'] = 7.5 |
|
comp_settings[key]['release'] = 400.0 |
|
comp_settings[key]['ratio'] = 4 |
|
comp_settings[key]['n_mels'] = 128 |
|
elif key == 'drums': |
|
comp_settings[key]['attack'] = 10.0 |
|
comp_settings[key]['release'] = 180.0 |
|
comp_settings[key]['ratio'] = 6 |
|
comp_settings[key]['n_mels'] = 128 |
|
elif key == 'bass': |
|
comp_settings[key]['attack'] = 10.0 |
|
comp_settings[key]['release'] = 500.0 |
|
comp_settings[key]['ratio'] = 5 |
|
comp_settings[key]['n_mels'] = 16 |
|
elif key == 'other': |
|
comp_settings[key]['attack'] = 15.0 |
|
comp_settings[key]['release'] = 666.0 |
|
comp_settings[key]['ratio'] = 4 |
|
comp_settings[key]['n_mels'] = 128 |
|
self.comp_settings = comp_settings |
|
|
|
|
|
features_mean = np.load(precomputed_feature_path, allow_pickle='TRUE')[()] |
|
self.features_mean = self.smooth_feature(features_mean) |
|
|
|
|
|
|
|
def normalize_audio(self, audio, src): |
|
assert src in self.STEMS |
|
|
|
normalized_audio = audio |
|
for cur_effect in self.EFFECTS: |
|
normalized_audio = self.normalize_audio_per_effect(normalized_audio, src=src, effect=cur_effect) |
|
|
|
return normalized_audio |
|
|
|
|
|
|
|
def normalize_audio_per_effect(self, audio, src, effect): |
|
audio = audio.astype(dtype=np.float32) |
|
audio_track = np.pad(audio, ((self.FFT_SIZE, self.FFT_SIZE), (0, 0)), mode='constant') |
|
|
|
assert len(audio_track.shape) == 2 |
|
|
|
if audio_track.shape[1] == 1: |
|
audio_track = np.repeat(audio_track, 2, axis=-1) |
|
|
|
output_audio = audio_track.copy() |
|
|
|
max_db = amp_to_db(np.max(np.abs(output_audio))) |
|
if max_db > self.MIN_DB: |
|
|
|
if effect == 'eq': |
|
|
|
for ch in range(audio_track.shape[1]): |
|
audio_eq_matched = get_eq_matching(output_audio[:, ch], |
|
self.features_mean[effect][src], |
|
sr=self.SR, |
|
n_fft=self.FFT_SIZE, |
|
hop_length=self.HOP_LENGTH, |
|
min_db=self.MIN_DB, |
|
ntaps=self.NTAPS, |
|
lufs=self.LUFS) |
|
|
|
|
|
np.copyto(output_audio[:,ch], audio_eq_matched) |
|
|
|
elif effect == 'compression': |
|
assert(len(self.features_mean[effect][src])==2) |
|
|
|
for ch in range(audio_track.shape[1]): |
|
try: |
|
audio_comp_matched = get_comp_matching(output_audio[:, ch], |
|
self.features_mean[effect][src][0], |
|
self.features_mean[effect][src][1], |
|
self.comp_settings[src]['ratio'], |
|
self.comp_settings[src]['attack'], |
|
self.comp_settings[src]['release'], |
|
sr=self.SR, |
|
min_db=self.MIN_DB, |
|
min_th=self.COMP_MIN_TH, |
|
comp_peak_norm=self.COMP_PEAK_NORM, |
|
max_ratio=self.COMP_MAX_RATIO, |
|
n_mels=self.comp_settings[src]['n_mels'], |
|
true_peak=self.COMP_TRUE_PEAK, |
|
percentile=self.COMP_PERCENTILE, |
|
expander=self.COMP_USE_EXPANDER) |
|
|
|
np.copyto(output_audio[:,ch], audio_comp_matched[:, 0]) |
|
except: |
|
break |
|
|
|
elif effect == 'loudness': |
|
output_audio = fx_utils.lufs_normalize(output_audio, self.SR, self.features_mean[effect][src], log=False) |
|
|
|
elif effect == 'imager': |
|
|
|
mono_threshold = 0.99 if src=='bass' else 0.975 |
|
audio_imager_matched = normalize_imager(output_audio, \ |
|
target_side_mid_bal=self.features_mean[effect][src], \ |
|
mono_threshold=mono_threshold, \ |
|
sr=self.SR) |
|
|
|
np.copyto(output_audio, audio_imager_matched) |
|
|
|
output_audio = output_audio[self.FFT_SIZE:self.FFT_SIZE+audio.shape[0]] |
|
return output_audio |
|
|
|
|
|
def smooth_feature(self, feature_dict_): |
|
|
|
for effect in self.EFFECTS: |
|
for key in self.STEMS: |
|
if effect == 'eq': |
|
if key in ['other', 'vocals']: |
|
f = 401 |
|
else: |
|
f = 151 |
|
feature_dict_[effect][key] = scipy.signal.savgol_filter(feature_dict_[effect][key], |
|
f, 1, mode='mirror') |
|
elif effect == 'panning': |
|
feature_dict_[effect][key] = scipy.signal.savgol_filter(feature_dict_[effect][key], |
|
501, 1, mode='mirror') |
|
return feature_dict_ |
|
|
|
|