import torch import torchaudio import numpy as np def gain(xs, min_dB=-12, max_dB=12): gain_dB = (torch.rand(1) * (max_dB - min_dB)) + min_dB gain_ln = 10 ** (gain_dB / 20) for idx, x in enumerate(xs): xs[idx] = x * gain_ln return xs def peaking_filter(xs, sr=44100, frequency=1000, width_q=0.707, gain_db=12): # gain_db = ((torch.rand(1) * 6) + 6).numpy().squeeze() # width_q = (torch.rand(1) * 4).numpy().squeeze() # frequency = ((torch.rand(1) * 9960) + 40).numpy().squeeze() # if torch.rand(1) > 0.5: # gain_db = -gain_db effects = [["equalizer", f"{frequency}", f"{width_q}", f"{gain_db}"]] for idx, x in enumerate(xs): y, sr = torchaudio.sox_effects.apply_effects_tensor( x, sr, effects, channels_first=True ) xs[idx] = y return xs def pitch_shift(xs, min_shift=-200, max_shift=200, sr=44100): shift = min_shift + (torch.rand(1)).numpy().squeeze() * (max_shift - min_shift) effects = [["pitch", f"{shift}"]] for idx, x in enumerate(xs): y, sr = torchaudio.sox_effects.apply_effects_tensor( x, sr, effects, channels_first=True ) xs[idx] = y return xs def time_stretch(xs, min_stretch=0.8, max_stretch=1.2, sr=44100): stretch = min_stretch + (torch.rand(1)).numpy().squeeze() * ( max_stretch - min_stretch ) effects = [["tempo", f"{stretch}"]] for idx, x in enumerate(xs): y, sr = torchaudio.sox_effects.apply_effects_tensor( x, sr, effects, channels_first=True ) xs[idx] = y return xs def frequency_corruption(xs, sr=44100): effects = [] # apply a random number of peaking bands from 0 to 4s bands = [[200, 2000], [800, 4000], [2000, 8000], [4000, int((sr // 2) * 0.9)]] total_gain_db = 0.0 for band in bands: if torch.rand(1).sum() > 0.2: frequency = (torch.randint(band[0], band[1], [1])).numpy().squeeze() width_q = ((torch.rand(1) * 10) + 0.1).numpy().squeeze() gain_db = ((torch.rand(1) * 48)).numpy().squeeze() if torch.rand(1).sum() > 0.5: gain_db = -gain_db total_gain_db += gain_db if np.abs(total_gain_db) >= 24: continue cmd = ["equalizer", f"{frequency}", f"{width_q}", f"{gain_db}"] effects.append(cmd) # low shelf (bass) if torch.rand(1).sum() > 0.2: gain_db = ((torch.rand(1) * 24)).numpy().squeeze() frequency = (torch.randint(20, 200, [1])).numpy().squeeze() if torch.rand(1).sum() > 0.5: gain_db = -gain_db effects.append(["bass", f"{gain_db}", f"{frequency}"]) # high shelf (treble) if torch.rand(1).sum() > 0.2: gain_db = ((torch.rand(1) * 24)).numpy().squeeze() frequency = (torch.randint(4000, int((sr // 2) * 0.9), [1])).numpy().squeeze() if torch.rand(1).sum() > 0.5: gain_db = -gain_db effects.append(["treble", f"{gain_db}", f"{frequency}"]) for idx, x in enumerate(xs): y, sr = torchaudio.sox_effects.apply_effects_tensor( x.view(1, -1) * 10 ** (-48 / 20), sr, effects, channels_first=True ) # apply gain back y *= 10 ** (48 / 20) xs[idx] = y return xs def dynamic_range_corruption(xs, sr=44100): """Apply an expander.""" attack = (torch.rand([1]).numpy()[0] * 0.05) + 0.001 release = (torch.rand([1]).numpy()[0] * 0.2) + attack knee = (torch.rand([1]).numpy()[0] * 12) + 0.0 # design the compressor transfer function start = -100.0 threshold = -( (torch.rand([1]).numpy()[0] * 20) + 10 ) # threshold from -30 to -10 dB ratio = (torch.rand([1]).numpy()[0] * 4.0) + 1 # ratio from 1:1 to 5:1 # compute the transfer curve point = -((-threshold / -ratio) + (-start / ratio) + -threshold) # apply some makeup gain makeup = torch.rand([1]).numpy()[0] * 6 effects = [ [ "compand", f"{attack},{release}", f"{knee}:{point},{start},{threshold},{threshold}", f"{makeup}", f"{start}", ] ] for idx, x in enumerate(xs): # if the input is clipping normalize it if x.abs().max() >= 1.0: x /= x.abs().max() gain_db = -((torch.rand(1) * 24)).numpy().squeeze() x *= 10 ** (gain_db / 20.0) y, sr = torchaudio.sox_effects.apply_effects_tensor( x.view(1, -1), sr, effects, channels_first=True ) xs[idx] = y return xs def dynamic_range_compression(xs, sr=44100): """Apply a compressor.""" attack = (torch.rand([1]).numpy()[0] * 0.05) + 0.0005 release = (torch.rand([1]).numpy()[0] * 0.2) + attack knee = (torch.rand([1]).numpy()[0] * 12) + 0.0 # design the compressor transfer function start = -100.0 threshold = -((torch.rand([1]).numpy()[0] * 52) + 12) # threshold from -64 to -12 dB ratio = (torch.rand([1]).numpy()[0] * 10.0) + 1 # ratio from 1:1 to 10:1 # compute the transfer curve point = threshold * (1 - (1 / ratio)) # apply some makeup gain makeup = torch.rand([1]).numpy()[0] * 6 effects = [ [ "compand", f"{attack},{release}", f"{knee}:{start},{threshold},{threshold},0,{point}", f"{makeup}", f"{start}", f"{attack}", ] ] for idx, x in enumerate(xs): y, sr = torchaudio.sox_effects.apply_effects_tensor( x.view(1, -1), sr, effects, channels_first=True ) xs[idx] = y return xs def lowpass_filter(xs, sr=44100, frequency=4000): effects = [["lowpass", f"{frequency}"]] for idx, x in enumerate(xs): y, sr = torchaudio.sox_effects.apply_effects_tensor( x, sr, effects, channels_first=True ) xs[idx] = y return xs def apply(xs, sr, augmentations): # iterate over augmentation dict for aug, params in augmentations.items(): if aug == "gain": xs = gain(xs, **params) elif aug == "peak": xs = peaking_filter(xs, **params) elif aug == "lowpass": xs = lowpass_filter(xs, **params) elif aug == "pitch": xs = pitch_shift(xs, **params) elif aug == "tempo": xs = time_stretch(xs, **params) elif aug == "freq_corrupt": xs = frequency_corruption(xs, **params) else: raise RuntimeError("Invalid augmentation: {aug}") return xs