|
import cv2 |
|
import librosa |
|
import numpy as np |
|
import albumentations |
|
from albumentations import (Compose, ImageCompression, GaussNoise, HorizontalFlip, |
|
PadIfNeeded, OneOf,ToGray, ShiftScaleRotate, GaussianBlur, |
|
RandomBrightnessContrast, FancyPCA, HueSaturationValue, BasicTransform) |
|
|
|
|
|
class AudioTransform(BasicTransform): |
|
""" Transform for audio task. This is the main class where we override the targets and update params function for our need""" |
|
@property |
|
def targets(self): |
|
return {"data": self.apply} |
|
|
|
def update_params(self, params, **kwargs): |
|
if hasattr(self, "interpolation"): |
|
params["interpolation"] = self.interpolation |
|
if hasattr(self, "fill_value"): |
|
params["fill_value"] = self.fill_value |
|
return params |
|
|
|
class TimeShifting(AudioTransform): |
|
""" Do time shifting of audio """ |
|
def __init__(self, always_apply=False, p=0.5): |
|
super(TimeShifting, self).__init__(always_apply, p) |
|
|
|
def apply(self,data,**params): |
|
''' |
|
data : ndarray of audio timeseries |
|
''' |
|
start_ = int(np.random.uniform(-80000,80000)) |
|
if start_ >= 0: |
|
audio_time_shift = np.r_[data[start_:], np.random.uniform(-0.001,0.001, start_)] |
|
else: |
|
audio_time_shift = np.r_[np.random.uniform(-0.001,0.001, -start_), data[:start_]] |
|
|
|
return audio_time_shift |
|
|
|
class PitchShift(AudioTransform): |
|
""" Do time shifting of audio """ |
|
def __init__(self, always_apply=False, p=0.5 , n_steps=None): |
|
super(PitchShift, self).__init__(always_apply, p) |
|
''' |
|
nsteps here is equal to number of semitones |
|
''' |
|
|
|
self.n_steps = n_steps |
|
|
|
def apply(self,data,**params): |
|
''' |
|
data : ndarray of audio timeseries |
|
''' |
|
return librosa.effects.pitch_shift(data,sr=16000,n_steps=self.n_steps) |
|
|
|
|
|
class AddGaussianNoise(AudioTransform): |
|
""" Do time shifting of audio """ |
|
def __init__(self, always_apply=False, p=0.5): |
|
super(AddGaussianNoise, self).__init__(always_apply, p) |
|
|
|
|
|
def apply(self,data,**params): |
|
''' |
|
data : ndarray of audio timeseries |
|
''' |
|
noise = np.random.randn(len(data)) |
|
data_wn = data + 0.005*noise |
|
return data_wn |
|
|
|
|
|
create_frame_transforms = Compose([ |
|
ImageCompression(quality_lower=60, quality_upper=100, p=0.5), |
|
GaussNoise(p=0.1), |
|
GaussianBlur(blur_limit=3, p=0.05), |
|
HorizontalFlip(), |
|
PadIfNeeded(min_height=256, min_width=256, border_mode=cv2.BORDER_CONSTANT), |
|
OneOf([RandomBrightnessContrast(), FancyPCA(), HueSaturationValue()], p=0.7), |
|
ToGray(p=0.2), |
|
ShiftScaleRotate(shift_limit=0.1, scale_limit=0.2, rotate_limit=10, border_mode=cv2.BORDER_CONSTANT, p=0.5),]) |
|
|
|
|
|
|
|
create_spec_transforms = albumentations.Compose([ |
|
TimeShifting(p=0.9), |
|
AddGaussianNoise(p=0.8), |
|
PitchShift(p=0.5,n_steps=4) |
|
]) |
|
|