Spaces:
Runtime error
Runtime error
| from typing import Dict | |
| import librosa | |
| import numpy as np | |
| from bytesep.utils import db_to_magnitude, get_pitch_shift_factor, magnitude_to_db | |
| class Augmentor: | |
| def __init__(self, augmentations: Dict, random_seed=1234): | |
| r"""Augmentor for data augmentation of a waveform. | |
| Args: | |
| augmentations: Dict, e.g, { | |
| 'mixaudio': {'vocals': 2, 'accompaniment': 2} | |
| 'pitch_shift': {'vocals': 4, 'accompaniment': 4}, | |
| ..., | |
| } | |
| random_seed: int | |
| """ | |
| self.augmentations = augmentations | |
| self.random_state = np.random.RandomState(random_seed) | |
| def __call__(self, waveform: np.array, source_type: str) -> np.array: | |
| r"""Augment a waveform. | |
| Args: | |
| waveform: (channels_num, audio_samples) | |
| source_type: str | |
| Returns: | |
| new_waveform: (channels_num, new_audio_samples) | |
| """ | |
| if 'pitch_shift' in self.augmentations.keys(): | |
| waveform = self.pitch_shift(waveform, source_type) | |
| if 'magnitude_scale' in self.augmentations.keys(): | |
| waveform = self.magnitude_scale(waveform, source_type) | |
| if 'swap_channel' in self.augmentations.keys(): | |
| waveform = self.swap_channel(waveform, source_type) | |
| if 'flip_axis' in self.augmentations.keys(): | |
| waveform = self.flip_axis(waveform, source_type) | |
| return waveform | |
| def pitch_shift(self, waveform: np.array, source_type: str) -> np.array: | |
| r"""Shift the pitch of a waveform. We use resampling for fast pitch | |
| shifting, so the speed will also be chaneged. The length of the returned | |
| waveform will be changed. | |
| Args: | |
| waveform: (channels_num, audio_samples) | |
| source_type: str | |
| Returns: | |
| new_waveform: (channels_num, new_audio_samples) | |
| """ | |
| # maximum pitch shift in semitones | |
| max_pitch_shift = self.augmentations['pitch_shift'][source_type] | |
| if max_pitch_shift == 0: # No pitch shift augmentations. | |
| return waveform | |
| # random pitch shift | |
| rand_pitch = self.random_state.uniform( | |
| low=-max_pitch_shift, high=max_pitch_shift | |
| ) | |
| # We use librosa.resample instead of librosa.effects.pitch_shift | |
| # because it is 10x times faster. | |
| pitch_shift_factor = get_pitch_shift_factor(rand_pitch) | |
| dummy_sample_rate = 10000 # Dummy constant. | |
| channels_num = waveform.shape[0] | |
| if channels_num == 1: | |
| waveform = np.squeeze(waveform) | |
| new_waveform = librosa.resample( | |
| y=waveform, | |
| orig_sr=dummy_sample_rate, | |
| target_sr=dummy_sample_rate / pitch_shift_factor, | |
| res_type='linear', | |
| axis=-1, | |
| ) | |
| if channels_num == 1: | |
| new_waveform = new_waveform[None, :] | |
| return new_waveform | |
| def magnitude_scale(self, waveform: np.array, source_type: str) -> np.array: | |
| r"""Scale the magnitude of a waveform. | |
| Args: | |
| waveform: (channels_num, audio_samples) | |
| source_type: str | |
| Returns: | |
| new_waveform: (channels_num, audio_samples) | |
| """ | |
| lower_db = self.augmentations['magnitude_scale'][source_type]['lower_db'] | |
| higher_db = self.augmentations['magnitude_scale'][source_type]['higher_db'] | |
| if lower_db == 0 and higher_db == 0: # No magnitude scale augmentation. | |
| return waveform | |
| # The magnitude (in dB) of the sample with the maximum value. | |
| waveform_db = magnitude_to_db(np.max(np.abs(waveform))) | |
| new_waveform_db = self.random_state.uniform( | |
| waveform_db + lower_db, min(waveform_db + higher_db, 0) | |
| ) | |
| relative_db = new_waveform_db - waveform_db | |
| relative_scale = db_to_magnitude(relative_db) | |
| new_waveform = waveform * relative_scale | |
| return new_waveform | |
| def swap_channel(self, waveform: np.array, source_type: str) -> np.array: | |
| r"""Randomly swap channels. | |
| Args: | |
| waveform: (channels_num, audio_samples) | |
| source_type: str | |
| Returns: | |
| new_waveform: (channels_num, audio_samples) | |
| """ | |
| ndim = waveform.shape[0] | |
| if ndim == 1: | |
| return waveform | |
| else: | |
| random_axes = self.random_state.permutation(ndim) | |
| return waveform[random_axes, :] | |
| def flip_axis(self, waveform: np.array, source_type: str) -> np.array: | |
| r"""Randomly flip the waveform along x-axis. | |
| Args: | |
| waveform: (channels_num, audio_samples) | |
| source_type: str | |
| Returns: | |
| new_waveform: (channels_num, audio_samples) | |
| """ | |
| ndim = waveform.shape[0] | |
| random_values = self.random_state.choice([-1, 1], size=ndim) | |
| return waveform * random_values[:, None] | |