Spaces:

akhaliq
/

Music_Source_Separation

Runtime error

File size: 4,951 Bytes
from typing import Dict

import librosa
import numpy as np

from bytesep.utils import db_to_magnitude, get_pitch_shift_factor, magnitude_to_db


class Augmentor:
    def __init__(self, augmentations: Dict, random_seed=1234):
        r"""Augmentor for data augmentation of a waveform.

        Args:
            augmentations: Dict, e.g, {
                'mixaudio': {'vocals': 2, 'accompaniment': 2}
                'pitch_shift': {'vocals': 4, 'accompaniment': 4},
                ...,
            }
            random_seed: int
        """
        self.augmentations = augmentations
        self.random_state = np.random.RandomState(random_seed)

    def __call__(self, waveform: np.array, source_type: str) -> np.array:
        r"""Augment a waveform.

        Args:
            waveform: (channels_num, audio_samples)
            source_type: str

        Returns:
            new_waveform: (channels_num, new_audio_samples)
        """
        if 'pitch_shift' in self.augmentations.keys():
            waveform = self.pitch_shift(waveform, source_type)

        if 'magnitude_scale' in self.augmentations.keys():
            waveform = self.magnitude_scale(waveform, source_type)

        if 'swap_channel' in self.augmentations.keys():
            waveform = self.swap_channel(waveform, source_type)

        if 'flip_axis' in self.augmentations.keys():
            waveform = self.flip_axis(waveform, source_type)

        return waveform

    def pitch_shift(self, waveform: np.array, source_type: str) -> np.array:
        r"""Shift the pitch of a waveform. We use resampling for fast pitch
        shifting, so the speed will also be chaneged. The length of the returned
        waveform will be changed.

        Args:
            waveform: (channels_num, audio_samples)
            source_type: str

        Returns:
            new_waveform: (channels_num, new_audio_samples)
        """

        # maximum pitch shift in semitones
        max_pitch_shift = self.augmentations['pitch_shift'][source_type]

        if max_pitch_shift == 0:  # No pitch shift augmentations.
            return waveform

        # random pitch shift
        rand_pitch = self.random_state.uniform(
            low=-max_pitch_shift, high=max_pitch_shift
        )

        # We use librosa.resample instead of librosa.effects.pitch_shift
        # because it is 10x times faster.
        pitch_shift_factor = get_pitch_shift_factor(rand_pitch)
        dummy_sample_rate = 10000  # Dummy constant.

        channels_num = waveform.shape[0]

        if channels_num == 1:
            waveform = np.squeeze(waveform)

        new_waveform = librosa.resample(
            y=waveform,
            orig_sr=dummy_sample_rate,
            target_sr=dummy_sample_rate / pitch_shift_factor,
            res_type='linear',
            axis=-1,
        )

        if channels_num == 1:
            new_waveform = new_waveform[None, :]

        return new_waveform

    def magnitude_scale(self, waveform: np.array, source_type: str) -> np.array:
        r"""Scale the magnitude of a waveform.

        Args:
            waveform: (channels_num, audio_samples)
            source_type: str

        Returns:
            new_waveform: (channels_num, audio_samples)
        """
        lower_db = self.augmentations['magnitude_scale'][source_type]['lower_db']
        higher_db = self.augmentations['magnitude_scale'][source_type]['higher_db']

        if lower_db == 0 and higher_db == 0:  # No magnitude scale augmentation.
            return waveform

        # The magnitude (in dB) of the sample with the maximum value.
        waveform_db = magnitude_to_db(np.max(np.abs(waveform)))

        new_waveform_db = self.random_state.uniform(
            waveform_db + lower_db, min(waveform_db + higher_db, 0)
        )

        relative_db = new_waveform_db - waveform_db

        relative_scale = db_to_magnitude(relative_db)

        new_waveform = waveform * relative_scale

        return new_waveform

    def swap_channel(self, waveform: np.array, source_type: str) -> np.array:
        r"""Randomly swap channels.

        Args:
            waveform: (channels_num, audio_samples)
            source_type: str

        Returns:
            new_waveform: (channels_num, audio_samples)
        """
        ndim = waveform.shape[0]

        if ndim == 1:
            return waveform
        else:
            random_axes = self.random_state.permutation(ndim)
            return waveform[random_axes, :]

    def flip_axis(self, waveform: np.array, source_type: str) -> np.array:
        r"""Randomly flip the waveform along x-axis.

        Args:
            waveform: (channels_num, audio_samples)
            source_type: str

        Returns:
            new_waveform: (channels_num, audio_samples)
        """
        ndim = waveform.shape[0]
        random_values = self.random_state.choice([-1, 1], size=ndim)

        return waveform * random_values[:, None]