File size: 4,951 Bytes
5019931
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
from typing import Dict

import librosa
import numpy as np

from bytesep.utils import db_to_magnitude, get_pitch_shift_factor, magnitude_to_db


class Augmentor:
    def __init__(self, augmentations: Dict, random_seed=1234):
        r"""Augmentor for data augmentation of a waveform.

        Args:
            augmentations: Dict, e.g, {
                'mixaudio': {'vocals': 2, 'accompaniment': 2}
                'pitch_shift': {'vocals': 4, 'accompaniment': 4},
                ...,
            }
            random_seed: int
        """
        self.augmentations = augmentations
        self.random_state = np.random.RandomState(random_seed)

    def __call__(self, waveform: np.array, source_type: str) -> np.array:
        r"""Augment a waveform.

        Args:
            waveform: (channels_num, audio_samples)
            source_type: str

        Returns:
            new_waveform: (channels_num, new_audio_samples)
        """
        if 'pitch_shift' in self.augmentations.keys():
            waveform = self.pitch_shift(waveform, source_type)

        if 'magnitude_scale' in self.augmentations.keys():
            waveform = self.magnitude_scale(waveform, source_type)

        if 'swap_channel' in self.augmentations.keys():
            waveform = self.swap_channel(waveform, source_type)

        if 'flip_axis' in self.augmentations.keys():
            waveform = self.flip_axis(waveform, source_type)

        return waveform

    def pitch_shift(self, waveform: np.array, source_type: str) -> np.array:
        r"""Shift the pitch of a waveform. We use resampling for fast pitch
        shifting, so the speed will also be chaneged. The length of the returned
        waveform will be changed.

        Args:
            waveform: (channels_num, audio_samples)
            source_type: str

        Returns:
            new_waveform: (channels_num, new_audio_samples)
        """

        # maximum pitch shift in semitones
        max_pitch_shift = self.augmentations['pitch_shift'][source_type]

        if max_pitch_shift == 0:  # No pitch shift augmentations.
            return waveform

        # random pitch shift
        rand_pitch = self.random_state.uniform(
            low=-max_pitch_shift, high=max_pitch_shift
        )

        # We use librosa.resample instead of librosa.effects.pitch_shift
        # because it is 10x times faster.
        pitch_shift_factor = get_pitch_shift_factor(rand_pitch)
        dummy_sample_rate = 10000  # Dummy constant.

        channels_num = waveform.shape[0]

        if channels_num == 1:
            waveform = np.squeeze(waveform)

        new_waveform = librosa.resample(
            y=waveform,
            orig_sr=dummy_sample_rate,
            target_sr=dummy_sample_rate / pitch_shift_factor,
            res_type='linear',
            axis=-1,
        )

        if channels_num == 1:
            new_waveform = new_waveform[None, :]

        return new_waveform

    def magnitude_scale(self, waveform: np.array, source_type: str) -> np.array:
        r"""Scale the magnitude of a waveform.

        Args:
            waveform: (channels_num, audio_samples)
            source_type: str

        Returns:
            new_waveform: (channels_num, audio_samples)
        """
        lower_db = self.augmentations['magnitude_scale'][source_type]['lower_db']
        higher_db = self.augmentations['magnitude_scale'][source_type]['higher_db']

        if lower_db == 0 and higher_db == 0:  # No magnitude scale augmentation.
            return waveform

        # The magnitude (in dB) of the sample with the maximum value.
        waveform_db = magnitude_to_db(np.max(np.abs(waveform)))

        new_waveform_db = self.random_state.uniform(
            waveform_db + lower_db, min(waveform_db + higher_db, 0)
        )

        relative_db = new_waveform_db - waveform_db

        relative_scale = db_to_magnitude(relative_db)

        new_waveform = waveform * relative_scale

        return new_waveform

    def swap_channel(self, waveform: np.array, source_type: str) -> np.array:
        r"""Randomly swap channels.

        Args:
            waveform: (channels_num, audio_samples)
            source_type: str

        Returns:
            new_waveform: (channels_num, audio_samples)
        """
        ndim = waveform.shape[0]

        if ndim == 1:
            return waveform
        else:
            random_axes = self.random_state.permutation(ndim)
            return waveform[random_axes, :]

    def flip_axis(self, waveform: np.array, source_type: str) -> np.array:
        r"""Randomly flip the waveform along x-axis.

        Args:
            waveform: (channels_num, audio_samples)
            source_type: str

        Returns:
            new_waveform: (channels_num, audio_samples)
        """
        ndim = waveform.shape[0]
        random_values = self.random_state.choice([-1, 1], size=ndim)

        return waveform * random_values[:, None]