Spaces:
Running
Running
File size: 5,161 Bytes
c5f65a4 701e2d4 c5f65a4 701e2d4 c5f65a4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 |
"""Audio transforms."""
# import torchaudio
# import torchvision
from torchvision.transforms import Compose, ToTensor
import torchaudio.transforms as T
# import imgaug.augmenters as iaa
import numpy as np
import torch
class AddNoise(object):
"""Add noise to the waveform."""
def __init__(self, noise_level=0.1):
self.noise_level = noise_level
def __call__(self, waveform):
noise = torch.randn_like(waveform)
return waveform + self.noise_level * noise
def __repr__(self):
return self.__class__.__name__ + f"(noise_level={self.noise_level})"
class ChangeVolume(object):
"""Change the volume of the waveform."""
def __init__(self, volume_factor=[0.6, 1.2]):
self.volume_factor = volume_factor
def __call__(self, waveform):
return waveform * np.random.uniform(*self.volume_factor)
def __repr__(self):
return self.__class__.__name__ + f"(volume_factor={self.volume_factor})"
def configure_transforms(cfg):
"""
Given a transform config (List[dict]), return a Compose object that
applies the transforms in order.
"""
transform = []
for a in cfg:
transform.append(eval(a["name"])(**a["args"]))
return Compose(transform)
class AudioClipsTransform:
def __init__(self, audio_transform):
"""Applies image transform to each frame of each video clip."""
self.audio_transform = audio_transform
def __call__(self, audio_clips):
"""
Args:
audio_clips (list): list of audio clips, each tensor [1, M]
where M is number of samples in each clip
"""
transformed_audio_clips = [self.audio_transform(x) for x in audio_clips]
# transformed_audio_clips = []
# for clip in audio_clips:
# transformed_clip = [self.audio_transform(x) for x in clip]
# transformed_audio_clips.append(transformed_clip)
return transformed_audio_clips
def __repr__(self):
return self.audio_transform.__repr__()
class NumpyToTensor:
def __call__(self, x):
return torch.from_numpy(x).float()
def __repr__(self):
return self.__class__.__name__ + "()"
# TODO: Might have to introduce normalisation
# to have a consistent pipeline.
class Wav2Vec2WaveformProcessor:
def __init__(self, model_name="facebook/wav2vec2-base-960h", sr=16000):
from transformers import Wav2Vec2Processor
self.processor = Wav2Vec2Processor.from_pretrained(model_name)
self.sr = sr
def __call__(self, x):
x = self.processor(
x, sampling_rate=self.sr, return_tensors="pt",
).input_values
return x
def define_audio_transforms(cfg_transform, augment=False):
wave_transforms = cfg_transform["audio"]["wave"]
wave_transforms_new = []
# Only pick augmentations if augment=True
for t in wave_transforms:
if "augmentation" not in t:
wave_transforms_new.append(t)
else:
if augment and t["augmentation"]:
wave_transforms_new.append(t)
# print(wave_transforms_new)
wave_transform = configure_transforms(wave_transforms_new)
wave_transform = AudioClipsTransform(wave_transform)
# wave_transform = configure_transforms(
# cfg_transform["audio"]["wave"],
# )
# wave_transform = AudioClipsTransform(wave_transform)
# spec_transform = configure_transforms(
# cfg_transform["audio"]["spec"],
# )
# spec_transform = AudioClipsTransform(spec_transform)
audio_transform = dict(
wave=wave_transform,
# spec=spec_transform,
)
return audio_transform
if __name__ == "__main__":
# Testing it out
# Raw waveform transform
cfg = [
{
"name": "AddNoise",
"args": {"noise_level": 0.1},
},
{
"name": "ChangeVolume",
"args": {"volume_factor": [0.6, 1.2]},
},
]
transform = configure_transforms(cfg)
x = torch.randn([1, 16000])
z = transform(x)
print(x.shape, z.shape)
import matplotlib.pyplot as plt
fig, ax = plt.subplots(2, 1, figsize=(8, 4))
ax[0].plot(x[0].numpy())
ax[1].plot(z[0].numpy())
plt.savefig("waveform_transform.png")
# Wav2Vec2 transform
cfg = [
{
"name": "Wav2Vec2WaveformProcessor",
"args": {"model_name": "facebook/wav2vec2-base-960h", "sr": 16000},
},
]
transform = configure_transforms(cfg)
x = torch.randn([4, 16000])
z = transform(x)
print(x.shape, z.shape)
# Spectrogram transform
cfg = [
{
"name": "T.FrequencyMasking",
"args": {"freq_mask_param": 8},
},
{
"name": "T.TimeMasking",
"args": {"time_mask_param": 16},
},
]
transform = configure_transforms(cfg)
x = torch.randn([1, 64, 251])
z = transform(x)
print(x.shape, z.shape)
fig, ax = plt.subplots(2, 1, figsize=(8, 4))
ax[0].imshow(x[0].numpy())
ax[1].imshow(z[0].numpy())
plt.savefig("spectrogram_transform.png")
|