Spaces:
Running
Running
File size: 6,784 Bytes
b3fa29f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 |
import librosa
import librosa.core as lb
import librosa.display as lbd
import matplotlib.pyplot as plt
import numpy
import numpy as np
import pyloudnorm as pyln
import torch
from torchaudio.transforms import Resample
class AudioPreprocessor:
def __init__(self, input_sr, output_sr=None, melspec_buckets=80, hop_length=256, n_fft=1024, cut_silence=False, device="cpu"):
"""
The parameters are by default set up to do well
on a 16kHz signal. A different sampling rate may
require different hop_length and n_fft (e.g.
doubling frequency --> doubling hop_length and
doubling n_fft)
"""
self.cut_silence = cut_silence
self.device = device
self.sr = input_sr
self.new_sr = output_sr
self.hop_length = hop_length
self.n_fft = n_fft
self.mel_buckets = melspec_buckets
self.meter = pyln.Meter(input_sr)
self.final_sr = input_sr
if cut_silence:
torch.hub._validate_not_a_forked_repo = lambda a, b, c: True # torch 1.9 has a bug in the hub loading, this is a workaround
# careful: assumes 16kHz or 8kHz audio
self.silero_model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
model='silero_vad',
force_reload=False,
onnx=False,
verbose=False)
(self.get_speech_timestamps,
self.save_audio,
self.read_audio,
self.VADIterator,
self.collect_chunks) = utils
self.silero_model = self.silero_model.to(self.device)
if output_sr is not None and output_sr != input_sr:
self.resample = Resample(orig_freq=input_sr, new_freq=output_sr).to(self.device)
self.final_sr = output_sr
else:
self.resample = lambda x: x
def cut_silence_from_audio(self, audio):
"""
https://github.com/snakers4/silero-vad
"""
return self.collect_chunks(self.get_speech_timestamps(audio, self.silero_model, sampling_rate=self.final_sr), audio)
def to_mono(self, x):
"""
make sure we deal with a 1D array
"""
if len(x.shape) == 2:
return lb.to_mono(numpy.transpose(x))
else:
return x
def normalize_loudness(self, audio):
"""
normalize the amplitudes according to
their decibels, so this should turn any
signal with different magnitudes into
the same magnitude by analysing loudness
"""
loudness = self.meter.integrated_loudness(audio)
loud_normed = pyln.normalize.loudness(audio, loudness, -30.0)
peak = numpy.amax(numpy.abs(loud_normed))
peak_normed = numpy.divide(loud_normed, peak)
return peak_normed
def logmelfilterbank(self, audio, sampling_rate, fmin=40, fmax=8000, eps=1e-10):
"""
Compute log-Mel filterbank
one day this could be replaced by torchaudio's internal log10(melspec(audio)), but
for some reason it gives slightly different results, so in order not to break backwards
compatibility, this is kept for now. If there is ever a reason to completely re-train
all models, this would be a good opportunity to make the switch.
"""
if isinstance(audio, torch.Tensor):
audio = audio.numpy()
# get amplitude spectrogram
x_stft = librosa.stft(audio, n_fft=self.n_fft, hop_length=self.hop_length, win_length=None, window="hann", pad_mode="reflect")
spc = np.abs(x_stft).T
# get mel basis
fmin = 0 if fmin is None else fmin
fmax = sampling_rate / 2 if fmax is None else fmax
mel_basis = librosa.filters.mel(sampling_rate, self.n_fft, self.mel_buckets, fmin, fmax)
# apply log and return
return torch.Tensor(np.log10(np.maximum(eps, np.dot(spc, mel_basis.T)))).transpose(0, 1)
def normalize_audio(self, audio):
"""
one function to apply them all in an
order that makes sense.
"""
audio = self.to_mono(audio)
audio = self.normalize_loudness(audio)
audio = torch.Tensor(audio).to(self.device)
audio = self.resample(audio)
if self.cut_silence:
audio = self.cut_silence_from_audio(audio)
return audio.to("cpu")
def visualize_cleaning(self, unclean_audio):
"""
displays Mel Spectrogram of unclean audio
and then displays Mel Spectrogram of the
cleaned version.
"""
fig, ax = plt.subplots(nrows=2, ncols=1)
unclean_audio_mono = self.to_mono(unclean_audio)
unclean_spec = self.audio_to_mel_spec_tensor(unclean_audio_mono, normalize=False).numpy()
clean_spec = self.audio_to_mel_spec_tensor(unclean_audio_mono, normalize=True).numpy()
lbd.specshow(unclean_spec, sr=self.sr, cmap='GnBu', y_axis='mel', ax=ax[0], x_axis='time')
ax[0].set(title='Uncleaned Audio')
ax[0].label_outer()
if self.new_sr is not None:
lbd.specshow(clean_spec, sr=self.new_sr, cmap='GnBu', y_axis='mel', ax=ax[1], x_axis='time')
else:
lbd.specshow(clean_spec, sr=self.sr, cmap='GnBu', y_axis='mel', ax=ax[1], x_axis='time')
ax[1].set(title='Cleaned Audio')
ax[1].label_outer()
plt.show()
def audio_to_wave_tensor(self, audio, normalize=True):
if normalize:
return self.normalize_audio(audio)
else:
if isinstance(audio, torch.Tensor):
return audio
else:
return torch.Tensor(audio)
def audio_to_mel_spec_tensor(self, audio, normalize=True, explicit_sampling_rate=None):
"""
explicit_sampling_rate is for when
normalization has already been applied
and that included resampling. No way
to detect the current sr of the incoming
audio
"""
if explicit_sampling_rate is None:
if normalize:
audio = self.normalize_audio(audio)
return self.logmelfilterbank(audio=audio, sampling_rate=self.final_sr)
return self.logmelfilterbank(audio=audio, sampling_rate=self.sr)
if normalize:
audio = self.normalize_audio(audio)
return self.logmelfilterbank(audio=audio, sampling_rate=explicit_sampling_rate)
if __name__ == '__main__':
import soundfile
wav, sr = soundfile.read("../audios/test.wav")
ap = AudioPreprocessor(input_sr=sr, output_sr=16000)
ap.visualize_cleaning(wav)
|