import librosa import librosa.core as lb import librosa.display as lbd import matplotlib.pyplot as plt import numpy import numpy as np import pyloudnorm as pyln import torch from torchaudio.transforms import Resample class AudioPreprocessor: def __init__(self, input_sr, output_sr=None, melspec_buckets=80, hop_length=256, n_fft=1024, cut_silence=False, device="cpu"): """ The parameters are by default set up to do well on a 16kHz signal. A different sampling rate may require different hop_length and n_fft (e.g. doubling frequency --> doubling hop_length and doubling n_fft) """ self.cut_silence = cut_silence self.device = device self.sr = input_sr self.new_sr = output_sr self.hop_length = hop_length self.n_fft = n_fft self.mel_buckets = melspec_buckets self.meter = pyln.Meter(input_sr) self.final_sr = input_sr if cut_silence: torch.hub._validate_not_a_forked_repo = lambda a, b, c: True # torch 1.9 has a bug in the hub loading, this is a workaround # careful: assumes 16kHz or 8kHz audio self.silero_model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', model='silero_vad', force_reload=False, onnx=False, verbose=False) (self.get_speech_timestamps, self.save_audio, self.read_audio, self.VADIterator, self.collect_chunks) = utils self.silero_model = self.silero_model.to(self.device) if output_sr is not None and output_sr != input_sr: self.resample = Resample(orig_freq=input_sr, new_freq=output_sr).to(self.device) self.final_sr = output_sr else: self.resample = lambda x: x def cut_silence_from_audio(self, audio): """ https://github.com/snakers4/silero-vad """ return self.collect_chunks(self.get_speech_timestamps(audio, self.silero_model, sampling_rate=self.final_sr), audio) def to_mono(self, x): """ make sure we deal with a 1D array """ if len(x.shape) == 2: return lb.to_mono(numpy.transpose(x)) else: return x def normalize_loudness(self, audio): """ normalize the amplitudes according to their decibels, so this should turn any signal with different magnitudes into the same magnitude by analysing loudness """ loudness = self.meter.integrated_loudness(audio) loud_normed = pyln.normalize.loudness(audio, loudness, -30.0) peak = numpy.amax(numpy.abs(loud_normed)) peak_normed = numpy.divide(loud_normed, peak) return peak_normed def logmelfilterbank(self, audio, sampling_rate, fmin=40, fmax=8000, eps=1e-10): """ Compute log-Mel filterbank one day this could be replaced by torchaudio's internal log10(melspec(audio)), but for some reason it gives slightly different results, so in order not to break backwards compatibility, this is kept for now. If there is ever a reason to completely re-train all models, this would be a good opportunity to make the switch. """ if isinstance(audio, torch.Tensor): audio = audio.numpy() # get amplitude spectrogram x_stft = librosa.stft(audio, n_fft=self.n_fft, hop_length=self.hop_length, win_length=None, window="hann", pad_mode="reflect") spc = np.abs(x_stft).T # get mel basis fmin = 0 if fmin is None else fmin fmax = sampling_rate / 2 if fmax is None else fmax mel_basis = librosa.filters.mel(sampling_rate, self.n_fft, self.mel_buckets, fmin, fmax) # apply log and return return torch.Tensor(np.log10(np.maximum(eps, np.dot(spc, mel_basis.T)))).transpose(0, 1) def normalize_audio(self, audio): """ one function to apply them all in an order that makes sense. """ audio = self.to_mono(audio) audio = self.normalize_loudness(audio) audio = torch.Tensor(audio).to(self.device) audio = self.resample(audio) if self.cut_silence: audio = self.cut_silence_from_audio(audio) return audio.to("cpu") def visualize_cleaning(self, unclean_audio): """ displays Mel Spectrogram of unclean audio and then displays Mel Spectrogram of the cleaned version. """ fig, ax = plt.subplots(nrows=2, ncols=1) unclean_audio_mono = self.to_mono(unclean_audio) unclean_spec = self.audio_to_mel_spec_tensor(unclean_audio_mono, normalize=False).numpy() clean_spec = self.audio_to_mel_spec_tensor(unclean_audio_mono, normalize=True).numpy() lbd.specshow(unclean_spec, sr=self.sr, cmap='GnBu', y_axis='mel', ax=ax[0], x_axis='time') ax[0].set(title='Uncleaned Audio') ax[0].label_outer() if self.new_sr is not None: lbd.specshow(clean_spec, sr=self.new_sr, cmap='GnBu', y_axis='mel', ax=ax[1], x_axis='time') else: lbd.specshow(clean_spec, sr=self.sr, cmap='GnBu', y_axis='mel', ax=ax[1], x_axis='time') ax[1].set(title='Cleaned Audio') ax[1].label_outer() plt.show() def audio_to_wave_tensor(self, audio, normalize=True): if normalize: return self.normalize_audio(audio) else: if isinstance(audio, torch.Tensor): return audio else: return torch.Tensor(audio) def audio_to_mel_spec_tensor(self, audio, normalize=True, explicit_sampling_rate=None): """ explicit_sampling_rate is for when normalization has already been applied and that included resampling. No way to detect the current sr of the incoming audio """ if explicit_sampling_rate is None: if normalize: audio = self.normalize_audio(audio) return self.logmelfilterbank(audio=audio, sampling_rate=self.final_sr) return self.logmelfilterbank(audio=audio, sampling_rate=self.sr) if normalize: audio = self.normalize_audio(audio) return self.logmelfilterbank(audio=audio, sampling_rate=explicit_sampling_rate) if __name__ == '__main__': import soundfile wav, sr = soundfile.read("../audios/test.wav") ap = AudioPreprocessor(input_sr=sr, output_sr=16000) ap.visualize_cleaning(wav)