| from torchaudio import transforms as T |
| import torch |
| import torch.nn as nn |
|
|
| MEAN, STD = 0.5347, 0.0772 |
| SR = 16000 |
| NFFT = 1024 |
| HOPLEN = 320 |
| NMELS = 128 |
| FMIN = 50 |
| FMAX = 8000 |
|
|
| class Normalization(torch.nn.Module): |
| def __init__(self): |
| super().__init__() |
| |
| def forward(self, x): |
| return (x - x.min()) / (x.max() - x.min()) |
| |
| class Standardization(torch.nn.Module): |
| def __init__(self, mean, std): |
| super().__init__() |
|
|
| self.mean = mean |
| self.std = std |
| |
| def forward(self, x): |
| return (x - self.mean) / self.std |
|
|
| class MelSpectrogramProcessor: |
| def __init__(self, sample_rate=SR, n_mels=NMELS, n_fft=NFFT, hop_length=HOPLEN, f_min=FMIN, f_max=FMAX, device='cpu'): |
| self.transform = nn.Sequential( |
| T.MelSpectrogram(sample_rate=sample_rate, n_mels=n_mels, n_fft=n_fft, hop_length=hop_length, f_min=f_min, f_max=f_max), |
| T.AmplitudeToDB(), |
| Normalization(), |
| Standardization(mean=MEAN, std=STD), |
| ).to(device) |
|
|
| def process(self, waveform): |
| return self.transform(waveform) |