|
import torch.nn as nn |
|
import torch |
|
import torch.nn.functional as F |
|
import torchaudio |
|
|
|
class PreEmphasis(torch.nn.Module): |
|
|
|
def __init__(self, coef: float = 0.97): |
|
super().__init__() |
|
self.coef = coef |
|
self.register_buffer( |
|
'flipped_filter', torch.FloatTensor([-self.coef, 1.]).unsqueeze(0).unsqueeze(0) |
|
) |
|
|
|
def forward(self, input: torch.tensor) -> torch.tensor: |
|
input = input.unsqueeze(1) |
|
input = F.pad(input, (1, 0), 'reflect') |
|
return F.conv1d(input, self.flipped_filter).squeeze(1) |
|
|
|
class FbankFeatureExtractor(nn.Module): |
|
"""Some Information about MyModule""" |
|
def __init__(self, feat_dim = 80, f_max = 7600, **kwargs): |
|
super(FbankFeatureExtractor, self, ).__init__() |
|
|
|
self.torchfbank = torch.nn.Sequential( |
|
PreEmphasis(), |
|
torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_fft=512, win_length=400, hop_length=160, \ |
|
f_min = 20, f_max = f_max, window_fn=torch.hamming_window, n_mels=feat_dim), |
|
) |
|
|
|
self.instance_norm = nn.InstanceNorm1d(feat_dim) |
|
|
|
def forward(self, x): |
|
with torch.no_grad(): |
|
x = self.torchfbank(x)+1e-6 |
|
x = x.log() |
|
x = x - torch.mean(x, dim=-1, keepdim=True) |
|
return x |