|
import librosa |
|
import numpy as np |
|
import torch |
|
import torch.nn as nn |
|
from torch.utils.data import Dataset |
|
from torch.utils.data import Dataset |
|
from transformers import Wav2Vec2Processor |
|
from transformers.models.wav2vec2.modeling_wav2vec2 import ( |
|
Wav2Vec2Model, |
|
Wav2Vec2PreTrainedModel, |
|
) |
|
|
|
from config import config |
|
|
|
|
|
class RegressionHead(nn.Module): |
|
r"""Classification head.""" |
|
|
|
def __init__(self, config): |
|
super().__init__() |
|
|
|
self.dense = nn.Linear(config.hidden_size, config.hidden_size) |
|
self.dropout = nn.Dropout(config.final_dropout) |
|
self.out_proj = nn.Linear(config.hidden_size, config.num_labels) |
|
|
|
def forward(self, features, **kwargs): |
|
x = features |
|
x = self.dropout(x) |
|
x = self.dense(x) |
|
x = torch.tanh(x) |
|
x = self.dropout(x) |
|
x = self.out_proj(x) |
|
|
|
return x |
|
|
|
|
|
class EmotionModel(Wav2Vec2PreTrainedModel): |
|
r"""Speech emotion classifier.""" |
|
|
|
def __init__(self, config): |
|
super().__init__(config) |
|
|
|
self.config = config |
|
self.wav2vec2 = Wav2Vec2Model(config) |
|
self.classifier = RegressionHead(config) |
|
self.init_weights() |
|
|
|
def forward( |
|
self, |
|
input_values, |
|
): |
|
outputs = self.wav2vec2(input_values) |
|
hidden_states = outputs[0] |
|
hidden_states = torch.mean(hidden_states, dim=1) |
|
logits = self.classifier(hidden_states) |
|
|
|
return hidden_states, logits |
|
|
|
|
|
class AudioDataset(Dataset): |
|
def __init__(self, list_of_wav_files, sr, processor): |
|
self.list_of_wav_files = list_of_wav_files |
|
self.processor = processor |
|
self.sr = sr |
|
|
|
def __len__(self): |
|
return len(self.list_of_wav_files) |
|
|
|
def __getitem__(self, idx): |
|
wav_file = self.list_of_wav_files[idx] |
|
audio_data, _ = librosa.load(wav_file, sr=self.sr) |
|
processed_data = self.processor(audio_data, sampling_rate=self.sr)[ |
|
"input_values" |
|
][0] |
|
return torch.from_numpy(processed_data) |
|
|
|
|
|
device = config.emo_gen_config.device |
|
model_name = "./emotional/wav2vec2-large-robust-12-ft-emotion-msp-dim" |
|
processor = Wav2Vec2Processor.from_pretrained(model_name) |
|
model = EmotionModel.from_pretrained(model_name).to(device) |
|
|
|
|
|
def process_func( |
|
x: np.ndarray, |
|
sampling_rate: int, |
|
model: EmotionModel, |
|
processor: Wav2Vec2Processor, |
|
device: str, |
|
embeddings: bool = False, |
|
) -> np.ndarray: |
|
r"""Predict emotions or extract embeddings from raw audio signal.""" |
|
model = model.to(device) |
|
y = processor(x, sampling_rate=sampling_rate) |
|
y = y["input_values"][0] |
|
y = torch.from_numpy(y).unsqueeze(0).to(device) |
|
|
|
|
|
with torch.no_grad(): |
|
y = model(y)[0 if embeddings else 1] |
|
|
|
|
|
y = y.detach().cpu().numpy() |
|
|
|
return y |
|
|
|
|
|
def get_emo(path): |
|
wav, sr = librosa.load(path, 16000) |
|
return process_func( |
|
np.expand_dims(wav, 0).astype(np.float64), |
|
sr, |
|
model, |
|
processor, |
|
device, |
|
embeddings=True, |
|
).squeeze(0) |
|
|