|
|
import os |
|
|
import io |
|
|
import torch |
|
|
import torchaudio |
|
|
from typing import Any, Dict |
|
|
from transformers import AutoConfig, AutoProcessor |
|
|
from modeling_upstream_finetune import UpstreamFinetune |
|
|
|
|
|
class EndpointHandler(): |
|
|
def __init__(self, model_dir: str, **kwargs: Any) -> None: |
|
|
|
|
|
device = 'cuda' |
|
|
self.emotions = ['neutral','happy','sad','angry','surprise','contempt'] |
|
|
|
|
|
self.model = UpstreamFinetune.from_pretrained( |
|
|
model_dir, |
|
|
device=device, |
|
|
) |
|
|
self.model.eval() |
|
|
|
|
|
def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]: |
|
|
|
|
|
audio = data["inputs"] |
|
|
sampling_rate = data.get("sampling_rate", 16000) |
|
|
|
|
|
|
|
|
waveform, sr = torchaudio.load(io.BytesIO(audio)) |
|
|
if sr != sampling_rate: |
|
|
waveform = torchaudio.functional.resample(waveform, sr, sampling_rate) |
|
|
|
|
|
|
|
|
with torch.no_grad(): |
|
|
cat_logits, reg_outputs = self.model( |
|
|
waveform, |
|
|
sampling_rate |
|
|
) |
|
|
|
|
|
|
|
|
emotion_probs = torch.nn.functional.softmax(cat_logits, dim=1) |
|
|
|
|
|
|
|
|
emotion_predictions = [] |
|
|
for i, emotion in enumerate(self.emotions): |
|
|
emotion_predictions.append({ |
|
|
"label": emotion, |
|
|
"score": float(emotion_probs[0, i]) |
|
|
}) |
|
|
|
|
|
|
|
|
result = emotion_predictions + [ |
|
|
{"label": "arousal", "score": float(reg_outputs[0, 0])}, |
|
|
{"label": "valence", "score": float(reg_outputs[0, 1])} |
|
|
] |
|
|
|
|
|
return result |