othsueh
/

CombineCorpus_ORG

Audio Classification

wav2vec2-emodualhead

emotion-recognition

Model card Files Files and versions

CombineCorpus_ORG / handler.py

othsueh's picture

Create handler.py

cdc1bf3 verified 8 months ago

history blame contribute delete

1.93 kB

	import os
	import io
	import torch
	import torchaudio
	from typing import Any, Dict
	from transformers import AutoConfig, AutoProcessor
	from modeling_upstream_finetune import UpstreamFinetune

	class EndpointHandler():
	def __init__(self, model_dir: str, **kwargs: Any) -> None:
	# Load config and model with trust_remote_code
	device = 'cuda'
	self.emotions = ['neutral','happy','sad','angry','surprise','contempt']

	self.model = UpstreamFinetune.from_pretrained(
	model_dir,
	device=device,
	)
	self.model.eval()

	def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
	# Expect raw audio bytes or a base64 string in `data["inputs"]`
	audio = data["inputs"]
	sampling_rate = data.get("sampling_rate", 16000)

	# Decode MP3/WAV bytes → waveform tensor
	waveform, sr = torchaudio.load(io.BytesIO(audio))
	if sr != sampling_rate:
	waveform = torchaudio.functional.resample(waveform, sr, sampling_rate)

	# Forward pass
	with torch.no_grad():
	cat_logits, reg_outputs = self.model(
	waveform,
	sampling_rate
	)

	# Convert logits to probabilities using softmax
	emotion_probs = torch.nn.functional.softmax(cat_logits, dim=1)

	# Create emotion predictions
	emotion_predictions = []
	for i, emotion in enumerate(self.emotions):
	emotion_predictions.append({
	"label": emotion,
	"score": float(emotion_probs[0, i]) # Convert tensor to float
	})

	# Add arousal and valence predictions
	result = emotion_predictions + [
	{"label": "arousal", "score": float(reg_outputs[0, 0])},
	{"label": "valence", "score": float(reg_outputs[0, 1])}
	]

	return result