Spaces:
No application file
No application file
import sounddevice as sd | |
import numpy as np | |
import librosa | |
import torch | |
from transformers import pipeline | |
class VoiceHandler: | |
def __init__(self): | |
self.sample_rate = 16000 | |
self.emotion_classifier = pipeline("audio-classification", | |
model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition") | |
def record_audio(self, duration=5): | |
"""Record audio for specified duration""" | |
recording = sd.rec(int(duration * self.sample_rate), | |
samplerate=self.sample_rate, | |
channels=1) | |
sd.wait() | |
return recording | |
def process_audio(self, audio_data): | |
"""Process audio and detect emotion""" | |
# Convert to mono if needed | |
if len(audio_data.shape) > 1: | |
audio_data = np.mean(audio_data, axis=1) | |
# Normalize audio | |
audio_data = librosa.util.normalize(audio_data) | |
# Get emotion | |
emotion = self.emotion_classifier(audio_data) | |
return audio_data, emotion[0]['label'] | |
def enhance_audio(self, audio_data): | |
"""Enhance audio quality""" | |
# Noise reduction | |
y = librosa.effects.preemphasis(audio_data) | |
# Normalize | |
y = librosa.util.normalize(y) | |
return y | |