import sounddevice as sd import numpy as np import librosa import torch from transformers import pipeline class VoiceHandler: def __init__(self): self.sample_rate = 16000 self.emotion_classifier = pipeline("audio-classification", model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition") def record_audio(self, duration=5): """Record audio for specified duration""" recording = sd.rec(int(duration * self.sample_rate), samplerate=self.sample_rate, channels=1) sd.wait() return recording def process_audio(self, audio_data): """Process audio and detect emotion""" # Convert to mono if needed if len(audio_data.shape) > 1: audio_data = np.mean(audio_data, axis=1) # Normalize audio audio_data = librosa.util.normalize(audio_data) # Get emotion emotion = self.emotion_classifier(audio_data) return audio_data, emotion[0]['label'] def enhance_audio(self, audio_data): """Enhance audio quality""" # Noise reduction y = librosa.effects.preemphasis(audio_data) # Normalize y = librosa.util.normalize(y) return y