Spaces:
No application file
No application file
File size: 1,342 Bytes
52d27b3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 |
import sounddevice as sd
import numpy as np
import librosa
import torch
from transformers import pipeline
class VoiceHandler:
def __init__(self):
self.sample_rate = 16000
self.emotion_classifier = pipeline("audio-classification",
model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition")
def record_audio(self, duration=5):
"""Record audio for specified duration"""
recording = sd.rec(int(duration * self.sample_rate),
samplerate=self.sample_rate,
channels=1)
sd.wait()
return recording
def process_audio(self, audio_data):
"""Process audio and detect emotion"""
# Convert to mono if needed
if len(audio_data.shape) > 1:
audio_data = np.mean(audio_data, axis=1)
# Normalize audio
audio_data = librosa.util.normalize(audio_data)
# Get emotion
emotion = self.emotion_classifier(audio_data)
return audio_data, emotion[0]['label']
def enhance_audio(self, audio_data):
"""Enhance audio quality"""
# Noise reduction
y = librosa.effects.preemphasis(audio_data)
# Normalize
y = librosa.util.normalize(y)
return y
|