MentalHealthVocalBiomarkers / models /audio-processor.py
invincible-jha's picture
Upload 4 files
d894230 verified
import librosa
import numpy as np
from typing import Dict, Tuple
class AudioProcessor:
def __init__(self):
self.sample_rate = 16000
self.n_mfcc = 13
self.n_mels = 128
def process_audio(self, audio_path: str) -> Tuple[np.ndarray, Dict]:
# Load and preprocess audio
waveform, sr = librosa.load(audio_path, sr=self.sample_rate)
# Extract features
features = {
'mfcc': self._extract_mfcc(waveform),
'pitch': self._extract_pitch(waveform),
'energy': self._extract_energy(waveform)
}
return waveform, features
def _extract_mfcc(self, waveform: np.ndarray) -> np.ndarray:
mfccs = librosa.feature.mfcc(
y=waveform,
sr=self.sample_rate,
n_mfcc=self.n_mfcc
)
return mfccs.mean(axis=1)
def _extract_pitch(self, waveform: np.ndarray) -> Dict:
f0, voiced_flag, voiced_probs = librosa.pyin(
waveform,
fmin=librosa.note_to_hz('C2'),
fmax=librosa.note_to_hz('C7'),
sr=self.sample_rate
)
return {
'mean': float(np.nanmean(f0)),
'std': float(np.nanstd(f0)),
'max': float(np.nanmax(f0)),
'min': float(np.nanmin(f0))
}
def _extract_energy(self, waveform: np.ndarray) -> Dict:
rms = librosa.feature.rms(y=waveform)[0]
return {
'mean': float(np.mean(rms)),
'std': float(np.std(rms)),
'max': float(np.max(rms)),
'min': float(np.min(rms))
}