Spaces:
Sleeping
Sleeping
| """Shared audio preprocessing utilities.""" | |
| import numpy as np | |
| import librosa | |
| # ββ Constants ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| SAMPLE_RATE = 16_000 # all models normalized to 16 kHz | |
| WINDOW_SECONDS = 1.0 # inference window size | |
| SILENCE_RMS_THRESHOLD = 0.001 # skip silent frames (low for phone speaker playback) | |
| HOP_LENGTH = 512 | |
| N_FFT = 1024 | |
| N_MELS = 128 | |
| N_MFCC = 40 | |
| def resample(audio_np: np.ndarray, from_sr: int, to_sr: int) -> np.ndarray: | |
| """Resample audio from *from_sr* to *to_sr* using librosa.""" | |
| if from_sr == to_sr: | |
| return audio_np | |
| return librosa.resample(audio_np, orig_sr=from_sr, target_sr=to_sr) | |
| def extract_mfcc_features( | |
| audio_np: np.ndarray, | |
| sr: int, | |
| n_mels: int = N_MELS, | |
| ) -> np.ndarray: | |
| """Return a feature vector (MFCCs + chroma + mel + contrast + tonnetz mean). | |
| Concatenation order matches foduucom/baby-cry-classification training code. | |
| ``n_mels`` can be overridden when the SVC model was trained with a different | |
| mel band count. | |
| """ | |
| # MFCCs β 40 coeffs | |
| mfcc = librosa.feature.mfcc(y=audio_np, sr=sr, n_mfcc=N_MFCC) | |
| mfcc_mean = np.mean(mfcc, axis=1) # (40,) | |
| # Chroma β 12 bins | |
| stft = np.abs(librosa.stft(audio_np, n_fft=N_FFT, hop_length=HOP_LENGTH)) | |
| chroma = librosa.feature.chroma_stft(S=stft, sr=sr) | |
| chroma_mean = np.mean(chroma, axis=1) # (12,) | |
| # Mel spectrogram summary | |
| mel = librosa.feature.melspectrogram( | |
| y=audio_np, sr=sr, n_fft=N_FFT, hop_length=HOP_LENGTH, n_mels=n_mels, | |
| ) | |
| mel_mean = np.mean(mel, axis=1) # (n_mels,) | |
| # Spectral contrast β 7 bands | |
| contrast = librosa.feature.spectral_contrast( | |
| S=stft, sr=sr, n_bands=6, fmin=200.0, | |
| ) | |
| contrast_mean = np.mean(contrast, axis=1) # (7,) | |
| # Tonnetz β 6 dims | |
| tonnetz = librosa.feature.tonnetz( | |
| y=librosa.effects.harmonic(audio_np), sr=sr, | |
| ) | |
| tonnetz_mean = np.mean(tonnetz, axis=1) # (6,) | |
| # Order: mfcc, chroma, mel, contrast, tonnetz (matches foduucom training) | |
| return np.concatenate([mfcc_mean, chroma_mean, mel_mean, contrast_mean, tonnetz_mean]) | |
| def extract_mel_spectrogram(audio_np: np.ndarray, sr: int) -> np.ndarray: | |
| """Return a mel spectrogram of shape (128, T) as float32.""" | |
| mel = librosa.feature.melspectrogram( | |
| y=audio_np, sr=sr, n_fft=N_FFT, hop_length=HOP_LENGTH, n_mels=N_MELS, | |
| ) | |
| return librosa.power_to_db(mel, ref=np.max).astype(np.float32) | |
| def is_silent(audio_np: np.ndarray) -> bool: | |
| """Return True when audio RMS is below the silence threshold.""" | |
| rms = np.sqrt(np.mean(audio_np ** 2)) | |
| return rms < SILENCE_RMS_THRESHOLD | |
| def compute_rms(audio_np: np.ndarray) -> float: | |
| """Return the RMS energy of the audio window.""" | |
| return float(np.sqrt(np.mean(audio_np ** 2))) | |
| def normalize_audio(audio_np: np.ndarray) -> np.ndarray: | |
| """Peak-normalize audio to [-1, 1]. | |
| Crucial when playing cry samples through a phone speaker β laptop mic, | |
| since the captured signal can be very quiet and models perform poorly | |
| on low-amplitude inputs. | |
| """ | |
| peak = np.max(np.abs(audio_np)) | |
| if peak < 1e-6: | |
| return audio_np | |
| return audio_np / peak | |