Spaces:

grungecoder
/

tot-talk

Sleeping

App Files Files Community

tot-talk / audio /preprocess.py

grungecoder

Initial commit: real-time multi-model baby cry classifier

ea2601f about 2 months ago

raw

history blame contribute delete

3.38 kB

	"""Shared audio preprocessing utilities."""

	import numpy as np
	import librosa

	# ── Constants ──────────────────────────────────────────────────────────────────
	SAMPLE_RATE = 16_000 # all models normalized to 16 kHz
	WINDOW_SECONDS = 1.0 # inference window size
	SILENCE_RMS_THRESHOLD = 0.001 # skip silent frames (low for phone speaker playback)
	HOP_LENGTH = 512
	N_FFT = 1024
	N_MELS = 128
	N_MFCC = 40


	def resample(audio_np: np.ndarray, from_sr: int, to_sr: int) -> np.ndarray:
	"""Resample audio from from_sr to to_sr using librosa."""
	if from_sr == to_sr:
	return audio_np
	return librosa.resample(audio_np, orig_sr=from_sr, target_sr=to_sr)


	def extract_mfcc_features(
	audio_np: np.ndarray,
	sr: int,
	n_mels: int = N_MELS,
	) -> np.ndarray:
	"""Return a feature vector (MFCCs + chroma + mel + contrast + tonnetz mean).

	Concatenation order matches foduucom/baby-cry-classification training code.
	``n_mels`` can be overridden when the SVC model was trained with a different
	mel band count.
	"""
	# MFCCs — 40 coeffs
	mfcc = librosa.feature.mfcc(y=audio_np, sr=sr, n_mfcc=N_MFCC)
	mfcc_mean = np.mean(mfcc, axis=1) # (40,)

	# Chroma — 12 bins
	stft = np.abs(librosa.stft(audio_np, n_fft=N_FFT, hop_length=HOP_LENGTH))
	chroma = librosa.feature.chroma_stft(S=stft, sr=sr)
	chroma_mean = np.mean(chroma, axis=1) # (12,)

	# Mel spectrogram summary
	mel = librosa.feature.melspectrogram(
	y=audio_np, sr=sr, n_fft=N_FFT, hop_length=HOP_LENGTH, n_mels=n_mels,
	)
	mel_mean = np.mean(mel, axis=1) # (n_mels,)

	# Spectral contrast — 7 bands
	contrast = librosa.feature.spectral_contrast(
	S=stft, sr=sr, n_bands=6, fmin=200.0,
	)
	contrast_mean = np.mean(contrast, axis=1) # (7,)

	# Tonnetz — 6 dims
	tonnetz = librosa.feature.tonnetz(
	y=librosa.effects.harmonic(audio_np), sr=sr,
	)
	tonnetz_mean = np.mean(tonnetz, axis=1) # (6,)

	# Order: mfcc, chroma, mel, contrast, tonnetz (matches foduucom training)
	return np.concatenate([mfcc_mean, chroma_mean, mel_mean, contrast_mean, tonnetz_mean])


	def extract_mel_spectrogram(audio_np: np.ndarray, sr: int) -> np.ndarray:
	"""Return a mel spectrogram of shape (128, T) as float32."""
	mel = librosa.feature.melspectrogram(
	y=audio_np, sr=sr, n_fft=N_FFT, hop_length=HOP_LENGTH, n_mels=N_MELS,
	)
	return librosa.power_to_db(mel, ref=np.max).astype(np.float32)


	def is_silent(audio_np: np.ndarray) -> bool:
	"""Return True when audio RMS is below the silence threshold."""
	rms = np.sqrt(np.mean(audio_np ** 2))
	return rms < SILENCE_RMS_THRESHOLD


	def compute_rms(audio_np: np.ndarray) -> float:
	"""Return the RMS energy of the audio window."""
	return float(np.sqrt(np.mean(audio_np ** 2)))


	def normalize_audio(audio_np: np.ndarray) -> np.ndarray:
	"""Peak-normalize audio to [-1, 1].

	Crucial when playing cry samples through a phone speaker → laptop mic,
	since the captured signal can be very quiet and models perform poorly
	on low-amplitude inputs.
	"""
	peak = np.max(np.abs(audio_np))
	if peak < 1e-6:
	return audio_np
	return audio_np / peak