akhanriz
/

whisper-vm-finetune

Audio Classification

voicemail detection

answering machine detection

Model card Files Files and versions

whisper-vm-finetune / scripts /utils.py

akhanriz's picture

initial commit

e112632 verified 13 days ago

history blame contribute delete

1.65 kB

	\
	import re, io, os, numpy as np, soundfile as sf
	from pydub import AudioSegment

	DEFAULT_SAMPLE_RATE = int(os.getenv("SAMPLE_RATE", "16000"))
	MAX_SEC = 2.0
	N_SAMPLES = int(DEFAULT_SAMPLE_RATE*MAX_SEC)

	MACHINE_KEYWORDS = [
	"pound key","press pound","record","please leave","leave a message",
	"can't come","can't get","forwarded","sorry i miss","sorry i missed",
	"we are not","not available","we're unable","we are unable",
	"can't take","can't answer","voice mail","voicemail","leave your name",
	"after the tone","at the tone","beep"
	]
	HUMAN_KEYWORDS = [
	"speaking","this is","how can i help","yeah","yes","okay","just a sec"
	]
	IGNORE_WORDS = ["hi","hello"] # ignored globally

	def normalize_text(t: str) -> str:
	if not t:
	return ""
	t = t.lower().strip()
	# remove ignore words as isolated tokens
	for w in IGNORE_WORDS:
	t = re.sub(rf"\b{re.escape(w)}\b", " ", t)
	t = re.sub(r"\s+", " ", t)
	return t.strip()

	def keyword_hits(text: str, vocab) -> list[str]:
	hits = []
	for kw in vocab:
	if kw in text:
	hits.append(kw)
	return hits

	def wav_bytes_to_float32(samples_bytes: bytes, sample_rate: int = DEFAULT_SAMPLE_RATE) -> np.ndarray:
	"""Decode linear PCM16 bytes to float32 [-1,1]."""
	arr = np.frombuffer(samples_bytes, dtype=np.int16).astype(np.float32) / 32768.0
	return arr

	def ensure_2s(buf: np.ndarray, sample_rate: int = DEFAULT_SAMPLE_RATE) -> np.ndarray:
	need = int(sample_rate*MAX_SEC)
	if buf.shape[0] >= need:
	return buf[:need]
	out = np.zeros((need,), dtype=np.float32)
	out[:buf.shape[0]] = buf
	return out