akhanriz's picture
initial commit
e112632 verified
\
import re, io, os, numpy as np, soundfile as sf
from pydub import AudioSegment
DEFAULT_SAMPLE_RATE = int(os.getenv("SAMPLE_RATE", "16000"))
MAX_SEC = 2.0
N_SAMPLES = int(DEFAULT_SAMPLE_RATE*MAX_SEC)
MACHINE_KEYWORDS = [
"pound key","press pound","record","please leave","leave a message",
"can't come","can't get","forwarded","sorry i miss","sorry i missed",
"we are not","not available","we're unable","we are unable",
"can't take","can't answer","voice mail","voicemail","leave your name",
"after the tone","at the tone","beep"
]
HUMAN_KEYWORDS = [
"speaking","this is","how can i help","yeah","yes","okay","just a sec"
]
IGNORE_WORDS = ["hi","hello"] # ignored globally
def normalize_text(t: str) -> str:
if not t:
return ""
t = t.lower().strip()
# remove ignore words as isolated tokens
for w in IGNORE_WORDS:
t = re.sub(rf"\b{re.escape(w)}\b", " ", t)
t = re.sub(r"\s+", " ", t)
return t.strip()
def keyword_hits(text: str, vocab) -> list[str]:
hits = []
for kw in vocab:
if kw in text:
hits.append(kw)
return hits
def wav_bytes_to_float32(samples_bytes: bytes, sample_rate: int = DEFAULT_SAMPLE_RATE) -> np.ndarray:
"""Decode linear PCM16 bytes to float32 [-1,1]."""
arr = np.frombuffer(samples_bytes, dtype=np.int16).astype(np.float32) / 32768.0
return arr
def ensure_2s(buf: np.ndarray, sample_rate: int = DEFAULT_SAMPLE_RATE) -> np.ndarray:
need = int(sample_rate*MAX_SEC)
if buf.shape[0] >= need:
return buf[:need]
out = np.zeros((need,), dtype=np.float32)
out[:buf.shape[0]] = buf
return out