from .vps import calculate_vps # Your file where calc_srs, calculate_pas, calculate_rcs, calculate_vps live import librosa import numpy as np import math import pyworld from filler_count.filler_score import analyze_fillers def compute_vps_score(file_path: str, whisper_model) -> dict: """ Compute VPS (Voice Pacing Score) and its components from a speech sample. Args: file_path (str): Path to the audio file. whisper_model: Transcription model (e.g., OpenAI Whisper or faster-whisper) Returns: dict: A dictionary containing VPS, SRS, PAS, RCS, and component scores. """ # Transcribe result = whisper_model.transcribe(file_path, word_timestamps=False, fp16=False) transcript = result.get("text", "").strip() segments = result.get("segments", []) # Validate early if not transcript or not segments: raise ValueError("Empty transcript or segments from Whisper.") # Filler count result = analyze_fillers(file_path,'base',transcript) filler_count = result.get("filler_count", 0) # Load audio y, sr = librosa.load(file_path, sr=None) duration = len(y) / sr if sr else 0.0 if duration <= 0: raise ValueError("Audio duration invalid or zero.") # Calculate pitch variation (in semitones) using pyworld _f0, t = pyworld.harvest(y.astype(np.float64), sr, f0_floor=80.0, f0_ceil=400.0, frame_period=1000 * 256 / sr) f0 = pyworld.stonemask(y.astype(np.float64), _f0, t, sr) voiced_f0 = f0[f0 > 0] voiced_f0 = voiced_f0[ (voiced_f0 > np.percentile(voiced_f0, 5)) & (voiced_f0 < np.percentile(voiced_f0, 95)) ] pitch_variation = 0.0 if voiced_f0.size > 0: median_f0 = np.median(voiced_f0) median_f0 = max(median_f0, 1e-6) semitone_diffs = 12 * np.log2(voiced_f0 / median_f0) pitch_variation = float(np.std(semitone_diffs)) # Pause analysis long_pause_count = 0 if segments: for i in range(len(segments) - 1): pause_dur = segments[i + 1]["start"] - segments[i]["end"] if pause_dur > 1.0: long_pause_count += 1 # Beginning and end if segments[0]["start"] > 1.0: long_pause_count += 1 if duration - segments[-1]["end"] > 1.0: long_pause_count += 1 # WPM word_count = len(transcript.split()) words_per_min = (word_count / duration) * 60.0 if duration > 0 else 0.0 # Calculate VPS and components vps_result = calculate_vps( transcript=transcript, segments=segments, filler_count=filler_count, duration=duration, wpm=words_per_min, long_pause_count=long_pause_count, pitch_variation=pitch_variation, y=y, sr=sr ) return vps_result