Fast_api / vps /vps.py
mulasagg's picture
Add application file
8ad2ab3
from typing import List, Dict
import librosa
import numpy as np
import spacy
import math
from .filler_analyzer import detect_fillers
def calc_srs(wpm: float, filler_count: int, long_pause_count: int, pitch_variation: float) -> float:
"""
Speech Rate Stability (SRS): Reflects the consistency of the speaker's pace and rhythm.
"""
ideal_wpm = 150
wpm_deviation = min(30, abs(wpm - ideal_wpm))
wpm_consistency = max(0, 100 - (wpm_deviation * 1.67))
filler_penalty = min(filler_count / 10, 1.0)
pause_penalty = min(long_pause_count / 5, 1.0)
pitch_penalty = min(pitch_variation / 3.0, 1.0)
stability = (1 - ((filler_penalty + pause_penalty + pitch_penalty) / 3)) * 100
SRS = (0.45 * wpm_consistency) + (0.55 * stability)
return min(100, max(0, SRS))
def calculate_pas(transcript: str, segments: List[Dict], filler_count: int, duration: float) -> Dict[str, float]:
"""
Calculate the Pause Appropriateness Score (PAS) and its components.
"""
if not transcript or not segments or duration <= 0:
raise ValueError("Transcript, segments, and duration must be valid")
nlp = spacy.load("en_core_web_sm")
doc = nlp(transcript)
words = transcript.split()
total_words = len(words)
if total_words == 0:
raise ValueError("No words found in transcript")
filler_rate = filler_count / total_words if total_words > 0 else 0.0
if filler_rate >= 0.10:
afw = 0.0
elif filler_rate <= 0.0:
afw = 100.0
else:
afw = 100.0 - (filler_rate * 1000)
afw = max(0.0, min(100.0, afw))
total_pauses = 0
natural_pauses = 0
segment_texts = [seg["text"].strip() for seg in segments]
segment_starts = [seg["start"] for seg in segments]
segment_ends = [seg["end"] for seg in segments]
for i in range(len(segments) - 1):
pause_dur = segment_starts[i + 1] - segment_ends[i]
if pause_dur > 0.5:
total_pauses += 1
if segment_texts[i] and segment_texts[i][-1] in ".!?,":
natural_pauses += 1
if segment_starts[0] > 0.5:
total_pauses += 1
if duration - segment_ends[-1] > 0.5:
total_pauses += 1
if segment_texts[-1] and segment_texts[-1][-1] in ".!?":
natural_pauses += 1
npp = 100.0 if total_pauses == 0 else (natural_pauses / total_pauses) * 100.0
pas = (0.4 * npp) + (0.6 * afw)
return {
"NPP": npp,
"AFW": afw,
"PAS": pas
}
def calculate_rcs(y: np.ndarray, sr: int, segments: List[Dict], duration: float) -> Dict[str, float]:
"""
Calculate the Rhythm Consistency Score (RCS) and its components.
"""
if y.size == 0 or sr <= 0 or duration <= 0 or not segments:
raise ValueError("Audio signal, sampling rate, duration, and segments must be valid")
onset_env = librosa.onset.onset_strength(y=y, sr=sr, hop_length=256)
onsets = librosa.onset.onset_detect(onset_envelope=onset_env, sr=sr, units='time', hop_length=256)
if len(onsets) > 1:
iois = np.diff(onsets)
ioi_std = np.std(iois)
ioi_std = min(max(ioi_std, 0.1), 0.5)
str_score = 100.0 * (0.5 - ioi_std) / (0.5 - 0.1)
str_score = max(0.0, min(100.0, str_score))
else:
str_score = 100.0
total_transitions = 0
smooth_transitions = 0
pause_threshold = 0.3
for i in range(len(segments) - 1):
gap = segments[i + 1]["start"] - segments[i]["end"]
total_transitions += 1
if gap <= pause_threshold:
smooth_transitions += 1
for segment in segments:
words = segment["text"].strip().split()
if len(words) > 1:
smooth_transitions += len(words) - 1
total_transitions += len(words) - 1
stw = 100.0 if total_transitions == 0 else (smooth_transitions / total_transitions) * 100.0
rcs = (0.5 * str_score) + (0.5 * stw)
return {
"STR": str_score,
"STW": stw,
"RCS": rcs
}
def calculate_vps(
transcript: str,
segments: List[Dict],
filler_count: int,
duration: float,
wpm: float,
long_pause_count: int,
pitch_variation: float,
y: np.ndarray,
sr: int
) -> Dict[str, float]:
"""
Calculate the Voice Pacing Score (VPS) and its components:
- SRS: Speech Rate Stability Score
- PAS: Pause Appropriateness Score
- RCS: Rhythm Consistency Score
- VPS = (0.5 * SRS) + (0.3 * PAS) + (0.2 * RCS)
Args:
transcript (str): Transcribed text.
segments (List[Dict]): Whisper model segments with 'start', 'end', 'text'.
filler_count (int): Number of filler words.
duration (float): Audio duration (seconds).
wpm (float): Words per minute.
long_pause_count (int): Number of long pauses (>1.0s).
pitch_variation (float): Pitch variation in semitones.
y (np.ndarray): Audio signal.
sr (int): Sampling rate.
Returns:
Dict[str, float]: Scores for SRS, PAS, RCS, VPS, and intermediates.
"""
# Validate inputs
if not transcript or not segments or duration <= 0 or y.size == 0 or sr <= 0:
raise ValueError("Invalid inputs")
# Calculate SRS
srs = calc_srs(wpm, filler_count, long_pause_count, pitch_variation)
# Calculate PAS
pas_result = calculate_pas(transcript, segments, filler_count, duration)
pas = pas_result["PAS"]
npp = pas_result["NPP"]
afw = pas_result["AFW"]
# Calculate RCS
rcs_result = calculate_rcs(y, sr, segments, duration)
rcs = rcs_result["RCS"]
str_score = rcs_result["STR"]
stw = rcs_result["STW"]
# Calculate VPS
vps = (0.5 * srs) + (0.3 * pas) + (0.2 * rcs)
vps = max(0.0, min(100.0, vps))
return {
"SRS": srs,
"PAS": pas,
"NPP": npp,
"AFW": afw,
"RCS": rcs,
"STR": str_score,
"STW": stw,
"VPS": vps
}