File size: 6,031 Bytes
8ad2ab3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 |
from typing import List, Dict
import librosa
import numpy as np
import spacy
import math
from .filler_analyzer import detect_fillers
def calc_srs(wpm: float, filler_count: int, long_pause_count: int, pitch_variation: float) -> float:
"""
Speech Rate Stability (SRS): Reflects the consistency of the speaker's pace and rhythm.
"""
ideal_wpm = 150
wpm_deviation = min(30, abs(wpm - ideal_wpm))
wpm_consistency = max(0, 100 - (wpm_deviation * 1.67))
filler_penalty = min(filler_count / 10, 1.0)
pause_penalty = min(long_pause_count / 5, 1.0)
pitch_penalty = min(pitch_variation / 3.0, 1.0)
stability = (1 - ((filler_penalty + pause_penalty + pitch_penalty) / 3)) * 100
SRS = (0.45 * wpm_consistency) + (0.55 * stability)
return min(100, max(0, SRS))
def calculate_pas(transcript: str, segments: List[Dict], filler_count: int, duration: float) -> Dict[str, float]:
"""
Calculate the Pause Appropriateness Score (PAS) and its components.
"""
if not transcript or not segments or duration <= 0:
raise ValueError("Transcript, segments, and duration must be valid")
nlp = spacy.load("en_core_web_sm")
doc = nlp(transcript)
words = transcript.split()
total_words = len(words)
if total_words == 0:
raise ValueError("No words found in transcript")
filler_rate = filler_count / total_words if total_words > 0 else 0.0
if filler_rate >= 0.10:
afw = 0.0
elif filler_rate <= 0.0:
afw = 100.0
else:
afw = 100.0 - (filler_rate * 1000)
afw = max(0.0, min(100.0, afw))
total_pauses = 0
natural_pauses = 0
segment_texts = [seg["text"].strip() for seg in segments]
segment_starts = [seg["start"] for seg in segments]
segment_ends = [seg["end"] for seg in segments]
for i in range(len(segments) - 1):
pause_dur = segment_starts[i + 1] - segment_ends[i]
if pause_dur > 0.5:
total_pauses += 1
if segment_texts[i] and segment_texts[i][-1] in ".!?,":
natural_pauses += 1
if segment_starts[0] > 0.5:
total_pauses += 1
if duration - segment_ends[-1] > 0.5:
total_pauses += 1
if segment_texts[-1] and segment_texts[-1][-1] in ".!?":
natural_pauses += 1
npp = 100.0 if total_pauses == 0 else (natural_pauses / total_pauses) * 100.0
pas = (0.4 * npp) + (0.6 * afw)
return {
"NPP": npp,
"AFW": afw,
"PAS": pas
}
def calculate_rcs(y: np.ndarray, sr: int, segments: List[Dict], duration: float) -> Dict[str, float]:
"""
Calculate the Rhythm Consistency Score (RCS) and its components.
"""
if y.size == 0 or sr <= 0 or duration <= 0 or not segments:
raise ValueError("Audio signal, sampling rate, duration, and segments must be valid")
onset_env = librosa.onset.onset_strength(y=y, sr=sr, hop_length=256)
onsets = librosa.onset.onset_detect(onset_envelope=onset_env, sr=sr, units='time', hop_length=256)
if len(onsets) > 1:
iois = np.diff(onsets)
ioi_std = np.std(iois)
ioi_std = min(max(ioi_std, 0.1), 0.5)
str_score = 100.0 * (0.5 - ioi_std) / (0.5 - 0.1)
str_score = max(0.0, min(100.0, str_score))
else:
str_score = 100.0
total_transitions = 0
smooth_transitions = 0
pause_threshold = 0.3
for i in range(len(segments) - 1):
gap = segments[i + 1]["start"] - segments[i]["end"]
total_transitions += 1
if gap <= pause_threshold:
smooth_transitions += 1
for segment in segments:
words = segment["text"].strip().split()
if len(words) > 1:
smooth_transitions += len(words) - 1
total_transitions += len(words) - 1
stw = 100.0 if total_transitions == 0 else (smooth_transitions / total_transitions) * 100.0
rcs = (0.5 * str_score) + (0.5 * stw)
return {
"STR": str_score,
"STW": stw,
"RCS": rcs
}
def calculate_vps(
transcript: str,
segments: List[Dict],
filler_count: int,
duration: float,
wpm: float,
long_pause_count: int,
pitch_variation: float,
y: np.ndarray,
sr: int
) -> Dict[str, float]:
"""
Calculate the Voice Pacing Score (VPS) and its components:
- SRS: Speech Rate Stability Score
- PAS: Pause Appropriateness Score
- RCS: Rhythm Consistency Score
- VPS = (0.5 * SRS) + (0.3 * PAS) + (0.2 * RCS)
Args:
transcript (str): Transcribed text.
segments (List[Dict]): Whisper model segments with 'start', 'end', 'text'.
filler_count (int): Number of filler words.
duration (float): Audio duration (seconds).
wpm (float): Words per minute.
long_pause_count (int): Number of long pauses (>1.0s).
pitch_variation (float): Pitch variation in semitones.
y (np.ndarray): Audio signal.
sr (int): Sampling rate.
Returns:
Dict[str, float]: Scores for SRS, PAS, RCS, VPS, and intermediates.
"""
# Validate inputs
if not transcript or not segments or duration <= 0 or y.size == 0 or sr <= 0:
raise ValueError("Invalid inputs")
# Calculate SRS
srs = calc_srs(wpm, filler_count, long_pause_count, pitch_variation)
# Calculate PAS
pas_result = calculate_pas(transcript, segments, filler_count, duration)
pas = pas_result["PAS"]
npp = pas_result["NPP"]
afw = pas_result["AFW"]
# Calculate RCS
rcs_result = calculate_rcs(y, sr, segments, duration)
rcs = rcs_result["RCS"]
str_score = rcs_result["STR"]
stw = rcs_result["STW"]
# Calculate VPS
vps = (0.5 * srs) + (0.3 * pas) + (0.2 * rcs)
vps = max(0.0, min(100.0, vps))
return {
"SRS": srs,
"PAS": pas,
"NPP": npp,
"AFW": afw,
"RCS": rcs,
"STR": str_score,
"STW": stw,
"VPS": vps
} |