File size: 3,687 Bytes
8ad2ab3 aef3b1e 8ad2ab3 aef3b1e 8ad2ab3 aef3b1e 8ad2ab3 aef3b1e 8ad2ab3 aef3b1e 8ad2ab3 aef3b1e 8ad2ab3 aef3b1e 8ad2ab3 aef3b1e 8ad2ab3 aef3b1e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 |
"""
Compute fluency score from audio file using SRS and PAS calculations
"""
import librosa
import numpy as np
from typing import Dict, Any, Union
from .fluency import calc_srs, calculate_pas, calculate_fluency, get_fluency_insight
from filler_count.filler_score import analyze_fillers
from typing import Dict, Any
import numpy as np
import librosa
import pyworld
def compute_fluency_score(file_path: str, whisper_model, filler_count= None) -> Dict[str, Any]:
"""
Compute fluency score and its components from a speech sample.
Args:
file_path (str): Path to the audio file.
whisper_model: Transcription model (e.g., OpenAI Whisper or faster-whisper)
Returns:
dict: A dictionary containing fluency score, SRS, PAS, and component scores.
"""
# Transcribe audio
result = whisper_model.transcribe(file_path, word_timestamps=False, fp16=False)
transcript = result.get("text", "").strip()
segments = result.get("segments", [])
# Validate early
if not transcript or not segments:
raise ValueError("Empty transcript or segments from Whisper.")
if filler_count is None:
# Detect filler words
result = analyze_fillers(file_path,"base", transcript)
filler_score = result.get("filler_score", 0)
filler_count = result.get("total_fillers", 0)
# Load audio
y, sr = librosa.load(file_path, sr=None)
duration = len(y) / sr if sr else 0.0
if duration <= 0:
raise ValueError("Audio duration invalid or zero.")
# Calculate pitch variation (in semitones) using pyworld
_f0, t = pyworld.harvest(y.astype(np.float64), sr, f0_floor=80.0, f0_ceil=400.0, frame_period=1000 * 256 / sr)
f0 = pyworld.stonemask(y.astype(np.float64), _f0, t, sr)
voiced_f0 = f0[f0 > 0]
voiced_f0 = voiced_f0[
(voiced_f0 > np.percentile(voiced_f0, 5)) &
(voiced_f0 < np.percentile(voiced_f0, 95))
]
pitch_variation = 0.0
if voiced_f0.size > 0:
median_f0 = np.median(voiced_f0)
median_f0 = max(median_f0, 1e-6)
semitone_diffs = 12 * np.log2(voiced_f0 / median_f0)
pitch_variation = float(np.std(semitone_diffs))
# Analyze pauses
long_pause_count = 0
if segments:
for i in range(len(segments) - 1):
pause_dur = segments[i + 1]["start"] - segments[i]["end"]
if pause_dur > 1.0:
long_pause_count += 1
# Check beginning and end pauses
if segments[0]["start"] > 1.0:
long_pause_count += 1
if duration - segments[-1]["end"] > 1.0:
long_pause_count += 1
# Calculate WPM
word_count = len(transcript.split())
words_per_min = (word_count / duration) * 60.0 if duration > 0 else 0.0
# Calculate SRS - Speech Rate Stability
srs_score = calc_srs(
wpm=words_per_min,
filler_count=filler_count,
long_pause_count=long_pause_count,
pitch_variation=pitch_variation
)
# Calculate PAS - Pause Appropriateness Score
pas_result = calculate_pas(
transcript=transcript,
segments=segments,
filler_count=filler_count,
duration=duration
)
pas_score = pas_result["PAS"]
# Calculate final fluency score
fluency_result = calculate_fluency(srs=srs_score, pas=pas_score)
fluency_score = fluency_result["score"]
return {
"fluency_score": fluency_score,
"SRS": srs_score,
"PAS": pas_score,
"pitch_variation": pitch_variation,
"filler_count": filler_count,
"long_pause_count": long_pause_count,
"WPM": words_per_min,
"transcript": transcript
}
|