|
""" |
|
Voice Clarity Score calculation module |
|
""" |
|
|
|
import librosa |
|
import numpy as np |
|
from typing import Dict, Any, List |
|
import soundfile as sf |
|
|
|
def calculate_articulation(y: np.ndarray, sr: int) -> float: |
|
""" |
|
Calculate articulation quality based on spectral contrast. |
|
|
|
Articulation refers to how clearly individual phonemes are produced. |
|
|
|
Args: |
|
y (np.ndarray): Audio signal |
|
sr (int): Sample rate |
|
|
|
Returns: |
|
float: Articulation score (0-100) |
|
""" |
|
|
|
|
|
S = np.abs(librosa.stft(y)) |
|
contrast = librosa.feature.spectral_contrast(S=S, sr=sr) |
|
|
|
|
|
mean_contrast = np.mean(contrast) |
|
|
|
|
|
|
|
min_contrast = 10 |
|
max_contrast = 50 |
|
normalized_contrast = min(100, max(0, (mean_contrast - min_contrast) / (max_contrast - min_contrast) * 100)) |
|
|
|
return normalized_contrast |
|
|
|
def calculate_enunciation(y: np.ndarray, sr: int) -> float: |
|
""" |
|
Calculate enunciation quality based on formant clarity and spectral flatness. |
|
|
|
Enunciation is the precision in pronouncing vowels and consonants. |
|
|
|
Args: |
|
y (np.ndarray): Audio signal |
|
sr (int): Sample rate |
|
|
|
Returns: |
|
float: Enunciation score (0-100) |
|
""" |
|
|
|
flatness = np.mean(librosa.feature.spectral_flatness(y=y)) |
|
|
|
|
|
centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr)) |
|
|
|
|
|
norm_flatness = max(0, min(100, (0.5 - flatness) / 0.5 * 100)) |
|
|
|
|
|
ideal_centroid = 2500 |
|
centroid_deviation = abs(centroid - ideal_centroid) / 2000 |
|
norm_centroid = max(0, min(100, (1 - centroid_deviation) * 100)) |
|
|
|
|
|
enunciation_score = (0.7 * norm_flatness) + (0.3 * norm_centroid) |
|
|
|
return enunciation_score |
|
|
|
def calculate_speech_pause_control(segments: List[Dict]) -> float: |
|
""" |
|
Calculate how effectively pauses are integrated in speech. |
|
|
|
Speech pause control refers to the natural vs. abrupt pauses in speech. |
|
|
|
Args: |
|
segments (List[Dict]): List of transcript segments with timing information |
|
|
|
Returns: |
|
float: Speech pause control score (0-100) |
|
""" |
|
if len(segments) < 2: |
|
return 100.0 |
|
|
|
pause_durations = [] |
|
for i in range(len(segments) - 1): |
|
pause_dur = segments[i + 1]["start"] - segments[i]["end"] |
|
if pause_dur > 0.05: |
|
pause_durations.append(pause_dur) |
|
|
|
if not pause_durations: |
|
return 100.0 |
|
|
|
|
|
|
|
pause_std = np.std(pause_durations) |
|
|
|
|
|
long_pauses = sum(1 for d in pause_durations if d > 2.0) |
|
long_pause_ratio = long_pauses / len(pause_durations) if pause_durations else 0 |
|
|
|
|
|
|
|
if pause_std < 0.1: |
|
std_score = 70 |
|
elif pause_std < 0.5: |
|
std_score = 100 - ((pause_std - 0.1) / 0.4 * 30) |
|
else: |
|
std_score = max(0, 70 - ((pause_std - 0.5) / 2.0 * 70)) |
|
|
|
|
|
long_pause_penalty = long_pause_ratio * 50 |
|
|
|
|
|
pause_control_score = max(0, min(100, std_score - long_pause_penalty)) |
|
|
|
return pause_control_score |
|
|
|
def calculate_voice_clarity_score(y: np.ndarray, sr: int, segments: List[Dict]) -> Dict[str, Any]: |
|
""" |
|
Calculate the Voice Clarity Score (VCS) and its components. |
|
|
|
VCS reflects the clarity and intelligibility of speech. |
|
|
|
Args: |
|
y (np.ndarray): Audio signal |
|
sr (int): Sample rate |
|
segments (List[Dict]): List of transcript segments with timing information |
|
|
|
Returns: |
|
Dict[str, Any]: Dictionary with VCS and component scores |
|
""" |
|
|
|
articulation_score = calculate_articulation(y, sr) |
|
enunciation_score = calculate_enunciation(y, sr) |
|
speech_pause_control_score = calculate_speech_pause_control(segments) |
|
|
|
|
|
vcs = (0.45 * articulation_score) + (0.35 * enunciation_score) + (0.2 * speech_pause_control_score) |
|
|
|
|
|
result = { |
|
"VCS": vcs, |
|
"components": { |
|
"articulation": articulation_score, |
|
"enunciation": enunciation_score, |
|
"speech_pause_control": speech_pause_control_score |
|
} |
|
} |
|
|
|
|
|
result["insight"] = get_clarity_insight(vcs) |
|
|
|
return result |
|
|
|
def get_clarity_insight(vcs: float) -> str: |
|
""" |
|
Generate insight text based on the Voice Clarity Score. |
|
|
|
Args: |
|
vcs (float): Voice Clarity Score (0-100) |
|
|
|
Returns: |
|
str: Insight text explaining the score |
|
""" |
|
if vcs >= 85: |
|
return "Excellent voice clarity with precise articulation and well-controlled pauses. Speech is highly intelligible and pleasant to listen to." |
|
elif vcs >= 70: |
|
return "Good voice clarity with clear pronunciation and generally appropriate pauses. Minor improvements could enhance overall clarity." |
|
elif vcs >= 50: |
|
return "Moderate voice clarity with some articulation issues or irregular pauses. Focus on clearer pronunciation and more natural pausing." |
|
elif vcs >= 30: |
|
return "Below average clarity with noticeable articulation problems or awkward pausing patterns. Consider speech exercises to improve clarity." |
|
else: |
|
return "Speech clarity needs significant improvement. Articulation is unclear and pausing patterns disrupt intelligibility. Speech therapy exercises may be beneficial." |