File size: 6,643 Bytes
8ad2ab3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 |
"""
Voice Clarity Score calculation module
"""
import librosa
import numpy as np
from typing import Dict, Any, List
import soundfile as sf
def calculate_articulation(y: np.ndarray, sr: int) -> float:
"""
Calculate articulation quality based on spectral contrast.
Articulation refers to how clearly individual phonemes are produced.
Args:
y (np.ndarray): Audio signal
sr (int): Sample rate
Returns:
float: Articulation score (0-100)
"""
# Extract spectral contrast
# Higher contrast between peaks and valleys in the spectrum generally correlates with clearer articulation
S = np.abs(librosa.stft(y))
contrast = librosa.feature.spectral_contrast(S=S, sr=sr)
# Average across frequency bands and frames
mean_contrast = np.mean(contrast)
# Normalize to 0-100 scale (empirically determined range)
# Typical values range from 10-50 dB
min_contrast = 10
max_contrast = 50
normalized_contrast = min(100, max(0, (mean_contrast - min_contrast) / (max_contrast - min_contrast) * 100))
return normalized_contrast
def calculate_enunciation(y: np.ndarray, sr: int) -> float:
"""
Calculate enunciation quality based on formant clarity and spectral flatness.
Enunciation is the precision in pronouncing vowels and consonants.
Args:
y (np.ndarray): Audio signal
sr (int): Sample rate
Returns:
float: Enunciation score (0-100)
"""
# Compute spectral flatness - lower values indicate clearer formants and better enunciation
flatness = np.mean(librosa.feature.spectral_flatness(y=y))
# Compute spectral centroid - related to "brightness" or articulation clarity
centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
# Normalize flatness (lower is better for speech) - range typically 0.01-0.5
norm_flatness = max(0, min(100, (0.5 - flatness) / 0.5 * 100))
# Normalize centroid (mid-range is better for clear speech) - typically 1000-4000 Hz for clear speech
ideal_centroid = 2500 # Hz
centroid_deviation = abs(centroid - ideal_centroid) / 2000 # Normalized by expected deviation
norm_centroid = max(0, min(100, (1 - centroid_deviation) * 100))
# Combine the two metrics (with more weight on flatness)
enunciation_score = (0.7 * norm_flatness) + (0.3 * norm_centroid)
return enunciation_score
def calculate_speech_pause_control(segments: List[Dict]) -> float:
"""
Calculate how effectively pauses are integrated in speech.
Speech pause control refers to the natural vs. abrupt pauses in speech.
Args:
segments (List[Dict]): List of transcript segments with timing information
Returns:
float: Speech pause control score (0-100)
"""
if len(segments) < 2:
return 100.0 # Not enough segments to evaluate pauses
pause_durations = []
for i in range(len(segments) - 1):
pause_dur = segments[i + 1]["start"] - segments[i]["end"]
if pause_dur > 0.05: # Only consider actual pauses
pause_durations.append(pause_dur)
if not pause_durations:
return 100.0 # No significant pauses detected
# Calculate the standard deviation of pause durations
# More consistent pauses indicate better control
pause_std = np.std(pause_durations)
# Calculate proportion of very long pauses (potentially awkward)
long_pauses = sum(1 for d in pause_durations if d > 2.0)
long_pause_ratio = long_pauses / len(pause_durations) if pause_durations else 0
# Normalize std dev (lower is better, but not too low)
# Ideal range is around 0.2-0.5 seconds
if pause_std < 0.1:
std_score = 70 # Too consistent might sound robotic
elif pause_std < 0.5:
std_score = 100 - ((pause_std - 0.1) / 0.4 * 30) # Scale 70-100
else:
std_score = max(0, 70 - ((pause_std - 0.5) / 2.0 * 70)) # Scale down from 70
# Penalize for too many long pauses
long_pause_penalty = long_pause_ratio * 50
# Final score
pause_control_score = max(0, min(100, std_score - long_pause_penalty))
return pause_control_score
def calculate_voice_clarity_score(y: np.ndarray, sr: int, segments: List[Dict]) -> Dict[str, Any]:
"""
Calculate the Voice Clarity Score (VCS) and its components.
VCS reflects the clarity and intelligibility of speech.
Args:
y (np.ndarray): Audio signal
sr (int): Sample rate
segments (List[Dict]): List of transcript segments with timing information
Returns:
Dict[str, Any]: Dictionary with VCS and component scores
"""
# Calculate component scores
articulation_score = calculate_articulation(y, sr)
enunciation_score = calculate_enunciation(y, sr)
speech_pause_control_score = calculate_speech_pause_control(segments)
# Calculate Voice Clarity Score using the formula from the paper
vcs = (0.45 * articulation_score) + (0.35 * enunciation_score) + (0.2 * speech_pause_control_score)
# Create result dictionary
result = {
"VCS": vcs,
"components": {
"articulation": articulation_score,
"enunciation": enunciation_score,
"speech_pause_control": speech_pause_control_score
}
}
# Add interpretation
result["insight"] = get_clarity_insight(vcs)
return result
def get_clarity_insight(vcs: float) -> str:
"""
Generate insight text based on the Voice Clarity Score.
Args:
vcs (float): Voice Clarity Score (0-100)
Returns:
str: Insight text explaining the score
"""
if vcs >= 85:
return "Excellent voice clarity with precise articulation and well-controlled pauses. Speech is highly intelligible and pleasant to listen to."
elif vcs >= 70:
return "Good voice clarity with clear pronunciation and generally appropriate pauses. Minor improvements could enhance overall clarity."
elif vcs >= 50:
return "Moderate voice clarity with some articulation issues or irregular pauses. Focus on clearer pronunciation and more natural pausing."
elif vcs >= 30:
return "Below average clarity with noticeable articulation problems or awkward pausing patterns. Consider speech exercises to improve clarity."
else:
return "Speech clarity needs significant improvement. Articulation is unclear and pausing patterns disrupt intelligibility. Speech therapy exercises may be beneficial." |