Fast_api / vcs /vcs.py
mulasagg's picture
Add application file
8ad2ab3
"""
Voice Clarity Score calculation module
"""
import librosa
import numpy as np
from typing import Dict, Any, List
import soundfile as sf
def calculate_articulation(y: np.ndarray, sr: int) -> float:
"""
Calculate articulation quality based on spectral contrast.
Articulation refers to how clearly individual phonemes are produced.
Args:
y (np.ndarray): Audio signal
sr (int): Sample rate
Returns:
float: Articulation score (0-100)
"""
# Extract spectral contrast
# Higher contrast between peaks and valleys in the spectrum generally correlates with clearer articulation
S = np.abs(librosa.stft(y))
contrast = librosa.feature.spectral_contrast(S=S, sr=sr)
# Average across frequency bands and frames
mean_contrast = np.mean(contrast)
# Normalize to 0-100 scale (empirically determined range)
# Typical values range from 10-50 dB
min_contrast = 10
max_contrast = 50
normalized_contrast = min(100, max(0, (mean_contrast - min_contrast) / (max_contrast - min_contrast) * 100))
return normalized_contrast
def calculate_enunciation(y: np.ndarray, sr: int) -> float:
"""
Calculate enunciation quality based on formant clarity and spectral flatness.
Enunciation is the precision in pronouncing vowels and consonants.
Args:
y (np.ndarray): Audio signal
sr (int): Sample rate
Returns:
float: Enunciation score (0-100)
"""
# Compute spectral flatness - lower values indicate clearer formants and better enunciation
flatness = np.mean(librosa.feature.spectral_flatness(y=y))
# Compute spectral centroid - related to "brightness" or articulation clarity
centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
# Normalize flatness (lower is better for speech) - range typically 0.01-0.5
norm_flatness = max(0, min(100, (0.5 - flatness) / 0.5 * 100))
# Normalize centroid (mid-range is better for clear speech) - typically 1000-4000 Hz for clear speech
ideal_centroid = 2500 # Hz
centroid_deviation = abs(centroid - ideal_centroid) / 2000 # Normalized by expected deviation
norm_centroid = max(0, min(100, (1 - centroid_deviation) * 100))
# Combine the two metrics (with more weight on flatness)
enunciation_score = (0.7 * norm_flatness) + (0.3 * norm_centroid)
return enunciation_score
def calculate_speech_pause_control(segments: List[Dict]) -> float:
"""
Calculate how effectively pauses are integrated in speech.
Speech pause control refers to the natural vs. abrupt pauses in speech.
Args:
segments (List[Dict]): List of transcript segments with timing information
Returns:
float: Speech pause control score (0-100)
"""
if len(segments) < 2:
return 100.0 # Not enough segments to evaluate pauses
pause_durations = []
for i in range(len(segments) - 1):
pause_dur = segments[i + 1]["start"] - segments[i]["end"]
if pause_dur > 0.05: # Only consider actual pauses
pause_durations.append(pause_dur)
if not pause_durations:
return 100.0 # No significant pauses detected
# Calculate the standard deviation of pause durations
# More consistent pauses indicate better control
pause_std = np.std(pause_durations)
# Calculate proportion of very long pauses (potentially awkward)
long_pauses = sum(1 for d in pause_durations if d > 2.0)
long_pause_ratio = long_pauses / len(pause_durations) if pause_durations else 0
# Normalize std dev (lower is better, but not too low)
# Ideal range is around 0.2-0.5 seconds
if pause_std < 0.1:
std_score = 70 # Too consistent might sound robotic
elif pause_std < 0.5:
std_score = 100 - ((pause_std - 0.1) / 0.4 * 30) # Scale 70-100
else:
std_score = max(0, 70 - ((pause_std - 0.5) / 2.0 * 70)) # Scale down from 70
# Penalize for too many long pauses
long_pause_penalty = long_pause_ratio * 50
# Final score
pause_control_score = max(0, min(100, std_score - long_pause_penalty))
return pause_control_score
def calculate_voice_clarity_score(y: np.ndarray, sr: int, segments: List[Dict]) -> Dict[str, Any]:
"""
Calculate the Voice Clarity Score (VCS) and its components.
VCS reflects the clarity and intelligibility of speech.
Args:
y (np.ndarray): Audio signal
sr (int): Sample rate
segments (List[Dict]): List of transcript segments with timing information
Returns:
Dict[str, Any]: Dictionary with VCS and component scores
"""
# Calculate component scores
articulation_score = calculate_articulation(y, sr)
enunciation_score = calculate_enunciation(y, sr)
speech_pause_control_score = calculate_speech_pause_control(segments)
# Calculate Voice Clarity Score using the formula from the paper
vcs = (0.45 * articulation_score) + (0.35 * enunciation_score) + (0.2 * speech_pause_control_score)
# Create result dictionary
result = {
"VCS": vcs,
"components": {
"articulation": articulation_score,
"enunciation": enunciation_score,
"speech_pause_control": speech_pause_control_score
}
}
# Add interpretation
result["insight"] = get_clarity_insight(vcs)
return result
def get_clarity_insight(vcs: float) -> str:
"""
Generate insight text based on the Voice Clarity Score.
Args:
vcs (float): Voice Clarity Score (0-100)
Returns:
str: Insight text explaining the score
"""
if vcs >= 85:
return "Excellent voice clarity with precise articulation and well-controlled pauses. Speech is highly intelligible and pleasant to listen to."
elif vcs >= 70:
return "Good voice clarity with clear pronunciation and generally appropriate pauses. Minor improvements could enhance overall clarity."
elif vcs >= 50:
return "Moderate voice clarity with some articulation issues or irregular pauses. Focus on clearer pronunciation and more natural pausing."
elif vcs >= 30:
return "Below average clarity with noticeable articulation problems or awkward pausing patterns. Consider speech exercises to improve clarity."
else:
return "Speech clarity needs significant improvement. Articulation is unclear and pausing patterns disrupt intelligibility. Speech therapy exercises may be beneficial."