""" Voice Clarity Score calculation module """ import librosa import numpy as np from typing import Dict, Any, List import soundfile as sf def calculate_articulation(y: np.ndarray, sr: int) -> float: """ Calculate articulation quality based on spectral contrast. Articulation refers to how clearly individual phonemes are produced. Args: y (np.ndarray): Audio signal sr (int): Sample rate Returns: float: Articulation score (0-100) """ # Extract spectral contrast # Higher contrast between peaks and valleys in the spectrum generally correlates with clearer articulation S = np.abs(librosa.stft(y)) contrast = librosa.feature.spectral_contrast(S=S, sr=sr) # Average across frequency bands and frames mean_contrast = np.mean(contrast) # Normalize to 0-100 scale (empirically determined range) # Typical values range from 10-50 dB min_contrast = 10 max_contrast = 50 normalized_contrast = min(100, max(0, (mean_contrast - min_contrast) / (max_contrast - min_contrast) * 100)) return normalized_contrast def calculate_enunciation(y: np.ndarray, sr: int) -> float: """ Calculate enunciation quality based on formant clarity and spectral flatness. Enunciation is the precision in pronouncing vowels and consonants. Args: y (np.ndarray): Audio signal sr (int): Sample rate Returns: float: Enunciation score (0-100) """ # Compute spectral flatness - lower values indicate clearer formants and better enunciation flatness = np.mean(librosa.feature.spectral_flatness(y=y)) # Compute spectral centroid - related to "brightness" or articulation clarity centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr)) # Normalize flatness (lower is better for speech) - range typically 0.01-0.5 norm_flatness = max(0, min(100, (0.5 - flatness) / 0.5 * 100)) # Normalize centroid (mid-range is better for clear speech) - typically 1000-4000 Hz for clear speech ideal_centroid = 2500 # Hz centroid_deviation = abs(centroid - ideal_centroid) / 2000 # Normalized by expected deviation norm_centroid = max(0, min(100, (1 - centroid_deviation) * 100)) # Combine the two metrics (with more weight on flatness) enunciation_score = (0.7 * norm_flatness) + (0.3 * norm_centroid) return enunciation_score def calculate_speech_pause_control(segments: List[Dict]) -> float: """ Calculate how effectively pauses are integrated in speech. Speech pause control refers to the natural vs. abrupt pauses in speech. Args: segments (List[Dict]): List of transcript segments with timing information Returns: float: Speech pause control score (0-100) """ if len(segments) < 2: return 100.0 # Not enough segments to evaluate pauses pause_durations = [] for i in range(len(segments) - 1): pause_dur = segments[i + 1]["start"] - segments[i]["end"] if pause_dur > 0.05: # Only consider actual pauses pause_durations.append(pause_dur) if not pause_durations: return 100.0 # No significant pauses detected # Calculate the standard deviation of pause durations # More consistent pauses indicate better control pause_std = np.std(pause_durations) # Calculate proportion of very long pauses (potentially awkward) long_pauses = sum(1 for d in pause_durations if d > 2.0) long_pause_ratio = long_pauses / len(pause_durations) if pause_durations else 0 # Normalize std dev (lower is better, but not too low) # Ideal range is around 0.2-0.5 seconds if pause_std < 0.1: std_score = 70 # Too consistent might sound robotic elif pause_std < 0.5: std_score = 100 - ((pause_std - 0.1) / 0.4 * 30) # Scale 70-100 else: std_score = max(0, 70 - ((pause_std - 0.5) / 2.0 * 70)) # Scale down from 70 # Penalize for too many long pauses long_pause_penalty = long_pause_ratio * 50 # Final score pause_control_score = max(0, min(100, std_score - long_pause_penalty)) return pause_control_score def calculate_voice_clarity_score(y: np.ndarray, sr: int, segments: List[Dict]) -> Dict[str, Any]: """ Calculate the Voice Clarity Score (VCS) and its components. VCS reflects the clarity and intelligibility of speech. Args: y (np.ndarray): Audio signal sr (int): Sample rate segments (List[Dict]): List of transcript segments with timing information Returns: Dict[str, Any]: Dictionary with VCS and component scores """ # Calculate component scores articulation_score = calculate_articulation(y, sr) enunciation_score = calculate_enunciation(y, sr) speech_pause_control_score = calculate_speech_pause_control(segments) # Calculate Voice Clarity Score using the formula from the paper vcs = (0.45 * articulation_score) + (0.35 * enunciation_score) + (0.2 * speech_pause_control_score) # Create result dictionary result = { "VCS": vcs, "components": { "articulation": articulation_score, "enunciation": enunciation_score, "speech_pause_control": speech_pause_control_score } } # Add interpretation result["insight"] = get_clarity_insight(vcs) return result def get_clarity_insight(vcs: float) -> str: """ Generate insight text based on the Voice Clarity Score. Args: vcs (float): Voice Clarity Score (0-100) Returns: str: Insight text explaining the score """ if vcs >= 85: return "Excellent voice clarity with precise articulation and well-controlled pauses. Speech is highly intelligible and pleasant to listen to." elif vcs >= 70: return "Good voice clarity with clear pronunciation and generally appropriate pauses. Minor improvements could enhance overall clarity." elif vcs >= 50: return "Moderate voice clarity with some articulation issues or irregular pauses. Focus on clearer pronunciation and more natural pausing." elif vcs >= 30: return "Below average clarity with noticeable articulation problems or awkward pausing patterns. Consider speech exercises to improve clarity." else: return "Speech clarity needs significant improvement. Articulation is unclear and pausing patterns disrupt intelligibility. Speech therapy exercises may be beneficial."