File size: 6,643 Bytes
8ad2ab3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
"""
Voice Clarity Score calculation module
"""

import librosa
import numpy as np
from typing import Dict, Any, List
import soundfile as sf

def calculate_articulation(y: np.ndarray, sr: int) -> float:
    """
    Calculate articulation quality based on spectral contrast.
    
    Articulation refers to how clearly individual phonemes are produced.
    
    Args:
        y (np.ndarray): Audio signal
        sr (int): Sample rate
        
    Returns:
        float: Articulation score (0-100)
    """
    # Extract spectral contrast
    # Higher contrast between peaks and valleys in the spectrum generally correlates with clearer articulation
    S = np.abs(librosa.stft(y))
    contrast = librosa.feature.spectral_contrast(S=S, sr=sr)
    
    # Average across frequency bands and frames
    mean_contrast = np.mean(contrast)
    
    # Normalize to 0-100 scale (empirically determined range)
    # Typical values range from 10-50 dB
    min_contrast = 10
    max_contrast = 50
    normalized_contrast = min(100, max(0, (mean_contrast - min_contrast) / (max_contrast - min_contrast) * 100))
    
    return normalized_contrast

def calculate_enunciation(y: np.ndarray, sr: int) -> float:
    """
    Calculate enunciation quality based on formant clarity and spectral flatness.
    
    Enunciation is the precision in pronouncing vowels and consonants.
    
    Args:
        y (np.ndarray): Audio signal
        sr (int): Sample rate
        
    Returns:
        float: Enunciation score (0-100)
    """
    # Compute spectral flatness - lower values indicate clearer formants and better enunciation
    flatness = np.mean(librosa.feature.spectral_flatness(y=y))
    
    # Compute spectral centroid - related to "brightness" or articulation clarity
    centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
    
    # Normalize flatness (lower is better for speech) - range typically 0.01-0.5
    norm_flatness = max(0, min(100, (0.5 - flatness) / 0.5 * 100))
    
    # Normalize centroid (mid-range is better for clear speech) - typically 1000-4000 Hz for clear speech
    ideal_centroid = 2500  # Hz
    centroid_deviation = abs(centroid - ideal_centroid) / 2000  # Normalized by expected deviation
    norm_centroid = max(0, min(100, (1 - centroid_deviation) * 100))
    
    # Combine the two metrics (with more weight on flatness)
    enunciation_score = (0.7 * norm_flatness) + (0.3 * norm_centroid)
    
    return enunciation_score

def calculate_speech_pause_control(segments: List[Dict]) -> float:
    """
    Calculate how effectively pauses are integrated in speech.
    
    Speech pause control refers to the natural vs. abrupt pauses in speech.
    
    Args:
        segments (List[Dict]): List of transcript segments with timing information
        
    Returns:
        float: Speech pause control score (0-100)
    """
    if len(segments) < 2:
        return 100.0  # Not enough segments to evaluate pauses
    
    pause_durations = []
    for i in range(len(segments) - 1):
        pause_dur = segments[i + 1]["start"] - segments[i]["end"]
        if pause_dur > 0.05:  # Only consider actual pauses
            pause_durations.append(pause_dur)
    
    if not pause_durations:
        return 100.0  # No significant pauses detected
    
    # Calculate the standard deviation of pause durations
    # More consistent pauses indicate better control
    pause_std = np.std(pause_durations)
    
    # Calculate proportion of very long pauses (potentially awkward)
    long_pauses = sum(1 for d in pause_durations if d > 2.0)
    long_pause_ratio = long_pauses / len(pause_durations) if pause_durations else 0
    
    # Normalize std dev (lower is better, but not too low)
    # Ideal range is around 0.2-0.5 seconds
    if pause_std < 0.1:
        std_score = 70  # Too consistent might sound robotic
    elif pause_std < 0.5:
        std_score = 100 - ((pause_std - 0.1) / 0.4 * 30)  # Scale 70-100
    else:
        std_score = max(0, 70 - ((pause_std - 0.5) / 2.0 * 70))  # Scale down from 70
    
    # Penalize for too many long pauses
    long_pause_penalty = long_pause_ratio * 50
    
    # Final score
    pause_control_score = max(0, min(100, std_score - long_pause_penalty))
    
    return pause_control_score

def calculate_voice_clarity_score(y: np.ndarray, sr: int, segments: List[Dict]) -> Dict[str, Any]:
    """
    Calculate the Voice Clarity Score (VCS) and its components.
    
    VCS reflects the clarity and intelligibility of speech.
    
    Args:
        y (np.ndarray): Audio signal
        sr (int): Sample rate
        segments (List[Dict]): List of transcript segments with timing information
        
    Returns:
        Dict[str, Any]: Dictionary with VCS and component scores
    """
    # Calculate component scores
    articulation_score = calculate_articulation(y, sr)
    enunciation_score = calculate_enunciation(y, sr)
    speech_pause_control_score = calculate_speech_pause_control(segments)
    
    # Calculate Voice Clarity Score using the formula from the paper
    vcs = (0.45 * articulation_score) + (0.35 * enunciation_score) + (0.2 * speech_pause_control_score)
    
    # Create result dictionary
    result = {
        "VCS": vcs,
        "components": {
            "articulation": articulation_score,
            "enunciation": enunciation_score,
            "speech_pause_control": speech_pause_control_score
        }
    }
    
    # Add interpretation
    result["insight"] = get_clarity_insight(vcs)
    
    return result

def get_clarity_insight(vcs: float) -> str:
    """
    Generate insight text based on the Voice Clarity Score.
    
    Args:
        vcs (float): Voice Clarity Score (0-100)
        
    Returns:
        str: Insight text explaining the score
    """
    if vcs >= 85:
        return "Excellent voice clarity with precise articulation and well-controlled pauses. Speech is highly intelligible and pleasant to listen to."
    elif vcs >= 70:
        return "Good voice clarity with clear pronunciation and generally appropriate pauses. Minor improvements could enhance overall clarity."
    elif vcs >= 50:
        return "Moderate voice clarity with some articulation issues or irregular pauses. Focus on clearer pronunciation and more natural pausing."
    elif vcs >= 30:
        return "Below average clarity with noticeable articulation problems or awkward pausing patterns. Consider speech exercises to improve clarity."
    else:
        return "Speech clarity needs significant improvement. Articulation is unclear and pausing patterns disrupt intelligibility. Speech therapy exercises may be beneficial."