Spaces:

cruvss
/

Fast_api

Running

File size: 6,643 Bytes

8ad2ab3

"""
Voice Clarity Score calculation module
"""

import librosa
import numpy as np
from typing import Dict, Any, List
import soundfile as sf

def calculate_articulation(y: np.ndarray, sr: int) -> float:
    """
    Calculate articulation quality based on spectral contrast.
    
    Articulation refers to how clearly individual phonemes are produced.
    
    Args:
        y (np.ndarray): Audio signal
        sr (int): Sample rate
        
    Returns:
        float: Articulation score (0-100)
    """
    # Extract spectral contrast
    # Higher contrast between peaks and valleys in the spectrum generally correlates with clearer articulation
    S = np.abs(librosa.stft(y))
    contrast = librosa.feature.spectral_contrast(S=S, sr=sr)
    
    # Average across frequency bands and frames
    mean_contrast = np.mean(contrast)
    
    # Normalize to 0-100 scale (empirically determined range)
    # Typical values range from 10-50 dB
    min_contrast = 10
    max_contrast = 50
    normalized_contrast = min(100, max(0, (mean_contrast - min_contrast) / (max_contrast - min_contrast) * 100))
    
    return normalized_contrast

def calculate_enunciation(y: np.ndarray, sr: int) -> float:
    """
    Calculate enunciation quality based on formant clarity and spectral flatness.
    
    Enunciation is the precision in pronouncing vowels and consonants.
    
    Args:
        y (np.ndarray): Audio signal
        sr (int): Sample rate
        
    Returns:
        float: Enunciation score (0-100)
    """
    # Compute spectral flatness - lower values indicate clearer formants and better enunciation
    flatness = np.mean(librosa.feature.spectral_flatness(y=y))
    
    # Compute spectral centroid - related to "brightness" or articulation clarity
    centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
    
    # Normalize flatness (lower is better for speech) - range typically 0.01-0.5
    norm_flatness = max(0, min(100, (0.5 - flatness) / 0.5 * 100))
    
    # Normalize centroid (mid-range is better for clear speech) - typically 1000-4000 Hz for clear speech
    ideal_centroid = 2500  # Hz
    centroid_deviation = abs(centroid - ideal_centroid) / 2000  # Normalized by expected deviation
    norm_centroid = max(0, min(100, (1 - centroid_deviation) * 100))
    
    # Combine the two metrics (with more weight on flatness)
    enunciation_score = (0.7 * norm_flatness) + (0.3 * norm_centroid)
    
    return enunciation_score

def calculate_speech_pause_control(segments: List[Dict]) -> float:
    """
    Calculate how effectively pauses are integrated in speech.
    
    Speech pause control refers to the natural vs. abrupt pauses in speech.
    
    Args:
        segments (List[Dict]): List of transcript segments with timing information
        
    Returns:
        float: Speech pause control score (0-100)
    """
    if len(segments) < 2:
        return 100.0  # Not enough segments to evaluate pauses
    
    pause_durations = []
    for i in range(len(segments) - 1):
        pause_dur = segments[i + 1]["start"] - segments[i]["end"]
        if pause_dur > 0.05:  # Only consider actual pauses
            pause_durations.append(pause_dur)
    
    if not pause_durations:
        return 100.0  # No significant pauses detected
    
    # Calculate the standard deviation of pause durations
    # More consistent pauses indicate better control
    pause_std = np.std(pause_durations)
    
    # Calculate proportion of very long pauses (potentially awkward)
    long_pauses = sum(1 for d in pause_durations if d > 2.0)
    long_pause_ratio = long_pauses / len(pause_durations) if pause_durations else 0
    
    # Normalize std dev (lower is better, but not too low)
    # Ideal range is around 0.2-0.5 seconds
    if pause_std < 0.1:
        std_score = 70  # Too consistent might sound robotic
    elif pause_std < 0.5:
        std_score = 100 - ((pause_std - 0.1) / 0.4 * 30)  # Scale 70-100
    else:
        std_score = max(0, 70 - ((pause_std - 0.5) / 2.0 * 70))  # Scale down from 70
    
    # Penalize for too many long pauses
    long_pause_penalty = long_pause_ratio * 50
    
    # Final score
    pause_control_score = max(0, min(100, std_score - long_pause_penalty))
    
    return pause_control_score

def calculate_voice_clarity_score(y: np.ndarray, sr: int, segments: List[Dict]) -> Dict[str, Any]:
    """
    Calculate the Voice Clarity Score (VCS) and its components.
    
    VCS reflects the clarity and intelligibility of speech.
    
    Args:
        y (np.ndarray): Audio signal
        sr (int): Sample rate
        segments (List[Dict]): List of transcript segments with timing information
        
    Returns:
        Dict[str, Any]: Dictionary with VCS and component scores
    """
    # Calculate component scores
    articulation_score = calculate_articulation(y, sr)
    enunciation_score = calculate_enunciation(y, sr)
    speech_pause_control_score = calculate_speech_pause_control(segments)
    
    # Calculate Voice Clarity Score using the formula from the paper
    vcs = (0.45 * articulation_score) + (0.35 * enunciation_score) + (0.2 * speech_pause_control_score)
    
    # Create result dictionary
    result = {
        "VCS": vcs,
        "components": {
            "articulation": articulation_score,
            "enunciation": enunciation_score,
            "speech_pause_control": speech_pause_control_score
        }
    }
    
    # Add interpretation
    result["insight"] = get_clarity_insight(vcs)
    
    return result

def get_clarity_insight(vcs: float) -> str:
    """
    Generate insight text based on the Voice Clarity Score.
    
    Args:
        vcs (float): Voice Clarity Score (0-100)
        
    Returns:
        str: Insight text explaining the score
    """
    if vcs >= 85:
        return "Excellent voice clarity with precise articulation and well-controlled pauses. Speech is highly intelligible and pleasant to listen to."
    elif vcs >= 70:
        return "Good voice clarity with clear pronunciation and generally appropriate pauses. Minor improvements could enhance overall clarity."
    elif vcs >= 50:
        return "Moderate voice clarity with some articulation issues or irregular pauses. Focus on clearer pronunciation and more natural pausing."
    elif vcs >= 30:
        return "Below average clarity with noticeable articulation problems or awkward pausing patterns. Consider speech exercises to improve clarity."
    else:
        return "Speech clarity needs significant improvement. Articulation is unclear and pausing patterns disrupt intelligibility. Speech therapy exercises may be beneficial."