Spaces:

cruvss
/

Fast_api

Running

File size: 6,031 Bytes

8ad2ab3

from typing import List, Dict
import librosa
import numpy as np
import spacy
import math
from .filler_analyzer import detect_fillers

def calc_srs(wpm: float, filler_count: int, long_pause_count: int, pitch_variation: float) -> float:
    """
    Speech Rate Stability (SRS): Reflects the consistency of the speaker's pace and rhythm.
    """
    ideal_wpm = 150
    wpm_deviation = min(30, abs(wpm - ideal_wpm))
    wpm_consistency = max(0, 100 - (wpm_deviation * 1.67))

    filler_penalty = min(filler_count / 10, 1.0)
    pause_penalty = min(long_pause_count / 5, 1.0)
    pitch_penalty = min(pitch_variation / 3.0, 1.0)

    stability = (1 - ((filler_penalty + pause_penalty + pitch_penalty) / 3)) * 100
    SRS = (0.45 * wpm_consistency) + (0.55 * stability)
    return min(100, max(0, SRS))

def calculate_pas(transcript: str, segments: List[Dict], filler_count: int, duration: float) -> Dict[str, float]:
    """
    Calculate the Pause Appropriateness Score (PAS) and its components.
    """
    if not transcript or not segments or duration <= 0:
        raise ValueError("Transcript, segments, and duration must be valid")
    
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(transcript)
    
    words = transcript.split()
    total_words = len(words)
    if total_words == 0:
        raise ValueError("No words found in transcript")
    
    filler_rate = filler_count / total_words if total_words > 0 else 0.0
    if filler_rate >= 0.10:
        afw = 0.0
    elif filler_rate <= 0.0:
        afw = 100.0
    else:
        afw = 100.0 - (filler_rate * 1000)
    afw = max(0.0, min(100.0, afw))
    
    total_pauses = 0
    natural_pauses = 0
    segment_texts = [seg["text"].strip() for seg in segments]
    segment_starts = [seg["start"] for seg in segments]
    segment_ends = [seg["end"] for seg in segments]
    
    for i in range(len(segments) - 1):
        pause_dur = segment_starts[i + 1] - segment_ends[i]
        if pause_dur > 0.5:
            total_pauses += 1
            if segment_texts[i] and segment_texts[i][-1] in ".!?,": 
                natural_pauses += 1
    
    if segment_starts[0] > 0.5:
        total_pauses += 1
    if duration - segment_ends[-1] > 0.5:
        total_pauses += 1
        if segment_texts[-1] and segment_texts[-1][-1] in ".!?":
            natural_pauses += 1
    
    npp = 100.0 if total_pauses == 0 else (natural_pauses / total_pauses) * 100.0
    pas = (0.4 * npp) + (0.6 * afw)
    
    return {
        "NPP": npp,
        "AFW": afw,
        "PAS": pas
    }

def calculate_rcs(y: np.ndarray, sr: int, segments: List[Dict], duration: float) -> Dict[str, float]:
    """
    Calculate the Rhythm Consistency Score (RCS) and its components.
    """
    if y.size == 0 or sr <= 0 or duration <= 0 or not segments:
        raise ValueError("Audio signal, sampling rate, duration, and segments must be valid")
    
    onset_env = librosa.onset.onset_strength(y=y, sr=sr, hop_length=256)
    onsets = librosa.onset.onset_detect(onset_envelope=onset_env, sr=sr, units='time', hop_length=256)
    
    if len(onsets) > 1:
        iois = np.diff(onsets)
        ioi_std = np.std(iois)
        ioi_std = min(max(ioi_std, 0.1), 0.5)
        str_score = 100.0 * (0.5 - ioi_std) / (0.5 - 0.1)
        str_score = max(0.0, min(100.0, str_score))
    else:
        str_score = 100.0
    
    total_transitions = 0
    smooth_transitions = 0
    pause_threshold = 0.3
    
    for i in range(len(segments) - 1):
        gap = segments[i + 1]["start"] - segments[i]["end"]
        total_transitions += 1
        if gap <= pause_threshold:
            smooth_transitions += 1
    
    for segment in segments:
        words = segment["text"].strip().split()
        if len(words) > 1:
            smooth_transitions += len(words) - 1
            total_transitions += len(words) - 1
    
    stw = 100.0 if total_transitions == 0 else (smooth_transitions / total_transitions) * 100.0
    rcs = (0.5 * str_score) + (0.5 * stw)
    
    return {
        "STR": str_score,
        "STW": stw,
        "RCS": rcs
    }

def calculate_vps(
    transcript: str,
    segments: List[Dict],
    filler_count: int,
    duration: float,
    wpm: float,
    long_pause_count: int,
    pitch_variation: float,
    y: np.ndarray,
    sr: int
) -> Dict[str, float]:
    """
    Calculate the Voice Pacing Score (VPS) and its components:
    - SRS: Speech Rate Stability Score
    - PAS: Pause Appropriateness Score
    - RCS: Rhythm Consistency Score
    - VPS = (0.5 * SRS) + (0.3 * PAS) + (0.2 * RCS)
    
    Args:
        transcript (str): Transcribed text.
        segments (List[Dict]): Whisper model segments with 'start', 'end', 'text'.
        filler_count (int): Number of filler words.
        duration (float): Audio duration (seconds).
        wpm (float): Words per minute.
        long_pause_count (int): Number of long pauses (>1.0s).
        pitch_variation (float): Pitch variation in semitones.
        y (np.ndarray): Audio signal.
        sr (int): Sampling rate.
    
    Returns:
        Dict[str, float]: Scores for SRS, PAS, RCS, VPS, and intermediates.
    """
    # Validate inputs
    if not transcript or not segments or duration <= 0 or y.size == 0 or sr <= 0:
        raise ValueError("Invalid inputs")
    
    # Calculate SRS
    srs = calc_srs(wpm, filler_count, long_pause_count, pitch_variation)
    
    # Calculate PAS
    pas_result = calculate_pas(transcript, segments, filler_count, duration)
    pas = pas_result["PAS"]
    npp = pas_result["NPP"]
    afw = pas_result["AFW"]
    
    # Calculate RCS
    rcs_result = calculate_rcs(y, sr, segments, duration)
    rcs = rcs_result["RCS"]
    str_score = rcs_result["STR"]
    stw = rcs_result["STW"]
    
    # Calculate VPS
    vps = (0.5 * srs) + (0.3 * pas) + (0.2 * rcs)
    vps = max(0.0, min(100.0, vps))
    
    return {
        "SRS": srs,
        "PAS": pas,
        "NPP": npp,
        "AFW": afw,
        "RCS": rcs,
        "STR": str_score,
        "STW": stw,
        "VPS": vps
    }