File size: 2,824 Bytes
8ad2ab3
 
 
 
aef3b1e
 
 
8ad2ab3
 
 
 
 
 
 
 
 
 
 
 
 
aef3b1e
8ad2ab3
 
 
 
 
 
 
 
aef3b1e
 
8ad2ab3
 
 
 
 
 
 
aef3b1e
 
 
 
 
 
 
 
8ad2ab3
 
aef3b1e
8ad2ab3
 
aef3b1e
8ad2ab3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
from .vps import calculate_vps  # Your file where calc_srs, calculate_pas, calculate_rcs, calculate_vps live
import librosa
import numpy as np
import math

import pyworld
from filler_count.filler_score import analyze_fillers

def compute_vps_score(file_path: str, whisper_model) -> dict:
    """
    Compute VPS (Voice Pacing Score) and its components from a speech sample.

    Args:
        file_path (str): Path to the audio file.
        whisper_model: Transcription model (e.g., OpenAI Whisper or faster-whisper)

    Returns:
        dict: A dictionary containing VPS, SRS, PAS, RCS, and component scores.
    """
    # Transcribe
    result = whisper_model.transcribe(file_path, word_timestamps=False, fp16=False)
    transcript = result.get("text", "").strip()
    segments = result.get("segments", [])

    # Validate early
    if not transcript or not segments:
        raise ValueError("Empty transcript or segments from Whisper.")

    # Filler count
    result = analyze_fillers(file_path,'base',transcript)
    filler_count = result.get("filler_count", 0)

    # Load audio
    y, sr = librosa.load(file_path, sr=None)
    duration = len(y) / sr if sr else 0.0
    if duration <= 0:
        raise ValueError("Audio duration invalid or zero.")

    # Calculate pitch variation (in semitones) using pyworld
    _f0, t = pyworld.harvest(y.astype(np.float64), sr, f0_floor=80.0, f0_ceil=400.0, frame_period=1000 * 256 / sr)
    f0 = pyworld.stonemask(y.astype(np.float64), _f0, t, sr)
    voiced_f0 = f0[f0 > 0]
    voiced_f0 = voiced_f0[
        (voiced_f0 > np.percentile(voiced_f0, 5)) &
        (voiced_f0 < np.percentile(voiced_f0, 95))
    ]
    pitch_variation = 0.0
    if voiced_f0.size > 0:
        median_f0 = np.median(voiced_f0)
        median_f0 = max(median_f0, 1e-6)
        semitone_diffs = 12 * np.log2(voiced_f0 / median_f0)
        pitch_variation = float(np.std(semitone_diffs))

    # Pause analysis
    long_pause_count = 0
    if segments:
        for i in range(len(segments) - 1):
            pause_dur = segments[i + 1]["start"] - segments[i]["end"]
            if pause_dur > 1.0:
                long_pause_count += 1
        # Beginning and end
        if segments[0]["start"] > 1.0:
            long_pause_count += 1
        if duration - segments[-1]["end"] > 1.0:
            long_pause_count += 1

    # WPM
    word_count = len(transcript.split())
    words_per_min = (word_count / duration) * 60.0 if duration > 0 else 0.0

    # Calculate VPS and components
    vps_result = calculate_vps(
        transcript=transcript,
        segments=segments,
        filler_count=filler_count,
        duration=duration,
        wpm=words_per_min,
        long_pause_count=long_pause_count,
        pitch_variation=pitch_variation,
        y=y,
        sr=sr
    )

    return vps_result