Spaces:

cruvss
/

Fast_api

Running

File size: 12,046 Bytes

8ad2ab3


import scipy.signal
import numpy as np
import librosa
import pyworld as pw

# def compute_pitch_variation(file_path):
#     # Step 1: Load audio
#     y, sr = librosa.load(file_path, sr=None)
#     y = y.astype(np.float64)  # pyworld expects float64

#     # Step 2: Extract pitch (F0)
#     _f0, t = pw.dio(y, sr)              # Fast initial pitch estimation
#     f0 = pw.stonemask(y, _f0, t, sr)    # Refinement step

#     # Step 3: Filter voiced frames
#     voiced_f0 = f0[f0 > 0]

#     # Handle empty case
#     if voiced_f0.size == 0:
#         return {
#             "pitch_mean": 0.0,
#             "pitch_std": 0.0,
#             "pitch_range": 0.0,
#             "semitone_std": 0.0,
#             "pitch_variation_score": 0.0
#         }

#     # Step 4: Basic statistics
#     pitch_mean = np.mean(voiced_f0)
#     pitch_std = np.std(voiced_f0)
#     pitch_range = np.max(voiced_f0) - np.min(voiced_f0)

#     print(pitch_mean)
#     print(f'voiced_f0: {voiced_f0}')
#     # Step 5: Compute semitone-based variation (better for human perception)
#     median_f0 = np.median(voiced_f0)
#     if median_f0 <= 0:
#         median_f0 = 1e-6  # Avoid division by zero
        
#     semitone_diffs = 12 * np.log2(voiced_f0 / median_f0)
#     semitone_std = np.std(semitone_diffs)
#     print(semitone_std)

#     # Step 6: Scale semitone_std to a 0–100 score (tunable)
#     # For example: semitone_std of 0 → 0 score, ≥6 semitones → 100 score
#     pitch_variation_score = np.clip((semitone_std / 6.0) * 100, 0, 100)

#     return {
#         "pitch_mean": pitch_mean,
#         "pitch_std": pitch_std,
#         "pitch_range": pitch_range,
#         "semitone_std": semitone_std,
#         "pitch_variation_score": pitch_variation_score
#     }
# def compute_intonation_range(file_path):
#     # Step 1: Load and prepare audio
#     y, sr = librosa.load(file_path, sr=None)
#     y = y.astype(np.float64)

#     # Step 2: Extract F0
#     _f0, t = pw.dio(y, sr)
#     f0 = pw.stonemask(y, _f0, t, sr)
    
   

#     # Step 3: Filter voiced frames
#     voiced_f0 = f0[f0 > 0]
#     if voiced_f0.size == 0:
#         return 0.0
    
#     voiced_f0 = voiced_f0[(voiced_f0 > np.percentile(voiced_f0, 5)) & 
#                       (voiced_f0 < np.percentile(voiced_f0, 95))]

#     # Step 4: Compute intonation range (in semitones)
#     f0_min = np.min(voiced_f0)
#     f0_max = np.max(voiced_f0)
#     if f0_min <= 0:
#         f0_min = 1e-6  # to avoid log error
#     intonation_range = 12 * np.log2(f0_max / f0_min)
    
#     # range into scores:
    
#     max_range = 12.0
#     normalized = min(intonation_range, max_range) / max_range
#     score = normalized * 100
#     return round(score, 2), intonation_range



# def compute_pitch_variation(file_path):
#     # Step 1: Load audio
#     y, sr = librosa.load(file_path, sr=None)

#     # Step 2: Extract pitch using librosa.pyin (YIN-based)
#     f0, voiced_flags, voiced_probs = librosa.pyin(
#         y,
#         sr=sr,
#         fmin=80,
#         fmax=400,
#         frame_length=1105,
#         hop_length=256,
#         fill_na=np.nan
#     )

#     # Step 3: Filter voiced frames
#     voiced_f0 = f0[~np.isnan(f0)]
 
 
#     voiced_f0 = voiced_f0[
#         (voiced_f0 > np.percentile(voiced_f0, 5)) &
#         (voiced_f0 < np.percentile(voiced_f0, 95))
#     ]

#     # Handle empty case
#     if voiced_f0.size == 0:
#         return {
#             "pitch_mean": 0.0,
#             "pitch_std": 0.0,
#             "pitch_range": 0.0,
#             "semitone_std": 0.0,
#             "pitch_variation_score": 0.0
#         }

#     # Step 4: Basic statistics
#     pitch_mean = float(np.mean(voiced_f0))
#     pitch_std = float(np.std(voiced_f0))
#     pitch_range = float(np.max(voiced_f0) - np.min(voiced_f0))


#     # Step 5: Compute semitone-based variation
#     median_f0 = np.median(voiced_f0)
#     if median_f0 <= 0:
#         median_f0 = 1e-6

#     semitone_diffs = 12 * np.log2(voiced_f0 / median_f0)
#     semitone_std = float(np.std(semitone_diffs))
  

#     # Step 6: Scale to 0–100 score
#     pitch_variation_score = float(np.clip((semitone_std / 6.0) * 100, 0, 100))
#     return {
#         "pitch_mean": pitch_mean,
#         "pitch_std": pitch_std,
#         "pitch_range": pitch_range,
#         "semitone_std": semitone_std,
#         "pitch_variation_score": pitch_variation_score
#     }

# def compute_intonation_range(file_path):
#     # Step 1: Load and prepare audio
#     y, sr = librosa.load(file_path, sr=None)

#     # Step 2: Extract F0 using librosa.pyin
#     f0, voiced_flags, voiced_probs = librosa.pyin(
#         y,
#         sr=sr,
#         fmin=80,
#         fmax=400,
#         frame_length=1105,  # ensures two periods of fmin fit
#         hop_length=256,
#         fill_na=np.nan
#     )

#     # Step 3: Filter voiced frames
#     voiced_f0 = f0[~np.isnan(f0)]
#     if voiced_f0.size == 0:
#         return 0.0, 0.0

#     # Optional: remove outliers (5th to 95th percentile)
#     voiced_f0 = voiced_f0[
#         (voiced_f0 > np.percentile(voiced_f0, 5)) &
#         (voiced_f0 < np.percentile(voiced_f0, 95))
#     ]

#     # Step 4: Compute intonation range in semitones
#     f0_min = np.min(voiced_f0)
#     f0_max = np.max(voiced_f0)
#     if f0_min <= 0:
#         f0_min = 1e-6

#     intonation_range = 12 * np.log2(f0_max / f0_min)

#     # Step 5: Normalize and convert to score out of 100
#     max_range = 12.0  # ~1 octave
#     normalized = min(intonation_range, max_range) / max_range
#     score = normalized * 100

#     return round(score, 2), float(intonation_range)



# def compute_speech_rhythm_variability(file_path):
#     """
#     Computes the speech rhythm variability score from an audio file.
#     The method estimates tempo consistency across time using onset intervals.

#     Returns:
#         score (float): Normalized rhythm variability score out of 100.
#         raw_std (float): Raw standard deviation of inter-onset intervals.
#     """
#     # Step 1: Load audio
#     y, sr = librosa.load(file_path, sr=None)

#     # Step 2: Onset detection 
#     onset_env = librosa.onset.onset_strength(y=y, sr=sr)
#     onsets = librosa.onset.onset_detect(onset_envelope=onset_env, sr=sr, units='time')

#     if len(onsets) < 2:
#         return 0.0, 0.0  # Not enough onsets to compute rhythm

#     # Step 3: Compute inter-onset intervals (IOIs) as rhythm proxy
#     iois = np.diff(onsets)

#     # Optional: Remove outliers (5th–95th percentile)
#     ioi_clean = iois[(iois > np.percentile(iois, 5)) & (iois < np.percentile(iois, 95))]
#     if len(ioi_clean) < 2:
#         return 0.0, 0.0

#     # Step 4: Compute variability — standard deviation of IOIs
#     raw_std = np.std(ioi_clean)

#     # Step 5: Normalize raw_std to 0–100 score
#     # Lower std = more consistent rhythm → higher score
#     min_std = 0.05   # near-perfect rhythm (tight pacing)
#     max_std = 0.6    # highly irregular rhythm

#     # Clamp and reverse-score
#     clamped_std = np.clip(raw_std, min_std, max_std)
#     normalized = 1 - (clamped_std - min_std) / (max_std - min_std)
#     score = normalized * 100

#     return round(score, 2), round(float(raw_std), 4)


# def calc_sds(file_path):
    
#     # sds = 0.35 * pitch_variation + 0.35 * intonation_range + 0.3 * speech_rhythm_variability
    
#     pitch_variation = compute_pitch_variation(file_path)
#     intonation_range = compute_intonation_range(file_path)
#     speech_rhythm_variability = compute_speech_rhythm_variability(file_path)
#     # print(f"Speech Rhythm Variability Score: {speech_rhythm_variability}")
#     # print(f"Speech Rhythm Variability Score: {speech_rhythm_variability}")
#     # print(f"Speech Rhythm Variability Score: {speech_rhythm_variability}")
    
#     sds = 0.35 * pitch_variation['pitch_variation_score'] + 0.35 * intonation_range[0] + 0.3 * speech_rhythm_variability[0]
#     return round(sds, 2)

# path = r'D:\Intern\shankh\audio_samples\anga.wav'

# result = calc_sds(path)
# print(f"SDS: {result}")

import numpy as np
import librosa
import pyworld

def compute_pitch_variation(file_path):
    # Step 1: Load audio
    y, sr = librosa.load(file_path, sr=None)

    # Step 2: Extract pitch using pyworld
    _f0, t = pyworld.harvest(y.astype(np.float64), sr, f0_floor=80.0, f0_ceil=400.0, frame_period=1000 * 256 / sr)
    f0 = pyworld.stonemask(y.astype(np.float64), _f0, t, sr)

    # Step 3: Filter voiced frames
    voiced_f0 = f0[f0 > 0]

    # Remove outliers (5th to 95th percentile)
    voiced_f0 = voiced_f0[
        (voiced_f0 > np.percentile(voiced_f0, 5)) &
        (voiced_f0 < np.percentile(voiced_f0, 95))
    ]

    if voiced_f0.size == 0:
        return {
            "pitch_mean": 0.0,
            "pitch_std": 0.0,
            "pitch_range": 0.0,
            "semitone_std": 0.0,
            "pitch_variation_score": 0.0
        }

    # Step 4: Basic statistics
    pitch_mean = float(np.mean(voiced_f0))
    pitch_std = float(np.std(voiced_f0))
    pitch_range = float(np.max(voiced_f0) - np.min(voiced_f0))

    # Step 5: Semitone-based variation
    median_f0 = np.median(voiced_f0)
    if median_f0 <= 0:
        median_f0 = 1e-6
    semitone_diffs = 12 * np.log2(voiced_f0 / median_f0)
    semitone_std = float(np.std(semitone_diffs))

    # Step 6: Scaled variation score
    pitch_variation_score = float(np.clip((semitone_std / 6.0) * 100, 0, 100))

    return {
        "pitch_mean": pitch_mean,
        "pitch_std": pitch_std,
        "pitch_range": pitch_range,
        "semitone_std": semitone_std,
        "pitch_variation_score": pitch_variation_score
    }


def compute_intonation_range(file_path):
    # Step 1: Load audio
    y, sr = librosa.load(file_path, sr=None)

    # Step 2: Extract pitch using pyworld
    _f0, t = pyworld.harvest(y.astype(np.float64), sr, f0_floor=80.0, f0_ceil=400.0, frame_period=1000 * 256 / sr)
    f0 = pyworld.stonemask(y.astype(np.float64), _f0, t, sr)

    # Step 3: Filter voiced frames
    voiced_f0 = f0[f0 > 0]
    if voiced_f0.size == 0:
        return 0.0, 0.0

    # Remove outliers
    voiced_f0 = voiced_f0[
        (voiced_f0 > np.percentile(voiced_f0, 5)) &
        (voiced_f0 < np.percentile(voiced_f0, 95))
    ]
    if voiced_f0.size == 0:
        return 0.0, 0.0

    # Step 4: Compute intonation range
    f0_min = np.min(voiced_f0)
    f0_max = np.max(voiced_f0)
    if f0_min <= 0:
        f0_min = 1e-6
    intonation_range = 12 * np.log2(f0_max / f0_min)

    # Step 5: Normalize
    max_range = 12.0
    normalized = min(intonation_range, max_range) / max_range
    score = normalized * 100

    return round(score, 2), float(intonation_range)


def compute_speech_rhythm_variability(file_path):
    """
    Computes the speech rhythm variability score from an audio file.
    The method estimates tempo consistency across time using onset intervals.
    """
    y, sr = librosa.load(file_path, sr=None)

    # Step 2: Onset detection
    onset_env = librosa.onset.onset_strength(y=y, sr=sr)
    onsets = librosa.onset.onset_detect(onset_envelope=onset_env, sr=sr, units='time')

    if len(onsets) < 2:
        return 0.0, 0.0

    iois = np.diff(onsets)

    ioi_clean = iois[(iois > np.percentile(iois, 5)) & (iois < np.percentile(iois, 95))]
    if len(ioi_clean) < 2:
        return 0.0, 0.0

    raw_std = np.std(ioi_clean)

    min_std = 0.05
    max_std = 0.6
    clamped_std = np.clip(raw_std, min_std, max_std)
    normalized = 1 - (clamped_std - min_std) / (max_std - min_std)
    score = normalized * 100

    return round(score, 2), round(float(raw_std), 4)


def calc_sds(file_path):
    pitch_variation = compute_pitch_variation(file_path)
    intonation_range = compute_intonation_range(file_path)
    speech_rhythm_variability = compute_speech_rhythm_variability(file_path)

    sds = 0.35 * pitch_variation['pitch_variation_score'] + \
          0.35 * intonation_range[0] + \
          0.3 * speech_rhythm_variability[0]
    
    return round(sds, 2)