import scipy.signal | |
import numpy as np | |
import librosa | |
import pyworld as pw | |
# def compute_pitch_variation(file_path): | |
# # Step 1: Load audio | |
# y, sr = librosa.load(file_path, sr=None) | |
# y = y.astype(np.float64) # pyworld expects float64 | |
# # Step 2: Extract pitch (F0) | |
# _f0, t = pw.dio(y, sr) # Fast initial pitch estimation | |
# f0 = pw.stonemask(y, _f0, t, sr) # Refinement step | |
# # Step 3: Filter voiced frames | |
# voiced_f0 = f0[f0 > 0] | |
# # Handle empty case | |
# if voiced_f0.size == 0: | |
# return { | |
# "pitch_mean": 0.0, | |
# "pitch_std": 0.0, | |
# "pitch_range": 0.0, | |
# "semitone_std": 0.0, | |
# "pitch_variation_score": 0.0 | |
# } | |
# # Step 4: Basic statistics | |
# pitch_mean = np.mean(voiced_f0) | |
# pitch_std = np.std(voiced_f0) | |
# pitch_range = np.max(voiced_f0) - np.min(voiced_f0) | |
# print(pitch_mean) | |
# print(f'voiced_f0: {voiced_f0}') | |
# # Step 5: Compute semitone-based variation (better for human perception) | |
# median_f0 = np.median(voiced_f0) | |
# if median_f0 <= 0: | |
# median_f0 = 1e-6 # Avoid division by zero | |
# semitone_diffs = 12 * np.log2(voiced_f0 / median_f0) | |
# semitone_std = np.std(semitone_diffs) | |
# print(semitone_std) | |
# # Step 6: Scale semitone_std to a 0β100 score (tunable) | |
# # For example: semitone_std of 0 β 0 score, β₯6 semitones β 100 score | |
# pitch_variation_score = np.clip((semitone_std / 6.0) * 100, 0, 100) | |
# return { | |
# "pitch_mean": pitch_mean, | |
# "pitch_std": pitch_std, | |
# "pitch_range": pitch_range, | |
# "semitone_std": semitone_std, | |
# "pitch_variation_score": pitch_variation_score | |
# } | |
# def compute_intonation_range(file_path): | |
# # Step 1: Load and prepare audio | |
# y, sr = librosa.load(file_path, sr=None) | |
# y = y.astype(np.float64) | |
# # Step 2: Extract F0 | |
# _f0, t = pw.dio(y, sr) | |
# f0 = pw.stonemask(y, _f0, t, sr) | |
# # Step 3: Filter voiced frames | |
# voiced_f0 = f0[f0 > 0] | |
# if voiced_f0.size == 0: | |
# return 0.0 | |
# voiced_f0 = voiced_f0[(voiced_f0 > np.percentile(voiced_f0, 5)) & | |
# (voiced_f0 < np.percentile(voiced_f0, 95))] | |
# # Step 4: Compute intonation range (in semitones) | |
# f0_min = np.min(voiced_f0) | |
# f0_max = np.max(voiced_f0) | |
# if f0_min <= 0: | |
# f0_min = 1e-6 # to avoid log error | |
# intonation_range = 12 * np.log2(f0_max / f0_min) | |
# # range into scores: | |
# max_range = 12.0 | |
# normalized = min(intonation_range, max_range) / max_range | |
# score = normalized * 100 | |
# return round(score, 2), intonation_range | |
# def compute_pitch_variation(file_path): | |
# # Step 1: Load audio | |
# y, sr = librosa.load(file_path, sr=None) | |
# # Step 2: Extract pitch using librosa.pyin (YIN-based) | |
# f0, voiced_flags, voiced_probs = librosa.pyin( | |
# y, | |
# sr=sr, | |
# fmin=80, | |
# fmax=400, | |
# frame_length=1105, | |
# hop_length=256, | |
# fill_na=np.nan | |
# ) | |
# # Step 3: Filter voiced frames | |
# voiced_f0 = f0[~np.isnan(f0)] | |
# voiced_f0 = voiced_f0[ | |
# (voiced_f0 > np.percentile(voiced_f0, 5)) & | |
# (voiced_f0 < np.percentile(voiced_f0, 95)) | |
# ] | |
# # Handle empty case | |
# if voiced_f0.size == 0: | |
# return { | |
# "pitch_mean": 0.0, | |
# "pitch_std": 0.0, | |
# "pitch_range": 0.0, | |
# "semitone_std": 0.0, | |
# "pitch_variation_score": 0.0 | |
# } | |
# # Step 4: Basic statistics | |
# pitch_mean = float(np.mean(voiced_f0)) | |
# pitch_std = float(np.std(voiced_f0)) | |
# pitch_range = float(np.max(voiced_f0) - np.min(voiced_f0)) | |
# # Step 5: Compute semitone-based variation | |
# median_f0 = np.median(voiced_f0) | |
# if median_f0 <= 0: | |
# median_f0 = 1e-6 | |
# semitone_diffs = 12 * np.log2(voiced_f0 / median_f0) | |
# semitone_std = float(np.std(semitone_diffs)) | |
# # Step 6: Scale to 0β100 score | |
# pitch_variation_score = float(np.clip((semitone_std / 6.0) * 100, 0, 100)) | |
# return { | |
# "pitch_mean": pitch_mean, | |
# "pitch_std": pitch_std, | |
# "pitch_range": pitch_range, | |
# "semitone_std": semitone_std, | |
# "pitch_variation_score": pitch_variation_score | |
# } | |
# def compute_intonation_range(file_path): | |
# # Step 1: Load and prepare audio | |
# y, sr = librosa.load(file_path, sr=None) | |
# # Step 2: Extract F0 using librosa.pyin | |
# f0, voiced_flags, voiced_probs = librosa.pyin( | |
# y, | |
# sr=sr, | |
# fmin=80, | |
# fmax=400, | |
# frame_length=1105, # ensures two periods of fmin fit | |
# hop_length=256, | |
# fill_na=np.nan | |
# ) | |
# # Step 3: Filter voiced frames | |
# voiced_f0 = f0[~np.isnan(f0)] | |
# if voiced_f0.size == 0: | |
# return 0.0, 0.0 | |
# # Optional: remove outliers (5th to 95th percentile) | |
# voiced_f0 = voiced_f0[ | |
# (voiced_f0 > np.percentile(voiced_f0, 5)) & | |
# (voiced_f0 < np.percentile(voiced_f0, 95)) | |
# ] | |
# # Step 4: Compute intonation range in semitones | |
# f0_min = np.min(voiced_f0) | |
# f0_max = np.max(voiced_f0) | |
# if f0_min <= 0: | |
# f0_min = 1e-6 | |
# intonation_range = 12 * np.log2(f0_max / f0_min) | |
# # Step 5: Normalize and convert to score out of 100 | |
# max_range = 12.0 # ~1 octave | |
# normalized = min(intonation_range, max_range) / max_range | |
# score = normalized * 100 | |
# return round(score, 2), float(intonation_range) | |
# def compute_speech_rhythm_variability(file_path): | |
# """ | |
# Computes the speech rhythm variability score from an audio file. | |
# The method estimates tempo consistency across time using onset intervals. | |
# Returns: | |
# score (float): Normalized rhythm variability score out of 100. | |
# raw_std (float): Raw standard deviation of inter-onset intervals. | |
# """ | |
# # Step 1: Load audio | |
# y, sr = librosa.load(file_path, sr=None) | |
# # Step 2: Onset detection | |
# onset_env = librosa.onset.onset_strength(y=y, sr=sr) | |
# onsets = librosa.onset.onset_detect(onset_envelope=onset_env, sr=sr, units='time') | |
# if len(onsets) < 2: | |
# return 0.0, 0.0 # Not enough onsets to compute rhythm | |
# # Step 3: Compute inter-onset intervals (IOIs) as rhythm proxy | |
# iois = np.diff(onsets) | |
# # Optional: Remove outliers (5thβ95th percentile) | |
# ioi_clean = iois[(iois > np.percentile(iois, 5)) & (iois < np.percentile(iois, 95))] | |
# if len(ioi_clean) < 2: | |
# return 0.0, 0.0 | |
# # Step 4: Compute variability β standard deviation of IOIs | |
# raw_std = np.std(ioi_clean) | |
# # Step 5: Normalize raw_std to 0β100 score | |
# # Lower std = more consistent rhythm β higher score | |
# min_std = 0.05 # near-perfect rhythm (tight pacing) | |
# max_std = 0.6 # highly irregular rhythm | |
# # Clamp and reverse-score | |
# clamped_std = np.clip(raw_std, min_std, max_std) | |
# normalized = 1 - (clamped_std - min_std) / (max_std - min_std) | |
# score = normalized * 100 | |
# return round(score, 2), round(float(raw_std), 4) | |
# def calc_sds(file_path): | |
# # sds = 0.35 * pitch_variation + 0.35 * intonation_range + 0.3 * speech_rhythm_variability | |
# pitch_variation = compute_pitch_variation(file_path) | |
# intonation_range = compute_intonation_range(file_path) | |
# speech_rhythm_variability = compute_speech_rhythm_variability(file_path) | |
# # print(f"Speech Rhythm Variability Score: {speech_rhythm_variability}") | |
# # print(f"Speech Rhythm Variability Score: {speech_rhythm_variability}") | |
# # print(f"Speech Rhythm Variability Score: {speech_rhythm_variability}") | |
# sds = 0.35 * pitch_variation['pitch_variation_score'] + 0.35 * intonation_range[0] + 0.3 * speech_rhythm_variability[0] | |
# return round(sds, 2) | |
# path = r'D:\Intern\shankh\audio_samples\anga.wav' | |
# result = calc_sds(path) | |
# print(f"SDS: {result}") | |
import numpy as np | |
import librosa | |
import pyworld | |
def compute_pitch_variation(file_path): | |
# Step 1: Load audio | |
y, sr = librosa.load(file_path, sr=None) | |
# Step 2: Extract pitch using pyworld | |
_f0, t = pyworld.harvest(y.astype(np.float64), sr, f0_floor=80.0, f0_ceil=400.0, frame_period=1000 * 256 / sr) | |
f0 = pyworld.stonemask(y.astype(np.float64), _f0, t, sr) | |
# Step 3: Filter voiced frames | |
voiced_f0 = f0[f0 > 0] | |
# Remove outliers (5th to 95th percentile) | |
voiced_f0 = voiced_f0[ | |
(voiced_f0 > np.percentile(voiced_f0, 5)) & | |
(voiced_f0 < np.percentile(voiced_f0, 95)) | |
] | |
if voiced_f0.size == 0: | |
return { | |
"pitch_mean": 0.0, | |
"pitch_std": 0.0, | |
"pitch_range": 0.0, | |
"semitone_std": 0.0, | |
"pitch_variation_score": 0.0 | |
} | |
# Step 4: Basic statistics | |
pitch_mean = float(np.mean(voiced_f0)) | |
pitch_std = float(np.std(voiced_f0)) | |
pitch_range = float(np.max(voiced_f0) - np.min(voiced_f0)) | |
# Step 5: Semitone-based variation | |
median_f0 = np.median(voiced_f0) | |
if median_f0 <= 0: | |
median_f0 = 1e-6 | |
semitone_diffs = 12 * np.log2(voiced_f0 / median_f0) | |
semitone_std = float(np.std(semitone_diffs)) | |
# Step 6: Scaled variation score | |
pitch_variation_score = float(np.clip((semitone_std / 6.0) * 100, 0, 100)) | |
return { | |
"pitch_mean": pitch_mean, | |
"pitch_std": pitch_std, | |
"pitch_range": pitch_range, | |
"semitone_std": semitone_std, | |
"pitch_variation_score": pitch_variation_score | |
} | |
def compute_intonation_range(file_path): | |
# Step 1: Load audio | |
y, sr = librosa.load(file_path, sr=None) | |
# Step 2: Extract pitch using pyworld | |
_f0, t = pyworld.harvest(y.astype(np.float64), sr, f0_floor=80.0, f0_ceil=400.0, frame_period=1000 * 256 / sr) | |
f0 = pyworld.stonemask(y.astype(np.float64), _f0, t, sr) | |
# Step 3: Filter voiced frames | |
voiced_f0 = f0[f0 > 0] | |
if voiced_f0.size == 0: | |
return 0.0, 0.0 | |
# Remove outliers | |
voiced_f0 = voiced_f0[ | |
(voiced_f0 > np.percentile(voiced_f0, 5)) & | |
(voiced_f0 < np.percentile(voiced_f0, 95)) | |
] | |
if voiced_f0.size == 0: | |
return 0.0, 0.0 | |
# Step 4: Compute intonation range | |
f0_min = np.min(voiced_f0) | |
f0_max = np.max(voiced_f0) | |
if f0_min <= 0: | |
f0_min = 1e-6 | |
intonation_range = 12 * np.log2(f0_max / f0_min) | |
# Step 5: Normalize | |
max_range = 12.0 | |
normalized = min(intonation_range, max_range) / max_range | |
score = normalized * 100 | |
return round(score, 2), float(intonation_range) | |
def compute_speech_rhythm_variability(file_path): | |
""" | |
Computes the speech rhythm variability score from an audio file. | |
The method estimates tempo consistency across time using onset intervals. | |
""" | |
y, sr = librosa.load(file_path, sr=None) | |
# Step 2: Onset detection | |
onset_env = librosa.onset.onset_strength(y=y, sr=sr) | |
onsets = librosa.onset.onset_detect(onset_envelope=onset_env, sr=sr, units='time') | |
if len(onsets) < 2: | |
return 0.0, 0.0 | |
iois = np.diff(onsets) | |
ioi_clean = iois[(iois > np.percentile(iois, 5)) & (iois < np.percentile(iois, 95))] | |
if len(ioi_clean) < 2: | |
return 0.0, 0.0 | |
raw_std = np.std(ioi_clean) | |
min_std = 0.05 | |
max_std = 0.6 | |
clamped_std = np.clip(raw_std, min_std, max_std) | |
normalized = 1 - (clamped_std - min_std) / (max_std - min_std) | |
score = normalized * 100 | |
return round(score, 2), round(float(raw_std), 4) | |
def calc_sds(file_path): | |
pitch_variation = compute_pitch_variation(file_path) | |
intonation_range = compute_intonation_range(file_path) | |
speech_rhythm_variability = compute_speech_rhythm_variability(file_path) | |
sds = 0.35 * pitch_variation['pitch_variation_score'] + \ | |
0.35 * intonation_range[0] + \ | |
0.3 * speech_rhythm_variability[0] | |
return round(sds, 2) | |