mulasagg's picture
Add application file
8ad2ab3
import scipy.signal
import numpy as np
import librosa
import pyworld as pw
# def compute_pitch_variation(file_path):
# # Step 1: Load audio
# y, sr = librosa.load(file_path, sr=None)
# y = y.astype(np.float64) # pyworld expects float64
# # Step 2: Extract pitch (F0)
# _f0, t = pw.dio(y, sr) # Fast initial pitch estimation
# f0 = pw.stonemask(y, _f0, t, sr) # Refinement step
# # Step 3: Filter voiced frames
# voiced_f0 = f0[f0 > 0]
# # Handle empty case
# if voiced_f0.size == 0:
# return {
# "pitch_mean": 0.0,
# "pitch_std": 0.0,
# "pitch_range": 0.0,
# "semitone_std": 0.0,
# "pitch_variation_score": 0.0
# }
# # Step 4: Basic statistics
# pitch_mean = np.mean(voiced_f0)
# pitch_std = np.std(voiced_f0)
# pitch_range = np.max(voiced_f0) - np.min(voiced_f0)
# print(pitch_mean)
# print(f'voiced_f0: {voiced_f0}')
# # Step 5: Compute semitone-based variation (better for human perception)
# median_f0 = np.median(voiced_f0)
# if median_f0 <= 0:
# median_f0 = 1e-6 # Avoid division by zero
# semitone_diffs = 12 * np.log2(voiced_f0 / median_f0)
# semitone_std = np.std(semitone_diffs)
# print(semitone_std)
# # Step 6: Scale semitone_std to a 0–100 score (tunable)
# # For example: semitone_std of 0 β†’ 0 score, β‰₯6 semitones β†’ 100 score
# pitch_variation_score = np.clip((semitone_std / 6.0) * 100, 0, 100)
# return {
# "pitch_mean": pitch_mean,
# "pitch_std": pitch_std,
# "pitch_range": pitch_range,
# "semitone_std": semitone_std,
# "pitch_variation_score": pitch_variation_score
# }
# def compute_intonation_range(file_path):
# # Step 1: Load and prepare audio
# y, sr = librosa.load(file_path, sr=None)
# y = y.astype(np.float64)
# # Step 2: Extract F0
# _f0, t = pw.dio(y, sr)
# f0 = pw.stonemask(y, _f0, t, sr)
# # Step 3: Filter voiced frames
# voiced_f0 = f0[f0 > 0]
# if voiced_f0.size == 0:
# return 0.0
# voiced_f0 = voiced_f0[(voiced_f0 > np.percentile(voiced_f0, 5)) &
# (voiced_f0 < np.percentile(voiced_f0, 95))]
# # Step 4: Compute intonation range (in semitones)
# f0_min = np.min(voiced_f0)
# f0_max = np.max(voiced_f0)
# if f0_min <= 0:
# f0_min = 1e-6 # to avoid log error
# intonation_range = 12 * np.log2(f0_max / f0_min)
# # range into scores:
# max_range = 12.0
# normalized = min(intonation_range, max_range) / max_range
# score = normalized * 100
# return round(score, 2), intonation_range
# def compute_pitch_variation(file_path):
# # Step 1: Load audio
# y, sr = librosa.load(file_path, sr=None)
# # Step 2: Extract pitch using librosa.pyin (YIN-based)
# f0, voiced_flags, voiced_probs = librosa.pyin(
# y,
# sr=sr,
# fmin=80,
# fmax=400,
# frame_length=1105,
# hop_length=256,
# fill_na=np.nan
# )
# # Step 3: Filter voiced frames
# voiced_f0 = f0[~np.isnan(f0)]
# voiced_f0 = voiced_f0[
# (voiced_f0 > np.percentile(voiced_f0, 5)) &
# (voiced_f0 < np.percentile(voiced_f0, 95))
# ]
# # Handle empty case
# if voiced_f0.size == 0:
# return {
# "pitch_mean": 0.0,
# "pitch_std": 0.0,
# "pitch_range": 0.0,
# "semitone_std": 0.0,
# "pitch_variation_score": 0.0
# }
# # Step 4: Basic statistics
# pitch_mean = float(np.mean(voiced_f0))
# pitch_std = float(np.std(voiced_f0))
# pitch_range = float(np.max(voiced_f0) - np.min(voiced_f0))
# # Step 5: Compute semitone-based variation
# median_f0 = np.median(voiced_f0)
# if median_f0 <= 0:
# median_f0 = 1e-6
# semitone_diffs = 12 * np.log2(voiced_f0 / median_f0)
# semitone_std = float(np.std(semitone_diffs))
# # Step 6: Scale to 0–100 score
# pitch_variation_score = float(np.clip((semitone_std / 6.0) * 100, 0, 100))
# return {
# "pitch_mean": pitch_mean,
# "pitch_std": pitch_std,
# "pitch_range": pitch_range,
# "semitone_std": semitone_std,
# "pitch_variation_score": pitch_variation_score
# }
# def compute_intonation_range(file_path):
# # Step 1: Load and prepare audio
# y, sr = librosa.load(file_path, sr=None)
# # Step 2: Extract F0 using librosa.pyin
# f0, voiced_flags, voiced_probs = librosa.pyin(
# y,
# sr=sr,
# fmin=80,
# fmax=400,
# frame_length=1105, # ensures two periods of fmin fit
# hop_length=256,
# fill_na=np.nan
# )
# # Step 3: Filter voiced frames
# voiced_f0 = f0[~np.isnan(f0)]
# if voiced_f0.size == 0:
# return 0.0, 0.0
# # Optional: remove outliers (5th to 95th percentile)
# voiced_f0 = voiced_f0[
# (voiced_f0 > np.percentile(voiced_f0, 5)) &
# (voiced_f0 < np.percentile(voiced_f0, 95))
# ]
# # Step 4: Compute intonation range in semitones
# f0_min = np.min(voiced_f0)
# f0_max = np.max(voiced_f0)
# if f0_min <= 0:
# f0_min = 1e-6
# intonation_range = 12 * np.log2(f0_max / f0_min)
# # Step 5: Normalize and convert to score out of 100
# max_range = 12.0 # ~1 octave
# normalized = min(intonation_range, max_range) / max_range
# score = normalized * 100
# return round(score, 2), float(intonation_range)
# def compute_speech_rhythm_variability(file_path):
# """
# Computes the speech rhythm variability score from an audio file.
# The method estimates tempo consistency across time using onset intervals.
# Returns:
# score (float): Normalized rhythm variability score out of 100.
# raw_std (float): Raw standard deviation of inter-onset intervals.
# """
# # Step 1: Load audio
# y, sr = librosa.load(file_path, sr=None)
# # Step 2: Onset detection
# onset_env = librosa.onset.onset_strength(y=y, sr=sr)
# onsets = librosa.onset.onset_detect(onset_envelope=onset_env, sr=sr, units='time')
# if len(onsets) < 2:
# return 0.0, 0.0 # Not enough onsets to compute rhythm
# # Step 3: Compute inter-onset intervals (IOIs) as rhythm proxy
# iois = np.diff(onsets)
# # Optional: Remove outliers (5th–95th percentile)
# ioi_clean = iois[(iois > np.percentile(iois, 5)) & (iois < np.percentile(iois, 95))]
# if len(ioi_clean) < 2:
# return 0.0, 0.0
# # Step 4: Compute variability β€” standard deviation of IOIs
# raw_std = np.std(ioi_clean)
# # Step 5: Normalize raw_std to 0–100 score
# # Lower std = more consistent rhythm β†’ higher score
# min_std = 0.05 # near-perfect rhythm (tight pacing)
# max_std = 0.6 # highly irregular rhythm
# # Clamp and reverse-score
# clamped_std = np.clip(raw_std, min_std, max_std)
# normalized = 1 - (clamped_std - min_std) / (max_std - min_std)
# score = normalized * 100
# return round(score, 2), round(float(raw_std), 4)
# def calc_sds(file_path):
# # sds = 0.35 * pitch_variation + 0.35 * intonation_range + 0.3 * speech_rhythm_variability
# pitch_variation = compute_pitch_variation(file_path)
# intonation_range = compute_intonation_range(file_path)
# speech_rhythm_variability = compute_speech_rhythm_variability(file_path)
# # print(f"Speech Rhythm Variability Score: {speech_rhythm_variability}")
# # print(f"Speech Rhythm Variability Score: {speech_rhythm_variability}")
# # print(f"Speech Rhythm Variability Score: {speech_rhythm_variability}")
# sds = 0.35 * pitch_variation['pitch_variation_score'] + 0.35 * intonation_range[0] + 0.3 * speech_rhythm_variability[0]
# return round(sds, 2)
# path = r'D:\Intern\shankh\audio_samples\anga.wav'
# result = calc_sds(path)
# print(f"SDS: {result}")
import numpy as np
import librosa
import pyworld
def compute_pitch_variation(file_path):
# Step 1: Load audio
y, sr = librosa.load(file_path, sr=None)
# Step 2: Extract pitch using pyworld
_f0, t = pyworld.harvest(y.astype(np.float64), sr, f0_floor=80.0, f0_ceil=400.0, frame_period=1000 * 256 / sr)
f0 = pyworld.stonemask(y.astype(np.float64), _f0, t, sr)
# Step 3: Filter voiced frames
voiced_f0 = f0[f0 > 0]
# Remove outliers (5th to 95th percentile)
voiced_f0 = voiced_f0[
(voiced_f0 > np.percentile(voiced_f0, 5)) &
(voiced_f0 < np.percentile(voiced_f0, 95))
]
if voiced_f0.size == 0:
return {
"pitch_mean": 0.0,
"pitch_std": 0.0,
"pitch_range": 0.0,
"semitone_std": 0.0,
"pitch_variation_score": 0.0
}
# Step 4: Basic statistics
pitch_mean = float(np.mean(voiced_f0))
pitch_std = float(np.std(voiced_f0))
pitch_range = float(np.max(voiced_f0) - np.min(voiced_f0))
# Step 5: Semitone-based variation
median_f0 = np.median(voiced_f0)
if median_f0 <= 0:
median_f0 = 1e-6
semitone_diffs = 12 * np.log2(voiced_f0 / median_f0)
semitone_std = float(np.std(semitone_diffs))
# Step 6: Scaled variation score
pitch_variation_score = float(np.clip((semitone_std / 6.0) * 100, 0, 100))
return {
"pitch_mean": pitch_mean,
"pitch_std": pitch_std,
"pitch_range": pitch_range,
"semitone_std": semitone_std,
"pitch_variation_score": pitch_variation_score
}
def compute_intonation_range(file_path):
# Step 1: Load audio
y, sr = librosa.load(file_path, sr=None)
# Step 2: Extract pitch using pyworld
_f0, t = pyworld.harvest(y.astype(np.float64), sr, f0_floor=80.0, f0_ceil=400.0, frame_period=1000 * 256 / sr)
f0 = pyworld.stonemask(y.astype(np.float64), _f0, t, sr)
# Step 3: Filter voiced frames
voiced_f0 = f0[f0 > 0]
if voiced_f0.size == 0:
return 0.0, 0.0
# Remove outliers
voiced_f0 = voiced_f0[
(voiced_f0 > np.percentile(voiced_f0, 5)) &
(voiced_f0 < np.percentile(voiced_f0, 95))
]
if voiced_f0.size == 0:
return 0.0, 0.0
# Step 4: Compute intonation range
f0_min = np.min(voiced_f0)
f0_max = np.max(voiced_f0)
if f0_min <= 0:
f0_min = 1e-6
intonation_range = 12 * np.log2(f0_max / f0_min)
# Step 5: Normalize
max_range = 12.0
normalized = min(intonation_range, max_range) / max_range
score = normalized * 100
return round(score, 2), float(intonation_range)
def compute_speech_rhythm_variability(file_path):
"""
Computes the speech rhythm variability score from an audio file.
The method estimates tempo consistency across time using onset intervals.
"""
y, sr = librosa.load(file_path, sr=None)
# Step 2: Onset detection
onset_env = librosa.onset.onset_strength(y=y, sr=sr)
onsets = librosa.onset.onset_detect(onset_envelope=onset_env, sr=sr, units='time')
if len(onsets) < 2:
return 0.0, 0.0
iois = np.diff(onsets)
ioi_clean = iois[(iois > np.percentile(iois, 5)) & (iois < np.percentile(iois, 95))]
if len(ioi_clean) < 2:
return 0.0, 0.0
raw_std = np.std(ioi_clean)
min_std = 0.05
max_std = 0.6
clamped_std = np.clip(raw_std, min_std, max_std)
normalized = 1 - (clamped_std - min_std) / (max_std - min_std)
score = normalized * 100
return round(score, 2), round(float(raw_std), 4)
def calc_sds(file_path):
pitch_variation = compute_pitch_variation(file_path)
intonation_range = compute_intonation_range(file_path)
speech_rhythm_variability = compute_speech_rhythm_variability(file_path)
sds = 0.35 * pitch_variation['pitch_variation_score'] + \
0.35 * intonation_range[0] + \
0.3 * speech_rhythm_variability[0]
return round(sds, 2)