Spaces:

cruvss
/

Fast_api

Running

App Files Files Community

Fast_api / tone_modulation /sds.py

mulasagg

Add application file

8ad2ab3 about 2 months ago

raw

history blame contribute delete

12 kB


	import scipy.signal
	import numpy as np
	import librosa
	import pyworld as pw

	# def compute_pitch_variation(file_path):
	# # Step 1: Load audio
	# y, sr = librosa.load(file_path, sr=None)
	# y = y.astype(np.float64) # pyworld expects float64

	# # Step 2: Extract pitch (F0)
	# _f0, t = pw.dio(y, sr) # Fast initial pitch estimation
	# f0 = pw.stonemask(y, _f0, t, sr) # Refinement step

	# # Step 3: Filter voiced frames
	# voiced_f0 = f0[f0 > 0]

	# # Handle empty case
	# if voiced_f0.size == 0:
	# return {
	# "pitch_mean": 0.0,
	# "pitch_std": 0.0,
	# "pitch_range": 0.0,
	# "semitone_std": 0.0,
	# "pitch_variation_score": 0.0
	# }

	# # Step 4: Basic statistics
	# pitch_mean = np.mean(voiced_f0)
	# pitch_std = np.std(voiced_f0)
	# pitch_range = np.max(voiced_f0) - np.min(voiced_f0)

	# print(pitch_mean)
	# print(f'voiced_f0: {voiced_f0}')
	# # Step 5: Compute semitone-based variation (better for human perception)
	# median_f0 = np.median(voiced_f0)
	# if median_f0 <= 0:
	# median_f0 = 1e-6 # Avoid division by zero

	# semitone_diffs = 12 * np.log2(voiced_f0 / median_f0)
	# semitone_std = np.std(semitone_diffs)
	# print(semitone_std)

	# # Step 6: Scale semitone_std to a 0–100 score (tunable)
	# # For example: semitone_std of 0 → 0 score, ≥6 semitones → 100 score
	# pitch_variation_score = np.clip((semitone_std / 6.0) * 100, 0, 100)

	# return {
	# "pitch_mean": pitch_mean,
	# "pitch_std": pitch_std,
	# "pitch_range": pitch_range,
	# "semitone_std": semitone_std,
	# "pitch_variation_score": pitch_variation_score
	# }
	# def compute_intonation_range(file_path):
	# # Step 1: Load and prepare audio
	# y, sr = librosa.load(file_path, sr=None)
	# y = y.astype(np.float64)

	# # Step 2: Extract F0
	# _f0, t = pw.dio(y, sr)
	# f0 = pw.stonemask(y, _f0, t, sr)



	# # Step 3: Filter voiced frames
	# voiced_f0 = f0[f0 > 0]
	# if voiced_f0.size == 0:
	# return 0.0

	# voiced_f0 = voiced_f0[(voiced_f0 > np.percentile(voiced_f0, 5)) &
	# (voiced_f0 < np.percentile(voiced_f0, 95))]

	# # Step 4: Compute intonation range (in semitones)
	# f0_min = np.min(voiced_f0)
	# f0_max = np.max(voiced_f0)
	# if f0_min <= 0:
	# f0_min = 1e-6 # to avoid log error
	# intonation_range = 12 * np.log2(f0_max / f0_min)

	# # range into scores:

	# max_range = 12.0
	# normalized = min(intonation_range, max_range) / max_range
	# score = normalized * 100
	# return round(score, 2), intonation_range



	# def compute_pitch_variation(file_path):
	# # Step 1: Load audio
	# y, sr = librosa.load(file_path, sr=None)

	# # Step 2: Extract pitch using librosa.pyin (YIN-based)
	# f0, voiced_flags, voiced_probs = librosa.pyin(
	# y,
	# sr=sr,
	# fmin=80,
	# fmax=400,
	# frame_length=1105,
	# hop_length=256,
	# fill_na=np.nan
	# )

	# # Step 3: Filter voiced frames
	# voiced_f0 = f0[~np.isnan(f0)]


	# voiced_f0 = voiced_f0[
	# (voiced_f0 > np.percentile(voiced_f0, 5)) &
	# (voiced_f0 < np.percentile(voiced_f0, 95))
	# ]

	# # Handle empty case
	# if voiced_f0.size == 0:
	# return {
	# "pitch_mean": 0.0,
	# "pitch_std": 0.0,
	# "pitch_range": 0.0,
	# "semitone_std": 0.0,
	# "pitch_variation_score": 0.0
	# }

	# # Step 4: Basic statistics
	# pitch_mean = float(np.mean(voiced_f0))
	# pitch_std = float(np.std(voiced_f0))
	# pitch_range = float(np.max(voiced_f0) - np.min(voiced_f0))


	# # Step 5: Compute semitone-based variation
	# median_f0 = np.median(voiced_f0)
	# if median_f0 <= 0:
	# median_f0 = 1e-6

	# semitone_diffs = 12 * np.log2(voiced_f0 / median_f0)
	# semitone_std = float(np.std(semitone_diffs))


	# # Step 6: Scale to 0–100 score
	# pitch_variation_score = float(np.clip((semitone_std / 6.0) * 100, 0, 100))
	# return {
	# "pitch_mean": pitch_mean,
	# "pitch_std": pitch_std,
	# "pitch_range": pitch_range,
	# "semitone_std": semitone_std,
	# "pitch_variation_score": pitch_variation_score
	# }

	# def compute_intonation_range(file_path):
	# # Step 1: Load and prepare audio
	# y, sr = librosa.load(file_path, sr=None)

	# # Step 2: Extract F0 using librosa.pyin
	# f0, voiced_flags, voiced_probs = librosa.pyin(
	# y,
	# sr=sr,
	# fmin=80,
	# fmax=400,
	# frame_length=1105, # ensures two periods of fmin fit
	# hop_length=256,
	# fill_na=np.nan
	# )

	# # Step 3: Filter voiced frames
	# voiced_f0 = f0[~np.isnan(f0)]
	# if voiced_f0.size == 0:
	# return 0.0, 0.0

	# # Optional: remove outliers (5th to 95th percentile)
	# voiced_f0 = voiced_f0[
	# (voiced_f0 > np.percentile(voiced_f0, 5)) &
	# (voiced_f0 < np.percentile(voiced_f0, 95))
	# ]

	# # Step 4: Compute intonation range in semitones
	# f0_min = np.min(voiced_f0)
	# f0_max = np.max(voiced_f0)
	# if f0_min <= 0:
	# f0_min = 1e-6

	# intonation_range = 12 * np.log2(f0_max / f0_min)

	# # Step 5: Normalize and convert to score out of 100
	# max_range = 12.0 # ~1 octave
	# normalized = min(intonation_range, max_range) / max_range
	# score = normalized * 100

	# return round(score, 2), float(intonation_range)



	# def compute_speech_rhythm_variability(file_path):
	# """
	# Computes the speech rhythm variability score from an audio file.
	# The method estimates tempo consistency across time using onset intervals.

	# Returns:
	# score (float): Normalized rhythm variability score out of 100.
	# raw_std (float): Raw standard deviation of inter-onset intervals.
	# """
	# # Step 1: Load audio
	# y, sr = librosa.load(file_path, sr=None)

	# # Step 2: Onset detection
	# onset_env = librosa.onset.onset_strength(y=y, sr=sr)
	# onsets = librosa.onset.onset_detect(onset_envelope=onset_env, sr=sr, units='time')

	# if len(onsets) < 2:
	# return 0.0, 0.0 # Not enough onsets to compute rhythm

	# # Step 3: Compute inter-onset intervals (IOIs) as rhythm proxy
	# iois = np.diff(onsets)

	# # Optional: Remove outliers (5th–95th percentile)
	# ioi_clean = iois[(iois > np.percentile(iois, 5)) & (iois < np.percentile(iois, 95))]
	# if len(ioi_clean) < 2:
	# return 0.0, 0.0

	# # Step 4: Compute variability — standard deviation of IOIs
	# raw_std = np.std(ioi_clean)

	# # Step 5: Normalize raw_std to 0–100 score
	# # Lower std = more consistent rhythm → higher score
	# min_std = 0.05 # near-perfect rhythm (tight pacing)
	# max_std = 0.6 # highly irregular rhythm

	# # Clamp and reverse-score
	# clamped_std = np.clip(raw_std, min_std, max_std)
	# normalized = 1 - (clamped_std - min_std) / (max_std - min_std)
	# score = normalized * 100

	# return round(score, 2), round(float(raw_std), 4)


	# def calc_sds(file_path):

	# # sds = 0.35 * pitch_variation + 0.35 * intonation_range + 0.3 * speech_rhythm_variability

	# pitch_variation = compute_pitch_variation(file_path)
	# intonation_range = compute_intonation_range(file_path)
	# speech_rhythm_variability = compute_speech_rhythm_variability(file_path)
	# # print(f"Speech Rhythm Variability Score: {speech_rhythm_variability}")
	# # print(f"Speech Rhythm Variability Score: {speech_rhythm_variability}")
	# # print(f"Speech Rhythm Variability Score: {speech_rhythm_variability}")

	# sds = 0.35 * pitch_variation['pitch_variation_score'] + 0.35 * intonation_range[0] + 0.3 * speech_rhythm_variability[0]
	# return round(sds, 2)

	# path = r'D:\Intern\shankh\audio_samples\anga.wav'

	# result = calc_sds(path)
	# print(f"SDS: {result}")

	import numpy as np
	import librosa
	import pyworld

	def compute_pitch_variation(file_path):
	# Step 1: Load audio
	y, sr = librosa.load(file_path, sr=None)

	# Step 2: Extract pitch using pyworld
	_f0, t = pyworld.harvest(y.astype(np.float64), sr, f0_floor=80.0, f0_ceil=400.0, frame_period=1000 * 256 / sr)
	f0 = pyworld.stonemask(y.astype(np.float64), _f0, t, sr)

	# Step 3: Filter voiced frames
	voiced_f0 = f0[f0 > 0]

	# Remove outliers (5th to 95th percentile)
	voiced_f0 = voiced_f0[
	(voiced_f0 > np.percentile(voiced_f0, 5)) &
	(voiced_f0 < np.percentile(voiced_f0, 95))
	]

	if voiced_f0.size == 0:
	return {
	"pitch_mean": 0.0,
	"pitch_std": 0.0,
	"pitch_range": 0.0,
	"semitone_std": 0.0,
	"pitch_variation_score": 0.0
	}

	# Step 4: Basic statistics
	pitch_mean = float(np.mean(voiced_f0))
	pitch_std = float(np.std(voiced_f0))
	pitch_range = float(np.max(voiced_f0) - np.min(voiced_f0))

	# Step 5: Semitone-based variation
	median_f0 = np.median(voiced_f0)
	if median_f0 <= 0:
	median_f0 = 1e-6
	semitone_diffs = 12 * np.log2(voiced_f0 / median_f0)
	semitone_std = float(np.std(semitone_diffs))

	# Step 6: Scaled variation score
	pitch_variation_score = float(np.clip((semitone_std / 6.0) * 100, 0, 100))

	return {
	"pitch_mean": pitch_mean,
	"pitch_std": pitch_std,
	"pitch_range": pitch_range,
	"semitone_std": semitone_std,
	"pitch_variation_score": pitch_variation_score
	}


	def compute_intonation_range(file_path):
	# Step 1: Load audio
	y, sr = librosa.load(file_path, sr=None)

	# Step 2: Extract pitch using pyworld
	_f0, t = pyworld.harvest(y.astype(np.float64), sr, f0_floor=80.0, f0_ceil=400.0, frame_period=1000 * 256 / sr)
	f0 = pyworld.stonemask(y.astype(np.float64), _f0, t, sr)

	# Step 3: Filter voiced frames
	voiced_f0 = f0[f0 > 0]
	if voiced_f0.size == 0:
	return 0.0, 0.0

	# Remove outliers
	voiced_f0 = voiced_f0[
	(voiced_f0 > np.percentile(voiced_f0, 5)) &
	(voiced_f0 < np.percentile(voiced_f0, 95))
	]
	if voiced_f0.size == 0:
	return 0.0, 0.0

	# Step 4: Compute intonation range
	f0_min = np.min(voiced_f0)
	f0_max = np.max(voiced_f0)
	if f0_min <= 0:
	f0_min = 1e-6
	intonation_range = 12 * np.log2(f0_max / f0_min)

	# Step 5: Normalize
	max_range = 12.0
	normalized = min(intonation_range, max_range) / max_range
	score = normalized * 100

	return round(score, 2), float(intonation_range)


	def compute_speech_rhythm_variability(file_path):
	"""
	Computes the speech rhythm variability score from an audio file.
	The method estimates tempo consistency across time using onset intervals.
	"""
	y, sr = librosa.load(file_path, sr=None)

	# Step 2: Onset detection
	onset_env = librosa.onset.onset_strength(y=y, sr=sr)
	onsets = librosa.onset.onset_detect(onset_envelope=onset_env, sr=sr, units='time')

	if len(onsets) < 2:
	return 0.0, 0.0

	iois = np.diff(onsets)

	ioi_clean = iois[(iois > np.percentile(iois, 5)) & (iois < np.percentile(iois, 95))]
	if len(ioi_clean) < 2:
	return 0.0, 0.0

	raw_std = np.std(ioi_clean)

	min_std = 0.05
	max_std = 0.6
	clamped_std = np.clip(raw_std, min_std, max_std)
	normalized = 1 - (clamped_std - min_std) / (max_std - min_std)
	score = normalized * 100

	return round(score, 2), round(float(raw_std), 4)


	def calc_sds(file_path):
	pitch_variation = compute_pitch_variation(file_path)
	intonation_range = compute_intonation_range(file_path)
	speech_rhythm_variability = compute_speech_rhythm_variability(file_path)

	sds = 0.35 * pitch_variation['pitch_variation_score'] + \
	0.35 * intonation_range[0] + \
	0.3 * speech_rhythm_variability[0]

	return round(sds, 2)