Spaces:

cruvss
/

Fast_api

Sleeping

App Files Files Community

Fast_api / vers /compute_vers_score.py

mulasagg

API optimizations

aef3b1e about 1 month ago

raw

history blame contribute delete

3.05 kB

	from .vers import calc_vers
	import librosa
	import numpy as np
	import math
	from .filler_analyzer import detect_fillers
	from .find_valence import get_valence_score
	from filler_count.filler_score import analyze_fillers
	import pyworld

	def compute_vers_score(file_path: str, whisper_model, filler_count = None) -> dict:
	"""
	Compute VERS (Vocal Emotional Regulation Score) and its components from a speech sample.
	"""
	result = whisper_model.transcribe(file_path, word_timestamps=False, fp16=False)
	transcript = result.get("text", "").strip()
	segments = result.get("segments", [])



	if filler_count is None:
	# Filler count
	result = analyze_fillers(file_path,'base', transcript)
	filler_count = result.get("filler_count", 0)

	# Load audio
	y, sr = librosa.load(file_path, sr=None)
	duration = len(y) / sr if sr else 0.0

	# Volume (RMS)
	rms = librosa.feature.rms(y=y)[0]
	mean_rms = float(np.mean(rms))
	mean_volume_db = 20 * math.log10(mean_rms + 1e-6) if mean_rms > 0 else -80.0
	volume_std = np.std(20 * np.log10(rms + 1e-6))

	# Max volume
	vol_max = np.max(np.abs(y)) if y.size > 0 else 0.0
	vol_max_db = 20 * math.log10(vol_max + 1e-6) if vol_max > 0 else -80.0

	# Calculate pitch variation (in semitones) using pyworld
	_f0, t = pyworld.harvest(y.astype(np.float64), sr, f0_floor=80.0, f0_ceil=400.0, frame_period=1000 * 256 / sr)
	f0 = pyworld.stonemask(y.astype(np.float64), _f0, t, sr)
	voiced_f0 = f0[f0 > 0]
	voiced_f0 = voiced_f0[
	(voiced_f0 > np.percentile(voiced_f0, 5)) &
	(voiced_f0 < np.percentile(voiced_f0, 95))
	]
	pitch_variation = 0.0
	if voiced_f0.size > 0:
	median_f0 = np.median(voiced_f0)
	median_f0 = max(median_f0, 1e-6)
	semitone_diffs = 12 * np.log2(voiced_f0 / median_f0)
	pitch_variation = float(np.std(semitone_diffs))

	# Pause analysis
	total_speaking_time = 0.0
	long_pause_count = 0
	if segments:
	for seg in segments:
	total_speaking_time += (seg["end"] - seg["start"])
	for i in range(len(segments) - 1):
	pause_dur = segments[i+1]["start"] - segments[i]["end"]
	if pause_dur > 1.0:
	long_pause_count += 1
	first_start = segments[0]["start"]
	last_end = segments[-1]["end"]
	if first_start > 1.0:
	long_pause_count += 1
	if duration - last_end > 1.0:
	long_pause_count += 1

	# WPM
	words = transcript.split()
	word_count = len(words)
	words_per_min = (word_count / duration) * 60.0 if duration > 0 else 0.0


	valence_scores = get_valence_score(file_path)

	# Calculate VERS
	vers_result = calc_vers(
	filler_count=filler_count,
	long_pause_count=long_pause_count,
	pitch_variation=pitch_variation,
	mean_volume_db=mean_volume_db,
	vol_max_db=vol_max_db,
	wpm=words_per_min,
	volume_std=volume_std,
	valence_scores=valence_scores
	)
	return vers_result