Spaces:

cruvss
/

Fast_api

Sleeping

App Files Files Community

Fast_api / vps /vps.py

mulasagg

Add application file

8ad2ab3 about 2 months ago

raw

history blame contribute delete

6.03 kB

	from typing import List, Dict
	import librosa
	import numpy as np
	import spacy
	import math
	from .filler_analyzer import detect_fillers

	def calc_srs(wpm: float, filler_count: int, long_pause_count: int, pitch_variation: float) -> float:
	"""
	Speech Rate Stability (SRS): Reflects the consistency of the speaker's pace and rhythm.
	"""
	ideal_wpm = 150
	wpm_deviation = min(30, abs(wpm - ideal_wpm))
	wpm_consistency = max(0, 100 - (wpm_deviation * 1.67))

	filler_penalty = min(filler_count / 10, 1.0)
	pause_penalty = min(long_pause_count / 5, 1.0)
	pitch_penalty = min(pitch_variation / 3.0, 1.0)

	stability = (1 - ((filler_penalty + pause_penalty + pitch_penalty) / 3)) * 100
	SRS = (0.45 * wpm_consistency) + (0.55 * stability)
	return min(100, max(0, SRS))

	def calculate_pas(transcript: str, segments: List[Dict], filler_count: int, duration: float) -> Dict[str, float]:
	"""
	Calculate the Pause Appropriateness Score (PAS) and its components.
	"""
	if not transcript or not segments or duration <= 0:
	raise ValueError("Transcript, segments, and duration must be valid")

	nlp = spacy.load("en_core_web_sm")
	doc = nlp(transcript)

	words = transcript.split()
	total_words = len(words)
	if total_words == 0:
	raise ValueError("No words found in transcript")

	filler_rate = filler_count / total_words if total_words > 0 else 0.0
	if filler_rate >= 0.10:
	afw = 0.0
	elif filler_rate <= 0.0:
	afw = 100.0
	else:
	afw = 100.0 - (filler_rate * 1000)
	afw = max(0.0, min(100.0, afw))

	total_pauses = 0
	natural_pauses = 0
	segment_texts = [seg["text"].strip() for seg in segments]
	segment_starts = [seg["start"] for seg in segments]
	segment_ends = [seg["end"] for seg in segments]

	for i in range(len(segments) - 1):
	pause_dur = segment_starts[i + 1] - segment_ends[i]
	if pause_dur > 0.5:
	total_pauses += 1
	if segment_texts[i] and segment_texts[i][-1] in ".!?,":
	natural_pauses += 1

	if segment_starts[0] > 0.5:
	total_pauses += 1
	if duration - segment_ends[-1] > 0.5:
	total_pauses += 1
	if segment_texts[-1] and segment_texts[-1][-1] in ".!?":
	natural_pauses += 1

	npp = 100.0 if total_pauses == 0 else (natural_pauses / total_pauses) * 100.0
	pas = (0.4 * npp) + (0.6 * afw)

	return {
	"NPP": npp,
	"AFW": afw,
	"PAS": pas
	}

	def calculate_rcs(y: np.ndarray, sr: int, segments: List[Dict], duration: float) -> Dict[str, float]:
	"""
	Calculate the Rhythm Consistency Score (RCS) and its components.
	"""
	if y.size == 0 or sr <= 0 or duration <= 0 or not segments:
	raise ValueError("Audio signal, sampling rate, duration, and segments must be valid")

	onset_env = librosa.onset.onset_strength(y=y, sr=sr, hop_length=256)
	onsets = librosa.onset.onset_detect(onset_envelope=onset_env, sr=sr, units='time', hop_length=256)

	if len(onsets) > 1:
	iois = np.diff(onsets)
	ioi_std = np.std(iois)
	ioi_std = min(max(ioi_std, 0.1), 0.5)
	str_score = 100.0 * (0.5 - ioi_std) / (0.5 - 0.1)
	str_score = max(0.0, min(100.0, str_score))
	else:
	str_score = 100.0

	total_transitions = 0
	smooth_transitions = 0
	pause_threshold = 0.3

	for i in range(len(segments) - 1):
	gap = segments[i + 1]["start"] - segments[i]["end"]
	total_transitions += 1
	if gap <= pause_threshold:
	smooth_transitions += 1

	for segment in segments:
	words = segment["text"].strip().split()
	if len(words) > 1:
	smooth_transitions += len(words) - 1
	total_transitions += len(words) - 1

	stw = 100.0 if total_transitions == 0 else (smooth_transitions / total_transitions) * 100.0
	rcs = (0.5 * str_score) + (0.5 * stw)

	return {
	"STR": str_score,
	"STW": stw,
	"RCS": rcs
	}

	def calculate_vps(
	transcript: str,
	segments: List[Dict],
	filler_count: int,
	duration: float,
	wpm: float,
	long_pause_count: int,
	pitch_variation: float,
	y: np.ndarray,
	sr: int
	) -> Dict[str, float]:
	"""
	Calculate the Voice Pacing Score (VPS) and its components:
	- SRS: Speech Rate Stability Score
	- PAS: Pause Appropriateness Score
	- RCS: Rhythm Consistency Score
	- VPS = (0.5 * SRS) + (0.3 * PAS) + (0.2 * RCS)

	Args:
	transcript (str): Transcribed text.
	segments (List[Dict]): Whisper model segments with 'start', 'end', 'text'.
	filler_count (int): Number of filler words.
	duration (float): Audio duration (seconds).
	wpm (float): Words per minute.
	long_pause_count (int): Number of long pauses (>1.0s).
	pitch_variation (float): Pitch variation in semitones.
	y (np.ndarray): Audio signal.
	sr (int): Sampling rate.

	Returns:
	Dict[str, float]: Scores for SRS, PAS, RCS, VPS, and intermediates.
	"""
	# Validate inputs
	if not transcript or not segments or duration <= 0 or y.size == 0 or sr <= 0:
	raise ValueError("Invalid inputs")

	# Calculate SRS
	srs = calc_srs(wpm, filler_count, long_pause_count, pitch_variation)

	# Calculate PAS
	pas_result = calculate_pas(transcript, segments, filler_count, duration)
	pas = pas_result["PAS"]
	npp = pas_result["NPP"]
	afw = pas_result["AFW"]

	# Calculate RCS
	rcs_result = calculate_rcs(y, sr, segments, duration)
	rcs = rcs_result["RCS"]
	str_score = rcs_result["STR"]
	stw = rcs_result["STW"]

	# Calculate VPS
	vps = (0.5 * srs) + (0.3 * pas) + (0.2 * rcs)
	vps = max(0.0, min(100.0, vps))

	return {
	"SRS": srs,
	"PAS": pas,
	"NPP": npp,
	"AFW": afw,
	"RCS": rcs,
	"STR": str_score,
	"STW": stw,
	"VPS": vps
	}