Spaces:

ysdede
/

keet-streaming

Running

App Files Files Community

keet-streaming / src /lib /audio /AudioSegmentProcessor.ts

ysdede

feat(space): migrate Hugging Face Space to keet SolidJS app

b8cc2bf 25 days ago

raw

history blame contribute delete

21.4 kB

	/**
	* Keet - Audio Segment Processor
	* Ported from legacy UI project/AudioSegmentProcessor.js
	*
	* Sophisticated VAD-based segment processor with:
	* - Speech onset detection with lookback
	* - Rising energy trend analysis
	* - Adaptive noise floor tracking
	* - SNR-based speech detection
	* - Proactive segment splitting for long utterances
	*/

	import { defaultAudioParams, windowDuration as DEFAULT_WINDOW_DURATION } from './audioParams';

	/** Chunk metadata for speech tracking */
	interface ChunkInfo {
	time: number;
	energy: number;
	isSpeech: boolean;
	snr: number;
	}

	/** Speech/silence statistics */
	interface SegmentStats {
	startTime: number;
	endTime: number;
	duration: number;
	avgEnergy: number;
	energyIntegral: number;
	}

	/** Statistics summary */
	interface StatsSummary {
	avgDuration: number;
	avgEnergy: number;
	avgEnergyIntegral: number;
	}

	/** Current stats snapshot */
	interface CurrentStats {
	silence: StatsSummary;
	speech: StatsSummary;
	noiseFloor: number;
	snr: number;
	snrThreshold: number;
	minSnrThreshold: number;
	energyRiseThreshold: number;
	}

	/** Processor state */
	interface ProcessorState {
	inSpeech: boolean;
	speechStartTime: number \| null;
	silenceStartTime: number \| null;
	silenceCounter: number;
	recentChunks: ChunkInfo[];
	speechEnergies: number[];
	silenceEnergies: number[];
	speechStats: SegmentStats[];
	silenceStats: SegmentStats[];
	currentStats: CurrentStats;
	segmentCounter: number;
	noiseFloor: number;
	recentEnergies: number[];
	silenceDuration: number;
	}

	/** Segment output */
	export interface ProcessedSegment {
	startTime: number;
	endTime: number;
	duration: number;
	}

	/** Processor configuration */
	export interface AudioSegmentProcessorConfig {
	sampleRate: number;
	windowSize: number;
	minSpeechDuration: number;
	silenceThreshold: number;
	energyThreshold: number;
	smaLength: number;
	lookbackChunks: number;
	overlapDuration: number;
	lookbackDuration: number;
	maxHistoryLength: number;
	noiseFloorAdaptationRate: number;
	fastAdaptationRate: number;
	snrThreshold: number;
	minBackgroundDuration: number;
	minSnrThreshold: number;
	energyRiseThreshold: number;
	maxSegmentDuration: number;
	maxSilenceWithinSpeech: number;
	endingSpeechTolerance: number;
	logger?: (message: string, data?: unknown) => void;
	}

	/**
	* AudioSegmentProcessor - Sophisticated VAD with speech onset detection
	*/
	export class AudioSegmentProcessor {
	private options: AudioSegmentProcessorConfig;
	private state!: ProcessorState;

	constructor(options: Partial<AudioSegmentProcessorConfig> = {}) {
	const sampleRate = options.sampleRate ?? defaultAudioParams.sampleRate ?? 16000;

	// Calculate window size based on sample rate (80ms window)
	const windowSize = Math.round(DEFAULT_WINDOW_DURATION * sampleRate);

	this.options = {
	sampleRate,

	minSpeechDuration: defaultAudioParams.minSpeechDuration,
	silenceThreshold: defaultAudioParams.silenceLength,
	energyThreshold: defaultAudioParams.audioThreshold,
	smaLength: defaultAudioParams.smaLength,
	lookbackChunks: defaultAudioParams.lookbackChunks,
	overlapDuration: defaultAudioParams.overlapDuration,
	lookbackDuration: defaultAudioParams.lookbackDuration,
	maxHistoryLength: defaultAudioParams.maxHistoryLength,
	noiseFloorAdaptationRate: defaultAudioParams.noiseFloorAdaptationRate,
	fastAdaptationRate: defaultAudioParams.fastAdaptationRate,
	snrThreshold: defaultAudioParams.snrThreshold,
	minBackgroundDuration: defaultAudioParams.minBackgroundDuration,
	minSnrThreshold: defaultAudioParams.minSnrThreshold,
	energyRiseThreshold: defaultAudioParams.energyRiseThreshold,
	maxSegmentDuration: defaultAudioParams.maxSegmentDuration,
	maxSilenceWithinSpeech: defaultAudioParams.maxSilenceWithinSpeech,
	endingSpeechTolerance: defaultAudioParams.endingSpeechTolerance,
	logger: console.log,
	...options,
	// Ensure windowSize is recalculated if sampleRate was overridden
	windowSize: Math.round(DEFAULT_WINDOW_DURATION * (options.sampleRate ?? sampleRate))
	};

	this.log('Initialized AudioSegmentProcessor', {
	sampleRate: this.options.sampleRate,
	windowSize: this.options.windowSize,
	lookbackDuration: this.options.lookbackDuration,
	overlapDuration: this.options.overlapDuration,
	snrThreshold: this.options.snrThreshold,
	minSnrThreshold: this.options.minSnrThreshold
	});

	this.reset();
	}

	private log(message: string, data?: unknown): void {
	if (typeof this.options.logger === 'function') {
	this.options.logger(`[AudioSegmentProcessor] ${message}`, data);
	}
	}

	/**
	* Process an audio chunk and return any detected segments.
	*/
	processAudioData(
	chunk: Float32Array,
	currentTime: number,
	energy: number
	): ProcessedSegment[] {
	if (!chunk \|\| !chunk.length) return [];

	const segments: ProcessedSegment[] = [];
	const isSpeech = energy > this.options.energyThreshold;

	// Update silence duration tracking
	if (!isSpeech) {
	const chunkDurationSec = chunk.length / this.options.sampleRate;
	this.state.silenceDuration += chunkDurationSec;
	} else {
	this.state.silenceDuration = 0;
	}

	// Update noise floor and calculate SNR
	this.updateNoiseFloor(energy, isSpeech);
	const snr = this.calculateSNR(energy);

	// Track recent chunks for lookback
	this.state.recentChunks.push({
	time: currentTime,
	energy,
	isSpeech,
	snr
	});

	if (this.state.recentChunks.length > this.options.maxHistoryLength * 10) {
	this.state.recentChunks.shift();
	}

	// --- Proactive Segment Splitting ---
	if (this.state.inSpeech && this.state.speechStartTime !== null) {
	const currentSpeechDuration = currentTime - this.state.speechStartTime;
	if (currentSpeechDuration > this.options.maxSegmentDuration) {
	this.log('Splitting long segment', {
	startTime: this.state.speechStartTime.toFixed(2),
	splitTime: currentTime.toFixed(2),
	duration: currentSpeechDuration.toFixed(2)
	});

	const segment = this.createSegment(this.state.speechStartTime, currentTime);
	if (segment) {
	segments.push(segment);
	}

	// Start new segment immediately
	this.startSpeech(currentTime, energy);
	}
	}

	// --- Speech State Machine ---
	if (!this.state.inSpeech && isSpeech) {
	// Transition: Silence -> Speech
	const realStartIndex = this.findSpeechStart();
	const realStartTime = realStartIndex !== -1
	? this.state.recentChunks[realStartIndex].time
	: currentTime;

	this.startSpeech(realStartTime, energy);

	this.log('Speech start detected', {
	detectedAt: currentTime.toFixed(2),
	actualStart: realStartTime.toFixed(2),
	lookbackDiff: (currentTime - realStartTime).toFixed(2),
	snr: snr.toFixed(2),
	noiseFloor: this.state.noiseFloor.toFixed(6)
	});
	} else if (this.state.inSpeech && !isSpeech) {
	// Transition: Speech -> potentially Silence
	this.state.silenceCounter++;

	const chunksNeeded = Math.ceil(this.options.silenceThreshold / (this.options.windowSize / this.options.sampleRate));

	if (this.state.silenceCounter % 5 === 0) {
	this.log('Silence progressing', {
	counter: this.state.silenceCounter,
	needed: chunksNeeded,
	energy: energy.toFixed(6),
	snr: snr.toFixed(2)
	});
	}

	// Implement ending speech tolerance and max silence within speech
	const silenceDuration = this.state.silenceCounter * (this.options.windowSize / this.options.sampleRate);
	const isConfirmedSilence = this.state.silenceCounter >= chunksNeeded;

	// Check if we should allow some silence within speech
	if (silenceDuration < this.options.maxSilenceWithinSpeech) {
	// Not yet enough silence to consider it a break
	this.state.speechEnergies.push(energy);
	} else if (isConfirmedSilence) {
	// Confirmed silence - end speech segment
	if (this.state.speechStartTime !== null) {
	const speechDuration = currentTime - this.state.speechStartTime;
	const avgEnergy = this.state.speechEnergies.length > 0
	? this.state.speechEnergies.reduce((a, b) => a + b, 0) / this.state.speechEnergies.length
	: 0;

	this.state.speechStats.push({
	startTime: this.state.speechStartTime,
	endTime: currentTime,
	duration: speechDuration,
	avgEnergy,
	energyIntegral: avgEnergy * speechDuration
	});

	if (this.state.speechStats.length > this.options.maxHistoryLength) {
	this.state.speechStats.shift();
	}
	}

	const segment = this.createSegment(this.state.speechStartTime!, currentTime);
	if (segment) {
	segments.push(segment);
	}

	this.startSilence(currentTime);
	} else {
	// Accumulate silence energies while deciding
	this.state.silenceEnergies.push(energy);
	}
	} else {
	// Continue in current state
	if (this.state.inSpeech) {
	this.state.speechEnergies.push(energy);
	} else {
	this.state.silenceEnergies.push(energy);
	}
	}

	this.updateStats();

	return segments;
	}

	/**
	* Update noise floor using adaptive exponential moving average.
	*/
	private updateNoiseFloor(energy: number, isSpeech: boolean): void {
	if (!isSpeech) {
	// Blend between fast and normal adaptation rates based on silence duration
	let adaptationRate = this.options.noiseFloorAdaptationRate;

	if (this.state.silenceDuration < this.options.minBackgroundDuration) {
	const blendFactor = Math.min(1, this.state.silenceDuration / this.options.minBackgroundDuration);
	adaptationRate = this.options.fastAdaptationRate * (1 - blendFactor) +
	this.options.noiseFloorAdaptationRate * blendFactor;
	}

	// Exponential moving average for noise floor tracking
	this.state.noiseFloor = this.state.noiseFloor * (1 - adaptationRate) + energy * adaptationRate;
	this.state.noiseFloor = Math.max(0.00001, this.state.noiseFloor);
	}

	// Track recent energies for analysis
	this.state.recentEnergies.push(energy);
	if (this.state.recentEnergies.length > 50) {
	this.state.recentEnergies.shift();
	}
	}

	/**
	* Calculate Signal-to-Noise Ratio in dB.
	*/
	private calculateSNR(energy: number): number {
	const noiseFloor = Math.max(0.0001, this.state.noiseFloor);
	return 10 * Math.log10(energy / noiseFloor);
	}

	/**
	* Start tracking a new speech segment.
	*/
	private startSpeech(time: number, energy: number): void {
	this.state.inSpeech = true;
	this.state.speechStartTime = time;
	this.state.silenceCounter = 0;
	this.state.speechEnergies = [energy];
	this.state.silenceStartTime = null;
	this.state.silenceDuration = 0;

	const snr = this.calculateSNR(energy);
	this.log('Speech state started', {
	time: time.toFixed(2),
	energy: energy.toFixed(6),
	snr: snr.toFixed(2),
	noiseFloor: this.state.noiseFloor.toFixed(6)
	});
	}

	/**
	* Transition to silence state.
	*/
	private startSilence(time: number): void {
	this.state.inSpeech = false;
	this.state.silenceStartTime = time;
	this.state.speechStartTime = null;
	this.state.silenceCounter = 0;
	this.state.silenceEnergies = [];
	this.state.silenceDuration = 0.001; // Avoid division by zero

	this.log('Silence state started', {
	time: time.toFixed(2),
	noiseFloor: this.state.noiseFloor.toFixed(6)
	});
	}

	/**
	* Find the actual speech start using lookback and energy trend analysis.
	*/
	private findSpeechStart(): number {
	const chunks = this.state.recentChunks;
	const minSnrThreshold = this.options.minSnrThreshold;

	// Find the most recent speech chunk
	let firstSpeechIndex = 0;
	for (let i = chunks.length - 1; i >= 0; i--) {
	if (chunks[i].isSpeech) {
	firstSpeechIndex = i;
	break;
	}
	}

	// Look for the earliest point where energy starts rising towards speech
	let earliestRisingIndex = firstSpeechIndex;
	let foundRisingTrend = false;

	for (let i = firstSpeechIndex - 1; i >= 0; i--) {
	// Check for rising energy trend
	if (i < chunks.length - 1 &&
	chunks[i + 1].energy > chunks[i].energy * (1 + this.options.energyRiseThreshold)) {
	earliestRisingIndex = i;
	foundRisingTrend = true;
	}

	// Stop if SNR drops significantly below threshold
	if (chunks[i].snr < minSnrThreshold / 2) {
	break;
	}

	// Limit lookback to ~500ms (assuming 80ms chunks)
	if (firstSpeechIndex - i > 6) {
	break;
	}
	}

	if (foundRisingTrend) {
	this.log('Found rising energy trend for speech onset', {
	index: earliestRisingIndex,
	time: chunks[earliestRisingIndex].time.toFixed(3),
	energy: chunks[earliestRisingIndex].energy.toFixed(6),
	snr: chunks[earliestRisingIndex].snr.toFixed(2)
	});
	return earliestRisingIndex;
	}

	// Check for SNR crossing
	for (let i = firstSpeechIndex; i >= 0; i--) {
	if (chunks[i].snr < minSnrThreshold) {
	return Math.min(chunks.length - 1, i + 1);
	}
	}

	// Default lookback
	return Math.max(0, firstSpeechIndex - 4);
	}

	/**
	* Create a segment object from start/end times.
	*/
	private createSegment(startTime: number, endTime: number): ProcessedSegment \| null {
	const duration = endTime - startTime;

	if (duration <= 0) {
	this.log('Skipping segment with zero/negative duration');
	return null;
	}

	return {
	startTime,
	endTime,
	duration
	};
	}

	/**
	* Update internal statistics.
	*/
	private updateStats(): void {
	const stats: CurrentStats = {
	silence: { avgDuration: 0, avgEnergy: 0, avgEnergyIntegral: 0 },
	speech: { avgDuration: 0, avgEnergy: 0, avgEnergyIntegral: 0 },
	noiseFloor: this.state.noiseFloor,
	snr: this.state.recentChunks.length > 0
	? this.state.recentChunks[this.state.recentChunks.length - 1].snr
	: 0,
	snrThreshold: this.options.snrThreshold,
	minSnrThreshold: this.options.minSnrThreshold,
	energyRiseThreshold: this.options.energyRiseThreshold
	};

	if (this.state.silenceStats.length > 0) {
	stats.silence = {
	avgDuration: this.average(this.state.silenceStats.map(s => s.duration)),
	avgEnergy: this.average(this.state.silenceStats.map(s => s.avgEnergy)),
	avgEnergyIntegral: this.average(this.state.silenceStats.map(s => s.energyIntegral))
	};
	}

	if (this.state.speechStats.length > 0) {
	stats.speech = {
	avgDuration: this.average(this.state.speechStats.map(s => s.duration)),
	avgEnergy: this.average(this.state.speechStats.map(s => s.avgEnergy)),
	avgEnergyIntegral: this.average(this.state.speechStats.map(s => s.energyIntegral))
	};
	}

	this.state.currentStats = stats;
	}

	private average(arr: number[]): number {
	if (arr.length === 0) return 0;
	return arr.reduce((a, b) => a + b, 0) / arr.length;
	}

	/**
	* Get current statistics.
	*/
	getStats(): CurrentStats {
	return this.state.currentStats;
	}

	/**
	* Get current state info for debugging.
	*/
	getStateInfo(): { inSpeech: boolean; noiseFloor: number; snr: number; speechStartTime: number \| null } {
	return {
	inSpeech: this.state.inSpeech,
	noiseFloor: this.state.noiseFloor,
	snr: this.state.currentStats.snr,
	speechStartTime: this.state.speechStartTime
	};
	}

	/**
	* Reset all state.
	*/
	reset(): void {
	this.state = {
	inSpeech: false,
	speechStartTime: null,
	silenceStartTime: null,
	silenceCounter: 0,
	recentChunks: [],
	speechEnergies: [],
	silenceEnergies: [],
	speechStats: [],
	silenceStats: [],
	currentStats: {
	silence: { avgDuration: 0, avgEnergy: 0, avgEnergyIntegral: 0 },
	speech: { avgDuration: 0, avgEnergy: 0, avgEnergyIntegral: 0 },
	noiseFloor: 0.005,
	snr: 0,
	snrThreshold: this.options.snrThreshold,
	minSnrThreshold: this.options.minSnrThreshold,
	energyRiseThreshold: this.options.energyRiseThreshold
	},
	segmentCounter: 0,
	noiseFloor: 0.005,
	recentEnergies: [],
	silenceDuration: 0
	};
	}

	// ========================================================================
	// Configuration Setters
	// ========================================================================

	setThreshold(threshold: number): void {
	this.options.energyThreshold = threshold;
	this.log('Updated energy threshold', threshold);
	}

	setSilenceLength(length: number): void {
	this.options.silenceThreshold = length;
	this.log('Updated silence threshold', length);
	}

	setLookbackDuration(duration: number): void {
	this.options.lookbackDuration = duration;
	this.log('Updated lookback duration', duration);
	}

	setOverlapDuration(duration: number): void {
	this.options.overlapDuration = duration;
	this.log('Updated overlap duration', duration);
	}

	setSnrThreshold(threshold: number): void {
	this.options.snrThreshold = threshold;
	this.log('Updated SNR threshold', threshold);
	}

	setMinSnrThreshold(threshold: number): void {
	this.options.minSnrThreshold = threshold;
	this.log('Updated minimum SNR threshold', threshold);
	}

	setNoiseFloorAdaptationRate(rate: number): void {
	this.options.noiseFloorAdaptationRate = rate;
	this.log('Updated noise floor adaptation rate', rate);
	}

	setFastAdaptationRate(rate: number): void {
	this.options.fastAdaptationRate = rate;
	this.log('Updated fast adaptation rate', rate);
	}

	setEnergyRiseThreshold(threshold: number): void {
	this.options.energyRiseThreshold = threshold;
	this.log('Updated energy rise threshold', threshold);
	}

	setMinBackgroundDuration(duration: number): void {
	this.options.minBackgroundDuration = duration;
	this.log('Updated minimum background duration', duration);
	}

	setMaxSegmentDuration(duration: number): void {
	this.options.maxSegmentDuration = duration;
	this.log('Updated maximum segment duration', duration);
	}

	setMinSpeechDuration(duration: number): void {
	this.options.minSpeechDuration = duration;
	this.log('Updated minimum speech duration', duration);
	}

	setMaxSilenceWithinSpeech(duration: number): void {
	this.options.maxSilenceWithinSpeech = duration;
	this.log('Updated max silence within speech', duration);
	}

	setEndingSpeechTolerance(duration: number): void {
	this.options.endingSpeechTolerance = duration;
	this.log('Updated ending speech tolerance', duration);
	}
	}