Spaces:
Running
Running
| /** | |
| * Keet - Audio Segment Processor | |
| * Ported from legacy UI project/AudioSegmentProcessor.js | |
| * | |
| * Sophisticated VAD-based segment processor with: | |
| * - Speech onset detection with lookback | |
| * - Rising energy trend analysis | |
| * - Adaptive noise floor tracking | |
| * - SNR-based speech detection | |
| * - Proactive segment splitting for long utterances | |
| */ | |
| import { defaultAudioParams, windowDuration as DEFAULT_WINDOW_DURATION } from './audioParams'; | |
| /** Chunk metadata for speech tracking */ | |
| interface ChunkInfo { | |
| time: number; | |
| energy: number; | |
| isSpeech: boolean; | |
| snr: number; | |
| } | |
| /** Speech/silence statistics */ | |
| interface SegmentStats { | |
| startTime: number; | |
| endTime: number; | |
| duration: number; | |
| avgEnergy: number; | |
| energyIntegral: number; | |
| } | |
| /** Statistics summary */ | |
| interface StatsSummary { | |
| avgDuration: number; | |
| avgEnergy: number; | |
| avgEnergyIntegral: number; | |
| } | |
| /** Current stats snapshot */ | |
| interface CurrentStats { | |
| silence: StatsSummary; | |
| speech: StatsSummary; | |
| noiseFloor: number; | |
| snr: number; | |
| snrThreshold: number; | |
| minSnrThreshold: number; | |
| energyRiseThreshold: number; | |
| } | |
| /** Processor state */ | |
| interface ProcessorState { | |
| inSpeech: boolean; | |
| speechStartTime: number | null; | |
| silenceStartTime: number | null; | |
| silenceCounter: number; | |
| recentChunks: ChunkInfo[]; | |
| speechEnergies: number[]; | |
| silenceEnergies: number[]; | |
| speechStats: SegmentStats[]; | |
| silenceStats: SegmentStats[]; | |
| currentStats: CurrentStats; | |
| segmentCounter: number; | |
| noiseFloor: number; | |
| recentEnergies: number[]; | |
| silenceDuration: number; | |
| } | |
| /** Segment output */ | |
| export interface ProcessedSegment { | |
| startTime: number; | |
| endTime: number; | |
| duration: number; | |
| } | |
| /** Processor configuration */ | |
| export interface AudioSegmentProcessorConfig { | |
| sampleRate: number; | |
| windowSize: number; | |
| minSpeechDuration: number; | |
| silenceThreshold: number; | |
| energyThreshold: number; | |
| smaLength: number; | |
| lookbackChunks: number; | |
| overlapDuration: number; | |
| lookbackDuration: number; | |
| maxHistoryLength: number; | |
| noiseFloorAdaptationRate: number; | |
| fastAdaptationRate: number; | |
| snrThreshold: number; | |
| minBackgroundDuration: number; | |
| minSnrThreshold: number; | |
| energyRiseThreshold: number; | |
| maxSegmentDuration: number; | |
| maxSilenceWithinSpeech: number; | |
| endingSpeechTolerance: number; | |
| logger?: (message: string, data?: unknown) => void; | |
| } | |
| /** | |
| * AudioSegmentProcessor - Sophisticated VAD with speech onset detection | |
| */ | |
| export class AudioSegmentProcessor { | |
| private options: AudioSegmentProcessorConfig; | |
| private state!: ProcessorState; | |
| constructor(options: Partial<AudioSegmentProcessorConfig> = {}) { | |
| const sampleRate = options.sampleRate ?? defaultAudioParams.sampleRate ?? 16000; | |
| // Calculate window size based on sample rate (80ms window) | |
| const windowSize = Math.round(DEFAULT_WINDOW_DURATION * sampleRate); | |
| this.options = { | |
| sampleRate, | |
| minSpeechDuration: defaultAudioParams.minSpeechDuration, | |
| silenceThreshold: defaultAudioParams.silenceLength, | |
| energyThreshold: defaultAudioParams.audioThreshold, | |
| smaLength: defaultAudioParams.smaLength, | |
| lookbackChunks: defaultAudioParams.lookbackChunks, | |
| overlapDuration: defaultAudioParams.overlapDuration, | |
| lookbackDuration: defaultAudioParams.lookbackDuration, | |
| maxHistoryLength: defaultAudioParams.maxHistoryLength, | |
| noiseFloorAdaptationRate: defaultAudioParams.noiseFloorAdaptationRate, | |
| fastAdaptationRate: defaultAudioParams.fastAdaptationRate, | |
| snrThreshold: defaultAudioParams.snrThreshold, | |
| minBackgroundDuration: defaultAudioParams.minBackgroundDuration, | |
| minSnrThreshold: defaultAudioParams.minSnrThreshold, | |
| energyRiseThreshold: defaultAudioParams.energyRiseThreshold, | |
| maxSegmentDuration: defaultAudioParams.maxSegmentDuration, | |
| maxSilenceWithinSpeech: defaultAudioParams.maxSilenceWithinSpeech, | |
| endingSpeechTolerance: defaultAudioParams.endingSpeechTolerance, | |
| logger: console.log, | |
| ...options, | |
| // Ensure windowSize is recalculated if sampleRate was overridden | |
| windowSize: Math.round(DEFAULT_WINDOW_DURATION * (options.sampleRate ?? sampleRate)) | |
| }; | |
| this.log('Initialized AudioSegmentProcessor', { | |
| sampleRate: this.options.sampleRate, | |
| windowSize: this.options.windowSize, | |
| lookbackDuration: this.options.lookbackDuration, | |
| overlapDuration: this.options.overlapDuration, | |
| snrThreshold: this.options.snrThreshold, | |
| minSnrThreshold: this.options.minSnrThreshold | |
| }); | |
| this.reset(); | |
| } | |
| private log(message: string, data?: unknown): void { | |
| if (typeof this.options.logger === 'function') { | |
| this.options.logger(`[AudioSegmentProcessor] ${message}`, data); | |
| } | |
| } | |
| /** | |
| * Process an audio chunk and return any detected segments. | |
| */ | |
| processAudioData( | |
| chunk: Float32Array, | |
| currentTime: number, | |
| energy: number | |
| ): ProcessedSegment[] { | |
| if (!chunk || !chunk.length) return []; | |
| const segments: ProcessedSegment[] = []; | |
| const isSpeech = energy > this.options.energyThreshold; | |
| // Update silence duration tracking | |
| if (!isSpeech) { | |
| const chunkDurationSec = chunk.length / this.options.sampleRate; | |
| this.state.silenceDuration += chunkDurationSec; | |
| } else { | |
| this.state.silenceDuration = 0; | |
| } | |
| // Update noise floor and calculate SNR | |
| this.updateNoiseFloor(energy, isSpeech); | |
| const snr = this.calculateSNR(energy); | |
| // Track recent chunks for lookback | |
| this.state.recentChunks.push({ | |
| time: currentTime, | |
| energy, | |
| isSpeech, | |
| snr | |
| }); | |
| if (this.state.recentChunks.length > this.options.maxHistoryLength * 10) { | |
| this.state.recentChunks.shift(); | |
| } | |
| // --- Proactive Segment Splitting --- | |
| if (this.state.inSpeech && this.state.speechStartTime !== null) { | |
| const currentSpeechDuration = currentTime - this.state.speechStartTime; | |
| if (currentSpeechDuration > this.options.maxSegmentDuration) { | |
| this.log('Splitting long segment', { | |
| startTime: this.state.speechStartTime.toFixed(2), | |
| splitTime: currentTime.toFixed(2), | |
| duration: currentSpeechDuration.toFixed(2) | |
| }); | |
| const segment = this.createSegment(this.state.speechStartTime, currentTime); | |
| if (segment) { | |
| segments.push(segment); | |
| } | |
| // Start new segment immediately | |
| this.startSpeech(currentTime, energy); | |
| } | |
| } | |
| // --- Speech State Machine --- | |
| if (!this.state.inSpeech && isSpeech) { | |
| // Transition: Silence -> Speech | |
| const realStartIndex = this.findSpeechStart(); | |
| const realStartTime = realStartIndex !== -1 | |
| ? this.state.recentChunks[realStartIndex].time | |
| : currentTime; | |
| this.startSpeech(realStartTime, energy); | |
| this.log('Speech start detected', { | |
| detectedAt: currentTime.toFixed(2), | |
| actualStart: realStartTime.toFixed(2), | |
| lookbackDiff: (currentTime - realStartTime).toFixed(2), | |
| snr: snr.toFixed(2), | |
| noiseFloor: this.state.noiseFloor.toFixed(6) | |
| }); | |
| } else if (this.state.inSpeech && !isSpeech) { | |
| // Transition: Speech -> potentially Silence | |
| this.state.silenceCounter++; | |
| const chunksNeeded = Math.ceil(this.options.silenceThreshold / (this.options.windowSize / this.options.sampleRate)); | |
| if (this.state.silenceCounter % 5 === 0) { | |
| this.log('Silence progressing', { | |
| counter: this.state.silenceCounter, | |
| needed: chunksNeeded, | |
| energy: energy.toFixed(6), | |
| snr: snr.toFixed(2) | |
| }); | |
| } | |
| // Implement ending speech tolerance and max silence within speech | |
| const silenceDuration = this.state.silenceCounter * (this.options.windowSize / this.options.sampleRate); | |
| const isConfirmedSilence = this.state.silenceCounter >= chunksNeeded; | |
| // Check if we should allow some silence within speech | |
| if (silenceDuration < this.options.maxSilenceWithinSpeech) { | |
| // Not yet enough silence to consider it a break | |
| this.state.speechEnergies.push(energy); | |
| } else if (isConfirmedSilence) { | |
| // Confirmed silence - end speech segment | |
| if (this.state.speechStartTime !== null) { | |
| const speechDuration = currentTime - this.state.speechStartTime; | |
| const avgEnergy = this.state.speechEnergies.length > 0 | |
| ? this.state.speechEnergies.reduce((a, b) => a + b, 0) / this.state.speechEnergies.length | |
| : 0; | |
| this.state.speechStats.push({ | |
| startTime: this.state.speechStartTime, | |
| endTime: currentTime, | |
| duration: speechDuration, | |
| avgEnergy, | |
| energyIntegral: avgEnergy * speechDuration | |
| }); | |
| if (this.state.speechStats.length > this.options.maxHistoryLength) { | |
| this.state.speechStats.shift(); | |
| } | |
| } | |
| const segment = this.createSegment(this.state.speechStartTime!, currentTime); | |
| if (segment) { | |
| segments.push(segment); | |
| } | |
| this.startSilence(currentTime); | |
| } else { | |
| // Accumulate silence energies while deciding | |
| this.state.silenceEnergies.push(energy); | |
| } | |
| } else { | |
| // Continue in current state | |
| if (this.state.inSpeech) { | |
| this.state.speechEnergies.push(energy); | |
| } else { | |
| this.state.silenceEnergies.push(energy); | |
| } | |
| } | |
| this.updateStats(); | |
| return segments; | |
| } | |
| /** | |
| * Update noise floor using adaptive exponential moving average. | |
| */ | |
| private updateNoiseFloor(energy: number, isSpeech: boolean): void { | |
| if (!isSpeech) { | |
| // Blend between fast and normal adaptation rates based on silence duration | |
| let adaptationRate = this.options.noiseFloorAdaptationRate; | |
| if (this.state.silenceDuration < this.options.minBackgroundDuration) { | |
| const blendFactor = Math.min(1, this.state.silenceDuration / this.options.minBackgroundDuration); | |
| adaptationRate = this.options.fastAdaptationRate * (1 - blendFactor) + | |
| this.options.noiseFloorAdaptationRate * blendFactor; | |
| } | |
| // Exponential moving average for noise floor tracking | |
| this.state.noiseFloor = this.state.noiseFloor * (1 - adaptationRate) + energy * adaptationRate; | |
| this.state.noiseFloor = Math.max(0.00001, this.state.noiseFloor); | |
| } | |
| // Track recent energies for analysis | |
| this.state.recentEnergies.push(energy); | |
| if (this.state.recentEnergies.length > 50) { | |
| this.state.recentEnergies.shift(); | |
| } | |
| } | |
| /** | |
| * Calculate Signal-to-Noise Ratio in dB. | |
| */ | |
| private calculateSNR(energy: number): number { | |
| const noiseFloor = Math.max(0.0001, this.state.noiseFloor); | |
| return 10 * Math.log10(energy / noiseFloor); | |
| } | |
| /** | |
| * Start tracking a new speech segment. | |
| */ | |
| private startSpeech(time: number, energy: number): void { | |
| this.state.inSpeech = true; | |
| this.state.speechStartTime = time; | |
| this.state.silenceCounter = 0; | |
| this.state.speechEnergies = [energy]; | |
| this.state.silenceStartTime = null; | |
| this.state.silenceDuration = 0; | |
| const snr = this.calculateSNR(energy); | |
| this.log('Speech state started', { | |
| time: time.toFixed(2), | |
| energy: energy.toFixed(6), | |
| snr: snr.toFixed(2), | |
| noiseFloor: this.state.noiseFloor.toFixed(6) | |
| }); | |
| } | |
| /** | |
| * Transition to silence state. | |
| */ | |
| private startSilence(time: number): void { | |
| this.state.inSpeech = false; | |
| this.state.silenceStartTime = time; | |
| this.state.speechStartTime = null; | |
| this.state.silenceCounter = 0; | |
| this.state.silenceEnergies = []; | |
| this.state.silenceDuration = 0.001; // Avoid division by zero | |
| this.log('Silence state started', { | |
| time: time.toFixed(2), | |
| noiseFloor: this.state.noiseFloor.toFixed(6) | |
| }); | |
| } | |
| /** | |
| * Find the actual speech start using lookback and energy trend analysis. | |
| */ | |
| private findSpeechStart(): number { | |
| const chunks = this.state.recentChunks; | |
| const minSnrThreshold = this.options.minSnrThreshold; | |
| // Find the most recent speech chunk | |
| let firstSpeechIndex = 0; | |
| for (let i = chunks.length - 1; i >= 0; i--) { | |
| if (chunks[i].isSpeech) { | |
| firstSpeechIndex = i; | |
| break; | |
| } | |
| } | |
| // Look for the earliest point where energy starts rising towards speech | |
| let earliestRisingIndex = firstSpeechIndex; | |
| let foundRisingTrend = false; | |
| for (let i = firstSpeechIndex - 1; i >= 0; i--) { | |
| // Check for rising energy trend | |
| if (i < chunks.length - 1 && | |
| chunks[i + 1].energy > chunks[i].energy * (1 + this.options.energyRiseThreshold)) { | |
| earliestRisingIndex = i; | |
| foundRisingTrend = true; | |
| } | |
| // Stop if SNR drops significantly below threshold | |
| if (chunks[i].snr < minSnrThreshold / 2) { | |
| break; | |
| } | |
| // Limit lookback to ~500ms (assuming 80ms chunks) | |
| if (firstSpeechIndex - i > 6) { | |
| break; | |
| } | |
| } | |
| if (foundRisingTrend) { | |
| this.log('Found rising energy trend for speech onset', { | |
| index: earliestRisingIndex, | |
| time: chunks[earliestRisingIndex].time.toFixed(3), | |
| energy: chunks[earliestRisingIndex].energy.toFixed(6), | |
| snr: chunks[earliestRisingIndex].snr.toFixed(2) | |
| }); | |
| return earliestRisingIndex; | |
| } | |
| // Check for SNR crossing | |
| for (let i = firstSpeechIndex; i >= 0; i--) { | |
| if (chunks[i].snr < minSnrThreshold) { | |
| return Math.min(chunks.length - 1, i + 1); | |
| } | |
| } | |
| // Default lookback | |
| return Math.max(0, firstSpeechIndex - 4); | |
| } | |
| /** | |
| * Create a segment object from start/end times. | |
| */ | |
| private createSegment(startTime: number, endTime: number): ProcessedSegment | null { | |
| const duration = endTime - startTime; | |
| if (duration <= 0) { | |
| this.log('Skipping segment with zero/negative duration'); | |
| return null; | |
| } | |
| return { | |
| startTime, | |
| endTime, | |
| duration | |
| }; | |
| } | |
| /** | |
| * Update internal statistics. | |
| */ | |
| private updateStats(): void { | |
| const stats: CurrentStats = { | |
| silence: { avgDuration: 0, avgEnergy: 0, avgEnergyIntegral: 0 }, | |
| speech: { avgDuration: 0, avgEnergy: 0, avgEnergyIntegral: 0 }, | |
| noiseFloor: this.state.noiseFloor, | |
| snr: this.state.recentChunks.length > 0 | |
| ? this.state.recentChunks[this.state.recentChunks.length - 1].snr | |
| : 0, | |
| snrThreshold: this.options.snrThreshold, | |
| minSnrThreshold: this.options.minSnrThreshold, | |
| energyRiseThreshold: this.options.energyRiseThreshold | |
| }; | |
| if (this.state.silenceStats.length > 0) { | |
| stats.silence = { | |
| avgDuration: this.average(this.state.silenceStats.map(s => s.duration)), | |
| avgEnergy: this.average(this.state.silenceStats.map(s => s.avgEnergy)), | |
| avgEnergyIntegral: this.average(this.state.silenceStats.map(s => s.energyIntegral)) | |
| }; | |
| } | |
| if (this.state.speechStats.length > 0) { | |
| stats.speech = { | |
| avgDuration: this.average(this.state.speechStats.map(s => s.duration)), | |
| avgEnergy: this.average(this.state.speechStats.map(s => s.avgEnergy)), | |
| avgEnergyIntegral: this.average(this.state.speechStats.map(s => s.energyIntegral)) | |
| }; | |
| } | |
| this.state.currentStats = stats; | |
| } | |
| private average(arr: number[]): number { | |
| if (arr.length === 0) return 0; | |
| return arr.reduce((a, b) => a + b, 0) / arr.length; | |
| } | |
| /** | |
| * Get current statistics. | |
| */ | |
| getStats(): CurrentStats { | |
| return this.state.currentStats; | |
| } | |
| /** | |
| * Get current state info for debugging. | |
| */ | |
| getStateInfo(): { inSpeech: boolean; noiseFloor: number; snr: number; speechStartTime: number | null } { | |
| return { | |
| inSpeech: this.state.inSpeech, | |
| noiseFloor: this.state.noiseFloor, | |
| snr: this.state.currentStats.snr, | |
| speechStartTime: this.state.speechStartTime | |
| }; | |
| } | |
| /** | |
| * Reset all state. | |
| */ | |
| reset(): void { | |
| this.state = { | |
| inSpeech: false, | |
| speechStartTime: null, | |
| silenceStartTime: null, | |
| silenceCounter: 0, | |
| recentChunks: [], | |
| speechEnergies: [], | |
| silenceEnergies: [], | |
| speechStats: [], | |
| silenceStats: [], | |
| currentStats: { | |
| silence: { avgDuration: 0, avgEnergy: 0, avgEnergyIntegral: 0 }, | |
| speech: { avgDuration: 0, avgEnergy: 0, avgEnergyIntegral: 0 }, | |
| noiseFloor: 0.005, | |
| snr: 0, | |
| snrThreshold: this.options.snrThreshold, | |
| minSnrThreshold: this.options.minSnrThreshold, | |
| energyRiseThreshold: this.options.energyRiseThreshold | |
| }, | |
| segmentCounter: 0, | |
| noiseFloor: 0.005, | |
| recentEnergies: [], | |
| silenceDuration: 0 | |
| }; | |
| } | |
| // ======================================================================== | |
| // Configuration Setters | |
| // ======================================================================== | |
| setThreshold(threshold: number): void { | |
| this.options.energyThreshold = threshold; | |
| this.log('Updated energy threshold', threshold); | |
| } | |
| setSilenceLength(length: number): void { | |
| this.options.silenceThreshold = length; | |
| this.log('Updated silence threshold', length); | |
| } | |
| setLookbackDuration(duration: number): void { | |
| this.options.lookbackDuration = duration; | |
| this.log('Updated lookback duration', duration); | |
| } | |
| setOverlapDuration(duration: number): void { | |
| this.options.overlapDuration = duration; | |
| this.log('Updated overlap duration', duration); | |
| } | |
| setSnrThreshold(threshold: number): void { | |
| this.options.snrThreshold = threshold; | |
| this.log('Updated SNR threshold', threshold); | |
| } | |
| setMinSnrThreshold(threshold: number): void { | |
| this.options.minSnrThreshold = threshold; | |
| this.log('Updated minimum SNR threshold', threshold); | |
| } | |
| setNoiseFloorAdaptationRate(rate: number): void { | |
| this.options.noiseFloorAdaptationRate = rate; | |
| this.log('Updated noise floor adaptation rate', rate); | |
| } | |
| setFastAdaptationRate(rate: number): void { | |
| this.options.fastAdaptationRate = rate; | |
| this.log('Updated fast adaptation rate', rate); | |
| } | |
| setEnergyRiseThreshold(threshold: number): void { | |
| this.options.energyRiseThreshold = threshold; | |
| this.log('Updated energy rise threshold', threshold); | |
| } | |
| setMinBackgroundDuration(duration: number): void { | |
| this.options.minBackgroundDuration = duration; | |
| this.log('Updated minimum background duration', duration); | |
| } | |
| setMaxSegmentDuration(duration: number): void { | |
| this.options.maxSegmentDuration = duration; | |
| this.log('Updated maximum segment duration', duration); | |
| } | |
| setMinSpeechDuration(duration: number): void { | |
| this.options.minSpeechDuration = duration; | |
| this.log('Updated minimum speech duration', duration); | |
| } | |
| setMaxSilenceWithinSpeech(duration: number): void { | |
| this.options.maxSilenceWithinSpeech = duration; | |
| this.log('Updated max silence within speech', duration); | |
| } | |
| setEndingSpeechTolerance(duration: number): void { | |
| this.options.endingSpeechTolerance = duration; | |
| this.log('Updated ending speech tolerance', duration); | |
| } | |
| } | |