Spaces:

Duplicated from andito/parakeet-v3-streaming

RobinsAIWorld
/

parakeet-v3-streaming

Running

App Files Files Community

parakeet-v3-streaming / source /src /utils /progressive-streaming.js

andito's picture

andito HF Staff

Use sentence-based window sliding instead of word-based

c9d9124 about 1 month ago

history blame contribute delete

9.98 kB

	/**
	* Smart Progressive Streaming Handler
	*
	* JavaScript port of STT/smart_progressive_streaming.py
	*
	* Provides frequent partial transcriptions (every 250ms) with:
	* - Growing window up to 15s for accuracy
	* - Sentence-boundary-aware window sliding for audio > 15s
	* - Fixed sentences + active transcription
	*/

	export class PartialTranscription {
	constructor(fixedText, activeText, timestamp, isFinal) {
	this.fixedText = fixedText; // Sentences that won't change
	this.activeText = activeText; // Current partial transcription
	this.timestamp = timestamp; // Current position in audio
	this.isFinal = isFinal; // True if this is the last update
	}
	}

	export class SmartProgressiveStreamingHandler {
	/**
	* Smart progressive streaming with sentence-aware window management.
	*
	* Strategy:
	* 1. Emit partial transcriptions every 250ms
	* 2. Use growing window (up to 15s) for better accuracy
	* 3. When audio > 15s, slide window using sentence boundaries:
	* - Keep completed sentences as "fixed"
	* - Only re-transcribe the "active" portion
	*/
	constructor(model, options = {}) {
	this.model = model;
	this.emissionInterval = options.emissionInterval \|\| 0.5; // 500ms
	this.maxWindowSize = options.maxWindowSize \|\| 15.0; // 15 seconds
	this.sentenceBuffer = options.sentenceBuffer \|\| 2.0; // 2 second buffer
	this.sampleRate = options.sampleRate \|\| 16000;

	// State for incremental streaming
	this.reset();
	}

	reset() {
	/**
	* Reset state for new streaming session.
	*/
	this.fixedSentences = [];
	this.fixedEndTime = 0.0;
	this.lastTranscribedLength = 0;
	}

	async transcribeIncremental(audio) {
	/**
	* Transcribe audio incrementally (for live streaming).
	*
	* Call this repeatedly with growing audio buffer (Float32Array).
	* Returns a single PartialTranscription for current state.
	*
	* @param {Float32Array} audio - Growing audio buffer
	* @returns {Promise<PartialTranscription>}
	*/

	// Skip if not enough new audio
	const currentLength = audio.length;
	if (currentLength < this.sampleRate * 0.5) { // Need at least 500ms
	return new PartialTranscription(
	this.fixedSentences.join(" "),
	"",
	currentLength / this.sampleRate,
	false
	);
	}

	// Skip if no new audio since last transcription
	if (currentLength === this.lastTranscribedLength) {
	return new PartialTranscription(
	this.fixedSentences.join(" "),
	"",
	currentLength / this.sampleRate,
	false
	);
	}

	this.lastTranscribedLength = currentLength;

	// Extract window for transcription (from last fixed sentence to end)
	const windowStartSamples = Math.floor(this.fixedEndTime * this.sampleRate);
	const audioWindow = audio.slice(windowStartSamples);

	// Check if window exceeds max_window_size
	const windowDuration = audioWindow.length / this.sampleRate;

	// Transcribe current window
	let result = await this.model.transcribe(audioWindow);

	if (windowDuration >= this.maxWindowSize && result.sentences && result.sentences.length > 1) {
	// Window is too large - fix some sentences
	const cutoffTime = windowDuration - this.sentenceBuffer;

	// Find sentences to fix (matching Python implementation)
	const newFixedSentences = [];
	let newFixedEndTime = this.fixedEndTime;

	for (const sentence of result.sentences) {
	if (sentence.end < cutoffTime) {
	// Fix this sentence
	newFixedSentences.push(sentence.text.trim());
	newFixedEndTime = this.fixedEndTime + sentence.end;
	} else {
	break;
	}
	}

	if (newFixedSentences.length > 0) {
	this.fixedSentences.push(...newFixedSentences);
	this.fixedEndTime = newFixedEndTime;

	// Re-transcribe from new fixed point
	const newWindowStartSamples = Math.floor(this.fixedEndTime * this.sampleRate);
	const newAudioWindow = audio.slice(newWindowStartSamples);
	result = await this.model.transcribe(newAudioWindow);
	}
	}

	// Build output
	const fixedText = this.fixedSentences.join(" ");
	const activeText = result.text ? result.text.trim() : "";
	const timestamp = audio.length / this.sampleRate;

	return new PartialTranscription(
	fixedText,
	activeText,
	timestamp,
	false
	);
	}

	async *transcribeProgressive(audio) {
	/**
	* Transcribe audio with smart progressive emissions.
	*
	* Yields PartialTranscription with:
	* - fixedText: Completed sentences (won't change)
	* - activeText: Current partial transcription
	* - timestamp: Current position
	*
	* @param {Float32Array} audio - Complete audio buffer
	* @yields {PartialTranscription}
	*/

	const totalDuration = audio.length / this.sampleRate;
	let currentTime = 0;

	this.reset();

	while (currentTime < totalDuration) {
	currentTime += this.emissionInterval;
	const currentSamples = Math.min(
	Math.floor(currentTime * this.sampleRate),
	audio.length
	);

	const currentAudio = audio.slice(0, currentSamples);
	const result = await this.transcribeIncremental(currentAudio);

	yield result;

	// Small delay to simulate real-time
	await new Promise(resolve => setTimeout(resolve, this.emissionInterval * 1000));
	}

	// Final transcription
	const finalResult = await this.transcribeIncremental(audio);
	yield new PartialTranscription(
	finalResult.fixedText,
	finalResult.activeText,
	finalResult.timestamp,
	true // is_final = true
	);
	}

	async *transcribeBatch(audio) {
	/**
	* Transcribe audio in batch mode (for uploaded files).
	*
	* Processes as fast as possible with full 15s windows:
	* - Start with maximum window size immediately
	* - No artificial delays between windows
	* - Slide window as soon as current transcription completes
	*
	* @param {Float32Array} audio - Complete audio buffer
	* @yields {PartialTranscription}
	*/
	const totalDuration = audio.length / this.sampleRate;
	this.reset();

	let processedUpTo = 0; // Track how much audio we've finalized

	while (processedUpTo < totalDuration) {
	// Process next 15s window (or remaining audio if less than 15s)
	const windowStart = processedUpTo;
	const windowEnd = Math.min(processedUpTo + this.maxWindowSize, totalDuration);
	const windowDuration = windowEnd - windowStart;

	// Extract audio window
	const windowStartSamples = Math.floor(windowStart * this.sampleRate);
	const windowEndSamples = Math.floor(windowEnd * this.sampleRate);
	const audioWindow = audio.slice(windowStartSamples, windowEndSamples);

	// Transcribe this window
	const result = await this.model.transcribe(audioWindow);

	// Determine how much of this window to lock as "fixed"
	if (windowDuration >= this.maxWindowSize) {
	// Full window - need to slide
	const cutoffTime = windowDuration - this.sentenceBuffer;

	// Find sentences to fix (matching Python implementation)
	if (result.sentences && result.sentences.length > 1) {
	const sentencesToFix = result.sentences.filter(s => s.end < cutoffTime);

	if (sentencesToFix.length > 0) {
	// Lock these sentences as fixed
	const newFixedSentences = sentencesToFix.map(s => s.text.trim());
	this.fixedSentences.push(...newFixedSentences);

	// Move processed marker forward
	const lastSentenceTime = sentencesToFix[sentencesToFix.length - 1].end;
	processedUpTo = windowStart + lastSentenceTime;

	// Get active text (remaining sentences)
	const activeSentences = result.sentences.filter(s => s.end >= cutoffTime);
	const activeText = activeSentences.map(s => s.text).join(' ').trim();

	// Yield update
	yield new PartialTranscription(
	this.fixedSentences.join(" "),
	activeText,
	windowEnd,
	false
	);
	} else {
	// No sentences found before cutoff - lock half the window to make progress
	const halfText = result.text ? result.text.trim() : "";
	this.fixedSentences.push(halfText);
	processedUpTo = windowStart + windowDuration / 2;

	yield new PartialTranscription(
	this.fixedSentences.join(" "),
	"",
	windowEnd,
	false
	);
	}
	} else {
	// No sentence timestamps - lock half to make progress
	const halfText = result.text ? result.text.trim() : "";
	this.fixedSentences.push(halfText);
	processedUpTo = windowStart + windowDuration / 2;

	yield new PartialTranscription(
	this.fixedSentences.join(" "),
	"",
	windowEnd,
	false
	);
	}
	} else {
	// Last window (less than 15s) - lock everything
	const finalText = result.text ? result.text.trim() : "";
	this.fixedSentences.push(finalText);
	processedUpTo = windowEnd;

	yield new PartialTranscription(
	this.fixedSentences.join(" "),
	"",
	windowEnd,
	true // Final
	);
	}
	}
	}

	async finalize(audio) {
	/**
	* Get final transcription by combining fixed + active.
	*
	* @param {Float32Array} audio - Complete audio buffer
	* @returns {Promise<string>} Final complete transcription
	*/
	const result = await this.transcribeIncremental(audio);

	const parts = [];
	if (result.fixedText) parts.push(result.fixedText);
	if (result.activeText) parts.push(result.activeText);

	return parts.join(" ");
	}
	}