| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | export class PartialTranscription { |
| | constructor(fixedText, activeText, timestamp, isFinal) { |
| | this.fixedText = fixedText; |
| | this.activeText = activeText; |
| | this.timestamp = timestamp; |
| | this.isFinal = isFinal; |
| | } |
| | } |
| |
|
| | export class SmartProgressiveStreamingHandler { |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | constructor(model, options = {}) { |
| | this.model = model; |
| | this.emissionInterval = options.emissionInterval || 0.5; |
| | this.maxWindowSize = options.maxWindowSize || 15.0; |
| | this.sentenceBuffer = options.sentenceBuffer || 2.0; |
| | this.sampleRate = options.sampleRate || 16000; |
| |
|
| | |
| | this.reset(); |
| | } |
| |
|
| | reset() { |
| | |
| | |
| | |
| | this.fixedSentences = []; |
| | this.fixedEndTime = 0.0; |
| | this.lastTranscribedLength = 0; |
| | } |
| |
|
| | async transcribeIncremental(audio) { |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | const currentLength = audio.length; |
| | if (currentLength < this.sampleRate * 0.5) { |
| | return new PartialTranscription( |
| | this.fixedSentences.join(" "), |
| | "", |
| | currentLength / this.sampleRate, |
| | false |
| | ); |
| | } |
| |
|
| | |
| | if (currentLength === this.lastTranscribedLength) { |
| | return new PartialTranscription( |
| | this.fixedSentences.join(" "), |
| | "", |
| | currentLength / this.sampleRate, |
| | false |
| | ); |
| | } |
| |
|
| | this.lastTranscribedLength = currentLength; |
| |
|
| | |
| | const windowStartSamples = Math.floor(this.fixedEndTime * this.sampleRate); |
| | const audioWindow = audio.slice(windowStartSamples); |
| |
|
| | |
| | const windowDuration = audioWindow.length / this.sampleRate; |
| |
|
| | |
| | let result = await this.model.transcribe(audioWindow); |
| |
|
| | if (windowDuration >= this.maxWindowSize && result.sentences && result.sentences.length > 1) { |
| | |
| | const cutoffTime = windowDuration - this.sentenceBuffer; |
| |
|
| | |
| | const newFixedSentences = []; |
| | let newFixedEndTime = this.fixedEndTime; |
| |
|
| | for (const sentence of result.sentences) { |
| | if (sentence.end < cutoffTime) { |
| | |
| | newFixedSentences.push(sentence.text.trim()); |
| | newFixedEndTime = this.fixedEndTime + sentence.end; |
| | } else { |
| | break; |
| | } |
| | } |
| |
|
| | if (newFixedSentences.length > 0) { |
| | this.fixedSentences.push(...newFixedSentences); |
| | this.fixedEndTime = newFixedEndTime; |
| |
|
| | |
| | const newWindowStartSamples = Math.floor(this.fixedEndTime * this.sampleRate); |
| | const newAudioWindow = audio.slice(newWindowStartSamples); |
| | result = await this.model.transcribe(newAudioWindow); |
| | } |
| | } |
| |
|
| | |
| | const fixedText = this.fixedSentences.join(" "); |
| | const activeText = result.text ? result.text.trim() : ""; |
| | const timestamp = audio.length / this.sampleRate; |
| |
|
| | return new PartialTranscription( |
| | fixedText, |
| | activeText, |
| | timestamp, |
| | false |
| | ); |
| | } |
| |
|
| | async *transcribeProgressive(audio) { |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | const totalDuration = audio.length / this.sampleRate; |
| | let currentTime = 0; |
| |
|
| | this.reset(); |
| |
|
| | while (currentTime < totalDuration) { |
| | currentTime += this.emissionInterval; |
| | const currentSamples = Math.min( |
| | Math.floor(currentTime * this.sampleRate), |
| | audio.length |
| | ); |
| |
|
| | const currentAudio = audio.slice(0, currentSamples); |
| | const result = await this.transcribeIncremental(currentAudio); |
| |
|
| | yield result; |
| |
|
| | |
| | await new Promise(resolve => setTimeout(resolve, this.emissionInterval * 1000)); |
| | } |
| |
|
| | |
| | const finalResult = await this.transcribeIncremental(audio); |
| | yield new PartialTranscription( |
| | finalResult.fixedText, |
| | finalResult.activeText, |
| | finalResult.timestamp, |
| | true |
| | ); |
| | } |
| |
|
| | async *transcribeBatch(audio) { |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | const totalDuration = audio.length / this.sampleRate; |
| | this.reset(); |
| |
|
| | let processedUpTo = 0; |
| |
|
| | while (processedUpTo < totalDuration) { |
| | |
| | const windowStart = processedUpTo; |
| | const windowEnd = Math.min(processedUpTo + this.maxWindowSize, totalDuration); |
| | const windowDuration = windowEnd - windowStart; |
| |
|
| | |
| | const windowStartSamples = Math.floor(windowStart * this.sampleRate); |
| | const windowEndSamples = Math.floor(windowEnd * this.sampleRate); |
| | const audioWindow = audio.slice(windowStartSamples, windowEndSamples); |
| |
|
| | |
| | const result = await this.model.transcribe(audioWindow); |
| |
|
| | |
| | if (windowDuration >= this.maxWindowSize) { |
| | |
| | const cutoffTime = windowDuration - this.sentenceBuffer; |
| |
|
| | |
| | if (result.sentences && result.sentences.length > 1) { |
| | const sentencesToFix = result.sentences.filter(s => s.end < cutoffTime); |
| |
|
| | if (sentencesToFix.length > 0) { |
| | |
| | const newFixedSentences = sentencesToFix.map(s => s.text.trim()); |
| | this.fixedSentences.push(...newFixedSentences); |
| |
|
| | |
| | const lastSentenceTime = sentencesToFix[sentencesToFix.length - 1].end; |
| | processedUpTo = windowStart + lastSentenceTime; |
| |
|
| | |
| | const activeSentences = result.sentences.filter(s => s.end >= cutoffTime); |
| | const activeText = activeSentences.map(s => s.text).join(' ').trim(); |
| |
|
| | |
| | yield new PartialTranscription( |
| | this.fixedSentences.join(" "), |
| | activeText, |
| | windowEnd, |
| | false |
| | ); |
| | } else { |
| | |
| | const halfText = result.text ? result.text.trim() : ""; |
| | this.fixedSentences.push(halfText); |
| | processedUpTo = windowStart + windowDuration / 2; |
| |
|
| | yield new PartialTranscription( |
| | this.fixedSentences.join(" "), |
| | "", |
| | windowEnd, |
| | false |
| | ); |
| | } |
| | } else { |
| | |
| | const halfText = result.text ? result.text.trim() : ""; |
| | this.fixedSentences.push(halfText); |
| | processedUpTo = windowStart + windowDuration / 2; |
| |
|
| | yield new PartialTranscription( |
| | this.fixedSentences.join(" "), |
| | "", |
| | windowEnd, |
| | false |
| | ); |
| | } |
| | } else { |
| | |
| | const finalText = result.text ? result.text.trim() : ""; |
| | this.fixedSentences.push(finalText); |
| | processedUpTo = windowEnd; |
| |
|
| | yield new PartialTranscription( |
| | this.fixedSentences.join(" "), |
| | "", |
| | windowEnd, |
| | true |
| | ); |
| | } |
| | } |
| | } |
| |
|
| | async finalize(audio) { |
| | |
| | |
| | |
| | |
| | |
| | |
| | const result = await this.transcribeIncremental(audio); |
| |
|
| | const parts = []; |
| | if (result.fixedText) parts.push(result.fixedText); |
| | if (result.activeText) parts.push(result.activeText); |
| |
|
| | return parts.join(" "); |
| | } |
| | } |
| |
|