NitishStark's picture
Upload folder using huggingface_hub
c20f20c verified
/**
* Playback Engine - Unified state machine for lecture playback and live discussion
*
* Consumes Scene.actions[] directly via ActionEngine.
* No intermediate compile step β€” actions are executed as-is.
*
* State machine:
*
* start() pause()
* idle ──────────────────→ playing ──────────────→ paused
* β–² β–² β”‚
* β”‚ β”‚ resume() β”‚
* β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
* β”‚
* β”‚ handleEndDiscussion()
* β”‚ confirmDiscussion()
* β”‚ / handleUserInterrupt()
* β”‚ β”‚
* β”‚ β–Ό pause()
* └──────────────────────── live ──────────────→ paused
* β–² β”‚
* β”‚ resume / user msg β”‚
* β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
*/
import type { Scene } from '@/lib/types/stage';
import type { Action, SpeechAction, DiscussionAction } from '@/lib/types/action';
import type {
EngineMode,
TopicState,
PlaybackEngineCallbacks,
PlaybackSnapshot,
TriggerEvent,
Effect,
} from './types';
import type { AudioPlayer } from '@/lib/utils/audio-player';
import { ActionEngine } from '@/lib/action/engine';
import { useCanvasStore } from '@/lib/store/canvas';
import { useSettingsStore } from '@/lib/store/settings';
import { createLogger } from '@/lib/logger';
const log = createLogger('PlaybackEngine');
/**
* If more than 30% of characters are CJK, treat the text as Chinese.
* Intentionally low: mixed Chinese text often contains punctuation,
* numbers, and short Latin fragments (e.g. "AIθ―Ύε ‚").
*/
const CJK_LANG_THRESHOLD = 0.3;
export class PlaybackEngine {
private scenes: Scene[] = [];
private sceneIndex: number = 0;
private actionIndex: number = 0;
private mode: EngineMode = 'idle';
private consumedDiscussions: Set<string> = new Set();
// Discussion state save
private savedSceneIndex: number | null = null;
private savedActionIndex: number | null = null;
// Discussion topic state
private currentTopicState: TopicState | null = null;
// Dependencies
private audioPlayer: AudioPlayer;
private actionEngine: ActionEngine;
private callbacks: PlaybackEngineCallbacks;
// Scene identity (for snapshot validation)
private sceneId: string | undefined;
// Internal state
private currentTrigger: TriggerEvent | null = null;
private triggerDelayTimer: ReturnType<typeof setTimeout> | null = null;
// Reading-time timer for speech actions without pre-generated audio (TTS disabled)
private speechTimer: ReturnType<typeof setTimeout> | null = null;
private speechTimerStart: number = 0; // Date.now() when timer was scheduled
// Browser-native TTS state (Web Speech API)
private browserTTSActive: boolean = false;
private browserTTSChunks: string[] = []; // sentence-level chunks for sequential playback
private browserTTSChunkIndex: number = 0; // current chunk being spoken
private browserTTSPausedChunks: string[] = []; // remaining chunks saved on pause (for cancel+re-speak)
private speechTimerRemaining: number = 0; // remaining ms (set on pause)
constructor(
scenes: Scene[],
actionEngine: ActionEngine,
audioPlayer: AudioPlayer,
callbacks: PlaybackEngineCallbacks = {},
) {
this.scenes = scenes;
this.sceneId = scenes[0]?.id;
this.actionEngine = actionEngine;
this.audioPlayer = audioPlayer;
this.callbacks = callbacks;
}
// ==================== Public API ====================
/** Get the current engine mode */
getMode(): EngineMode {
return this.mode;
}
/** Export a serializable playback snapshot */
getSnapshot(): PlaybackSnapshot {
return {
sceneIndex: this.sceneIndex,
actionIndex: this.actionIndex,
consumedDiscussions: [...this.consumedDiscussions],
sceneId: this.sceneId,
};
}
/** Restore playback position from a snapshot */
restoreFromSnapshot(snapshot: PlaybackSnapshot): void {
this.sceneIndex = snapshot.sceneIndex;
this.actionIndex = snapshot.actionIndex;
this.consumedDiscussions = new Set(snapshot.consumedDiscussions);
}
/** idle β†’ playing (from beginning) */
start(): void {
if (this.mode !== 'idle') {
log.warn('Cannot start: not idle, current mode:', this.mode);
return;
}
this.sceneIndex = 0;
this.actionIndex = 0;
this.setMode('playing');
this.processNext();
}
/** idle β†’ playing (continue from current position, e.g. after discussion end) */
continuePlayback(): void {
if (this.mode !== 'idle') {
log.warn('Cannot continue: not idle, current mode:', this.mode);
return;
}
this.setMode('playing');
this.processNext();
}
/** playing β†’ paused | live β†’ paused (abort SSE, truncate, topic pending) */
pause(): void {
if (this.mode === 'playing') {
// Cancel pending timers
if (this.triggerDelayTimer) {
clearTimeout(this.triggerDelayTimer);
this.triggerDelayTimer = null;
}
if (this.speechTimer) {
// Save remaining time so resume() can reschedule
this.speechTimerRemaining = Math.max(
0,
this.speechTimerRemaining - (Date.now() - this.speechTimerStart),
);
clearTimeout(this.speechTimer);
this.speechTimer = null;
}
this.setMode('paused');
// Freeze TTS β€” but skip if waiting on ProactiveCard (no active speech)
if (!this.currentTrigger) {
if (this.browserTTSActive) {
// Cancel+re-speak pattern: save remaining chunks for resume.
// speechSynthesis.pause()/resume() is broken on Firefox, so we
// cancel now and re-speak from current chunk onward on resume.
this.browserTTSPausedChunks = this.browserTTSChunks.slice(this.browserTTSChunkIndex);
window.speechSynthesis?.cancel();
// Note: cancel fires onerror('canceled'), which we ignore (see playBrowserTTSChunk)
} else if (this.audioPlayer.isPlaying()) {
this.audioPlayer.pause();
}
}
} else if (this.mode === 'live') {
this.setMode('paused');
this.currentTopicState = 'pending';
// Caller is responsible for aborting SSE
} else {
log.warn('Cannot pause: mode is', this.mode);
}
}
/** paused β†’ playing (TTS resume) | paused (in discussion) β†’ live */
resume(): void {
if (this.mode !== 'paused') {
log.warn('Cannot resume: not paused, mode is', this.mode);
return;
}
if (this.currentTopicState === 'pending') {
// Resume discussion β†’ live
this.currentTopicState = 'active';
this.setMode('live');
} else if (this.currentTrigger) {
// Waiting on ProactiveCard β€” just resume mode, don't touch audio
this.setMode('playing');
} else {
// Resume lecture
this.setMode('playing');
if (this.browserTTSPausedChunks.length > 0) {
// Browser TTS was paused via cancel β€” re-speak remaining chunks
this.browserTTSActive = true;
this.browserTTSChunks = this.browserTTSPausedChunks;
this.browserTTSChunkIndex = 0;
this.browserTTSPausedChunks = [];
this.playBrowserTTSChunk();
} else if (this.audioPlayer.hasActiveAudio()) {
// Audio is paused β€” resume it; TTS onend will call processNext
this.audioPlayer.resume();
} else if (this.speechTimerRemaining > 0) {
// Reading timer was paused β€” reschedule with remaining time
this.speechTimerStart = Date.now();
this.speechTimer = setTimeout(() => {
this.speechTimer = null;
this.speechTimerRemaining = 0;
this.callbacks.onSpeechEnd?.();
if (this.mode === 'playing') this.processNext();
}, this.speechTimerRemaining);
} else {
// TTS finished while paused, continue to next event
this.processNext();
}
}
}
/** β†’ idle */
stop(): void {
// Set mode BEFORE stopping audio to prevent spurious processNext from
// synchronous onend callbacks (see handleUserInterrupt for details).
this.setMode('idle');
this.audioPlayer.stop();
this.cancelBrowserTTS();
this.actionEngine.clearEffects();
if (this.triggerDelayTimer) {
clearTimeout(this.triggerDelayTimer);
this.triggerDelayTimer = null;
}
if (this.speechTimer) {
clearTimeout(this.speechTimer);
this.speechTimer = null;
}
this.speechTimerRemaining = 0;
this.sceneIndex = 0;
this.actionIndex = 0;
this.savedSceneIndex = null;
this.savedActionIndex = null;
this.currentTopicState = null;
this.currentTrigger = null;
}
/** User clicks "Join" on ProactiveCard β†’ save cursor β†’ live */
confirmDiscussion(): void {
if (!this.currentTrigger) {
log.warn('confirmDiscussion called but no trigger');
return;
}
// Mark consumed so it won't re-trigger on replay
this.consumedDiscussions.add(this.currentTrigger.id);
// Save lecture state β€” keep actionIndex as-is (past the discussion).
// Discussions are placed after all speech actions, so the preceding
// speech was already fully played; no need to replay it.
this.savedSceneIndex = this.sceneIndex;
this.savedActionIndex = this.actionIndex;
// Enter live mode
this.currentTopicState = 'active';
this.setMode('live');
// Notify callbacks
this.callbacks.onProactiveHide?.();
this.callbacks.onDiscussionConfirmed?.(
this.currentTrigger.question,
this.currentTrigger.prompt,
this.currentTrigger.agentId,
);
this.currentTrigger = null;
}
/** User clicks "Skip" on ProactiveCard β†’ consumed β†’ processNext */
skipDiscussion(): void {
if (this.currentTrigger) {
this.consumedDiscussions.add(this.currentTrigger.id);
this.currentTrigger = null;
}
this.callbacks.onProactiveHide?.();
if (this.mode === 'playing') {
this.processNext();
}
}
/** End discussion β†’ restore lecture β†’ idle (user clicks "start" to continue) */
handleEndDiscussion(): void {
this.actionEngine.clearEffects();
this.currentTopicState = 'closed';
// Close whiteboard if it was open during the discussion
useCanvasStore.getState().setWhiteboardOpen(false);
this.callbacks.onDiscussionEnd?.();
// Restore lecture state
if (this.savedSceneIndex !== null && this.savedActionIndex !== null) {
this.sceneIndex = this.savedSceneIndex;
this.actionIndex = this.savedActionIndex;
this.savedSceneIndex = null;
this.savedActionIndex = null;
}
this.setMode('idle');
}
/** User sends a message during playback β†’ interrupt β†’ live mode */
handleUserInterrupt(text: string): void {
if (this.mode === 'playing' || this.mode === 'paused') {
// Save lecture state BEFORE stopping audio β€” actionIndex was already
// incremented by processNext, so subtract 1 to replay the interrupted
// sentence when resuming. Guard against overwriting a previously saved
// position (e.g. live β†’ paused β†’ new message).
if (this.savedSceneIndex === null) {
this.savedSceneIndex = this.sceneIndex;
this.savedActionIndex = Math.max(0, this.actionIndex - 1);
}
// Cancel pending trigger delay
if (this.triggerDelayTimer) {
clearTimeout(this.triggerDelayTimer);
this.triggerDelayTimer = null;
}
}
// Set mode BEFORE stopping audio β€” speechSynthesis.cancel() may fire the
// onend callback synchronously, and the processNext guard checks
// `this.mode === 'playing'`. Setting mode first prevents a spurious
// processNext that would advance actionIndex past the interrupted speech.
this.currentTopicState = 'active';
this.setMode('live');
this.audioPlayer.stop();
this.cancelBrowserTTS();
this.callbacks.onUserInterrupt?.(text);
}
/** Whether all remaining actions have been consumed (no speech left to play) */
isExhausted(): boolean {
let si = this.sceneIndex;
let ai = this.actionIndex;
while (si < this.scenes.length) {
const actions = this.scenes[si].actions || [];
while (ai < actions.length) {
const action = actions[ai];
// Consumed discussions don't count as remaining work
if (action.type === 'discussion' && this.consumedDiscussions.has(action.id)) {
ai++;
continue;
}
return false;
}
si++;
ai = 0;
}
return true;
}
// ==================== Private ====================
private setMode(mode: EngineMode): void {
if (this.mode === mode) return;
this.mode = mode;
this.callbacks.onModeChange?.(mode);
}
/**
* Get the current action, or null if playback is complete.
* Advances sceneIndex automatically when a scene's actions are exhausted.
*/
private getCurrentAction(): { action: Action; sceneId: string } | null {
while (this.sceneIndex < this.scenes.length) {
const scene = this.scenes[this.sceneIndex];
const actions = scene.actions || [];
if (this.actionIndex < actions.length) {
return { action: actions[this.actionIndex], sceneId: scene.id };
}
// Move to next scene
this.sceneIndex++;
this.actionIndex = 0;
}
return null;
}
/**
* Core processing loop: consume the next action.
*/
private async processNext(): Promise<void> {
if (this.mode !== 'playing') return;
// Check for scene boundary (fire scene change callback at start of each new scene)
if (this.actionIndex === 0 && this.sceneIndex < this.scenes.length) {
const scene = this.scenes[this.sceneIndex];
this.actionEngine.clearEffects();
this.callbacks.onSceneChange?.(scene.id);
this.callbacks.onSpeakerChange?.('teacher');
}
const current = this.getCurrentAction();
if (!current) {
// All scenes complete
this.actionEngine.clearEffects();
this.setMode('idle');
this.callbacks.onComplete?.();
return;
}
const { action } = current;
// Notify progress BEFORE advancing the cursor so the snapshot points at
// the current action. On restore the same action will be replayed β€” this
// is the desired behaviour for speech (user may have only heard half).
this.callbacks.onProgress?.(this.getSnapshot());
this.actionIndex++;
switch (action.type) {
case 'speech': {
const speechAction = action as SpeechAction;
this.callbacks.onSpeechStart?.(speechAction.text);
// onEnded β†’ processNext; if paused, resume() will call processNext
this.audioPlayer.onEnded(() => {
this.callbacks.onSpeechEnd?.();
if (this.mode === 'playing') {
this.processNext();
}
});
// Estimated reading time when no pre-generated audio (TTS disabled).
// CJK text: ~150ms/char (one char β‰ˆ one word).
// Non-CJK text: ~240ms/word (β‰ˆ250 WPM).
// Min 2s. Cancelled on pause; resume() calls processNext directly.
const scheduleReadingTimer = () => {
const text = speechAction.text;
const cjkCount = (
text.match(/[\u4e00-\u9fff\u3400-\u4dbf\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af]/g) || []
).length;
const isCJK = cjkCount > text.length * 0.3;
const speed = this.callbacks.getPlaybackSpeed?.() ?? 1;
const rawMs = isCJK
? Math.max(2000, text.length * 150)
: Math.max(2000, text.split(/\s+/).filter(Boolean).length * 240);
const readingMs = rawMs / speed;
this.speechTimerStart = Date.now();
this.speechTimerRemaining = readingMs;
this.speechTimer = setTimeout(() => {
this.speechTimer = null;
this.speechTimerRemaining = 0;
this.callbacks.onSpeechEnd?.();
if (this.mode === 'playing') this.processNext();
}, readingMs);
};
this.audioPlayer
.play(speechAction.audioId || '', speechAction.audioUrl)
.then((audioStarted) => {
if (!audioStarted) {
// No pre-generated audio β€” try browser-native TTS if selected
const settings = useSettingsStore.getState();
if (
settings.ttsEnabled &&
settings.ttsProviderId === 'browser-native-tts' &&
typeof window !== 'undefined' &&
window.speechSynthesis
) {
this.playBrowserTTS(speechAction);
} else {
scheduleReadingTimer();
}
}
})
.catch((err) => {
log.error('TTS error:', err);
scheduleReadingTimer();
});
break;
}
case 'spotlight':
case 'laser': {
// Fire-and-forget visual effects via ActionEngine
this.actionEngine.execute(action);
this.callbacks.onEffectFire?.({
kind: action.type,
targetId: action.elementId,
...(action.type === 'spotlight'
? { dimOpacity: action.dimOpacity }
: { color: action.color }),
} as Effect);
// Don't block β€” continue immediately
this.processNext();
break;
}
case 'discussion': {
const discussionAction = action as DiscussionAction;
// Check if already consumed
if (this.consumedDiscussions.has(discussionAction.id)) {
this.processNext();
return;
}
// Skip if the discussion's agent isn't in the user's selected list
if (
discussionAction.agentId &&
this.callbacks.isAgentSelected &&
!this.callbacks.isAgentSelected(discussionAction.agentId)
) {
this.consumedDiscussions.add(discussionAction.id);
this.processNext();
return;
}
// 3s delay before showing ProactiveCard (allows previous speech to finish naturally)
const trigger: TriggerEvent = {
id: discussionAction.id,
question: discussionAction.topic,
prompt: discussionAction.prompt,
agentId: discussionAction.agentId,
};
this.triggerDelayTimer = setTimeout(() => {
this.triggerDelayTimer = null;
if (this.mode !== 'playing') return; // Cancelled if user paused/stopped
this.currentTrigger = trigger;
this.callbacks.onProactiveShow?.(trigger);
// Engine pauses here β€” user calls confirmDiscussion() or skipDiscussion()
}, 3000);
break;
}
case 'play_video':
case 'wb_open':
case 'wb_draw_text':
case 'wb_draw_shape':
case 'wb_draw_chart':
case 'wb_draw_latex':
case 'wb_draw_table':
case 'wb_clear':
case 'wb_delete':
case 'wb_close': {
// Synchronous whiteboard actions β€” await completion, then continue
await this.actionEngine.execute(action);
if (this.mode === 'playing') {
this.processNext();
}
break;
}
default:
// Unknown action, skip
this.processNext();
break;
}
}
// ==================== Browser Native TTS ====================
/**
* Split text into sentence-level chunks for sequential playback.
* Chrome has a bug where utterances >~15s are silently cut off and onend
* never fires, causing the engine to hang. Chunking avoids this.
*/
private splitIntoChunks(text: string): string[] {
// Split on sentence-ending punctuation (Latin + CJK) and newlines
const chunks = text
.split(/(?<=[.!?γ€‚οΌοΌŸ\n])\s*/)
.map((s) => s.trim())
.filter((s) => s.length > 0);
// If splitting produced nothing (no punctuation), return the original text
return chunks.length > 0 ? chunks : [text];
}
/**
* Play text using the Web Speech API (browser-native TTS).
* Splits text into sentence-level chunks to avoid Chrome's ~15s cutoff.
* Uses cancel+re-speak for pause/resume (Firefox compatibility).
*/
private playBrowserTTS(speechAction: SpeechAction): void {
this.browserTTSChunks = this.splitIntoChunks(speechAction.text);
this.browserTTSChunkIndex = 0;
this.browserTTSPausedChunks = [];
this.browserTTSActive = true;
this.playBrowserTTSChunk();
}
/** Speak the current chunk; on completion, advance to next or finish. */
private async playBrowserTTSChunk(): Promise<void> {
if (this.browserTTSChunkIndex >= this.browserTTSChunks.length) {
// All chunks done
this.browserTTSActive = false;
this.browserTTSChunks = [];
this.callbacks.onSpeechEnd?.();
if (this.mode === 'playing') this.processNext();
return;
}
const settings = useSettingsStore.getState();
const chunkText = this.browserTTSChunks[this.browserTTSChunkIndex];
const utterance = new SpeechSynthesisUtterance(chunkText);
// Apply settings
const speed = this.callbacks.getPlaybackSpeed?.() ?? 1;
utterance.rate = (settings.ttsSpeed ?? 1) * speed;
utterance.volume = settings.ttsMuted ? 0 : (settings.ttsVolume ?? 1);
// Ensure voices are loaded (Chrome loads them asynchronously)
const voices = await this.ensureVoicesLoaded();
// Set voice: try user's configured voice, fall back to auto-detect language
let voiceFound = false;
if (settings.ttsVoice && settings.ttsVoice !== 'default') {
const voice = voices.find((v) => v.voiceURI === settings.ttsVoice);
if (voice) {
utterance.voice = voice;
utterance.lang = voice.lang;
voiceFound = true;
}
}
if (!voiceFound) {
// No usable voice configured β€” detect text language so the browser
// auto-selects an appropriate voice.
const cjkRatio =
(chunkText.match(/[\u4e00-\u9fff\u3400-\u4dbf]/g) || []).length / chunkText.length;
utterance.lang = cjkRatio > CJK_LANG_THRESHOLD ? 'zh-CN' : 'en-US';
}
utterance.onend = () => {
this.browserTTSChunkIndex++;
if (this.mode === 'playing') {
this.playBrowserTTSChunk(); // next chunk
}
};
utterance.onerror = (event) => {
// 'canceled' is expected when stop/pause is called β€” not a real error
if (event.error !== 'canceled') {
log.warn('Browser TTS chunk error:', event.error);
// Skip failed chunk, try next
this.browserTTSChunkIndex++;
if (this.mode === 'playing') {
this.playBrowserTTSChunk();
}
}
// On 'canceled': do nothing β€” pause handler already saved state
};
window.speechSynthesis.speak(utterance);
}
/**
* Wait for speechSynthesis voices to load (Chrome loads them asynchronously).
* Caches result so subsequent calls return immediately.
*/
private cachedVoices: SpeechSynthesisVoice[] | null = null;
private async ensureVoicesLoaded(): Promise<SpeechSynthesisVoice[]> {
if (this.cachedVoices && this.cachedVoices.length > 0) {
return this.cachedVoices;
}
let voices = window.speechSynthesis.getVoices();
if (voices.length > 0) {
this.cachedVoices = voices;
return voices;
}
// Chrome: voices load asynchronously β€” wait for the voiceschanged event
await new Promise<void>((resolve) => {
const onVoicesChanged = () => {
window.speechSynthesis.removeEventListener('voiceschanged', onVoicesChanged);
resolve();
};
window.speechSynthesis.addEventListener('voiceschanged', onVoicesChanged);
// Timeout after 2s to avoid hanging
setTimeout(() => {
window.speechSynthesis.removeEventListener('voiceschanged', onVoicesChanged);
resolve();
}, 2000);
});
voices = window.speechSynthesis.getVoices();
this.cachedVoices = voices;
return voices;
}
/** Cancel any active browser-native TTS */
private cancelBrowserTTS(): void {
if (this.browserTTSActive) {
this.browserTTSActive = false;
this.browserTTSChunks = [];
this.browserTTSChunkIndex = 0;
this.browserTTSPausedChunks = [];
window.speechSynthesis?.cancel();
}
}
}