Spaces:

NitishStark
/

arena-learning

No application file

App Files Files Community

arena-learning / studyArena /lib /playback /engine.ts

NitishStark

Upload folder using huggingface_hub

c20f20c verified 7 days ago

raw

history blame contribute delete

24.8 kB

	/**
	* Playback Engine - Unified state machine for lecture playback and live discussion
	*
	* Consumes Scene.actions[] directly via ActionEngine.
	* No intermediate compile step — actions are executed as-is.
	*
	* State machine:
	*
	* start() pause()
	* idle ──────────────────→ playing ──────────────→ paused
	* ▲ ▲ │
	* │ │ resume() │
	* │ └───────────────────────┘
	* │
	* │ handleEndDiscussion()
	* │ confirmDiscussion()
	* │ / handleUserInterrupt()
	* │ │
	* │ ▼ pause()
	* └──────────────────────── live ──────────────→ paused
	* ▲ │
	* │ resume / user msg │
	* └────────────────────┘
	*/

	import type { Scene } from '@/lib/types/stage';
	import type { Action, SpeechAction, DiscussionAction } from '@/lib/types/action';
	import type {
	EngineMode,
	TopicState,
	PlaybackEngineCallbacks,
	PlaybackSnapshot,
	TriggerEvent,
	Effect,
	} from './types';
	import type { AudioPlayer } from '@/lib/utils/audio-player';
	import { ActionEngine } from '@/lib/action/engine';
	import { useCanvasStore } from '@/lib/store/canvas';
	import { useSettingsStore } from '@/lib/store/settings';
	import { createLogger } from '@/lib/logger';

	const log = createLogger('PlaybackEngine');

	/**
	* If more than 30% of characters are CJK, treat the text as Chinese.
	* Intentionally low: mixed Chinese text often contains punctuation,
	* numbers, and short Latin fragments (e.g. "AI课堂").
	*/
	const CJK_LANG_THRESHOLD = 0.3;

	export class PlaybackEngine {
	private scenes: Scene[] = [];
	private sceneIndex: number = 0;
	private actionIndex: number = 0;
	private mode: EngineMode = 'idle';
	private consumedDiscussions: Set<string> = new Set();

	// Discussion state save
	private savedSceneIndex: number \| null = null;
	private savedActionIndex: number \| null = null;

	// Discussion topic state
	private currentTopicState: TopicState \| null = null;

	// Dependencies
	private audioPlayer: AudioPlayer;
	private actionEngine: ActionEngine;
	private callbacks: PlaybackEngineCallbacks;

	// Scene identity (for snapshot validation)
	private sceneId: string \| undefined;

	// Internal state
	private currentTrigger: TriggerEvent \| null = null;
	private triggerDelayTimer: ReturnType<typeof setTimeout> \| null = null;
	// Reading-time timer for speech actions without pre-generated audio (TTS disabled)
	private speechTimer: ReturnType<typeof setTimeout> \| null = null;
	private speechTimerStart: number = 0; // Date.now() when timer was scheduled
	// Browser-native TTS state (Web Speech API)
	private browserTTSActive: boolean = false;
	private browserTTSChunks: string[] = []; // sentence-level chunks for sequential playback
	private browserTTSChunkIndex: number = 0; // current chunk being spoken
	private browserTTSPausedChunks: string[] = []; // remaining chunks saved on pause (for cancel+re-speak)
	private speechTimerRemaining: number = 0; // remaining ms (set on pause)

	constructor(
	scenes: Scene[],
	actionEngine: ActionEngine,
	audioPlayer: AudioPlayer,
	callbacks: PlaybackEngineCallbacks = {},
	) {
	this.scenes = scenes;
	this.sceneId = scenes[0]?.id;
	this.actionEngine = actionEngine;
	this.audioPlayer = audioPlayer;
	this.callbacks = callbacks;
	}

	// ==================== Public API ====================

	/** Get the current engine mode */
	getMode(): EngineMode {
	return this.mode;
	}

	/** Export a serializable playback snapshot */
	getSnapshot(): PlaybackSnapshot {
	return {
	sceneIndex: this.sceneIndex,
	actionIndex: this.actionIndex,
	consumedDiscussions: [...this.consumedDiscussions],
	sceneId: this.sceneId,
	};
	}

	/** Restore playback position from a snapshot */
	restoreFromSnapshot(snapshot: PlaybackSnapshot): void {
	this.sceneIndex = snapshot.sceneIndex;
	this.actionIndex = snapshot.actionIndex;
	this.consumedDiscussions = new Set(snapshot.consumedDiscussions);
	}

	/** idle → playing (from beginning) */
	start(): void {
	if (this.mode !== 'idle') {
	log.warn('Cannot start: not idle, current mode:', this.mode);
	return;
	}

	this.sceneIndex = 0;
	this.actionIndex = 0;
	this.setMode('playing');
	this.processNext();
	}

	/** idle → playing (continue from current position, e.g. after discussion end) */
	continuePlayback(): void {
	if (this.mode !== 'idle') {
	log.warn('Cannot continue: not idle, current mode:', this.mode);
	return;
	}
	this.setMode('playing');
	this.processNext();
	}

	/** playing → paused \| live → paused (abort SSE, truncate, topic pending) */
	pause(): void {
	if (this.mode === 'playing') {
	// Cancel pending timers
	if (this.triggerDelayTimer) {
	clearTimeout(this.triggerDelayTimer);
	this.triggerDelayTimer = null;
	}
	if (this.speechTimer) {
	// Save remaining time so resume() can reschedule
	this.speechTimerRemaining = Math.max(
	0,
	this.speechTimerRemaining - (Date.now() - this.speechTimerStart),
	);
	clearTimeout(this.speechTimer);
	this.speechTimer = null;
	}
	this.setMode('paused');
	// Freeze TTS — but skip if waiting on ProactiveCard (no active speech)
	if (!this.currentTrigger) {
	if (this.browserTTSActive) {
	// Cancel+re-speak pattern: save remaining chunks for resume.
	// speechSynthesis.pause()/resume() is broken on Firefox, so we
	// cancel now and re-speak from current chunk onward on resume.
	this.browserTTSPausedChunks = this.browserTTSChunks.slice(this.browserTTSChunkIndex);
	window.speechSynthesis?.cancel();
	// Note: cancel fires onerror('canceled'), which we ignore (see playBrowserTTSChunk)
	} else if (this.audioPlayer.isPlaying()) {
	this.audioPlayer.pause();
	}
	}
	} else if (this.mode === 'live') {
	this.setMode('paused');
	this.currentTopicState = 'pending';
	// Caller is responsible for aborting SSE
	} else {
	log.warn('Cannot pause: mode is', this.mode);
	}
	}

	/** paused → playing (TTS resume) \| paused (in discussion) → live */
	resume(): void {
	if (this.mode !== 'paused') {
	log.warn('Cannot resume: not paused, mode is', this.mode);
	return;
	}

	if (this.currentTopicState === 'pending') {
	// Resume discussion → live
	this.currentTopicState = 'active';
	this.setMode('live');
	} else if (this.currentTrigger) {
	// Waiting on ProactiveCard — just resume mode, don't touch audio
	this.setMode('playing');
	} else {
	// Resume lecture
	this.setMode('playing');
	if (this.browserTTSPausedChunks.length > 0) {
	// Browser TTS was paused via cancel — re-speak remaining chunks
	this.browserTTSActive = true;
	this.browserTTSChunks = this.browserTTSPausedChunks;
	this.browserTTSChunkIndex = 0;
	this.browserTTSPausedChunks = [];
	this.playBrowserTTSChunk();
	} else if (this.audioPlayer.hasActiveAudio()) {
	// Audio is paused — resume it; TTS onend will call processNext
	this.audioPlayer.resume();
	} else if (this.speechTimerRemaining > 0) {
	// Reading timer was paused — reschedule with remaining time
	this.speechTimerStart = Date.now();
	this.speechTimer = setTimeout(() => {
	this.speechTimer = null;
	this.speechTimerRemaining = 0;
	this.callbacks.onSpeechEnd?.();
	if (this.mode === 'playing') this.processNext();
	}, this.speechTimerRemaining);
	} else {
	// TTS finished while paused, continue to next event
	this.processNext();
	}
	}
	}

	/** → idle */
	stop(): void {
	// Set mode BEFORE stopping audio to prevent spurious processNext from
	// synchronous onend callbacks (see handleUserInterrupt for details).
	this.setMode('idle');
	this.audioPlayer.stop();
	this.cancelBrowserTTS();
	this.actionEngine.clearEffects();
	if (this.triggerDelayTimer) {
	clearTimeout(this.triggerDelayTimer);
	this.triggerDelayTimer = null;
	}
	if (this.speechTimer) {
	clearTimeout(this.speechTimer);
	this.speechTimer = null;
	}
	this.speechTimerRemaining = 0;
	this.sceneIndex = 0;
	this.actionIndex = 0;
	this.savedSceneIndex = null;
	this.savedActionIndex = null;
	this.currentTopicState = null;
	this.currentTrigger = null;
	}

	/** User clicks "Join" on ProactiveCard → save cursor → live */
	confirmDiscussion(): void {
	if (!this.currentTrigger) {
	log.warn('confirmDiscussion called but no trigger');
	return;
	}

	// Mark consumed so it won't re-trigger on replay
	this.consumedDiscussions.add(this.currentTrigger.id);

	// Save lecture state — keep actionIndex as-is (past the discussion).
	// Discussions are placed after all speech actions, so the preceding
	// speech was already fully played; no need to replay it.
	this.savedSceneIndex = this.sceneIndex;
	this.savedActionIndex = this.actionIndex;

	// Enter live mode
	this.currentTopicState = 'active';
	this.setMode('live');

	// Notify callbacks
	this.callbacks.onProactiveHide?.();
	this.callbacks.onDiscussionConfirmed?.(
	this.currentTrigger.question,
	this.currentTrigger.prompt,
	this.currentTrigger.agentId,
	);
	this.currentTrigger = null;
	}

	/** User clicks "Skip" on ProactiveCard → consumed → processNext */
	skipDiscussion(): void {
	if (this.currentTrigger) {
	this.consumedDiscussions.add(this.currentTrigger.id);
	this.currentTrigger = null;
	}
	this.callbacks.onProactiveHide?.();

	if (this.mode === 'playing') {
	this.processNext();
	}
	}

	/** End discussion → restore lecture → idle (user clicks "start" to continue) */
	handleEndDiscussion(): void {
	this.actionEngine.clearEffects();
	this.currentTopicState = 'closed';

	// Close whiteboard if it was open during the discussion
	useCanvasStore.getState().setWhiteboardOpen(false);

	this.callbacks.onDiscussionEnd?.();

	// Restore lecture state
	if (this.savedSceneIndex !== null && this.savedActionIndex !== null) {
	this.sceneIndex = this.savedSceneIndex;
	this.actionIndex = this.savedActionIndex;
	this.savedSceneIndex = null;
	this.savedActionIndex = null;
	}

	this.setMode('idle');
	}

	/** User sends a message during playback → interrupt → live mode */
	handleUserInterrupt(text: string): void {
	if (this.mode === 'playing' \|\| this.mode === 'paused') {
	// Save lecture state BEFORE stopping audio — actionIndex was already
	// incremented by processNext, so subtract 1 to replay the interrupted
	// sentence when resuming. Guard against overwriting a previously saved
	// position (e.g. live → paused → new message).
	if (this.savedSceneIndex === null) {
	this.savedSceneIndex = this.sceneIndex;
	this.savedActionIndex = Math.max(0, this.actionIndex - 1);
	}

	// Cancel pending trigger delay
	if (this.triggerDelayTimer) {
	clearTimeout(this.triggerDelayTimer);
	this.triggerDelayTimer = null;
	}
	}

	// Set mode BEFORE stopping audio — speechSynthesis.cancel() may fire the
	// onend callback synchronously, and the processNext guard checks
	// `this.mode === 'playing'`. Setting mode first prevents a spurious
	// processNext that would advance actionIndex past the interrupted speech.
	this.currentTopicState = 'active';
	this.setMode('live');
	this.audioPlayer.stop();
	this.cancelBrowserTTS();
	this.callbacks.onUserInterrupt?.(text);
	}

	/** Whether all remaining actions have been consumed (no speech left to play) */
	isExhausted(): boolean {
	let si = this.sceneIndex;
	let ai = this.actionIndex;
	while (si < this.scenes.length) {
	const actions = this.scenes[si].actions \|\| [];
	while (ai < actions.length) {
	const action = actions[ai];
	// Consumed discussions don't count as remaining work
	if (action.type === 'discussion' && this.consumedDiscussions.has(action.id)) {
	ai++;
	continue;
	}
	return false;
	}
	si++;
	ai = 0;
	}
	return true;
	}

	// ==================== Private ====================

	private setMode(mode: EngineMode): void {
	if (this.mode === mode) return;
	this.mode = mode;
	this.callbacks.onModeChange?.(mode);
	}

	/**
	* Get the current action, or null if playback is complete.
	* Advances sceneIndex automatically when a scene's actions are exhausted.
	*/
	private getCurrentAction(): { action: Action; sceneId: string } \| null {
	while (this.sceneIndex < this.scenes.length) {
	const scene = this.scenes[this.sceneIndex];
	const actions = scene.actions \|\| [];

	if (this.actionIndex < actions.length) {
	return { action: actions[this.actionIndex], sceneId: scene.id };
	}

	// Move to next scene
	this.sceneIndex++;
	this.actionIndex = 0;
	}
	return null;
	}

	/**
	* Core processing loop: consume the next action.
	*/
	private async processNext(): Promise<void> {
	if (this.mode !== 'playing') return;

	// Check for scene boundary (fire scene change callback at start of each new scene)
	if (this.actionIndex === 0 && this.sceneIndex < this.scenes.length) {
	const scene = this.scenes[this.sceneIndex];
	this.actionEngine.clearEffects();
	this.callbacks.onSceneChange?.(scene.id);
	this.callbacks.onSpeakerChange?.('teacher');
	}

	const current = this.getCurrentAction();
	if (!current) {
	// All scenes complete
	this.actionEngine.clearEffects();
	this.setMode('idle');
	this.callbacks.onComplete?.();
	return;
	}

	const { action } = current;

	// Notify progress BEFORE advancing the cursor so the snapshot points at
	// the current action. On restore the same action will be replayed — this
	// is the desired behaviour for speech (user may have only heard half).
	this.callbacks.onProgress?.(this.getSnapshot());

	this.actionIndex++;

	switch (action.type) {
	case 'speech': {
	const speechAction = action as SpeechAction;
	this.callbacks.onSpeechStart?.(speechAction.text);

	// onEnded → processNext; if paused, resume() will call processNext
	this.audioPlayer.onEnded(() => {
	this.callbacks.onSpeechEnd?.();
	if (this.mode === 'playing') {
	this.processNext();
	}
	});

	// Estimated reading time when no pre-generated audio (TTS disabled).
	// CJK text: ~150ms/char (one char ≈ one word).
	// Non-CJK text: ~240ms/word (≈250 WPM).
	// Min 2s. Cancelled on pause; resume() calls processNext directly.
	const scheduleReadingTimer = () => {
	const text = speechAction.text;
	const cjkCount = (
	text.match(/[\u4e00-\u9fff\u3400-\u4dbf\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af]/g) \|\| []
	).length;
	const isCJK = cjkCount > text.length * 0.3;
	const speed = this.callbacks.getPlaybackSpeed?.() ?? 1;
	const rawMs = isCJK
	? Math.max(2000, text.length * 150)
	: Math.max(2000, text.split(/\s+/).filter(Boolean).length * 240);
	const readingMs = rawMs / speed;
	this.speechTimerStart = Date.now();
	this.speechTimerRemaining = readingMs;
	this.speechTimer = setTimeout(() => {
	this.speechTimer = null;
	this.speechTimerRemaining = 0;
	this.callbacks.onSpeechEnd?.();
	if (this.mode === 'playing') this.processNext();
	}, readingMs);
	};

	this.audioPlayer
	.play(speechAction.audioId \|\| '', speechAction.audioUrl)
	.then((audioStarted) => {
	if (!audioStarted) {
	// No pre-generated audio — try browser-native TTS if selected
	const settings = useSettingsStore.getState();
	if (
	settings.ttsEnabled &&
	settings.ttsProviderId === 'browser-native-tts' &&
	typeof window !== 'undefined' &&
	window.speechSynthesis
	) {
	this.playBrowserTTS(speechAction);
	} else {
	scheduleReadingTimer();
	}
	}
	})
	.catch((err) => {
	log.error('TTS error:', err);
	scheduleReadingTimer();
	});
	break;
	}

	case 'spotlight':
	case 'laser': {
	// Fire-and-forget visual effects via ActionEngine
	this.actionEngine.execute(action);
	this.callbacks.onEffectFire?.({
	kind: action.type,
	targetId: action.elementId,
	...(action.type === 'spotlight'
	? { dimOpacity: action.dimOpacity }
	: { color: action.color }),
	} as Effect);
	// Don't block — continue immediately
	this.processNext();
	break;
	}

	case 'discussion': {
	const discussionAction = action as DiscussionAction;
	// Check if already consumed
	if (this.consumedDiscussions.has(discussionAction.id)) {
	this.processNext();
	return;
	}
	// Skip if the discussion's agent isn't in the user's selected list
	if (
	discussionAction.agentId &&
	this.callbacks.isAgentSelected &&
	!this.callbacks.isAgentSelected(discussionAction.agentId)
	) {
	this.consumedDiscussions.add(discussionAction.id);
	this.processNext();
	return;
	}

	// 3s delay before showing ProactiveCard (allows previous speech to finish naturally)
	const trigger: TriggerEvent = {
	id: discussionAction.id,
	question: discussionAction.topic,
	prompt: discussionAction.prompt,
	agentId: discussionAction.agentId,
	};

	this.triggerDelayTimer = setTimeout(() => {
	this.triggerDelayTimer = null;
	if (this.mode !== 'playing') return; // Cancelled if user paused/stopped
	this.currentTrigger = trigger;
	this.callbacks.onProactiveShow?.(trigger);
	// Engine pauses here — user calls confirmDiscussion() or skipDiscussion()
	}, 3000);
	break;
	}

	case 'play_video':
	case 'wb_open':
	case 'wb_draw_text':
	case 'wb_draw_shape':
	case 'wb_draw_chart':
	case 'wb_draw_latex':
	case 'wb_draw_table':
	case 'wb_clear':
	case 'wb_delete':
	case 'wb_close': {
	// Synchronous whiteboard actions — await completion, then continue
	await this.actionEngine.execute(action);
	if (this.mode === 'playing') {
	this.processNext();
	}
	break;
	}

	default:
	// Unknown action, skip
	this.processNext();
	break;
	}
	}

	// ==================== Browser Native TTS ====================

	/**
	* Split text into sentence-level chunks for sequential playback.
	* Chrome has a bug where utterances >~15s are silently cut off and onend
	* never fires, causing the engine to hang. Chunking avoids this.
	*/
	private splitIntoChunks(text: string): string[] {
	// Split on sentence-ending punctuation (Latin + CJK) and newlines
	const chunks = text
	.split(/(?<=[.!?。！？\n])\s*/)
	.map((s) => s.trim())
	.filter((s) => s.length > 0);
	// If splitting produced nothing (no punctuation), return the original text
	return chunks.length > 0 ? chunks : [text];
	}

	/**
	* Play text using the Web Speech API (browser-native TTS).
	* Splits text into sentence-level chunks to avoid Chrome's ~15s cutoff.
	* Uses cancel+re-speak for pause/resume (Firefox compatibility).
	*/
	private playBrowserTTS(speechAction: SpeechAction): void {
	this.browserTTSChunks = this.splitIntoChunks(speechAction.text);
	this.browserTTSChunkIndex = 0;
	this.browserTTSPausedChunks = [];
	this.browserTTSActive = true;
	this.playBrowserTTSChunk();
	}

	/** Speak the current chunk; on completion, advance to next or finish. */
	private async playBrowserTTSChunk(): Promise<void> {
	if (this.browserTTSChunkIndex >= this.browserTTSChunks.length) {
	// All chunks done
	this.browserTTSActive = false;
	this.browserTTSChunks = [];
	this.callbacks.onSpeechEnd?.();
	if (this.mode === 'playing') this.processNext();
	return;
	}

	const settings = useSettingsStore.getState();
	const chunkText = this.browserTTSChunks[this.browserTTSChunkIndex];
	const utterance = new SpeechSynthesisUtterance(chunkText);

	// Apply settings
	const speed = this.callbacks.getPlaybackSpeed?.() ?? 1;
	utterance.rate = (settings.ttsSpeed ?? 1) * speed;
	utterance.volume = settings.ttsMuted ? 0 : (settings.ttsVolume ?? 1);

	// Ensure voices are loaded (Chrome loads them asynchronously)
	const voices = await this.ensureVoicesLoaded();

	// Set voice: try user's configured voice, fall back to auto-detect language
	let voiceFound = false;
	if (settings.ttsVoice && settings.ttsVoice !== 'default') {
	const voice = voices.find((v) => v.voiceURI === settings.ttsVoice);
	if (voice) {
	utterance.voice = voice;
	utterance.lang = voice.lang;
	voiceFound = true;
	}
	}
	if (!voiceFound) {
	// No usable voice configured — detect text language so the browser
	// auto-selects an appropriate voice.
	const cjkRatio =
	(chunkText.match(/[\u4e00-\u9fff\u3400-\u4dbf]/g) \|\| []).length / chunkText.length;
	utterance.lang = cjkRatio > CJK_LANG_THRESHOLD ? 'zh-CN' : 'en-US';
	}

	utterance.onend = () => {
	this.browserTTSChunkIndex++;
	if (this.mode === 'playing') {
	this.playBrowserTTSChunk(); // next chunk
	}
	};

	utterance.onerror = (event) => {
	// 'canceled' is expected when stop/pause is called — not a real error
	if (event.error !== 'canceled') {
	log.warn('Browser TTS chunk error:', event.error);
	// Skip failed chunk, try next
	this.browserTTSChunkIndex++;
	if (this.mode === 'playing') {
	this.playBrowserTTSChunk();
	}
	}
	// On 'canceled': do nothing — pause handler already saved state
	};

	window.speechSynthesis.speak(utterance);
	}

	/**
	* Wait for speechSynthesis voices to load (Chrome loads them asynchronously).
	* Caches result so subsequent calls return immediately.
	*/
	private cachedVoices: SpeechSynthesisVoice[] \| null = null;
	private async ensureVoicesLoaded(): Promise<SpeechSynthesisVoice[]> {
	if (this.cachedVoices && this.cachedVoices.length > 0) {
	return this.cachedVoices;
	}

	let voices = window.speechSynthesis.getVoices();
	if (voices.length > 0) {
	this.cachedVoices = voices;
	return voices;
	}

	// Chrome: voices load asynchronously — wait for the voiceschanged event
	await new Promise<void>((resolve) => {
	const onVoicesChanged = () => {
	window.speechSynthesis.removeEventListener('voiceschanged', onVoicesChanged);
	resolve();
	};
	window.speechSynthesis.addEventListener('voiceschanged', onVoicesChanged);
	// Timeout after 2s to avoid hanging
	setTimeout(() => {
	window.speechSynthesis.removeEventListener('voiceschanged', onVoicesChanged);
	resolve();
	}, 2000);
	});

	voices = window.speechSynthesis.getVoices();
	this.cachedVoices = voices;
	return voices;
	}

	/** Cancel any active browser-native TTS */
	private cancelBrowserTTS(): void {
	if (this.browserTTSActive) {
	this.browserTTSActive = false;
	this.browserTTSChunks = [];
	this.browserTTSChunkIndex = 0;
	this.browserTTSPausedChunks = [];
	window.speechSynthesis?.cancel();
	}
	}
	}