import {useCallback, useEffect, useLayoutEffect, useRef, useState} from 'react'; import Button from '@mui/material/Button'; import Typography from '@mui/material/Typography'; import InputLabel from '@mui/material/InputLabel'; import FormControl from '@mui/material/FormControl'; import Select, {SelectChangeEvent} from '@mui/material/Select'; import MenuItem from '@mui/material/MenuItem'; import Stack from '@mui/material/Stack'; import seamlessLogoUrl from './assets/seamless.svg'; import { AgentCapabilities, BaseResponse, BrowserAudioStreamConfig, DynamicConfig, PartialDynamicConfig, SUPPORTED_INPUT_SOURCES, SUPPORTED_OUTPUT_MODES, ServerExceptionData, ServerSpeechData, ServerState, ServerTextData, StartStreamEventConfig, StreamingStatus, SupportedInputSource, SupportedOutputMode, TranslationSentences, } from './types/StreamingTypes'; import FormLabel from '@mui/material/FormLabel'; import RadioGroup from '@mui/material/RadioGroup'; import FormControlLabel from '@mui/material/FormControlLabel'; import Radio from '@mui/material/Radio'; import './StreamingInterface.css'; import RoomConfig from './RoomConfig'; import Divider from '@mui/material/Divider'; import {useSocket} from './useSocket'; import {RoomState} from './types/RoomState'; import useStable from './useStable'; import float32To16BitPCM from './float32To16BitPCM'; import createBufferedSpeechPlayer from './createBufferedSpeechPlayer'; import Checkbox from '@mui/material/Checkbox'; import Alert from '@mui/material/Alert'; import ISO6391 from 'iso-639-1'; import isScrolledToDocumentBottom from './isScrolledToDocumentBottom'; import Box from '@mui/material/Box'; import Slider from '@mui/material/Slider'; import VolumeDown from '@mui/icons-material/VolumeDown'; import VolumeUp from '@mui/icons-material/VolumeUp'; import Mic from '@mui/icons-material/Mic'; import MicOff from '@mui/icons-material/MicOff'; import XRDialog from './react-xr/XRDialog'; import getTranslationSentencesFromReceivedData from './getTranslationSentencesFromReceivedData'; import { sliceTranslationSentencesUpToIndex, getTotalSentencesLength, } from './sliceTranslationSentencesUtils'; import Blink from './Blink'; import {CURSOR_BLINK_INTERVAL_MS} from './cursorBlinkInterval'; import {getURLParams} from './URLParams'; import debug from './debug'; import DebugSection from './DebugSection'; import Switch from '@mui/material/Switch'; const AUDIO_STREAM_DEFAULTS: { [key in SupportedInputSource]: BrowserAudioStreamConfig; } = { userMedia: { noiseSuppression: true, echoCancellation: false, }, displayMedia: { noiseSuppression: false, echoCancellation: false, }, }; async function requestUserMediaAudioStream( config: BrowserAudioStreamConfig = { noiseSuppression: true, echoCancellation: false, }, ) { const stream = await navigator.mediaDevices.getUserMedia({ audio: {...config, channelCount: 1}, }); console.debug( '[requestUserMediaAudioStream] stream created with settings:', stream.getAudioTracks()?.[0]?.getSettings(), ); return stream; } async function requestDisplayMediaAudioStream( config: BrowserAudioStreamConfig = { noiseSuppression: false, echoCancellation: false, }, ) { const stream = await navigator.mediaDevices.getDisplayMedia({ audio: {...config, channelCount: 1}, // selfBrowserSurface: false, // don't allow the user to select the current tab as the source }); console.debug( '[requestDisplayMediaAudioStream] stream created with settings:', stream.getAudioTracks()?.[0]?.getSettings(), ); return stream; } const buttonLabelMap: {[key in StreamingStatus]: string} = { stopped: 'Start Streaming', running: 'Stop Streaming', starting: 'Starting...', }; const BUFFER_LIMIT = 1; const SCROLLED_TO_BOTTOM_THRESHOLD_PX = 36; const GAIN_MULTIPLIER_OVER_1 = 3; const getGainScaledValue = (value) => value > 1 ? (value - 1) * GAIN_MULTIPLIER_OVER_1 + 1 : value; const TOTAL_ACTIVE_TRANSCODER_WARNING_THRESHOLD = 2; const MAX_SERVER_EXCEPTIONS_TRACKED = 500; export const TYPING_ANIMATION_DELAY_MS = 6; export default function StreamingInterface() { const urlParams = getURLParams(); const debugParam = urlParams.debug; const animateTextDisplay = urlParams.animateTextDisplay; const socketObject = useSocket(); const {socket, clientID} = socketObject; const [serverState, setServerState] = useState(null); const [agent, setAgent] = useState(null); const model = agent?.name ?? null; const agentsCapabilities: Array = serverState?.agentsCapabilities ?? []; const currentAgent: AgentCapabilities | null = agentsCapabilities.find((agent) => agent.name === model) ?? null; const [serverExceptions, setServerExceptions] = useState< Array >([]); const [connectionError, setConnectionError] = useState(null); const [roomState, setRoomState] = useState(null); const roomID = roomState?.room_id ?? null; const isSpeaker = (clientID != null && roomState?.speakers.includes(clientID)) ?? false; const isListener = (clientID != null && roomState?.listeners.includes(clientID)) ?? false; const [streamingStatus, setStreamingStatus] = useState('stopped'); const isStreamConfiguredRef = useRef(false); const [outputMode, setOutputMode] = useState('s2s&t'); const [inputSource, setInputSource] = useState('userMedia'); const [enableNoiseSuppression, setEnableNoiseSuppression] = useState< boolean | null >(null); // Dynamic Params: const [targetLang, setTargetLang] = useState(null); const [enableExpressive, setEnableExpressive] = useState( null, ); const [serverDebugFlag, setServerDebugFlag] = useState( debugParam ?? false, ); const [receivedData, setReceivedData] = useState>([]); // const [translationSentencesAnimated, setTranslationSentencesAnimated] = // useState([]); const [ translationSentencesAnimatedIndex, setTranslationSentencesAnimatedIndex, ] = useState(0); const lastTranslationResultRef = useRef(null); const [inputStream, setInputStream] = useState(null); const [inputStreamSource, setInputStreamSource] = useState(null); const audioContext = useStable(() => new AudioContext()); const [scriptNodeProcessor, setScriptNodeProcessor] = useState(null); const [muted, setMuted] = useState(false); // The onaudioprocess script needs an up-to-date reference to the muted state, so // we use a ref here and keep it in sync via useEffect const mutedRef = useRef(muted); useEffect(() => { mutedRef.current = muted; }, [muted]); const [gain, setGain] = useState(1); const isScrolledToBottomRef = useRef(isScrolledToDocumentBottom()); // Some config options must be set when starting streaming and cannot be chaned dynamically. // This controls whether they are disabled or not const streamFixedConfigOptionsDisabled = streamingStatus !== 'stopped' || roomID == null; const bufferedSpeechPlayer = useStable(() => { const player = createBufferedSpeechPlayer({ onStarted: () => { console.debug('📢 PLAYBACK STARTED 📢'); }, onEnded: () => { console.debug('🛑 PLAYBACK ENDED 🛑'); }, }); // Start the player now so it eagerly plays audio when it arrives player.start(); return player; }); const translationSentencesBase: TranslationSentences = getTranslationSentencesFromReceivedData(receivedData); const translationSentencesBaseTotalLength = getTotalSentencesLength( translationSentencesBase, ); const translationSentences: TranslationSentences = animateTextDisplay ? sliceTranslationSentencesUpToIndex( translationSentencesBase, translationSentencesAnimatedIndex, ) : translationSentencesBase; // We want the blinking cursor to show before any text has arrived, so let's add an empty string so that the cursor shows up const translationSentencesWithEmptyStartingString = streamingStatus === 'running' && translationSentences.length === 0 ? [''] : translationSentences; /****************************************** * Event Handlers ******************************************/ const setAgentAndUpdateParams = useCallback( (newAgent: AgentCapabilities | null) => { setAgent((prevAgent) => { if (prevAgent?.name !== newAgent?.name) { setTargetLang(newAgent?.targetLangs[0] ?? null); setEnableExpressive(null); // setOutputMode(newAgent.modalities[0]); } return newAgent; }); }, [], ); const onSetDynamicConfig = useCallback( async (partialConfig: PartialDynamicConfig) => { return new Promise((resolve, reject) => { if (socket == null) { reject(new Error('[onSetDynamicConfig] socket is null ')); return; } socket.emit( 'set_dynamic_config', partialConfig, (result: BaseResponse) => { console.log('[emit result: set_dynamic_config]', result); if (result.status === 'ok') { resolve(); } else { reject(); } }, ); }); }, [socket], ); const configureStreamAsync = ({sampleRate}: {sampleRate: number}) => { return new Promise((resolve, reject) => { if (socket == null) { reject(new Error('[configureStreamAsync] socket is null ')); return; } const modelName = agent?.name ?? null; if (modelName == null) { reject(new Error('[configureStreamAsync] modelName is null ')); return; } const config: StartStreamEventConfig = { event: 'config', rate: sampleRate, model_name: modelName, // source_language: inputLang, debug: serverDebugFlag, // synchronous processing isn't implemented on the v2 pubsub server, so hardcode this to true async_processing: true, buffer_limit: BUFFER_LIMIT, model_type: outputMode, }; console.log('[configureStreamAsync] sending config', config); socket.emit('configure_stream', config, (statusObject) => { if (statusObject.status === 'ok') { isStreamConfiguredRef.current = true; console.debug( '[configureStreamAsync] stream configured!', statusObject, ); resolve(); } else { isStreamConfiguredRef.current = false; reject( new Error( `[configureStreamAsync] configure_stream returned status: ${statusObject.status}`, ), ); return; } }); }); }; const startStreaming = async () => { if (streamingStatus !== 'stopped') { console.warn( `Attempting to start stream when status is ${streamingStatus}`, ); return; } setStreamingStatus('starting'); if (audioContext.state === 'suspended') { console.warn('audioContext was suspended! resuming...'); await audioContext.resume(); } let stream: MediaStream | null = null; try { if (inputSource === 'userMedia') { stream = await requestUserMediaAudioStream({ noiseSuppression: enableNoiseSuppression ?? AUDIO_STREAM_DEFAULTS['userMedia'].noiseSuppression, echoCancellation: false, }); } else if (inputSource === 'displayMedia') { stream = await requestDisplayMediaAudioStream({ noiseSuppression: enableNoiseSuppression ?? AUDIO_STREAM_DEFAULTS['displayMedia'].noiseSuppression, echoCancellation: false, }); } else { throw new Error(`Unsupported input source requested: ${inputSource}`); } setInputStream(stream); } catch (e) { console.error('[startStreaming] media stream request failed:', e); setStreamingStatus('stopped'); return; } const mediaStreamSource = audioContext.createMediaStreamSource(stream); setInputStreamSource(mediaStreamSource); /** * NOTE: This currently uses a deprecated way of processing the audio (createScriptProcessor). * * Documentation for the deprecated way of doing it is here: https://developer.mozilla.org/en-US/docs/Web/API/BaseAudioContext/createScriptProcessor * * This should be migrated to something like this SO answer: https://stackoverflow.com/a/65448287 */ const scriptProcessor = audioContext.createScriptProcessor(16384, 1, 1); setScriptNodeProcessor(scriptProcessor); scriptProcessor.onaudioprocess = (event) => { if (isStreamConfiguredRef.current === false) { console.debug('[onaudioprocess] stream is not configured yet!'); return; } if (socket == null) { console.warn('[onaudioprocess] socket is null in onaudioprocess'); return; } // console.debug('[onaudioprocess] event', event); if (mutedRef.current) { // We still want to send audio to the server when we're muted to ensure we // get any remaining audio back from the server, so let's pass an array length 1 with a value of 0 const mostlyEmptyInt16Array = new Int16Array(1); socket.emit('incoming_audio', mostlyEmptyInt16Array); } else { const float32Audio = event.inputBuffer.getChannelData(0); const pcm16Audio = float32To16BitPCM(float32Audio); socket.emit('incoming_audio', pcm16Audio); } debug()?.sentAudio(event); }; mediaStreamSource.connect(scriptProcessor); scriptProcessor.connect(audioContext.destination); bufferedSpeechPlayer.start(); try { if (targetLang == null) { throw new Error('[startStreaming] targetLang cannot be nullish'); } // When we are starting the stream we want to pass all the dynamic config values // available before actually configuring and starting the stream const fullDynamicConfig: DynamicConfig = { targetLanguage: targetLang, expressive: enableExpressive, }; await onSetDynamicConfig(fullDynamicConfig); // NOTE: this needs to be the *audioContext* sample rate, not the sample rate of the input stream. Not entirely sure why. await configureStreamAsync({ sampleRate: audioContext.sampleRate, }); } catch (e) { console.error('configureStreamAsync failed', e); setStreamingStatus('stopped'); return; } setStreamingStatus('running'); }; const stopStreaming = useCallback(async () => { if (streamingStatus === 'stopped') { console.warn( `Attempting to stop stream when status is ${streamingStatus}`, ); return; } // Stop the speech playback right away bufferedSpeechPlayer.stop(); if (inputStreamSource == null || scriptNodeProcessor == null) { console.error( 'inputStreamSource || scriptNodeProcessor is null in stopStreaming', ); } else { inputStreamSource.disconnect(scriptNodeProcessor); scriptNodeProcessor.disconnect(audioContext.destination); // From: https://stackoverflow.com/questions/65447236/scriptnode-onaudioprocess-is-deprecated-any-alternative // do we also need this?? // recorder?.stop(); // Release the mic input so we stop showing the red recording icon in the browser inputStream?.getTracks().forEach((track) => track.stop()); } if (socket == null) { console.warn('Unable to emit stop_stream because socket is null'); } else { socket.emit('stop_stream', (result) => { console.debug('[emit result: stop_stream]', result); }); } setStreamingStatus('stopped'); }, [ audioContext.destination, bufferedSpeechPlayer, inputStream, inputStreamSource, scriptNodeProcessor, socket, streamingStatus, ]); const onClearTranscriptForAll = useCallback(() => { if (socket != null) { socket.emit('clear_transcript_for_all'); } }, [socket]); /****************************************** * Effects ******************************************/ useEffect(() => { if (socket == null) { return; } const onRoomStateUpdate = (roomState: RoomState) => { // console.log('[event: room_state_update]', roomState); setRoomState(roomState); }; socket.on('room_state_update', onRoomStateUpdate); return () => { socket.off('room_state_update', onRoomStateUpdate); }; }, [socket]); useEffect(() => { if (socket != null) { const onTranslationText = (data: ServerTextData) => { setReceivedData((prev) => [...prev, data]); debug()?.receivedText(data.payload); }; const onTranslationSpeech = (data: ServerSpeechData) => { bufferedSpeechPlayer.addAudioToBuffer(data.payload, data.sample_rate); }; socket.on('translation_text', onTranslationText); socket.on('translation_speech', onTranslationSpeech); return () => { socket.off('translation_text', onTranslationText); socket.off('translation_speech', onTranslationSpeech); }; } }, [bufferedSpeechPlayer, socket]); useEffect(() => { if (socket != null) { const onServerStateUpdate = (newServerState: ServerState) => { setServerState(newServerState); // If a client creates a server lock, we want to stop streaming if we're not them if ( newServerState.serverLock?.isActive === true && newServerState.serverLock?.clientID !== clientID && streamingStatus === 'running' ) { stopStreaming(); } const firstAgentNullable = newServerState.agentsCapabilities[0]; if (agent == null && firstAgentNullable != null) { setAgentAndUpdateParams(firstAgentNullable); } }; socket.on('server_state_update', onServerStateUpdate); return () => { socket.off('server_state_update', onServerStateUpdate); }; } }, [ agent, clientID, setAgentAndUpdateParams, socket, stopStreaming, streamingStatus, ]); useEffect(() => { if (socket != null) { const onServerException = ( exceptionDataWithoutClientTime: ServerExceptionData, ) => { const exceptionData = { ...exceptionDataWithoutClientTime, timeStringClient: new Date( exceptionDataWithoutClientTime['timeEpochMs'], ).toLocaleString(), }; setServerExceptions((prev) => [exceptionData, ...prev].slice(0, MAX_SERVER_EXCEPTIONS_TRACKED), ); console.error( `[server_exception] The server encountered an exception: ${exceptionData['message']}`, exceptionData, ); }; socket.on('server_exception', onServerException); return () => { socket.off('server_exception', onServerException); }; } }, [socket]); useEffect(() => { if (socket != null) { const onClearTranscript = () => { setReceivedData([]); setTranslationSentencesAnimatedIndex(0); }; socket.on('clear_transcript', onClearTranscript); return () => { socket.off('clear_transcript', onClearTranscript); }; } }, [socket]); useEffect(() => { const onScroll = () => { if (isScrolledToDocumentBottom(SCROLLED_TO_BOTTOM_THRESHOLD_PX)) { // console.debug('scrolled to bottom!'); isScrolledToBottomRef.current = true; return; } // console.debug('NOT scrolled to bottom!'); isScrolledToBottomRef.current = false; return; }; document.addEventListener('scroll', onScroll); return () => { document.removeEventListener('scroll', onScroll); }; }, []); useLayoutEffect(() => { if ( lastTranslationResultRef.current != null && isScrolledToBottomRef.current ) { // Scroll the div to the most recent entry lastTranslationResultRef.current.scrollIntoView(); } // Run the effect every time data is received, so that // we scroll to the bottom even if we're just adding text to // a pre-existing chunk }, [receivedData]); useEffect(() => { if (!animateTextDisplay) { return; } if ( translationSentencesAnimatedIndex < translationSentencesBaseTotalLength ) { const timeout = setTimeout(() => { setTranslationSentencesAnimatedIndex((prev) => prev + 1); debug()?.startRenderText(); }, TYPING_ANIMATION_DELAY_MS); return () => clearTimeout(timeout); } else { debug()?.endRenderText(); } }, [ animateTextDisplay, translationSentencesAnimatedIndex, translationSentencesBaseTotalLength, ]); /****************************************** * Sub-components ******************************************/ const volumeSliderNode = ( `${(value * 100).toFixed(0)}%`} valueLabelDisplay="auto" value={gain} onChange={(_event: Event, newValue: number | number[]) => { // console.log({event, newValue}); if (typeof newValue === 'number') { const scaledGain = getGainScaledValue(newValue); // We want the actual gain node to use the scaled value bufferedSpeechPlayer.setGain(scaledGain); // But we want react state to keep track of the non-scaled value setGain(newValue); } else { console.error( `[volume slider] Unexpected non-number value: ${newValue}`, ); } }} /> ); const xrDialogComponent = ( ); return (
Seamless Translation Logo
Seamless Translation
{ // If the user has switched from speaker to listener we need to tell the // player to play eagerly, since currently the listener doesn't have any stop/start controls bufferedSpeechPlayer.start(); }} /> {isListener && !isSpeaker && ( {volumeSliderNode} )} {isSpeaker && ( <> Model Model {`Supported Source Languages: ${ currentAgent?.sourceLangs.join(', ') ?? 'None' }`} Output Target Language setOutputMode(e.target.value as SupportedOutputMode) } name="output-modes-radio-buttons-group"> { // TODO: Use supported modalities from agentCapabilities SUPPORTED_OUTPUT_MODES.map(({value, label}) => ( } label={label} /> )) } {currentAgent?.dynamicParams?.includes( 'expressive', ) && ( , ) => { const newValue = event.target.checked; setEnableExpressive(newValue); onSetDynamicConfig({expressive: newValue}); }} /> } label="Expressive" /> )} {isListener && ( {volumeSliderNode} )}
Input Source ) => setInputSource( e.target.value as SupportedInputSource, ) } name="input-source-radio-buttons-group"> {SUPPORTED_INPUT_SOURCES.map(({label, value}) => ( } label={label} /> ))}
Options , ) => setEnableNoiseSuppression(event.target.checked) } /> } label="Noise Suppression (Browser)" /> , ) => setServerDebugFlag(event.target.checked)} /> } label="Server Debug Flag" />
{streamingStatus === 'stopped' ? ( ) : ( )} {roomID == null ? null : ( {xrDialogComponent} )} {serverExceptions.length > 0 && (
{`The server encountered an exception. See the browser console for details. You may need to refresh the page to continue using the app.`}
)} {serverState != null && serverState.totalActiveTranscoders >= TOTAL_ACTIVE_TRANSCODER_WARNING_THRESHOLD && (
{`The server currently has ${serverState?.totalActiveTranscoders} active streaming sessions. Performance may be degraded.`}
)} {serverState?.serverLock != null && serverState.serverLock.clientID !== clientID && (
{`The server is currently locked by "${serverState.serverLock.name}". Priority will be given to that client when they are streaming, and your streaming session may be halted abruptly.`}
)} )}
{isListener && !isSpeaker && ( {xrDialogComponent} )}
{debugParam && roomID != null && }
Transcript {isSpeaker && ( )}
{translationSentencesWithEmptyStartingString.map( (sentence, index, arr) => { const isLast = index === arr.length - 1; const maybeRef = isLast ? {ref: lastTranslationResultRef} : {}; return (
{sentence} {animateTextDisplay && isLast && ( 0 }> {'|'} )}
); }, )}
); }