import * as THREE from 'three'; import {OrbitControls} from 'three/examples/jsm/controls/OrbitControls.js'; import ThreeMeshUI, {Block, Text} from 'three-mesh-ui'; import FontJSON from '../assets/RobotoMono-Regular-msdf.json?url'; import FontImage from '../assets/RobotoMono-Regular.png'; import {TranslationSentences} from '../types/StreamingTypes'; import supportedCharSet from './supportedCharSet'; // Augment three-mesh-ui types which aren't implemented declare module 'three-mesh-ui' { interface Block { add(any: any); set(props: BlockOptions); position: { x: number; y: number; z: number; set: (x: number, y: number, z: number) => void; }; } interface Text { set(props: {content: string}); } } // Various configuration parameters const INITIAL_PROMPT = 'Listening...\n'; const NUM_LINES = 3; const CHARS_PER_LINE = 37; const CHARS_PER_SECOND = 15; const MAX_WIDTH = 0.89; const CHAR_WIDTH = 0.0233; const Y_COORD_START = -0.38; const Z_COORD = -1.3; const LINE_HEIGHT = 0.062; const BLOCK_SPACING = 0.02; const FONT_SIZE = 0.038; // Speed of scrolling of text lines const SCROLL_Y_DELTA = 0.01; // Overlay an extra block for padding due to inflexibilities of native padding const OFFSET = 0.01; const OFFSET_WIDTH = OFFSET * 3; // The tick interval const CURSOR_BLINK_INTERVAL_MS = 500; type TranscriptState = { translationText: string; textBlocksProps: TextBlockProps[]; lastTranslationStringIndex: number; lastTranslationLineStartIndex: number; transcriptLines: string[]; lastUpdateTime: number; }; type TextBlockProps = { content: string; // The end position when animating targetY: number; // Current scroll position that caps at targetY currentY: number; textOpacity: number; backgroundOpacity: number; index: number; isBottomLine: boolean; }; function initialTextBlockProps(count: number): TextBlockProps[] { return Array.from({length: count}).map(() => { // Push in non display blocks because mesh UI crashes if elements are add / removed from screen. return { // key: textBlocksProps.length, targetY: Y_COORD_START, currentY: Y_COORD_START, index: 0, textOpacity: 0, backgroundOpacity: 0, width: MAX_WIDTH, height: LINE_HEIGHT, content: '', isBottomLine: true, }; }); } function initialState(): TranscriptState { return { translationText: '', textBlocksProps: initialTextBlockProps(NUM_LINES), lastTranslationStringIndex: 0, lastTranslationLineStartIndex: 0, transcriptLines: [], lastUpdateTime: new Date().getTime(), }; } let transcriptState: TranscriptState = initialState(); let scene: THREE.Scene | null; let camera: THREE.PerspectiveCamera | null; let renderer: THREE.WebGLRenderer | null; let controls: THREE.OrbitControls | null; let cursorBlinkOn: boolean = false; setInterval(() => { cursorBlinkOn = !cursorBlinkOn; }, CURSOR_BLINK_INTERVAL_MS); type TextBlock = { textBlockOuterContainer: Block; textBlockInnerContainer: Block; text: Text; }; const textBlocks: TextBlock[] = []; export function getRenderer(): THREE.WebGLRenderer | null { return renderer; } export function init( width: number, height: number, parentElement: HTMLDivElement | null, ): THREE.WebGLRenderer { scene = new THREE.Scene(); scene.background = new THREE.Color(0x505050); camera = new THREE.PerspectiveCamera(60, width / height, 0.1, 1000); camera.position.z = 1; renderer = new THREE.WebGLRenderer({ antialias: true, }); renderer.setPixelRatio(window.devicePixelRatio); renderer.setSize(width, height); renderer.xr.enabled = true; renderer.xr.setReferenceSpaceType('local'); parentElement?.appendChild(renderer.domElement); controls = new OrbitControls(camera, renderer.domElement); controls.update(); scene.add(camera); textBlocks.push( ...initialTextBlockProps(NUM_LINES).map((props) => makeTextBlock(props)), ); renderer.setAnimationLoop(loop); return renderer; } export function updatetranslationText( translationSentences: TranslationSentences, ): void { const newText = INITIAL_PROMPT + translationSentences.join('\n'); if (transcriptState.translationText === newText) { return; } transcriptState.translationText = newText; } export function resetState(): void { transcriptState = initialState(); } function makeTextBlock({ content, backgroundOpacity, }: TextBlockProps): TextBlock { const width = MAX_WIDTH; const height = LINE_HEIGHT; const fontProps = { fontSize: FONT_SIZE, textAlign: 'left', // TODO: support more language charsets // This renders using MSDF format supported in WebGL. Renderable characters are defined in the "charset" json // Currently supports most default keyboard inputs but this would exclude many non latin charset based languages. // You can use https://msdf-bmfont.donmccurdy.com/ for easily generating these files fontFamily: FontJSON, fontTexture: FontImage, }; const textBlockOuterContainer = new Block({ backgroundOpacity, width: width + OFFSET_WIDTH, height: height, borderRadius: 0, ...fontProps, }); const text = new Text({content}); const textBlockInnerContainer = new Block({ padding: 0, backgroundOpacity: 0, width, height, }); // Adding it to the camera makes the UI follow it. camera.add(textBlockOuterContainer); textBlockOuterContainer.add(textBlockInnerContainer); textBlockInnerContainer.add(text); return { textBlockOuterContainer, textBlockInnerContainer, text, }; } // Updates the position and text of a text block from its props function updateTextBlock( id: number, {content, targetY, currentY, backgroundOpacity, isBottomLine}: TextBlockProps, ): void { const {textBlockOuterContainer, textBlockInnerContainer, text} = textBlocks[id]; const {lastTranslationStringIndex, translationText} = transcriptState; // Add blinking cursor if we don't have any new input to render const numChars = content.length; if ( isBottomLine && cursorBlinkOn && lastTranslationStringIndex >= translationText.length ) { content = content + '|'; } // Accounting for potential cursor for block width (the +1) const width = (numChars + (isBottomLine ? 1.1 : 0) + (numChars < 10 ? 1 : 0)) * CHAR_WIDTH; const height = LINE_HEIGHT; // Width starts from 0 and goes 1/2 in each direction so offset x const xPosition = width / 2 - MAX_WIDTH / 2 + OFFSET_WIDTH; textBlockOuterContainer?.set({ backgroundOpacity, width: width + 2 * OFFSET_WIDTH, height: height + OFFSET / 3, borderRadius: 0, }); // Scroll up line toward target const y = isBottomLine ? targetY : Math.min(currentY + SCROLL_Y_DELTA, targetY); transcriptState.textBlocksProps[id].currentY = y; textBlockOuterContainer.position.set(-OFFSET_WIDTH + xPosition, y, Z_COORD); textBlockInnerContainer.set({ padding: 0, backgroundOpacity: 0, width, height, }); text.set({content}); } // We split the text so it fits line by line into the UI function chunkTranslationTextIntoLines( translationText: string, nextTranslationStringIndex: number, ): string[] { // Ideally we continue where we left off but this is complicated when we have mid-words. Recalculating for now const newSentences = translationText .substring(0, nextTranslationStringIndex) .split('\n'); const transcriptLines = ['']; newSentences.forEach((newSentence, sentenceIdx) => { const words = newSentence.split(/\s+/); words.forEach((word) => { const filteredWord = [...word] .filter((c) => { if (supportedCharSet().has(c)) { return true; } console.error( `Unsupported char ${c} - make sure this is supported in the font family msdf file`, ); return false; }) .join('') // Filter out unknown symbol .replace('', ''); const lastLineSoFar = transcriptLines[0]; const charCount = lastLineSoFar.length + filteredWord.length + 1; if (charCount <= CHARS_PER_LINE) { transcriptLines[0] = lastLineSoFar + ' ' + filteredWord; } else { transcriptLines.unshift(filteredWord); } }); if (sentenceIdx < newSentences.length - 1) { transcriptLines.unshift('\n'); transcriptLines.unshift(''); } }); return transcriptLines; } // The main loop, function updateTextBlocksProps(): void { const {translationText, lastTranslationStringIndex, lastUpdateTime} = transcriptState; const currentTime = new Date().getTime(); const charsToRender = Math.round( ((currentTime - lastUpdateTime) * CHARS_PER_SECOND) / 1000, ); if (charsToRender < 1) { // Wait some more until we render more characters return; } const nextTranslationStringIndex = Math.min( lastTranslationStringIndex + charsToRender, translationText.length, ); if (nextTranslationStringIndex === lastTranslationStringIndex) { // No new characters to render transcriptState.lastUpdateTime = currentTime; return; } // Ideally we continue where we left off but this is complicated when we have mid-words. Recalculating for now const transcriptLines = chunkTranslationTextIntoLines( translationText, nextTranslationStringIndex, ); transcriptState.transcriptLines = transcriptLines; transcriptState.lastTranslationStringIndex = nextTranslationStringIndex; // Compute the new props for each text block const newTextBlocksProps: TextBlockProps[] = []; // We start with the most recent line and increment the y coordinate for older lines. // If it is a new sentence we increment the y coordinate a little more to leave a visible space let y = Y_COORD_START; transcriptLines.forEach((line, i) => { if (newTextBlocksProps.length == NUM_LINES) { return; } if (line === '\n') { y += BLOCK_SPACING; return; } const isBottomLine = newTextBlocksProps.length === 0; const textOpacity = 1 - 0.1 * newTextBlocksProps.length; const previousProps = transcriptState.textBlocksProps.find( (props) => props.index === i, ); const props = { targetY: y + LINE_HEIGHT / 2, currentY: isBottomLine ? y : previousProps?.currentY || y, index: i, textOpacity, backgroundOpacity: 1, content: line, isBottomLine, }; newTextBlocksProps.push(props); y += LINE_HEIGHT; }); transcriptState.textBlocksProps = newTextBlocksProps; transcriptState.lastUpdateTime = currentTime; } // The main render loop, everything gets rendered here. function loop() { updateTextBlocksProps(); transcriptState.textBlocksProps.map((props, i) => updateTextBlock(i, props)); ThreeMeshUI.update(); controls.update(); renderer.render(scene, camera); }