import { create } from "zustand" import { ClapProject, ClapSegment, ClapSegmentCategory, newClap, parseClap } from "@aitube/clap" import { getVideoPrompt } from "@aitube/engine" import { LatentEngineStore } from "./types" import { resolveSegments } from "../resolvers/resolveSegments" import { fetchLatentClap } from "./generators/fetchLatentClap" import { InteractiveSegmenterResult, MPMask } from "@mediapipe/tasks-vision" import { segmentFrame } from "@/lib/on-device-ai/segmentFrameOnClick" import { drawSegmentation } from "../utils/canvas/drawSegmentation" import { getZIndexDepth } from "../utils/data/getZIndexDepth" import { getSegmentStartAt } from "../utils/data/getSegmentStartAt" import { getElementsSortedByStartAt } from "../utils/data/getElementsSortedByStartAt" import { getSegmentEndAt } from "../utils/data/getSegmentEndAt" import { setZIndexDepthId } from "../utils/data/setZIndexDepth" import { setSegmentStartAt } from "../utils/data/setSegmentStartAt" import { setSegmentEndAt } from "../utils/data/setSegmentEndAt" import { setSegmentId } from "../utils/data/setSegmentId" export const useLatentEngine = create((set, get) => ({ jwtToken: "", width: 512, height: 288, clap: newClap(), debug: true, headless: false, // false by default isLoop: false, isStatic: false, isLive: false, isInteractive: false, isLoading: false, // true when a .clap is being downloaded and/or generated isLoaded: false, // true if a clap is loaded isPlaying: false, isPaused: true, // our "this is AI.. gasp!" disclaimer hasDisclaimer: true, hasPresentedDisclaimer: false, videoSimulationPromise: undefined, videoSimulationPending: false, videoSimulationStartedAt: performance.now(), videoSimulationEndedAt: performance.now(), videoSimulationDurationInMs: 0, videoSimulationVideoPlaybackFPS: 0, videoSimulationRenderingTimeFPS: 0, interfaceSimulationPromise: undefined, interfaceSimulationPending: false, interfaceSimulationStartedAt: performance.now(), interfaceSimulationEndedAt: performance.now(), interfaceSimulationDurationInMs: 0, entitySimulationPromise: undefined, entitySimulationPending: false, entitySimulationStartedAt: performance.now(), entitySimulationEndedAt: performance.now(), entitySimulationDurationInMs: 0, renderingIntervalId: undefined, renderingIntervalDelayInMs: 150, // 0.2s renderingLastRenderAt: performance.now(), // for our calculations to be correct // those need to match the actual output from the API // don't trust the parameters you send to the API, // instead check the *actual* values with VLC!! videoModelFPS: 24, videoModelNumOfFrames: 60, // 80, videoModelDurationInSec: 2.584, playbackSpeed: 1, positionInMs: 0, durationInMs: 0, // this is the "buffer size" videoLayers: [ { id: "video-buffer-0", element: null as unknown as JSX.Element, }, { id: "video-buffer-1", element: null as unknown as JSX.Element, }, /* { id: "video-buffer-2", element: null as unknown as JSX.Element, }, { id: "video-buffer-3", element: null as unknown as JSX.Element, }, */ ], videoElements: [], interfaceLayers: [], setJwtToken: (jwtToken: string) => { set({ jwtToken }) }, setContainerDimension: ({ width, height }: { width: number; height: number }) => { set({ width, height }) }, imagine: async (prompt: string): Promise => { set({ isLoaded: false, isLoading: true, }) let clap: ClapProject | undefined = undefined try { clap = await fetchLatentClap(prompt) } catch (err) { console.error(`generateAndLoad failed (${err})`) set({ isLoading: false, }) } if (!clap) { return } get().open(clap) }, open: async (src?: string | ClapProject | Blob) => { const { debug } = get() set({ isLoaded: false, isLoading: true, }) let clap: ClapProject | undefined = undefined try { clap = await parseClap(src, debug) } catch (err) { console.error(`failed to open the Clap: ${err}`) set({ isLoading: false, }) } if (!clap) { return } set({ clap, isLoading: false, isLoaded: true, isLoop: clap.meta.isLoop, isStatic: !clap.meta.isInteractive, isLive: false, isInteractive: clap.meta.isInteractive, }) }, setVideoElements: (videoElements: HTMLVideoElement[] = []) => { set({ videoElements }) }, processClickOnSegment: (result: InteractiveSegmenterResult) => { console.log(`processClickOnSegment: user clicked on something:`, result) const { videoElements, debug } = get() if (!result?.categoryMask) { if (debug) { console.log(`processClickOnSegment: no categoryMask, so we skip the click`) } return } try { if (debug) { console.log(`processClickOnSegment: callling drawSegmentation`) } const firstVisibleVideo = videoElements.find(element => getZIndexDepth(element) > 0 ) const segmentationElements = Array.from( document.querySelectorAll('.segmentation-canvas') ) as HTMLCanvasElement[] const segmentationElement = segmentationElements.at(0) const canvasMask: HTMLCanvasElement = drawSegmentation({ mask: result.categoryMask, canvas: segmentationElement, backgroundImage: firstVisibleVideo, fillStyle: "rgba(255, 255, 255, 1.0)" }) // TODO: read the canvas te determine on what the user clicked if (debug) { console.log(`processClickOnSegment: filtering the original image`) } // filterImage(imageElement, canvasMask) if (debug) { console.log("processClickOnSegment: TODO call data.close() to free the memory!") } result.close() } catch (err) { console.error(`processClickOnSegment: something failed ${err}`) } }, onClickOnSegmentationLayer: (event) => { const { videoElements, debug } = get() if (debug) { console.log("onClickOnSegmentationLayer") } const firstVisibleVideo = videoElements.find(element => getZIndexDepth(element) > 0 ) if (!firstVisibleVideo) { return } const box = event.currentTarget.getBoundingClientRect() const px = event.clientX const py = event.clientY const x = px / box.width const y = py / box.height console.log(`onClickOnSegmentationLayer: user clicked on `, { x, y, px, py, box, videoElements }) const fn = async () => { // todo julian: this should use the visible element instead const results: InteractiveSegmenterResult = await segmentFrame(firstVisibleVideo, x, y) get().processClickOnSegment(results) } fn() }, togglePlayPause: (): boolean => { const { isLoaded, isPlaying, playbackSpeed, renderingIntervalId, videoElements } = get() if (!isLoaded) { return false } const newValue = !isPlaying clearInterval(renderingIntervalId) const firstVisibleVideo = videoElements.find(element => getZIndexDepth(element) > 0 ) // Note Julian: we could also let the background scheduler // (runRenderingLoop) do its work of advancing the cursor here if (newValue) { if (firstVisibleVideo) { try { firstVisibleVideo.playbackRate = playbackSpeed firstVisibleVideo.play() } catch (err) { console.error(`togglePlayPause: failed to start the video (${err})`) } } set({ isPlaying: true, renderingIntervalId: setTimeout(() => { get().runRenderingLoop() }, 0) }) } else { if (firstVisibleVideo) { try { firstVisibleVideo.playbackRate = playbackSpeed firstVisibleVideo.pause() } catch (err) { console.error(`togglePlayPause: failed to pause the video (${err})`) } } set({ isPlaying: false }) } return newValue }, play: (): boolean => { const { isLoaded, isPlaying, renderingIntervalId, renderingIntervalDelayInMs } = get() if (!isLoaded) { return false } if (isPlaying) { return true } clearInterval(renderingIntervalId) set({ isPlaying: true, renderingIntervalId: setTimeout(() => { get().runRenderingLoop() }, 0) }) return true }, pause: (): boolean => { const { isLoaded, renderingIntervalId } = get() if (!isLoaded) { return false } clearInterval(renderingIntervalId) set({ isPlaying: false }) return false }, // a slow rendering function (async - might call a third party LLM) runVideoSimulationLoop: async () => { const { isLoaded, isPlaying, clap, playbackSpeed, positionInMs, videoModelFPS, videoModelNumOfFrames, videoModelDurationInSec, videoElements, jwtToken, } = get() if (!isLoaded || !isPlaying) { set({ videoSimulationPending: false }) return } set({ videoSimulationPending: true, videoSimulationStartedAt: performance.now(), }) const videosSortedByStartAt = getElementsSortedByStartAt(videoElements) // videos whose timestamp is behind the current cursor let toRecycle: HTMLVideoElement[] = [] let toPlay: HTMLVideoElement[] = [] let toPreload: HTMLVideoElement[] = [] for (let i = 0; i < videosSortedByStartAt.length; i++) { const video = videosSortedByStartAt[i] const segmentStartAt = getSegmentStartAt(video) const segmentEndAt = getSegmentEndAt(video) // this segment has been spent, it should be discared if (segmentEndAt < positionInMs) { toRecycle.push(video) } else if (segmentStartAt < positionInMs) { toPlay.push(video) video.play() setZIndexDepthId(video, 10) } else { toPreload.push(video) video.pause() setZIndexDepthId(video, 0) } } const videoDurationInMs = videoModelDurationInSec * 1000 // TODO julian: this is an approximation // to grab the max number of segments const maxBufferDurationInMs = positionInMs + (videoDurationInMs * 4) console.log(`DEBUG: `, { positionInMs, videoModelDurationInSec, videoDurationInMs, "(videoDurationInMs * 4)": (videoDurationInMs * 4), maxBufferDurationInMs, segments: clap.segments }) const prefilterSegmentsForPerformanceReasons: ClapSegment[] = clap.segments.filter(s => s.startTimeInMs >= positionInMs && s.startTimeInMs < maxBufferDurationInMs ) console.log(`prefilterSegmentsForPerformanceReasons: `, prefilterSegmentsForPerformanceReasons) // this tells us how much time is left let remainingTimeInMs = Math.max(0, clap.meta.durationInMs - positionInMs) // to avoid interruptions, we should jump to the beginning of the project // as soo as we are start playing back the "last" video segment // now, we need to recycle spent videos, // by discarding their content and replacing it with fresh one // // yes: I know the code is complex and not intuitive - sorry about that const extraPositivePrompt: string[] = ["high quality", "crisp", "detailed"] let bufferAheadOfCurrentPositionInMs = positionInMs for (let i = 0; i < toRecycle.length; i++) { console.log(`got a spent video to recycle`) // we select the segments in the current shot const shotSegmentsToPreload: ClapSegment[] = prefilterSegmentsForPerformanceReasons.filter(s => s.startTimeInMs >= bufferAheadOfCurrentPositionInMs && s.startTimeInMs < (bufferAheadOfCurrentPositionInMs + videoDurationInMs) ) bufferAheadOfCurrentPositionInMs += videoDurationInMs const prompt = getVideoPrompt(shotSegmentsToPreload, clap.entityIndex, extraPositivePrompt) console.log(`video prompt: ${prompt}`) // could also be the camera // after all, we don't necessarily have a shot, // this could also be a gaussian splat const shotData = shotSegmentsToPreload.find(s => s.category === "video") console.log(`shotData:`, shotData) if (!prompt || !shotData) { continue } const recycled = toRecycle[i] recycled.pause() setSegmentId(recycled, shotData.id) setSegmentStartAt(recycled, shotData.startTimeInMs) setSegmentEndAt(recycled, shotData.endTimeInMs) setZIndexDepthId(recycled, 0) // this is the best compromise for now in term of speed const width = 512 const height = 288 // this is our magic trick: we let the browser do the token-secured, // asynchronous and parallel video generation call for us // // one issue with this approach is that it hopes the video // will be downloaded in time, but it's not an exact science // // first, generation time varies between 4sec and 7sec, // then some people will get 300ms latency due to their ISP, // and finally the video itself is a 150~200 Kb payload) recycled.src = `/api/resolvers/video?t=${ // to prevent funny people from using this as a free, open-bar video API // we have this system of token with a 24h expiration date // we might even make it tighter in the future jwtToken }&w=${ width }&h=${ height }&p=${ // let's re-use the best ideas from the Latent Browser: // a text uri equals a latent resource encodeURIComponent(prompt) }` toPreload.push(recycled) } const videoSimulationEndedAt = performance.now() const videoSimulationDurationInMs = videoSimulationEndedAt - get().videoSimulationStartedAt const videoSimulationDurationInSec = videoSimulationDurationInMs / 1000 const videoSimulationVideoPlaybackFPS = videoModelFPS * playbackSpeed const videoSimulationRenderingTimeFPS = videoModelNumOfFrames / videoSimulationDurationInSec set({ videoSimulationPending: false, videoSimulationEndedAt, videoSimulationDurationInMs, videoSimulationVideoPlaybackFPS, videoSimulationRenderingTimeFPS, }) }, // a slow rendering function (async - might call a third party LLM) runInterfaceSimulationLoop: async () => { const { isLoaded, isPlaying, clap, } = get() if (!isLoaded || !isPlaying) { set({ interfaceSimulationPending: false }) return } set({ interfaceSimulationPending: true, interfaceSimulationStartedAt: performance.now(), }) try { if (get().isPlaying) { // console.log(`runSimulationLoop: rendering UI layer..`) // note: for now we only display one panel at a time, // later we can try to see if we should handle more // for nice gradient transition, const interfaceLayers = await resolveSegments(clap, ClapSegmentCategory.INTERFACE, 1) if (get().isPlaying) { set({ interfaceLayers }) // console.log(`runSimulationLoop: rendered UI layer`) } } } catch (err) { console.error(`runInterfaceSimulationLoop failed to render UI layer ${err}`) } const interfaceSimulationEndedAt = performance.now() const interfaceSimulationDurationInMs = interfaceSimulationEndedAt - get().interfaceSimulationStartedAt set({ interfaceSimulationPending: false, interfaceSimulationEndedAt, interfaceSimulationDurationInMs, }) }, // a slow rendering function (async - might call a third party LLM) runEntitySimulationLoop: async () => { const { isLoaded, isPlaying, clap, } = get() if (!isLoaded || !isPlaying) { set({ entitySimulationPending: false }) return } set({ entitySimulationPending: true, entitySimulationStartedAt: performance.now(), }) const entitySimulationEndedAt = performance.now() const entitySimulationDurationInMs = entitySimulationEndedAt - get().entitySimulationStartedAt set({ entitySimulationPending: false, entitySimulationEndedAt, entitySimulationDurationInMs, }) }, // a fast sync rendering function; whose sole role is to filter the component // list to put into the buffer the one that should be displayed runRenderingLoop: () => { const { isLoaded, isPlaying, renderingIntervalId, renderingIntervalDelayInMs, renderingLastRenderAt, positionInMs, videoSimulationPending, runVideoSimulationLoop, interfaceSimulationPending, runInterfaceSimulationLoop, entitySimulationPending, runEntitySimulationLoop, } = get() if (!isLoaded || !isPlaying) { return } // TODO julian: don't do this here, this is inneficient const videoElements = Array.from( document.querySelectorAll('.video-buffer') ) as HTMLVideoElement[] const newRenderingLastRenderAt = performance.now() const elapsedInMs = newRenderingLastRenderAt - renderingLastRenderAt // let's move inside the Clap file timeline const newPositionInMs = positionInMs + elapsedInMs clearInterval(renderingIntervalId) set({ isPlaying: true, renderingLastRenderAt: newRenderingLastRenderAt, positionInMs: newPositionInMs, videoElements: videoElements, // TODO: use requestAnimationFrame somehow // https://developers.google.com/mediapipe/solutions/vision/image_segmenter/web_js renderingIntervalId: setTimeout(() => { get().runRenderingLoop() }, renderingIntervalDelayInMs) }) // note that having this second set() also helps us to make sure previously values are properly stored // in the state when the simulation loop runs if (!videoSimulationPending) { set({ videoSimulationPromise: runVideoSimulationLoop() }) // <-- note: this is a fire-and-forget operation! } if (!interfaceSimulationPending) { set({ interfaceSimulationPromise: runInterfaceSimulationLoop() }) // <-- note: this is a fire-and-forget operation! } if (!entitySimulationPending) { set({ entitySimulationPromise: runEntitySimulationLoop() }) // <-- note: this is a fire-and-forget operation! } }, jumpTo: (positionInMs: number) => { set({ positionInMs }) }, jumpToStart: () => { set({ positionInMs: 0 }) }, }))