jbilcke-hf's picture
jbilcke-hf HF staff
limit size
013e696
raw
history blame
7.02 kB
"use server"
import { ClapProject, getValidNumber, newClap, newSegment, ClapSegmentCategory, ClapOutputType, ClapMediaOrientation } from "@aitube/clap"
import { sleep } from "@/lib/utils/sleep"
import { predict } from "@/app/api/providers/huggingface/predictWithHuggingFace"
import { parseRawStringToYAML } from "@/app/api/parsers/parseRawStringToYAML"
import { LatentStory } from "@/app/api/v1/types"
import { systemPrompt } from "./systemPrompt"
import { generateMusicPrompts } from "../edit/music/generateMusicPrompt"
import { clapToLatentStory } from "../edit/entities/clapToLatentStory"
// a helper to generate Clap stories from a few sentences
// this is mostly used by external apps such as the Stories Factory
export async function create(request: {
prompt?: string
width?: number
height?: number
turbo?: boolean
}= {
prompt: "",
width: 1024,
height: 576,
turbo: false,
}): Promise<ClapProject> {
// we limit to 512 characters
const prompt = `${request?.prompt || ""}`.trim().slice(0, 512)
console.log("api/v1/create(): request:", request)
if (!prompt.length) { throw new Error(`please provide a prompt`) }
const width = getValidNumber(request?.width, 256, 8192, 1024)
const height = getValidNumber(request?.height, 256, 8192, 576)
const turbo = request?.turbo ? true : false
const userPrompt = `Movie story to generate: ${prompt}
Output: `
const prefix = "```yaml\n"
const nbMaxNewTokens = 1400
// TODO use streaming for the Hugging Face prediction
//
// note that a Clap file is actually a YAML stream of documents
// so technically we could stream everything from end-to-end
// (but I haven't coded the helpers to do this yet)
let rawString = await predict({
systemPrompt,
userPrompt,
nbMaxNewTokens,
prefix,
turbo,
})
// console.log("api/v1/create(): rawString: ", rawString)
let shots: LatentStory[] = []
let maybeShots = parseRawStringToYAML<LatentStory[]>(rawString, [])
if (!Array.isArray(maybeShots) || maybeShots.length === 0) {
console.log(`api/v1/create(): failed to generate shots.. trying again`)
await sleep(2000)
rawString = await predict({
systemPrompt,
userPrompt: userPrompt + ".", // we trick the Hugging Face cache
nbMaxNewTokens,
prefix,
turbo,
})
// console.log("api/v1/create(): rawString: ", rawString)
maybeShots = parseRawStringToYAML<LatentStory[]>(rawString, [])
if (!Array.isArray(maybeShots) || maybeShots.length === 0) {
console.log(`api/v1/create(): failed to generate shots for the second time, which indicates an issue with the Hugging Face API`)
}
}
if (maybeShots.length) {
shots = maybeShots
} else {
throw new Error(`Hugging Face Inference API failure (the model failed to generate the shots)`)
}
console.log(`api/v1/create(): generated ${shots.length} shots`)
// this is approximate - TTS generation will determine the final duration of each shot
const defaultSegmentDurationInMs = 7000
let currentElapsedTimeInMs = 0
const clap: ClapProject = newClap({
meta: {
title: prompt.split(",").shift() || "",
description: prompt,
synopsis: "",
licence: "",
orientation:
width > height ? ClapMediaOrientation.LANDSCAPE :
height > width ? ClapMediaOrientation.PORTRAIT :
ClapMediaOrientation.SQUARE,
width,
height,
isInteractive: false,
isLoop: false,
durationInMs: shots.length * defaultSegmentDurationInMs,
defaultVideoModel: "AnimateDiff-Lightning",
}
})
for (const { comment, image, voice } of shots) {
console.log(`api/v1/create(): - ${comment}`)
// note: it would be nice if we could have a convention saying that
// track 0 is for videos and track 1 storyboards
//
// however, that's a bit constraining as people will generate .clap
// using all kind of tools and development experience,
// and they may not wish to learn the Clap protocol format completely
//
// TL;DR:
// we should fix the Clap file editor to make it able to react videos
// from any track number
clap.segments.push(newSegment({
track: 0,
startTimeInMs: currentElapsedTimeInMs,
endTimeInMs: currentElapsedTimeInMs + defaultSegmentDurationInMs,
assetDurationInMs: defaultSegmentDurationInMs,
category: ClapSegmentCategory.VIDEO,
prompt: image,
outputType: ClapOutputType.VIDEO,
status: "to_generate",
}))
clap.segments.push(newSegment({
track: 1,
startTimeInMs: currentElapsedTimeInMs,
endTimeInMs: currentElapsedTimeInMs + defaultSegmentDurationInMs,
assetDurationInMs: defaultSegmentDurationInMs,
category: ClapSegmentCategory.STORYBOARD,
prompt: image,
outputType: ClapOutputType.IMAGE,
status: "to_generate",
}))
clap.segments.push(newSegment({
track: 2,
startTimeInMs: currentElapsedTimeInMs,
endTimeInMs: currentElapsedTimeInMs + defaultSegmentDurationInMs,
assetDurationInMs: defaultSegmentDurationInMs,
category: ClapSegmentCategory.INTERFACE,
prompt: comment,
// assetUrl: `data:text/plain;base64,${btoa(comment)}`,
assetUrl: comment,
outputType: ClapOutputType.TEXT,
status: "to_generate",
}))
clap.segments.push(newSegment({
track: 3,
startTimeInMs: currentElapsedTimeInMs,
endTimeInMs: currentElapsedTimeInMs + defaultSegmentDurationInMs,
assetDurationInMs: defaultSegmentDurationInMs,
category: ClapSegmentCategory.DIALOGUE,
prompt: voice,
outputType: ClapOutputType.AUDIO,
status: "to_generate",
}))
// the presence of a camera is mandatory
clap.segments.push(newSegment({
track: 4,
startTimeInMs: currentElapsedTimeInMs,
endTimeInMs: currentElapsedTimeInMs + defaultSegmentDurationInMs,
assetDurationInMs: defaultSegmentDurationInMs,
category: ClapSegmentCategory.CAMERA,
prompt: "video",
outputType: ClapOutputType.TEXT,
status: "to_generate",
}))
currentElapsedTimeInMs += defaultSegmentDurationInMs
}
// one more thing: music!
let musicPrompts: string[] = []
try {
musicPrompts = await generateMusicPrompts({
prompt,
latentStory: await clapToLatentStory(clap)
})
const musicPrompt = musicPrompts.at(0)
if (!musicPrompt) { throw new Error(`not enough music prompts`) }
// console.log("musicPrompt:", musicPrompt)
clap.segments.push(newSegment({
track: 5,
startTimeInMs: 0,
endTimeInMs: currentElapsedTimeInMs,
assetDurationInMs: currentElapsedTimeInMs,
category: ClapSegmentCategory.MUSIC,
prompt: musicPrompt,
outputType: ClapOutputType.AUDIO,
status: "to_generate",
}))
} catch (err) {
console.error(`[api/v1/create] failed to generate music prompts`)
musicPrompts.push("lofi hiphop loop")
}
return clap
}