Spaces:

jbilcke-hf
/

ai-tube

Running on CPU Upgrade

App Files Files Community

ai-tube / src /app /api /v1 /create /index.ts

jbilcke-hf HF staff

limit size

013e696 6 months ago

raw

history blame

7.02 kB

	"use server"

	import { ClapProject, getValidNumber, newClap, newSegment, ClapSegmentCategory, ClapOutputType, ClapMediaOrientation } from "@aitube/clap"

	import { sleep } from "@/lib/utils/sleep"
	import { predict } from "@/app/api/providers/huggingface/predictWithHuggingFace"
	import { parseRawStringToYAML } from "@/app/api/parsers/parseRawStringToYAML"
	import { LatentStory } from "@/app/api/v1/types"

	import { systemPrompt } from "./systemPrompt"
	import { generateMusicPrompts } from "../edit/music/generateMusicPrompt"
	import { clapToLatentStory } from "../edit/entities/clapToLatentStory"

	// a helper to generate Clap stories from a few sentences
	// this is mostly used by external apps such as the Stories Factory
	export async function create(request: {
	prompt?: string
	width?: number
	height?: number
	turbo?: boolean
	}= {
	prompt: "",
	width: 1024,
	height: 576,
	turbo: false,
	}): Promise<ClapProject> {

	// we limit to 512 characters
	const prompt = `${request?.prompt \|\| ""}`.trim().slice(0, 512)

	console.log("api/v1/create(): request:", request)

	if (!prompt.length) { throw new Error(`please provide a prompt`) }

	const width = getValidNumber(request?.width, 256, 8192, 1024)
	const height = getValidNumber(request?.height, 256, 8192, 576)
	const turbo = request?.turbo ? true : false

	const userPrompt = `Movie story to generate: ${prompt}

	Output: `

	const prefix = "```yaml\n"
	const nbMaxNewTokens = 1400

	// TODO use streaming for the Hugging Face prediction
	//
	// note that a Clap file is actually a YAML stream of documents
	// so technically we could stream everything from end-to-end
	// (but I haven't coded the helpers to do this yet)
	let rawString = await predict({
	systemPrompt,
	userPrompt,
	nbMaxNewTokens,
	prefix,
	turbo,
	})

	// console.log("api/v1/create(): rawString: ", rawString)

	let shots: LatentStory[] = []

	let maybeShots = parseRawStringToYAML<LatentStory[]>(rawString, [])

	if (!Array.isArray(maybeShots) \|\| maybeShots.length === 0) {
	console.log(`api/v1/create(): failed to generate shots.. trying again`)

	await sleep(2000)

	rawString = await predict({
	systemPrompt,
	userPrompt: userPrompt + ".", // we trick the Hugging Face cache
	nbMaxNewTokens,
	prefix,
	turbo,
	})

	// console.log("api/v1/create(): rawString: ", rawString)

	maybeShots = parseRawStringToYAML<LatentStory[]>(rawString, [])
	if (!Array.isArray(maybeShots) \|\| maybeShots.length === 0) {
	console.log(`api/v1/create(): failed to generate shots for the second time, which indicates an issue with the Hugging Face API`)
	}
	}

	if (maybeShots.length) {
	shots = maybeShots
	} else {
	throw new Error(`Hugging Face Inference API failure (the model failed to generate the shots)`)
	}

	console.log(`api/v1/create(): generated ${shots.length} shots`)

	// this is approximate - TTS generation will determine the final duration of each shot
	const defaultSegmentDurationInMs = 7000

	let currentElapsedTimeInMs = 0

	const clap: ClapProject = newClap({
	meta: {
	title: prompt.split(",").shift() \|\| "",
	description: prompt,
	synopsis: "",
	licence: "",
	orientation:
	width > height ? ClapMediaOrientation.LANDSCAPE :
	height > width ? ClapMediaOrientation.PORTRAIT :
	ClapMediaOrientation.SQUARE,
	width,
	height,
	isInteractive: false,
	isLoop: false,
	durationInMs: shots.length * defaultSegmentDurationInMs,
	defaultVideoModel: "AnimateDiff-Lightning",
	}
	})

	for (const { comment, image, voice } of shots) {

	console.log(`api/v1/create(): - ${comment}`)

	// note: it would be nice if we could have a convention saying that
	// track 0 is for videos and track 1 storyboards
	//
	// however, that's a bit constraining as people will generate .clap
	// using all kind of tools and development experience,
	// and they may not wish to learn the Clap protocol format completely
	//
	// TL;DR:
	// we should fix the Clap file editor to make it able to react videos
	// from any track number

	clap.segments.push(newSegment({
	track: 0,
	startTimeInMs: currentElapsedTimeInMs,
	endTimeInMs: currentElapsedTimeInMs + defaultSegmentDurationInMs,
	assetDurationInMs: defaultSegmentDurationInMs,
	category: ClapSegmentCategory.VIDEO,
	prompt: image,
	outputType: ClapOutputType.VIDEO,
	status: "to_generate",
	}))

	clap.segments.push(newSegment({
	track: 1,
	startTimeInMs: currentElapsedTimeInMs,
	endTimeInMs: currentElapsedTimeInMs + defaultSegmentDurationInMs,
	assetDurationInMs: defaultSegmentDurationInMs,
	category: ClapSegmentCategory.STORYBOARD,
	prompt: image,
	outputType: ClapOutputType.IMAGE,
	status: "to_generate",
	}))

	clap.segments.push(newSegment({
	track: 2,
	startTimeInMs: currentElapsedTimeInMs,
	endTimeInMs: currentElapsedTimeInMs + defaultSegmentDurationInMs,
	assetDurationInMs: defaultSegmentDurationInMs,
	category: ClapSegmentCategory.INTERFACE,
	prompt: comment,
	// assetUrl: `data:text/plain;base64,${btoa(comment)}`,
	assetUrl: comment,
	outputType: ClapOutputType.TEXT,
	status: "to_generate",
	}))

	clap.segments.push(newSegment({
	track: 3,
	startTimeInMs: currentElapsedTimeInMs,
	endTimeInMs: currentElapsedTimeInMs + defaultSegmentDurationInMs,
	assetDurationInMs: defaultSegmentDurationInMs,
	category: ClapSegmentCategory.DIALOGUE,
	prompt: voice,
	outputType: ClapOutputType.AUDIO,
	status: "to_generate",
	}))

	// the presence of a camera is mandatory
	clap.segments.push(newSegment({
	track: 4,
	startTimeInMs: currentElapsedTimeInMs,
	endTimeInMs: currentElapsedTimeInMs + defaultSegmentDurationInMs,
	assetDurationInMs: defaultSegmentDurationInMs,
	category: ClapSegmentCategory.CAMERA,
	prompt: "video",
	outputType: ClapOutputType.TEXT,
	status: "to_generate",
	}))

	currentElapsedTimeInMs += defaultSegmentDurationInMs
	}

	// one more thing: music!
	let musicPrompts: string[] = []

	try {
	musicPrompts = await generateMusicPrompts({
	prompt,
	latentStory: await clapToLatentStory(clap)
	})
	const musicPrompt = musicPrompts.at(0)
	if (!musicPrompt) { throw new Error(`not enough music prompts`) }

	// console.log("musicPrompt:", musicPrompt)

	clap.segments.push(newSegment({
	track: 5,
	startTimeInMs: 0,
	endTimeInMs: currentElapsedTimeInMs,
	assetDurationInMs: currentElapsedTimeInMs,
	category: ClapSegmentCategory.MUSIC,
	prompt: musicPrompt,
	outputType: ClapOutputType.AUDIO,
	status: "to_generate",
	}))
	} catch (err) {
	console.error(`[api/v1/create] failed to generate music prompts`)
	musicPrompts.push("lofi hiphop loop")
	}

	return clap
	}