Spaces:

jbilcke-hf
/

ai-tube

Running on CPU Upgrade

App Files Files Community

jbilcke-hf HF staff commited on Apr 25

Commit

ce559ed

•

1 Parent(s): 8993f70

first version of the AI Stories Factory endpoint

Browse files

Files changed (6) hide show

src/app/api/generate/story/route.ts +121 -6
src/app/api/generate/story/systemPrompt.ts +39 -0
src/app/api/generate/story/userPrompt.ts +0 -0
src/app/api/resolvers/interface/route.ts +1 -0
src/app/api/stories/README.md +3 -0
src/lib/clap/clapToDataUri.ts +1 -0

src/app/api/generate/story/route.ts CHANGED Viewed

@@ -2,25 +2,140 @@ import { NextResponse, NextRequest } from "next/server"
 import { generateClapFromSimpleStory } from "@/lib/clap/generateClapFromSimpleStory"
 import { serializeClap } from "@/lib/clap/serializeClap"
 // a helper to generate Clap stories from a few sentences
 // this is mostly used by external apps such as the Stories Factory
 export async function POST(req: NextRequest) {
   const request = await req.json() as {
-    story: string[]
     // can add more stuff for the V2 of Stories Factory
   }
-  const story = Array.isArray(request?.story) ? request.story : []
-  if (!story.length) { throw new Error(`please provide at least oen sentence for the story`) }
-  const clap = generateClapFromSimpleStory({
-    story,
-    // can add more stuff for the V2 of Stories Factory
   })
   return new NextResponse(await serializeClap(clap), {
     status: 200,
     headers: new Headers({ "content-type": "application/x-gzip" }),

 import { generateClapFromSimpleStory } from "@/lib/clap/generateClapFromSimpleStory"
 import { serializeClap } from "@/lib/clap/serializeClap"
+import { getValidNumber } from "@/lib/utils/getValidNumber"
+import { newClap } from "@/lib/clap/newClap"
+import { predict } from "../../providers/huggingface/predictWithHuggingFace"
+import { systemPrompt } from "./systemPrompt"
+import { parseRawStringToYAML } from "../../utils/parseRawStringToYAML"
+import { newSegment } from "@/lib/clap/newSegment"
+export type LatentStory = {
+  title: string
+  image: string
+  voice: string
+}
 // a helper to generate Clap stories from a few sentences
 // this is mostly used by external apps such as the Stories Factory
 export async function POST(req: NextRequest) {
   const request = await req.json() as {
+    prompt: string
+    width: number
+    height: number
     // can add more stuff for the V2 of Stories Factory
   }
+  const prompt = `${request?.prompt || ""}`.trim()
+  console.log("[api/generate/story] request:", request)
+  if (!prompt.length) { throw new Error(`please provide a prompt`) }
+  const width = getValidNumber(request?.width, 256, 8192, 1024)
+  const height = getValidNumber(request?.height, 256, 8192, 576)
+  const userPrompt = `Video story to generate: ${prompt}`
+  // TODO use streaming for the Hugging Face prediction
+  //
+  // note that a Clap file is actually a YAML stream of documents
+  // so technically we could stream everything from end-to-end
+  // (but I haven't coded the helpers to do this yet)
+  const rawString = await predict({
+    systemPrompt,
+    userPrompt,
+    nbMaxNewTokens: 1200,
+    prefix: "```yaml\n",
   })
+  console.log("[api/generate/story] rawString: ", rawString)
+  const shots = parseRawStringToYAML<LatentStory[]>(rawString, [])
+  console.log("[api/generate/story] generated shots: ", shots)
+  // this is approximate - TTS generation will determine the final duration of each shot
+  const defaultSegmentDurationInMs = 5000
+  let currentElapsedTimeInMs = 0
+  let currentSegmentDurationInMs = defaultSegmentDurationInMs
+  const clap = newClap({
+    meta: {
+      title: "Not needed", // we don't need a title actually
+      description: "This video has been generated using AI",
+      synopsis: "",
+      licence: "Non Commercial",
+      orientation: "vertical",
+      width,
+      height,
+      isInteractive: false,
+      isLoop: false,
+      durationInMs: shots.length * defaultSegmentDurationInMs,
+      defaultVideoModel: "AnimateDiff-Lightning",
+    }
+  })
+  for (const { title, image, voice } of shots) {
+    console.log(`[api/generate/story]  - ${title}`)
+    // note: it would be nice if we could have a convention saying that
+    // track 0 is for videos and track 1 storyboards
+    //
+    // however, that's a bit constraining as people will generate .clap
+    // using all kind of tools and development experience,
+    // and they may not wish to learn the Clap protocol format completely
+    //
+    // TL;DR:
+    // we should fix the Clap file editor to make it able to react videos
+    // from any track number
+    /*
+    we disable it, because we don't generate animated videos yet
+    clap.segments.push(newSegment({
+      track: 0,
+      category: "video",
+      prompt: image,
+      outputType: "video"
+    }))
+    */
+    clap.segments.push(newSegment({
+      track: 1,
+      startTimeInMs: currentSegmentDurationInMs,
+      assetDurationInMs: defaultSegmentDurationInMs,
+      category: "storyboard",
+      prompt: image,
+      outputType: "image"
+    }))
+    clap.segments.push(newSegment({
+      track: 2,
+      startTimeInMs: currentSegmentDurationInMs,
+      assetDurationInMs: defaultSegmentDurationInMs,
+      category: "interface",
+      prompt: title,
+      // assetUrl: `data:text/plain;base64,${btoa(title)}`,
+      assetUrl: title,
+      outputType: "text"
+    }))
+    clap.segments.push(newSegment({
+      track: 3,
+      startTimeInMs: currentSegmentDurationInMs,
+      assetDurationInMs: defaultSegmentDurationInMs,
+      category: "dialogue",
+      prompt: voice,
+      outputType: "audio"
+    }))
+    currentSegmentDurationInMs += defaultSegmentDurationInMs
+  }
+  // TODO replace by Clap file streaming
   return new NextResponse(await serializeClap(clap), {
     status: 200,
     headers: new Headers({ "content-type": "application/x-gzip" }),

src/app/api/generate/story/systemPrompt.ts ADDED Viewed

	@@ -0,0 +1,39 @@

+export const systemPrompt: string =
+  `# Context
+You are a server-side function generating stories from a single synopsis/brief (a "prompt").
+The video are vertical, so they can be displayed on mobile.
+They are meant to be shared on social media platform (Instagram, TikTok, Snapchat, Twitter, YouTube Shorts etc).
+Each video is composed of a sequence of static panels (a dozen in average),
+with a voice over and text.
+# Task
+Your mission is to generate a sequence of panels that will form the final video.
+You will be provided a "prompt" (for the story) and max number of images
+Each panel is composed of:
+- one title (which will be displayed as an overlay over the video, so keep it short eg. max 10/12 words),
+- one image (you must describe it using a Stable Diffusion prompt - about ~300 characters - using simple descriptive words and adjectives. Describe facts about characters, location, lights, texture, camera orientation, colors, clothes, movements etc. But don't give your opinion, don't talk about the emotions it evokes etc.)
+- one voice over (should be short too, about 10 to 15 words)
+# Examples
+You most reply by writing/completing a YAML list of objects.
+Here is a short example, the prompt was "a cute puppy who misbehaves in the kitchen, in 3 parts 🐶"
+Note how we asked for "3 parts". Sometimes the user will talk about steps, slides etc instead (that's fine, it means the same thing),
+or the user might omit to give the number (that's fine too, you can use 5 by default),
+but if the user asks for large numbers, it should be ignored (our limit is 32).
+\`\`\`
+- title: "my puppy is so cute when he sleeps 🐶"
+  image: "close-up shot of a puppy sleeping in a bed, cute, instagram, award winning, vertical photo"
+  voice: "look at my puppy, how cute he is. He is the cutest puppy in the world"
+- title: "wait.. noo not the milk 😭"
+  image: "medium-shot of a puppy spilling over milk on the kitchen floor, nice kitchen, spilled milk, guilty dog face, cute, dramatic, instagram, vertical photo"
+  voice: "wait.. what are you doing.. nooo my milk"
+- title: "😭 please send help"
+  image: "medium-shot of a puppy eating a cake, on the kitchen table, birthday cake, eating, cute, instagram, funny, messy, vertical photo"
+  voice: "Now my dog is eating my birtday cake. Please send help."
+\`\`\
+# You turn!
+`

src/app/api/generate/story/userPrompt.ts ADDED Viewed

File without changes

src/app/api/resolvers/interface/route.ts CHANGED Viewed

@@ -1,5 +1,6 @@
 import { NextResponse, NextRequest } from "next/server"
 import queryString from "query-string"
 import { predict } from "../../providers/huggingface/predictWithHuggingFace"
 import { systemPrompt } from "./systemPrompt"

 import { NextResponse, NextRequest } from "next/server"
 import queryString from "query-string"
 import { predict } from "../../providers/huggingface/predictWithHuggingFace"
 import { systemPrompt } from "./systemPrompt"

src/app/api/stories/README.md ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ TODO put the API to POST and GET stories
2	+
3	+ They will be saved to the DB

src/lib/clap/clapToDataUri.ts CHANGED Viewed

@@ -1,4 +1,5 @@
 import { blobToDataUri } from "@/app/api/utils/blobToDataUri"
 import { serializeClap } from "./serializeClap"
 import { ClapProject } from "./types"

 import { blobToDataUri } from "@/app/api/utils/blobToDataUri"
 import { serializeClap } from "./serializeClap"
 import { ClapProject } from "./types"