jbilcke-hf HF staff commited on
Commit
ce559ed
1 Parent(s): 8993f70

first version of the AI Stories Factory endpoint

Browse files
src/app/api/generate/story/route.ts CHANGED
@@ -2,25 +2,140 @@ import { NextResponse, NextRequest } from "next/server"
2
 
3
  import { generateClapFromSimpleStory } from "@/lib/clap/generateClapFromSimpleStory"
4
  import { serializeClap } from "@/lib/clap/serializeClap"
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  // a helper to generate Clap stories from a few sentences
7
  // this is mostly used by external apps such as the Stories Factory
8
  export async function POST(req: NextRequest) {
9
 
10
  const request = await req.json() as {
11
- story: string[]
 
 
12
  // can add more stuff for the V2 of Stories Factory
13
  }
14
 
15
- const story = Array.isArray(request?.story) ? request.story : []
16
 
17
- if (!story.length) { throw new Error(`please provide at least oen sentence for the story`) }
18
 
19
- const clap = generateClapFromSimpleStory({
20
- story,
21
- // can add more stuff for the V2 of Stories Factory
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  })
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  return new NextResponse(await serializeClap(clap), {
25
  status: 200,
26
  headers: new Headers({ "content-type": "application/x-gzip" }),
 
2
 
3
  import { generateClapFromSimpleStory } from "@/lib/clap/generateClapFromSimpleStory"
4
  import { serializeClap } from "@/lib/clap/serializeClap"
5
+ import { getValidNumber } from "@/lib/utils/getValidNumber"
6
+ import { newClap } from "@/lib/clap/newClap"
7
+ import { predict } from "../../providers/huggingface/predictWithHuggingFace"
8
+ import { systemPrompt } from "./systemPrompt"
9
+ import { parseRawStringToYAML } from "../../utils/parseRawStringToYAML"
10
+ import { newSegment } from "@/lib/clap/newSegment"
11
+
12
+ export type LatentStory = {
13
+ title: string
14
+ image: string
15
+ voice: string
16
+ }
17
 
18
  // a helper to generate Clap stories from a few sentences
19
  // this is mostly used by external apps such as the Stories Factory
20
  export async function POST(req: NextRequest) {
21
 
22
  const request = await req.json() as {
23
+ prompt: string
24
+ width: number
25
+ height: number
26
  // can add more stuff for the V2 of Stories Factory
27
  }
28
 
29
+ const prompt = `${request?.prompt || ""}`.trim()
30
 
31
+ console.log("[api/generate/story] request:", request)
32
 
33
+ if (!prompt.length) { throw new Error(`please provide a prompt`) }
34
+
35
+ const width = getValidNumber(request?.width, 256, 8192, 1024)
36
+ const height = getValidNumber(request?.height, 256, 8192, 576)
37
+
38
+ const userPrompt = `Video story to generate: ${prompt}`
39
+
40
+ // TODO use streaming for the Hugging Face prediction
41
+ //
42
+ // note that a Clap file is actually a YAML stream of documents
43
+ // so technically we could stream everything from end-to-end
44
+ // (but I haven't coded the helpers to do this yet)
45
+ const rawString = await predict({
46
+ systemPrompt,
47
+ userPrompt,
48
+ nbMaxNewTokens: 1200,
49
+ prefix: "```yaml\n",
50
  })
51
 
52
+ console.log("[api/generate/story] rawString: ", rawString)
53
+
54
+ const shots = parseRawStringToYAML<LatentStory[]>(rawString, [])
55
+
56
+ console.log("[api/generate/story] generated shots: ", shots)
57
+
58
+ // this is approximate - TTS generation will determine the final duration of each shot
59
+ const defaultSegmentDurationInMs = 5000
60
+
61
+ let currentElapsedTimeInMs = 0
62
+ let currentSegmentDurationInMs = defaultSegmentDurationInMs
63
+
64
+ const clap = newClap({
65
+ meta: {
66
+ title: "Not needed", // we don't need a title actually
67
+ description: "This video has been generated using AI",
68
+ synopsis: "",
69
+ licence: "Non Commercial",
70
+ orientation: "vertical",
71
+ width,
72
+ height,
73
+ isInteractive: false,
74
+ isLoop: false,
75
+ durationInMs: shots.length * defaultSegmentDurationInMs,
76
+ defaultVideoModel: "AnimateDiff-Lightning",
77
+ }
78
+ })
79
+
80
+ for (const { title, image, voice } of shots) {
81
+
82
+ console.log(`[api/generate/story] - ${title}`)
83
+
84
+ // note: it would be nice if we could have a convention saying that
85
+ // track 0 is for videos and track 1 storyboards
86
+ //
87
+ // however, that's a bit constraining as people will generate .clap
88
+ // using all kind of tools and development experience,
89
+ // and they may not wish to learn the Clap protocol format completely
90
+ //
91
+ // TL;DR:
92
+ // we should fix the Clap file editor to make it able to react videos
93
+ // from any track number
94
+
95
+
96
+ /*
97
+ we disable it, because we don't generate animated videos yet
98
+ clap.segments.push(newSegment({
99
+ track: 0,
100
+ category: "video",
101
+ prompt: image,
102
+ outputType: "video"
103
+ }))
104
+ */
105
+
106
+ clap.segments.push(newSegment({
107
+ track: 1,
108
+ startTimeInMs: currentSegmentDurationInMs,
109
+ assetDurationInMs: defaultSegmentDurationInMs,
110
+ category: "storyboard",
111
+ prompt: image,
112
+ outputType: "image"
113
+ }))
114
+
115
+ clap.segments.push(newSegment({
116
+ track: 2,
117
+ startTimeInMs: currentSegmentDurationInMs,
118
+ assetDurationInMs: defaultSegmentDurationInMs,
119
+ category: "interface",
120
+ prompt: title,
121
+ // assetUrl: `data:text/plain;base64,${btoa(title)}`,
122
+ assetUrl: title,
123
+ outputType: "text"
124
+ }))
125
+
126
+ clap.segments.push(newSegment({
127
+ track: 3,
128
+ startTimeInMs: currentSegmentDurationInMs,
129
+ assetDurationInMs: defaultSegmentDurationInMs,
130
+ category: "dialogue",
131
+ prompt: voice,
132
+ outputType: "audio"
133
+ }))
134
+
135
+ currentSegmentDurationInMs += defaultSegmentDurationInMs
136
+ }
137
+
138
+ // TODO replace by Clap file streaming
139
  return new NextResponse(await serializeClap(clap), {
140
  status: 200,
141
  headers: new Headers({ "content-type": "application/x-gzip" }),
src/app/api/generate/story/systemPrompt.ts ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ export const systemPrompt: string =
2
+ `# Context
3
+ You are a server-side function generating stories from a single synopsis/brief (a "prompt").
4
+ The video are vertical, so they can be displayed on mobile.
5
+ They are meant to be shared on social media platform (Instagram, TikTok, Snapchat, Twitter, YouTube Shorts etc).
6
+ Each video is composed of a sequence of static panels (a dozen in average),
7
+ with a voice over and text.
8
+
9
+ # Task
10
+ Your mission is to generate a sequence of panels that will form the final video.
11
+
12
+ You will be provided a "prompt" (for the story) and max number of images
13
+ Each panel is composed of:
14
+ - one title (which will be displayed as an overlay over the video, so keep it short eg. max 10/12 words),
15
+ - one image (you must describe it using a Stable Diffusion prompt - about ~300 characters - using simple descriptive words and adjectives. Describe facts about characters, location, lights, texture, camera orientation, colors, clothes, movements etc. But don't give your opinion, don't talk about the emotions it evokes etc.)
16
+ - one voice over (should be short too, about 10 to 15 words)
17
+
18
+ # Examples
19
+
20
+ You most reply by writing/completing a YAML list of objects.
21
+ Here is a short example, the prompt was "a cute puppy who misbehaves in the kitchen, in 3 parts 🐶"
22
+ Note how we asked for "3 parts". Sometimes the user will talk about steps, slides etc instead (that's fine, it means the same thing),
23
+ or the user might omit to give the number (that's fine too, you can use 5 by default),
24
+ but if the user asks for large numbers, it should be ignored (our limit is 32).
25
+
26
+ \`\`\`
27
+ - title: "my puppy is so cute when he sleeps 🐶"
28
+ image: "close-up shot of a puppy sleeping in a bed, cute, instagram, award winning, vertical photo"
29
+ voice: "look at my puppy, how cute he is. He is the cutest puppy in the world"
30
+ - title: "wait.. noo not the milk 😭"
31
+ image: "medium-shot of a puppy spilling over milk on the kitchen floor, nice kitchen, spilled milk, guilty dog face, cute, dramatic, instagram, vertical photo"
32
+ voice: "wait.. what are you doing.. nooo my milk"
33
+ - title: "😭 please send help"
34
+ image: "medium-shot of a puppy eating a cake, on the kitchen table, birthday cake, eating, cute, instagram, funny, messy, vertical photo"
35
+ voice: "Now my dog is eating my birtday cake. Please send help."
36
+ \`\`\
37
+
38
+ # You turn!
39
+ `
src/app/api/generate/story/userPrompt.ts ADDED
File without changes
src/app/api/resolvers/interface/route.ts CHANGED
@@ -1,5 +1,6 @@
1
  import { NextResponse, NextRequest } from "next/server"
2
  import queryString from "query-string"
 
3
  import { predict } from "../../providers/huggingface/predictWithHuggingFace"
4
  import { systemPrompt } from "./systemPrompt"
5
 
 
1
  import { NextResponse, NextRequest } from "next/server"
2
  import queryString from "query-string"
3
+
4
  import { predict } from "../../providers/huggingface/predictWithHuggingFace"
5
  import { systemPrompt } from "./systemPrompt"
6
 
src/app/api/stories/README.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ TODO put the API to POST and GET stories
2
+
3
+ They will be saved to the DB
src/lib/clap/clapToDataUri.ts CHANGED
@@ -1,4 +1,5 @@
1
  import { blobToDataUri } from "@/app/api/utils/blobToDataUri"
 
2
  import { serializeClap } from "./serializeClap"
3
  import { ClapProject } from "./types"
4
 
 
1
  import { blobToDataUri } from "@/app/api/utils/blobToDataUri"
2
+
3
  import { serializeClap } from "./serializeClap"
4
  import { ClapProject } from "./types"
5