jbilcke-hf HF staff commited on
Commit
58b1ffb
1 Parent(s): 6d66622

improve prompts

Browse files
package-lock.json CHANGED
@@ -9,7 +9,7 @@
9
  "version": "0.0.0",
10
  "dependencies": {
11
  "@aitube/clap": "0.0.16",
12
- "@aitube/client": "0.0.21",
13
  "@aitube/engine": "0.0.6",
14
  "@huggingface/hub": "0.12.3-oauth",
15
  "@huggingface/inference": "^2.6.7",
@@ -130,9 +130,9 @@
130
  }
131
  },
132
  "node_modules/@aitube/client": {
133
- "version": "0.0.21",
134
- "resolved": "https://registry.npmjs.org/@aitube/client/-/client-0.0.21.tgz",
135
- "integrity": "sha512-Nw/K4j024ffYiw1WipLDU7M29L+4cM4cQwrjvSxWW8zF4/1NLrPSKTlF6Gak5Qd/ZFWq/D8GbMiGQ5W3lQb8mA==",
136
  "dependencies": {
137
  "query-string": "^9.0.0"
138
  },
@@ -3744,9 +3744,9 @@
3744
  }
3745
  },
3746
  "node_modules/caniuse-lite": {
3747
- "version": "1.0.30001616",
3748
- "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001616.tgz",
3749
- "integrity": "sha512-RHVYKov7IcdNjVHJFNY/78RdG4oGVjbayxv8u5IO74Wv7Hlq4PnJE6mo/OjFijjVFNy5ijnCt6H3IIo4t+wfEw==",
3750
  "funding": [
3751
  {
3752
  "type": "opencollective",
@@ -4323,9 +4323,9 @@
4323
  "integrity": "sha512-I88TYZWc9XiYHRQ4/3c5rjjfgkjhLyW2luGIheGERbNQ6OY7yTybanSpDXZa8y7VUP9YmDcYa+eyq4ca7iLqWA=="
4324
  },
4325
  "node_modules/electron-to-chromium": {
4326
- "version": "1.4.759",
4327
- "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.4.759.tgz",
4328
- "integrity": "sha512-qZJc+zsuI+/5UjOSFnpkJBwwLMH1AZgyKqJ7LUNnRsB7v/cDjMu9DvXgp9kH6PTTZxjnPXGp2Uhurw+2Ll4Hjg=="
4329
  },
4330
  "node_modules/elliptic": {
4331
  "version": "6.5.4",
@@ -5361,9 +5361,9 @@
5361
  }
5362
  },
5363
  "node_modules/get-tsconfig": {
5364
- "version": "4.7.4",
5365
- "resolved": "https://registry.npmjs.org/get-tsconfig/-/get-tsconfig-4.7.4.tgz",
5366
- "integrity": "sha512-ofbkKj+0pjXjhejr007J/fLf+sW+8H7K5GCm+msC8q3IpvgjobpyPqSRFemNyIMxklC0zeJpi7VDFna19FacvQ==",
5367
  "dependencies": {
5368
  "resolve-pkg-maps": "^1.0.0"
5369
  },
@@ -6360,9 +6360,9 @@
6360
  }
6361
  },
6362
  "node_modules/minipass": {
6363
- "version": "7.1.0",
6364
- "resolved": "https://registry.npmjs.org/minipass/-/minipass-7.1.0.tgz",
6365
- "integrity": "sha512-oGZRv2OT1lO2UF1zUcwdTb3wqUwI0kBGTgt/T7OdSj6M6N5m3o5uPf0AIW6lVxGGoiWUR7e2AwTE+xiwK8WQig==",
6366
  "engines": {
6367
  "node": ">=16 || 14 >=14.17"
6368
  }
@@ -6806,9 +6806,9 @@
6806
  "integrity": "sha512-LDJzPVEEEPR+y48z93A0Ed0yXb8pAByGWo/k5YYdYgpY2/2EsOsksJrq7lOHxryrVOn1ejG6oAp8ahvOIQD8sw=="
6807
  },
6808
  "node_modules/path-scurry": {
6809
- "version": "1.10.2",
6810
- "resolved": "https://registry.npmjs.org/path-scurry/-/path-scurry-1.10.2.tgz",
6811
- "integrity": "sha512-7xTavNy5RQXnsjANvVvMkEjvloOinkAjv/Z6Ildz9v2RinZ4SBKTWFOVRbaF8p0vpHnyjV/UwNDdKuUv6M5qcA==",
6812
  "dependencies": {
6813
  "lru-cache": "^10.2.0",
6814
  "minipass": "^5.0.0 || ^6.0.2 || ^7.0.0"
@@ -7547,9 +7547,9 @@
7547
  "integrity": "sha512-cdwTTnqPu0Hyvf5in5asVdZocVDTNRmR7XEcJuIzMjJeSHybHl7vpB66AzwTaIg6CLSbtjcxc8fqcySfnTkccA=="
7548
  },
7549
  "node_modules/semver": {
7550
- "version": "7.6.1",
7551
- "resolved": "https://registry.npmjs.org/semver/-/semver-7.6.1.tgz",
7552
- "integrity": "sha512-f/vbBsu+fOiYt+lmwZV0rVwJScl46HppnOA1ZvIuBWKOTlllpyJ3bfVax76/OrhCH38dyxoDIA8K7uB963IYgA==",
7553
  "bin": {
7554
  "semver": "bin/semver.js"
7555
  },
@@ -8005,15 +8005,15 @@
8005
  }
8006
  },
8007
  "node_modules/sucrase/node_modules/glob": {
8008
- "version": "10.3.12",
8009
- "resolved": "https://registry.npmjs.org/glob/-/glob-10.3.12.tgz",
8010
- "integrity": "sha512-TCNv8vJ+xz4QiqTpfOJA7HvYv+tNIRHKfUWw/q+v2jdgN4ebz+KY9tGx5J4rHP0o84mNP+ApH66HRX8us3Khqg==",
8011
  "dependencies": {
8012
  "foreground-child": "^3.1.0",
8013
  "jackspeak": "^2.3.6",
8014
  "minimatch": "^9.0.1",
8015
  "minipass": "^7.0.4",
8016
- "path-scurry": "^1.10.2"
8017
  },
8018
  "bin": {
8019
  "glob": "dist/esm/bin.mjs"
 
9
  "version": "0.0.0",
10
  "dependencies": {
11
  "@aitube/clap": "0.0.16",
12
+ "@aitube/client": "0.0.23",
13
  "@aitube/engine": "0.0.6",
14
  "@huggingface/hub": "0.12.3-oauth",
15
  "@huggingface/inference": "^2.6.7",
 
130
  }
131
  },
132
  "node_modules/@aitube/client": {
133
+ "version": "0.0.23",
134
+ "resolved": "https://registry.npmjs.org/@aitube/client/-/client-0.0.23.tgz",
135
+ "integrity": "sha512-zZeGacE2WWSIO1h+HOQu6ExwWfJ01mzW1SreP3bN67vOmrau+bWRzZmX6Wg7DAHePnjvTkeR01TAiZVXskJOkw==",
136
  "dependencies": {
137
  "query-string": "^9.0.0"
138
  },
 
3744
  }
3745
  },
3746
  "node_modules/caniuse-lite": {
3747
+ "version": "1.0.30001617",
3748
+ "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001617.tgz",
3749
+ "integrity": "sha512-mLyjzNI9I+Pix8zwcrpxEbGlfqOkF9kM3ptzmKNw5tizSyYwMe+nGLTqMK9cO+0E+Bh6TsBxNAaHWEM8xwSsmA==",
3750
  "funding": [
3751
  {
3752
  "type": "opencollective",
 
4323
  "integrity": "sha512-I88TYZWc9XiYHRQ4/3c5rjjfgkjhLyW2luGIheGERbNQ6OY7yTybanSpDXZa8y7VUP9YmDcYa+eyq4ca7iLqWA=="
4324
  },
4325
  "node_modules/electron-to-chromium": {
4326
+ "version": "1.4.761",
4327
+ "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.4.761.tgz",
4328
+ "integrity": "sha512-PIbxpiJGx6Bb8dQaonNc6CGTRlVntdLg/2nMa1YhnrwYOORY9a3ZgGN0UQYE6lAcj/lkyduJN7BPt/JiY+jAQQ=="
4329
  },
4330
  "node_modules/elliptic": {
4331
  "version": "6.5.4",
 
5361
  }
5362
  },
5363
  "node_modules/get-tsconfig": {
5364
+ "version": "4.7.5",
5365
+ "resolved": "https://registry.npmjs.org/get-tsconfig/-/get-tsconfig-4.7.5.tgz",
5366
+ "integrity": "sha512-ZCuZCnlqNzjb4QprAzXKdpp/gh6KTxSJuw3IBsPnV/7fV4NxC9ckB+vPTt8w7fJA0TaSD7c55BR47JD6MEDyDw==",
5367
  "dependencies": {
5368
  "resolve-pkg-maps": "^1.0.0"
5369
  },
 
6360
  }
6361
  },
6362
  "node_modules/minipass": {
6363
+ "version": "7.1.1",
6364
+ "resolved": "https://registry.npmjs.org/minipass/-/minipass-7.1.1.tgz",
6365
+ "integrity": "sha512-UZ7eQ+h8ywIRAW1hIEl2AqdwzJucU/Kp59+8kkZeSvafXhZjul247BvIJjEVFVeON6d7lM46XX1HXCduKAS8VA==",
6366
  "engines": {
6367
  "node": ">=16 || 14 >=14.17"
6368
  }
 
6806
  "integrity": "sha512-LDJzPVEEEPR+y48z93A0Ed0yXb8pAByGWo/k5YYdYgpY2/2EsOsksJrq7lOHxryrVOn1ejG6oAp8ahvOIQD8sw=="
6807
  },
6808
  "node_modules/path-scurry": {
6809
+ "version": "1.11.0",
6810
+ "resolved": "https://registry.npmjs.org/path-scurry/-/path-scurry-1.11.0.tgz",
6811
+ "integrity": "sha512-LNHTaVkzaYaLGlO+0u3rQTz7QrHTFOuKyba9JMTQutkmtNew8dw8wOD7mTU/5fCPZzCWpfW0XnQKzY61P0aTaw==",
6812
  "dependencies": {
6813
  "lru-cache": "^10.2.0",
6814
  "minipass": "^5.0.0 || ^6.0.2 || ^7.0.0"
 
7547
  "integrity": "sha512-cdwTTnqPu0Hyvf5in5asVdZocVDTNRmR7XEcJuIzMjJeSHybHl7vpB66AzwTaIg6CLSbtjcxc8fqcySfnTkccA=="
7548
  },
7549
  "node_modules/semver": {
7550
+ "version": "7.6.2",
7551
+ "resolved": "https://registry.npmjs.org/semver/-/semver-7.6.2.tgz",
7552
+ "integrity": "sha512-FNAIBWCx9qcRhoHcgcJ0gvU7SN1lYU2ZXuSfl04bSC5OpvDHFyJCjdNHomPXxjQlCBU67YW64PzY7/VIEH7F2w==",
7553
  "bin": {
7554
  "semver": "bin/semver.js"
7555
  },
 
8005
  }
8006
  },
8007
  "node_modules/sucrase/node_modules/glob": {
8008
+ "version": "10.3.14",
8009
+ "resolved": "https://registry.npmjs.org/glob/-/glob-10.3.14.tgz",
8010
+ "integrity": "sha512-4fkAqu93xe9Mk7le9v0y3VrPDqLKHarNi2s4Pv7f2yOvfhWfhc7hRPHC/JyqMqb8B/Dt/eGS4n7ykwf3fOsl8g==",
8011
  "dependencies": {
8012
  "foreground-child": "^3.1.0",
8013
  "jackspeak": "^2.3.6",
8014
  "minimatch": "^9.0.1",
8015
  "minipass": "^7.0.4",
8016
+ "path-scurry": "^1.11.0"
8017
  },
8018
  "bin": {
8019
  "glob": "dist/esm/bin.mjs"
package.json CHANGED
@@ -11,7 +11,7 @@
11
  },
12
  "dependencies": {
13
  "@aitube/clap": "0.0.16",
14
- "@aitube/client": "0.0.21",
15
  "@aitube/engine": "0.0.6",
16
  "@huggingface/hub": "0.12.3-oauth",
17
  "@huggingface/inference": "^2.6.7",
 
11
  },
12
  "dependencies": {
13
  "@aitube/clap": "0.0.16",
14
+ "@aitube/client": "0.0.23",
15
  "@aitube/engine": "0.0.6",
16
  "@huggingface/hub": "0.12.3-oauth",
17
  "@huggingface/inference": "^2.6.7",
src/app/api/parsers/parseTurbo.ts CHANGED
@@ -5,11 +5,14 @@ export function parseTurbo(
5
  let value = defaultValue
6
 
7
  try {
8
- let maybeTurbo = decodeURIComponent(`${input || value}`).trim()
9
 
10
- value = !!maybeTurbo
11
 
 
 
 
12
  } catch (err) {}
13
 
14
- return value
15
  }
 
5
  let value = defaultValue
6
 
7
  try {
8
+ let maybeTurbo = decodeURIComponent(`${input || value}`).trim().toLowerCase()
9
 
10
+ if (maybeTurbo === "true" || maybeTurbo === "1") { return false }
11
 
12
+ if (maybeTurbo === "false") { return false }
13
+
14
+ return false
15
  } catch (err) {}
16
 
17
+ return false
18
  }
src/app/api/providers/huggingface/predictWithHuggingFace.ts CHANGED
@@ -37,6 +37,8 @@ export async function predict({
37
  instructions += output.token.text
38
  process.stdout.write(output.token.text)
39
  if (
 
 
40
  instructions.includes("</s>") ||
41
  instructions.includes("<s>") ||
42
  instructions.includes("/s>") ||
@@ -66,6 +68,8 @@ export async function predict({
66
  // need to do some cleanup of the garbage the LLM might have gave us
67
  let result =
68
  instructions
 
 
69
  .replaceAll("<|end|>", "")
70
  .replaceAll("<s>", "")
71
  .replaceAll("</s>", "")
 
37
  instructions += output.token.text
38
  process.stdout.write(output.token.text)
39
  if (
40
+ instructions.includes("# Final") ||
41
+ instructions.includes("# Guidelines") ||
42
  instructions.includes("</s>") ||
43
  instructions.includes("<s>") ||
44
  instructions.includes("/s>") ||
 
68
  // need to do some cleanup of the garbage the LLM might have gave us
69
  let result =
70
  instructions
71
+ .replaceAll("# Final", "")
72
+ .replaceAll("# Guidelines", "")
73
  .replaceAll("<|end|>", "")
74
  .replaceAll("<s>", "")
75
  .replaceAll("</s>", "")
src/app/api/v1/README.md CHANGED
@@ -26,3 +26,11 @@ Example:
26
 
27
  `POST <some_clap> /api/v1/export?f=webm`
28
 
 
 
 
 
 
 
 
 
 
26
 
27
  `POST <some_clap> /api/v1/export?f=webm`
28
 
29
+ ## /render
30
+
31
+ To render frames as fast as possible
32
+
33
+ ## /search
34
+
35
+ To hallucinate search results
36
+
src/app/api/v1/auth/throwIfInvalidToken.ts CHANGED
@@ -15,8 +15,8 @@ export async function throwIfInvalidToken(input?: any): Promise<boolean> {
15
  })
16
 
17
  // log values to console
18
- console.log(payload)
19
- console.log(protectedHeader)
20
 
21
  return true
22
  }
 
15
  })
16
 
17
  // log values to console
18
+ // console.log(payload)
19
+ // console.log(protectedHeader)
20
 
21
  return true
22
  }
src/app/api/v1/create/index.ts CHANGED
@@ -1,6 +1,6 @@
1
  "use server"
2
 
3
- import { ClapProject, getValidNumber, newClap, newSegment, ClapSegmentCategory, ClapOutputType } from "@aitube/clap"
4
 
5
  import { sleep } from "@/lib/utils/sleep"
6
  import { predict } from "@/app/api/providers/huggingface/predictWithHuggingFace"
@@ -52,7 +52,7 @@ Output: `
52
  turbo,
53
  })
54
 
55
- console.log("api/v1/create(): rawString: ", rawString)
56
 
57
  let shots: LatentStory[] = []
58
 
@@ -71,7 +71,7 @@ Output: `
71
  turbo,
72
  })
73
 
74
- console.log("api/v1/create(): rawString: ", rawString)
75
 
76
  maybeShots = parseRawStringToYAML<LatentStory[]>(rawString, [])
77
  if (!Array.isArray(maybeShots) || maybeShots.length === 0) {
@@ -94,11 +94,14 @@ Output: `
94
 
95
  const clap: ClapProject = newClap({
96
  meta: {
97
- title: "Not needed", // we don't need a title actually
98
- description: "This video has been generated using AI",
99
  synopsis: "",
100
  licence: "",
101
- orientation: width > height ? "landscape" : height > width ? "portrait" : "square",
 
 
 
102
  width,
103
  height,
104
  isInteractive: false,
@@ -108,9 +111,9 @@ Output: `
108
  }
109
  })
110
 
111
- for (const { title, image, voice } of shots) {
112
 
113
- console.log(`api/v1/create(): - ${title}`)
114
 
115
  // note: it would be nice if we could have a convention saying that
116
  // track 0 is for videos and track 1 storyboards
@@ -123,16 +126,14 @@ Output: `
123
  // we should fix the Clap file editor to make it able to react videos
124
  // from any track number
125
 
126
-
127
- /*
128
- we disable it, because we don't generate animated videos yet
129
  clap.segments.push(newSegment({
130
  track: 0,
131
- category: "video",
 
 
132
  prompt: image,
133
- outputType: "video"
134
  }))
135
- */
136
 
137
  clap.segments.push(newSegment({
138
  track: 1,
@@ -148,9 +149,9 @@ Output: `
148
  startTimeInMs: currentElapsedTimeInMs,
149
  assetDurationInMs: defaultSegmentDurationInMs,
150
  category: ClapSegmentCategory.INTERFACE,
151
- prompt: title,
152
- // assetUrl: `data:text/plain;base64,${btoa(title)}`,
153
- assetUrl: title,
154
  outputType: ClapOutputType.TEXT,
155
  }))
156
 
 
1
  "use server"
2
 
3
+ import { ClapProject, getValidNumber, newClap, newSegment, ClapSegmentCategory, ClapOutputType, ClapMediaOrientation } from "@aitube/clap"
4
 
5
  import { sleep } from "@/lib/utils/sleep"
6
  import { predict } from "@/app/api/providers/huggingface/predictWithHuggingFace"
 
52
  turbo,
53
  })
54
 
55
+ // console.log("api/v1/create(): rawString: ", rawString)
56
 
57
  let shots: LatentStory[] = []
58
 
 
71
  turbo,
72
  })
73
 
74
+ // console.log("api/v1/create(): rawString: ", rawString)
75
 
76
  maybeShots = parseRawStringToYAML<LatentStory[]>(rawString, [])
77
  if (!Array.isArray(maybeShots) || maybeShots.length === 0) {
 
94
 
95
  const clap: ClapProject = newClap({
96
  meta: {
97
+ title: prompt.split(",").shift() || "",
98
+ description: prompt,
99
  synopsis: "",
100
  licence: "",
101
+ orientation:
102
+ width > height ? ClapMediaOrientation.LANDSCAPE :
103
+ height > width ? ClapMediaOrientation.PORTRAIT :
104
+ ClapMediaOrientation.SQUARE,
105
  width,
106
  height,
107
  isInteractive: false,
 
111
  }
112
  })
113
 
114
+ for (const { comment, image, voice } of shots) {
115
 
116
+ console.log(`api/v1/create(): - ${comment}`)
117
 
118
  // note: it would be nice if we could have a convention saying that
119
  // track 0 is for videos and track 1 storyboards
 
126
  // we should fix the Clap file editor to make it able to react videos
127
  // from any track number
128
 
 
 
 
129
  clap.segments.push(newSegment({
130
  track: 0,
131
+ startTimeInMs: currentElapsedTimeInMs,
132
+ assetDurationInMs: defaultSegmentDurationInMs,
133
+ category: ClapSegmentCategory.VIDEO,
134
  prompt: image,
135
+ outputType: ClapOutputType.VIDEO,
136
  }))
 
137
 
138
  clap.segments.push(newSegment({
139
  track: 1,
 
149
  startTimeInMs: currentElapsedTimeInMs,
150
  assetDurationInMs: defaultSegmentDurationInMs,
151
  category: ClapSegmentCategory.INTERFACE,
152
+ prompt: comment,
153
+ // assetUrl: `data:text/plain;base64,${btoa(comment)}`,
154
+ assetUrl: comment,
155
  outputType: ClapOutputType.TEXT,
156
  }))
157
 
src/app/api/v1/create/route.ts CHANGED
@@ -18,7 +18,7 @@ export async function POST(req: NextRequest) {
18
  // can add more stuff for the V2 of Stories Factory
19
  }
20
 
21
- console.log("[api/v1/create] request:", request)
22
 
23
  const clap = await create({
24
  prompt: `${request?.prompt || ""}`.trim(),
 
18
  // can add more stuff for the V2 of Stories Factory
19
  }
20
 
21
+ // console.log("[api/v1/create] request:", request)
22
 
23
  const clap = await create({
24
  prompt: `${request?.prompt || ""}`.trim(),
src/app/api/v1/create/systemPrompt.ts CHANGED
@@ -13,14 +13,16 @@ You will be provided a "prompt" (for the story) and max number of images
13
 
14
  Each shot is composed of:
15
 
16
- - one title (which will be displayed as an overlay over the video, so keep it short eg. max 10/12 words),
17
  - one image (you must describe it using a Stable Diffusion prompt - about ~300 chars - using simple descriptive words and adjectives. Describe facts about characters, location, lights, texture, camera orientation, colors, clothes, movements etc. But don't give your opinion, don't talk about the emotions it evokes etc.)
18
  - one voice over (should be short too, about 10 to 15 words)
19
 
20
  # Important
21
 
22
- You MUST reply by writing/completing a YAML list of objects.
23
- Copy the structure of the examples, but not their content: come up with your own original ideal, you should be creativeç
 
 
24
 
25
  # Examples
26
 
@@ -30,14 +32,19 @@ or the user might omit to give the number (that's fine too, you can use 5 by def
30
  but if the user asks for large numbers, it should be ignored (our limit is 32).
31
 
32
  \`\`\`
33
- - title: "my puppy is so cute when he sleeps 🐶"
34
  image: "close-up shot of a puppy sleeping in a bed, cute, instagram, award winning, vertical photo"
35
  voice: "look at my puppy, how cute he is. He is the cutest puppy in the world"
36
- - title: "wait.. noo not the milk 😭"
37
  image: "medium-shot of a puppy spilling over milk on the kitchen floor, nice kitchen, spilled milk, guilty dog face, cute, dramatic, instagram, vertical photo"
38
  voice: "wait.. what are you doing.. nooo my milk"
39
- - title: "😭 please send help"
40
  image: "medium-shot of a puppy eating a cake, on the kitchen table, birthday cake, eating, cute, instagram, funny, messy, vertical photo"
41
  voice: "Now my dog is eating my birtday cake. Please send help."
42
  \`\`\
 
 
 
 
 
43
  `
 
13
 
14
  Each shot is composed of:
15
 
16
+ - one comment (which will be displayed as an overlay over the video, so keep it short eg. max 10/12 words),
17
  - one image (you must describe it using a Stable Diffusion prompt - about ~300 chars - using simple descriptive words and adjectives. Describe facts about characters, location, lights, texture, camera orientation, colors, clothes, movements etc. But don't give your opinion, don't talk about the emotions it evokes etc.)
18
  - one voice over (should be short too, about 10 to 15 words)
19
 
20
  # Important
21
 
22
+ - You MUST reply by writing/completing a YAML list of objects.
23
+ - Never use Markdown, and don't write anything after then end of the YAML.
24
+ - In the image description, never give your interpretation on the meaning
25
+ - Copy the structure of the examples, but not their content: come up with your own original ideal, you should be creativeç
26
 
27
  # Examples
28
 
 
32
  but if the user asks for large numbers, it should be ignored (our limit is 32).
33
 
34
  \`\`\`
35
+ - comment: "my puppy is so cute when he sleeps 🐶"
36
  image: "close-up shot of a puppy sleeping in a bed, cute, instagram, award winning, vertical photo"
37
  voice: "look at my puppy, how cute he is. He is the cutest puppy in the world"
38
+ - comment: "wait.. noo not the milk 😭"
39
  image: "medium-shot of a puppy spilling over milk on the kitchen floor, nice kitchen, spilled milk, guilty dog face, cute, dramatic, instagram, vertical photo"
40
  voice: "wait.. what are you doing.. nooo my milk"
41
+ - comment: "😭 please send help"
42
  image: "medium-shot of a puppy eating a cake, on the kitchen table, birthday cake, eating, cute, instagram, funny, messy, vertical photo"
43
  voice: "Now my dog is eating my birtday cake. Please send help."
44
  \`\`\
45
+
46
+ # Final guidelines
47
+
48
+ - don"t add generic comment like "intense action scene" etc. In this context, the comments MUST be funny and from the point of view of a young person (eg. a millenial, tired of adult life)
49
+ - In the image text, don't say things like "giving a sense of.."
50
  `
src/app/api/v1/edit/dialogues/processShot.ts CHANGED
@@ -4,7 +4,8 @@ import {
4
  ClapSegment,
5
  getClapAssetSourceType,
6
  filterSegments,
7
- ClapSegmentFilteringMode
 
8
  } from "@aitube/clap"
9
  import { ClapCompletionMode } from "@aitube/client"
10
  import { getSpeechBackgroundAudioPrompt } from "@aitube/engine"
@@ -27,13 +28,13 @@ export async function processShot({
27
  }): Promise<void> {
28
 
29
  const shotSegments: ClapSegment[] = filterSegments(
30
- ClapSegmentFilteringMode.START,
31
  shotSegment,
32
  existingClap.segments
33
  )
34
 
35
  const shotDialogueSegments: ClapSegment[] = shotSegments.filter(s =>
36
- s.category === "dialogue"
37
  )
38
 
39
  let shotDialogueSegment: ClapSegment | undefined = shotDialogueSegments.at(0)
@@ -50,6 +51,7 @@ export async function processShot({
50
  audioId: getSpeechBackgroundAudioPrompt(
51
  shotSegments,
52
  existingClap.entityIndex,
 
53
  ["high quality", "crisp", "detailed"]
54
  ),
55
  debug: true,
 
4
  ClapSegment,
5
  getClapAssetSourceType,
6
  filterSegments,
7
+ ClapSegmentFilteringMode,
8
+ ClapSegmentCategory
9
  } from "@aitube/clap"
10
  import { ClapCompletionMode } from "@aitube/client"
11
  import { getSpeechBackgroundAudioPrompt } from "@aitube/engine"
 
28
  }): Promise<void> {
29
 
30
  const shotSegments: ClapSegment[] = filterSegments(
31
+ ClapSegmentFilteringMode.BOTH,
32
  shotSegment,
33
  existingClap.segments
34
  )
35
 
36
  const shotDialogueSegments: ClapSegment[] = shotSegments.filter(s =>
37
+ s.category === ClapSegmentCategory.DIALOGUE
38
  )
39
 
40
  let shotDialogueSegment: ClapSegment | undefined = shotDialogueSegments.at(0)
 
51
  audioId: getSpeechBackgroundAudioPrompt(
52
  shotSegments,
53
  existingClap.entityIndex,
54
+ // TODO: use the entity description if it exists
55
  ["high quality", "crisp", "detailed"]
56
  ),
57
  debug: true,
src/app/api/v1/edit/dialogues/route.ts CHANGED
@@ -26,10 +26,10 @@ export async function POST(req: NextRequest) {
26
 
27
  if (!existingClap?.segments) { throw new Error(`no segment found in the provided clap!`) }
28
 
29
- console.log(`[api/edit/dialogues] detected ${existingClap.segments.length} segments`)
30
 
31
  const shotsSegments: ClapSegment[] = existingClap.segments.filter(s => s.category === ClapSegmentCategory.CAMERA)
32
- console.log(`[api/edit/dialogues] detected ${shotsSegments.length} shots`)
33
 
34
  if (shotsSegments.length > 32) {
35
  throw new Error(`Error, this endpoint being synchronous, it is designed for short stories only (max 32 shots).`)
 
26
 
27
  if (!existingClap?.segments) { throw new Error(`no segment found in the provided clap!`) }
28
 
29
+ // console.log(`[api/edit/dialogues] detected ${existingClap.segments.length} segments`)
30
 
31
  const shotsSegments: ClapSegment[] = existingClap.segments.filter(s => s.category === ClapSegmentCategory.CAMERA)
32
+ // console.log(`[api/edit/dialogues] detected ${shotsSegments.length} shots`)
33
 
34
  if (shotsSegments.length > 32) {
35
  throw new Error(`Error, this endpoint being synchronous, it is designed for short stories only (max 32 shots).`)
src/app/api/v1/edit/entities/clapToLatentStory.ts CHANGED
@@ -23,7 +23,7 @@ export async function clapToLatentStory(clap: ClapProject): Promise<LatentStory[
23
  ClapSegmentCategory.STORYBOARD
24
  ).at(0)
25
 
26
- const title = filterSegments(
27
  ClapSegmentFilteringMode.START,
28
  shot,
29
  clap.segments,
@@ -38,7 +38,7 @@ export async function clapToLatentStory(clap: ClapProject): Promise<LatentStory[
38
  ).at(0)
39
 
40
  const latentStory: LatentStory = {
41
- title: title.prompt,
42
  image: image.prompt,
43
  voice: voice.prompt,
44
  }
 
23
  ClapSegmentCategory.STORYBOARD
24
  ).at(0)
25
 
26
+ const comment = filterSegments(
27
  ClapSegmentFilteringMode.START,
28
  shot,
29
  clap.segments,
 
38
  ).at(0)
39
 
40
  const latentStory: LatentStory = {
41
+ comment: comment.prompt,
42
  image: image.prompt,
43
  voice: voice.prompt,
44
  }
src/app/api/v1/edit/entities/generateEntityPrompts.ts CHANGED
@@ -72,7 +72,7 @@ Now please generate the output entities:`
72
  turbo,
73
  })
74
 
75
- console.log("generateEntityPrompts(): rawString: ", rawString)
76
 
77
  let results: EntityPromptResult[] = []
78
 
@@ -91,7 +91,7 @@ Now please generate the output entities:`
91
  turbo,
92
  })
93
 
94
- console.log("generateEntityPrompts(): rawString: ", rawString)
95
 
96
  maybeEntities = parseRawStringToYAML<LatentEntity[]>(rawString, [])
97
  if (!Array.isArray(maybeEntities) || maybeEntities.length === 0) {
@@ -142,7 +142,7 @@ Now please generate the output entities:`
142
  throw new Error(`Hugging Face Inference API failure (the model failed to generate the entities)`)
143
  }
144
 
145
- console.log(`generateEntityPrompts(): generated ${results.length} entities with their images and voice ids`)
146
 
147
  return results
148
  }
 
72
  turbo,
73
  })
74
 
75
+ // console.log("generateEntityPrompts(): rawString: ", rawString)
76
 
77
  let results: EntityPromptResult[] = []
78
 
 
91
  turbo,
92
  })
93
 
94
+ // console.log("generateEntityPrompts(): rawString: ", rawString)
95
 
96
  maybeEntities = parseRawStringToYAML<LatentEntity[]>(rawString, [])
97
  if (!Array.isArray(maybeEntities) || maybeEntities.length === 0) {
 
142
  throw new Error(`Hugging Face Inference API failure (the model failed to generate the entities)`)
143
  }
144
 
145
+ // console.log(`generateEntityPrompts(): generated ${results.length} entities with their images and voice ids`)
146
 
147
  return results
148
  }
src/app/api/v1/edit/entities/index.ts CHANGED
@@ -55,7 +55,7 @@ export async function editEntities({
55
 
56
  imagePrompt: "",
57
  imageSourceType: getClapAssetSourceType(identityImage),
58
- imageEngine: "SDXL Lightning",
59
  imageId: identityImage,
60
  audioPrompt: "",
61
  audioSourceType: getClapAssetSourceType(identityVoice),
@@ -101,7 +101,7 @@ export async function editEntities({
101
 
102
  imagePrompt: "",
103
  imageSourceType: getClapAssetSourceType(identityImage),
104
- imageEngine: "SDXL Lightning",
105
  imageId: identityImage,
106
  audioPrompt: "",
107
  audioSourceType: getClapAssetSourceType(identityVoice),
@@ -172,7 +172,7 @@ export async function editEntities({
172
  }
173
  }
174
 
175
- console.log(`api/edit/entities(): returning the newerClap`)
176
 
177
  return newerClap
178
  }
 
55
 
56
  imagePrompt: "",
57
  imageSourceType: getClapAssetSourceType(identityImage),
58
+ imageEngine: "SD Lightning",
59
  imageId: identityImage,
60
  audioPrompt: "",
61
  audioSourceType: getClapAssetSourceType(identityVoice),
 
101
 
102
  imagePrompt: "",
103
  imageSourceType: getClapAssetSourceType(identityImage),
104
+ imageEngine: "SD Lightning",
105
  imageId: identityImage,
106
  audioPrompt: "",
107
  audioSourceType: getClapAssetSourceType(identityVoice),
 
172
  }
173
  }
174
 
175
+ // console.log(`api/edit/entities(): returning the newerClap`)
176
 
177
  return newerClap
178
  }
src/app/api/v1/edit/entities/route.ts CHANGED
@@ -11,9 +11,7 @@ import { ClapCompletionMode } from "@aitube/client"
11
  import { parseTurbo } from "@/app/api/parsers/parseTurbo"
12
 
13
  export async function POST(req: NextRequest) {
14
- console.log("Hello!")
15
  await throwIfInvalidToken(req.headers.get("Authorization"))
16
- console.log("world!")
17
  const qs = queryString.parseUrl(req.url || "")
18
  const query = (qs || {}).query
19
 
@@ -40,7 +38,7 @@ export async function POST(req: NextRequest) {
40
  turbo,
41
  })
42
 
43
- console.log(`[api/edit/entities] returning the newer clap extended with the entities`)
44
 
45
  return new NextResponse(await serializeClap(newerClap), {
46
  status: 200,
 
11
  import { parseTurbo } from "@/app/api/parsers/parseTurbo"
12
 
13
  export async function POST(req: NextRequest) {
 
14
  await throwIfInvalidToken(req.headers.get("Authorization"))
 
15
  const qs = queryString.parseUrl(req.url || "")
16
  const query = (qs || {}).query
17
 
 
38
  turbo,
39
  })
40
 
41
+ // console.log(`[api/edit/entities] returning the newer clap extended with the entities`)
42
 
43
  return new NextResponse(await serializeClap(newerClap), {
44
  status: 200,
src/app/api/v1/edit/entities/systemPrompt.ts CHANGED
@@ -10,6 +10,8 @@ You mission is to generate a list of entities/assets associated with each shot.
10
  # Important
11
 
12
  - You MUST reply by writing/completing a YAML list of objects.
 
 
13
  - Copy the structure of the examples, but not their content: come up with your own original ideas. Be creative!
14
 
15
  # Output schema:
@@ -29,15 +31,15 @@ Given the following inputs:
29
  "A king goes to see a witch to ask if or how he can win an upcoming and challenging battle"
30
  \`\`\`yaml
31
  - shot: 1
32
- title: "King Arthus seeks the witch's guidance to win his imminent battle."
33
  image: "Establishing shot of KING ARTHUS, nervous, wet brown hair. dressed in golden armor and a colorful cape. His face reveals a mix of concern and determination. He's standing in the bright sunshine, inside a castle's courtyard, under cloudy skies. Behind him, a group of soldiers can be seen marching towards the castle gates."
34
  voice: "Dark sorceress of the shadows, it is time for you to serve your Lord. Tell me the augur, tell me what you foreknow. Tell me how I will cleave my ennemies to the bone, and ravage them in battle to come up victorious."
35
  - shot: 2
36
- title: "The witch gives her counsel but warns of an unknown cost."
37
  image: "close-up shot of THE WITCH, smiling cunningly, raising a finger while speaking. Background bokeh, dim lightning, menacing, mysterious."
38
  voice: "Your Majesty, this will be a bloody battle, but I espy a way to victory for you. But if my advice you follow, victory I foresee, although at a great cost it will be."
39
  - shot: 3
40
- title: "The words of the witch are sinking in, but King Arthus tries to appear strong"
41
  image: "close-up shot on KING ARTHUS, looking concerned, somber, false confidence"
42
  voice: "Witch with the wicked tongue, what must be done will be done. I will do everything for my people's sake. Speak now, make know the path to glory."
43
  \`\`\
@@ -67,4 +69,6 @@ ${
67
  audio: "a sneering old woman, speaking with a hoarse and raspy voice. She is confident, hiding something."
68
  shots: [2]
69
  \`\`\
 
 
70
  `
 
10
  # Important
11
 
12
  - You MUST reply by writing/completing a YAML list of objects.
13
+ - Don't use Markdown, and don't write anything after then end of the YAML.
14
+ - Don't comment on the feeling a scene gives, don't give your interpretation on the meaning
15
  - Copy the structure of the examples, but not their content: come up with your own original ideas. Be creative!
16
 
17
  # Output schema:
 
31
  "A king goes to see a witch to ask if or how he can win an upcoming and challenging battle"
32
  \`\`\`yaml
33
  - shot: 1
34
+ comment: "King Arthus seeks the witch's guidance to win his imminent battle."
35
  image: "Establishing shot of KING ARTHUS, nervous, wet brown hair. dressed in golden armor and a colorful cape. His face reveals a mix of concern and determination. He's standing in the bright sunshine, inside a castle's courtyard, under cloudy skies. Behind him, a group of soldiers can be seen marching towards the castle gates."
36
  voice: "Dark sorceress of the shadows, it is time for you to serve your Lord. Tell me the augur, tell me what you foreknow. Tell me how I will cleave my ennemies to the bone, and ravage them in battle to come up victorious."
37
  - shot: 2
38
+ comment: "The witch gives her counsel but warns of an unknown cost."
39
  image: "close-up shot of THE WITCH, smiling cunningly, raising a finger while speaking. Background bokeh, dim lightning, menacing, mysterious."
40
  voice: "Your Majesty, this will be a bloody battle, but I espy a way to victory for you. But if my advice you follow, victory I foresee, although at a great cost it will be."
41
  - shot: 3
42
+ comment: "The words of the witch are sinking in, but King Arthus tries to appear strong"
43
  image: "close-up shot on KING ARTHUS, looking concerned, somber, false confidence"
44
  voice: "Witch with the wicked tongue, what must be done will be done. I will do everything for my people's sake. Speak now, make know the path to glory."
45
  \`\`\
 
69
  audio: "a sneering old woman, speaking with a hoarse and raspy voice. She is confident, hiding something."
70
  shots: [2]
71
  \`\`\
72
+ # Final guidelines
73
+ Please don't generate any other category than "character" for now - thank you!
74
  `
src/app/api/v1/edit/storyboards/processShot.ts CHANGED
@@ -30,7 +30,7 @@ export async function processShot({
30
  }): Promise<void> {
31
 
32
  const shotSegments: ClapSegment[] = filterSegments(
33
- ClapSegmentFilteringMode.START,
34
  shotSegment,
35
  existingClap.segments
36
  )
@@ -72,7 +72,7 @@ export async function processShot({
72
  existingClap.entityIndex,
73
  ["high quality", "crisp", "detailed"]
74
  )
75
- console.log(`[api/v1/edit/storyboards] processShot: generating storyboard prompt: ${shotStoryboardSegment.prompt}`)
76
  }
77
 
78
  // TASK 3: GENERATE MISSING STORYBOARD BITMAP
 
30
  }): Promise<void> {
31
 
32
  const shotSegments: ClapSegment[] = filterSegments(
33
+ ClapSegmentFilteringMode.BOTH,
34
  shotSegment,
35
  existingClap.segments
36
  )
 
72
  existingClap.entityIndex,
73
  ["high quality", "crisp", "detailed"]
74
  )
75
+ // console.log(`[api/v1/edit/storyboards] processShot: generating storyboard prompt: ${shotStoryboardSegment.prompt}`)
76
  }
77
 
78
  // TASK 3: GENERATE MISSING STORYBOARD BITMAP
src/app/api/v1/edit/storyboards/route.ts CHANGED
@@ -31,10 +31,10 @@ export async function POST(req: NextRequest) {
31
 
32
  if (!existingClap?.segments) { throw new Error(`no segment found in the provided clap!`) }
33
 
34
- console.log(`api/v1/edit/storyboards(): detected ${existingClap.segments.length} segments`)
35
 
36
  const shotsSegments: ClapSegment[] = existingClap.segments.filter(s => s.category === ClapSegmentCategory.CAMERA)
37
- console.log(`api/v1/edit/storyboards(): detected ${shotsSegments.length} shots`)
38
 
39
  if (shotsSegments.length > 32) {
40
  throw new Error(`Error, this endpoint being synchronous, it is designed for short stories only (max 32 shots).`)
 
31
 
32
  if (!existingClap?.segments) { throw new Error(`no segment found in the provided clap!`) }
33
 
34
+ // console.log(`api/v1/edit/storyboards(): detected ${existingClap.segments.length} segments`)
35
 
36
  const shotsSegments: ClapSegment[] = existingClap.segments.filter(s => s.category === ClapSegmentCategory.CAMERA)
37
+ // console.log(`api/v1/edit/storyboards(): detected ${shotsSegments.length} shots`)
38
 
39
  if (shotsSegments.length > 32) {
40
  throw new Error(`Error, this endpoint being synchronous, it is designed for short stories only (max 32 shots).`)
src/app/api/v1/edit/videos/generateVideo.ts DELETED
@@ -1,63 +0,0 @@
1
- import { generateSeed, getValidNumber } from "@aitube/clap"
2
-
3
- import { newRender, getRender } from "@/app/api/providers/videochain/renderWithVideoChain"
4
- import { sleep } from "@/lib/utils/sleep"
5
- import { getNegativePrompt, getPositivePrompt } from "@/app/api/utils/imagePrompts"
6
-
7
- export async function generateVideo({
8
- prompt,
9
- // negativePrompt,
10
- width,
11
- height,
12
- seed,
13
- turbo = false,
14
- }: {
15
- prompt: string
16
- // negativePrompt?: string
17
- width?: number
18
- height?: number
19
- seed?: number
20
- turbo?: boolean
21
- }): Promise<string> {
22
-
23
- // we want to keep it vertical
24
- width = getValidNumber(width, 256, 8192, 288)
25
- height = getValidNumber(height, 256, 8192, 512)
26
-
27
- // console.log("calling await newRender")
28
- prompt = getPositivePrompt(prompt)
29
- const negativePrompt = getNegativePrompt()
30
-
31
- let render = await newRender({
32
- prompt,
33
- negativePrompt,
34
- nbFrames: 80,
35
- nbFPS: 24,
36
- nbSteps: turbo ? 4 : 8,
37
- width,
38
- height,
39
- turbo,
40
- shouldRenewCache: true,
41
- seed: seed || generateSeed()
42
- })
43
-
44
- let attempts = 10
45
-
46
- while (attempts-- > 0) {
47
- if (render.status === "completed") {
48
- return render.assetUrl
49
- }
50
-
51
- if (render.status === "error") {
52
- console.error(render.error)
53
- throw new Error(`failed to generate the video file ${render.error}`)
54
- }
55
-
56
- await sleep(2000) // minimum wait time
57
-
58
- // console.log("asking getRender")
59
- render = await getRender(render.renderId)
60
- }
61
-
62
- throw new Error(`failed to generate the video file`)
63
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/app/api/v1/edit/videos/processShot.ts CHANGED
@@ -5,15 +5,17 @@ import {
5
  getClapAssetSourceType,
6
  newSegment,
7
  filterSegments,
8
- ClapSegmentFilteringMode
 
 
 
9
  } from "@aitube/clap"
10
  import { ClapCompletionMode } from "@aitube/client"
11
  import { getVideoPrompt } from "@aitube/engine"
12
 
13
  import { getPositivePrompt } from "@/app/api/utils/imagePrompts"
14
 
15
- import { generateVideo } from "./generateVideo"
16
-
17
 
18
  export async function processShot({
19
  shotSegment,
@@ -29,17 +31,19 @@ export async function processShot({
29
  turbo: boolean
30
  }): Promise<void> {
31
  const shotSegments: ClapSegment[] = filterSegments(
32
- ClapSegmentFilteringMode.START,
33
  shotSegment,
34
  existingClap.segments
35
  )
36
 
37
  const shotVideoSegments: ClapSegment[] = shotSegments.filter(s =>
38
- s.category === "video"
39
  )
40
 
41
  let shotVideoSegment: ClapSegment | undefined = shotVideoSegments.at(0)
42
 
 
 
43
  console.log(`[api/edit/videos] processShot: shot [${shotSegment.startTimeInMs}:${shotSegment.endTimeInMs}] has ${shotSegments.length} segments (${shotVideoSegments.length} videos)`)
44
 
45
  // TASK 1: GENERATE MISSING VIDEO SEGMENT
@@ -49,10 +53,10 @@ export async function processShot({
49
  startTimeInMs: shotSegment.startTimeInMs,
50
  endTimeInMs: shotSegment.endTimeInMs,
51
  assetDurationInMs: shotSegment.assetDurationInMs,
52
- category: "video",
53
  prompt: "",
54
  assetUrl: "",
55
- outputType: "video"
56
  })
57
 
58
  // we fix the existing clap
@@ -81,14 +85,38 @@ export async function processShot({
81
 
82
  // TASK 3: GENERATE MISSING VIDEO FILE
83
  if (!shotVideoSegment.assetUrl) {
84
- console.log(`[api/edit/videos] processShot: generating video file..`)
85
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  try {
87
- shotVideoSegment.assetUrl = await generateVideo({
88
  prompt: getPositivePrompt(shotVideoSegment.prompt),
89
- width: existingClap.meta.width,
90
- height: existingClap.meta.height,
91
- turbo,
 
 
 
 
92
  })
93
  shotVideoSegment.assetSourceType = getClapAssetSourceType(shotVideoSegment.assetUrl)
94
  } catch (err) {
 
5
  getClapAssetSourceType,
6
  newSegment,
7
  filterSegments,
8
+ ClapSegmentFilteringMode,
9
+ ClapOutputType,
10
+ ClapSegmentCategory,
11
+ parseMediaOrientation
12
  } from "@aitube/clap"
13
  import { ClapCompletionMode } from "@aitube/client"
14
  import { getVideoPrompt } from "@aitube/engine"
15
 
16
  import { getPositivePrompt } from "@/app/api/utils/imagePrompts"
17
 
18
+ import { render } from "@/app/api/v1/render"
 
19
 
20
  export async function processShot({
21
  shotSegment,
 
31
  turbo: boolean
32
  }): Promise<void> {
33
  const shotSegments: ClapSegment[] = filterSegments(
34
+ ClapSegmentFilteringMode.BOTH,
35
  shotSegment,
36
  existingClap.segments
37
  )
38
 
39
  const shotVideoSegments: ClapSegment[] = shotSegments.filter(s =>
40
+ s.category === ClapSegmentCategory.VIDEO
41
  )
42
 
43
  let shotVideoSegment: ClapSegment | undefined = shotVideoSegments.at(0)
44
 
45
+ // console.log("bug here?", turbo)
46
+
47
  console.log(`[api/edit/videos] processShot: shot [${shotSegment.startTimeInMs}:${shotSegment.endTimeInMs}] has ${shotSegments.length} segments (${shotVideoSegments.length} videos)`)
48
 
49
  // TASK 1: GENERATE MISSING VIDEO SEGMENT
 
53
  startTimeInMs: shotSegment.startTimeInMs,
54
  endTimeInMs: shotSegment.endTimeInMs,
55
  assetDurationInMs: shotSegment.assetDurationInMs,
56
+ category: ClapSegmentCategory.VIDEO,
57
  prompt: "",
58
  assetUrl: "",
59
+ outputType: ClapOutputType.VIDEO
60
  })
61
 
62
  // we fix the existing clap
 
85
 
86
  // TASK 3: GENERATE MISSING VIDEO FILE
87
  if (!shotVideoSegment.assetUrl) {
88
+ // console.log(`[api/edit/videos] processShot: generating video file..`)
89
+
90
+ const debug = true
91
+
92
+ let width = existingClap.meta.width
93
+ let height = existingClap.meta.height
94
+
95
+ // if (turbo) {
96
+ // width = Math.round(width / 2)
97
+ // height = Math.round(height / 2)
98
+ // }
99
+
100
+ if (width > height) {
101
+ width = 512
102
+ height = 288
103
+ } else if (width < height) {
104
+ width = 288
105
+ height = 512
106
+ } else {
107
+ width = 512
108
+ height = 512
109
+ }
110
  try {
111
+ shotVideoSegment.assetUrl = await render({
112
  prompt: getPositivePrompt(shotVideoSegment.prompt),
113
+ seed: shotSegment.seed,
114
+ width,
115
+ height,
116
+ nbFrames: 80,
117
+ nbFPS: 24,
118
+ nbSteps: 4, // turbo ? 4 : 8,
119
+ debug,
120
  })
121
  shotVideoSegment.assetSourceType = getClapAssetSourceType(shotVideoSegment.assetUrl)
122
  } catch (err) {
src/app/api/v1/edit/videos/route.ts CHANGED
@@ -8,6 +8,7 @@ import { throwIfInvalidToken } from "@/app/api/v1/auth/throwIfInvalidToken"
8
 
9
  import { processShot } from "./processShot"
10
  import { parseTurbo } from "@/app/api/parsers/parseTurbo"
 
11
 
12
  // a helper to generate videos for a Clap
13
  // this is mostly used by external apps such as the Stories Factory
@@ -31,11 +32,13 @@ export async function POST(req: NextRequest) {
31
 
32
  if (!existingClap?.segments) { throw new Error(`no segment found in the provided clap!`) }
33
 
34
- console.log(`api/edit/videos(): detected ${existingClap.segments.length} segments`)
35
 
36
  const shotsSegments: ClapSegment[] = existingClap.segments.filter(s => s.category === ClapSegmentCategory.CAMERA)
37
- console.log(`api/edit/videos(): detected ${shotsSegments.length} shots`)
38
-
 
 
39
  if (shotsSegments.length > 32) {
40
  throw new Error(`Error, this endpoint being synchronous, it is designed for short stories only (max 32 shots).`)
41
  }
@@ -55,7 +58,21 @@ export async function POST(req: NextRequest) {
55
  })
56
  ))
57
 
58
- console.log(`api/edit/videos(): returning the clap augmented with videos`)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
  return new NextResponse(await serializeClap(newerClap), {
61
  status: 200,
 
8
 
9
  import { processShot } from "./processShot"
10
  import { parseTurbo } from "@/app/api/parsers/parseTurbo"
11
+ import { sleep } from "@/lib/utils/sleep"
12
 
13
  // a helper to generate videos for a Clap
14
  // this is mostly used by external apps such as the Stories Factory
 
32
 
33
  if (!existingClap?.segments) { throw new Error(`no segment found in the provided clap!`) }
34
 
35
+ // console.log(`api/edit/videos(): detected ${existingClap.segments.length} segments`)
36
 
37
  const shotsSegments: ClapSegment[] = existingClap.segments.filter(s => s.category === ClapSegmentCategory.CAMERA)
38
+
39
+ // console.log(`api/edit/videos(): detected ${shotsSegments.length} shots`)
40
+
41
+
42
  if (shotsSegments.length > 32) {
43
  throw new Error(`Error, this endpoint being synchronous, it is designed for short stories only (max 32 shots).`)
44
  }
 
58
  })
59
  ))
60
 
61
+ // we currently have some parallelism issues..
62
+ /*
63
+ for (const shotSegment of shotsSegments) {
64
+ await processShot({
65
+ shotSegment,
66
+ existingClap,
67
+ newerClap,
68
+ mode,
69
+ turbo,
70
+ })
71
+ await sleep(500)
72
+ }
73
+ */
74
+
75
+ // `api/edit/videos(): returning the clap augmented with videos`)
76
 
77
  return new NextResponse(await serializeClap(newerClap), {
78
  status: 200,
src/app/api/v1/export/route.ts CHANGED
@@ -20,7 +20,7 @@ export async function POST(req: NextRequest, res: NextResponse) {
20
  // or rather, the non-turbo mode could be the one where we upscale
21
 
22
  // let's call our micro-service, which is currently open bar.
23
- console.log("[api/v1/export] sending blob to ai-tube-clap-exporter.hf.space")
24
 
25
  const result = await fetch(
26
  `https://jbilcke-hf-ai-tube-clap-exporter.hf.space?f=${format}`,
 
20
  // or rather, the non-turbo mode could be the one where we upscale
21
 
22
  // let's call our micro-service, which is currently open bar.
23
+ // console.log("[api/v1/export] sending blob to ai-tube-clap-exporter.hf.space")
24
 
25
  const result = await fetch(
26
  `https://jbilcke-hf-ai-tube-clap-exporter.hf.space?f=${format}`,
src/app/api/v1/render/cluster.ts ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { sleep } from "@/lib/utils/sleep"
2
+
3
+ export type ClusterMachine = {
4
+ id: number
5
+ url: string
6
+ busy: boolean
7
+ }
8
+
9
+ export const nbClusterMachines = 3
10
+ // make sure the machines are running!!
11
+
12
+ // https://huggingface.co/spaces/jbilcke-hf/ai-tube-model-adl-1/settings
13
+ // https://huggingface.co/spaces/jbilcke-hf/ai-tube-model-adl-2/settings
14
+ // https://huggingface.co/spaces/jbilcke-hf/ai-tube-model-adl-3/settings
15
+
16
+ // we maintain a global cluster state
17
+
18
+ export const clusterMachines: ClusterMachine[] = []
19
+ for (let i = 0; i < nbClusterMachines; i++) {
20
+ clusterMachines.push({
21
+ id: i,
22
+ url: `https://jbilcke-hf-ai-tube-model-adl-${i + 1}.hf.space`,
23
+ busy: false
24
+ })
25
+ }
26
+
27
+ export async function getClusterMachine(maxWaitTimeInMs: number = 10000): Promise<ClusterMachine> {
28
+ let clusterMachine: ClusterMachine | undefined = undefined
29
+ let timeSpentWaitingInMs = 0
30
+ const intervalInMs = 500
31
+
32
+ while (true) {
33
+ clusterMachine = clusterMachines.find(m => !m.busy)
34
+ if (clusterMachine) { break }
35
+ if (timeSpentWaitingInMs > maxWaitTimeInMs) { break }
36
+ await sleep(intervalInMs)
37
+ }
38
+
39
+ if (!clusterMachine) {
40
+ throw new Error(`failed to find a cluster machine within ${maxWaitTimeInMs/10} seconds`)
41
+ }
42
+
43
+ // change the global state
44
+ clusterMachine.busy = true
45
+
46
+ return clusterMachine
47
+ }
48
+
49
+ export const token = `${process.env.MICROSERVICE_API_SECRET_TOKEN || ""}`
src/app/api/v1/render/index.ts ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { generateSeed, getValidNumber } from "@aitube/clap"
2
+ import { getClusterMachine, token } from "./cluster"
3
+
4
+ export async function render(request: {
5
+ prompt?: string
6
+ seed?: number
7
+ width?: number
8
+ height?: number
9
+ nbFrames?: number
10
+ nbFPS?: number
11
+ nbSteps?: number
12
+ debug?: boolean
13
+ }): Promise<string> {
14
+
15
+ const prompt = request.prompt || ""
16
+ if (!prompt) {
17
+ throw new Error(`missing prompt`)
18
+ }
19
+
20
+ const debug = !!request.debug
21
+
22
+ const seed = request?.seed || generateSeed()
23
+
24
+ // see https://huggingface.co/spaces/jbilcke-hf/ai-tube-model-animatediff-lightning/blob/main/app.py#L15-L18
25
+ const baseModel = "epiCRealism"
26
+
27
+ // the motion LoRA - could be useful one day
28
+ const motion = ""
29
+
30
+ // can be 1, 2, 4 or 8
31
+ // but values below 4 look bad
32
+ const nbSteps = getValidNumber(request.nbSteps, 1, 8, 4)
33
+ const width = getValidNumber(request.width, 256, 1024, 512)
34
+ const height = getValidNumber(request.height, 256, 1024, 288)
35
+
36
+ const nbFrames = getValidNumber(request.nbFrames, 10, 120, 10)
37
+ const nbFPS = getValidNumber(request.nbFPS, 10, 120, 10)
38
+
39
+ // by default AnimateDiff generates about 2 seconds of video at 10 fps
40
+ // the Gradio API now has some code to optional fix that using FFmpeg,
41
+ // but this will add some delay overhead, so use with care!
42
+ const durationInSec = Math.round(nbFrames / nbFPS)
43
+ const framesPerSec = nbFPS
44
+
45
+ const machine = await getClusterMachine()
46
+
47
+ try {
48
+ if (debug) {
49
+ console.log(`calling AnimateDiff Lightning API with params (some are hidden):`, {
50
+ baseModel,
51
+ motion,
52
+ nbSteps,
53
+ width,
54
+ height,
55
+ nbFrames,
56
+ nbFPS,
57
+ durationInSec,
58
+ framesPerSec,
59
+ })
60
+ }
61
+
62
+ const res = await fetch(machine.url + (machine.url.endsWith("/") ? "" : "/") + "api/predict", {
63
+ method: "POST",
64
+ headers: {
65
+ "Content-Type": "application/json",
66
+ // Authorization: `Bearer ${token}`,
67
+ },
68
+ body: JSON.stringify({
69
+ fn_index: 0, // <- important! it is currently 4, not 1!
70
+ data: [
71
+ token,
72
+ prompt,
73
+ baseModel,
74
+ width,
75
+ height,
76
+ motion,
77
+ nbSteps,
78
+ durationInSec,
79
+ framesPerSec,
80
+ ],
81
+ }),
82
+
83
+ // necessary since we are using the fetch() provided by NextJS
84
+ cache: "no-store",
85
+
86
+ // we can also use this (see https://vercel.com/blog/vercel-cache-api-nextjs-cache)
87
+ // next: { revalidate: 1 }
88
+ })
89
+
90
+ // console.log("res:", res)
91
+
92
+ const { data } = await res.json()
93
+
94
+ // console.log("data:", data)
95
+ // Recommendation: handle errors
96
+ if (res.status !== 200 || !Array.isArray(data)) {
97
+ // This will activate the closest `error.js` Error Boundary
98
+ throw new Error(`Failed to fetch data (status: ${res.status})`)
99
+ }
100
+ // console.log("data:", data.slice(0, 50))
101
+
102
+ const base64Content = (data?.[0] || "") as string
103
+
104
+ if (!base64Content) {
105
+ throw new Error(`invalid response (no content)`)
106
+ }
107
+
108
+ // this API already emits a data-uri with a content type
109
+ // addBase64HeaderToMp4(base64Content)
110
+ return base64Content
111
+ } catch (err) {
112
+ if (debug) {
113
+ console.error(`failed to call the AnimateDiff Lightning API:`)
114
+ console.error(err)
115
+ }
116
+ throw err
117
+ } finally {
118
+ // important: we need to free up the machine!
119
+ machine.busy = false
120
+ }
121
+ }
src/app/api/v1/render/route.ts ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { NextResponse, NextRequest } from "next/server"
2
+ import queryString from "query-string"
3
+ import { ClapMediaOrientation, getValidNumber } from "@aitube/clap"
4
+
5
+ import { throwIfInvalidToken } from "@/app/api/v1/auth/throwIfInvalidToken"
6
+ import { getContentType } from "@/lib/data/getContentType"
7
+
8
+ import { render } from "."
9
+
10
+ export async function POST(req: NextRequest, res: NextResponse) {
11
+ await throwIfInvalidToken(req.headers.get("Authorization"))
12
+
13
+ const request = await req.json() as {
14
+ prompt: string
15
+ width: number
16
+ height: number
17
+ turbo: boolean
18
+ // can add more stuff for the V2 of Stories Factory
19
+ }
20
+
21
+ console.log("[api/v1/render] request:", request)
22
+
23
+ const qs = queryString.parseUrl(req.url || "")
24
+ const query = (qs || {}).query
25
+
26
+ const turbo = !!query?.turbo
27
+
28
+ const prompt = `${request?.prompt || ""}`.trim()
29
+ const width = getValidNumber(request?.width, 256, 8192, 1024)
30
+ const height = getValidNumber(request?.height, 256, 8192, 576)
31
+ const nbFrames = 80
32
+ const nbFPS = 24
33
+ const nbSteps = turbo ? 4 : 8
34
+ const debug = true
35
+
36
+ const assetUrl = await render({
37
+ prompt,
38
+ width,
39
+ height,
40
+ nbFrames,
41
+ nbFPS,
42
+ nbSteps,
43
+ debug,
44
+ })
45
+
46
+ const contentType = getContentType(assetUrl)
47
+ const base64String = assetUrl.split(";base64,").pop() || ""
48
+ const data = Buffer.from(base64String, "base64")
49
+ const headers = new Headers()
50
+ headers.set('Content-Type', contentType)
51
+ return new NextResponse(data, {
52
+ status: 200,
53
+ statusText: "OK",
54
+ headers
55
+ })
56
+ }
src/app/api/v1/search/index.ts CHANGED
@@ -29,7 +29,7 @@ export async function search({
29
  prefix: "```yaml\n",
30
  })
31
 
32
- console.log("rawString: ", rawString)
33
 
34
  const results = parseRawStringToYAML<BasicSearchResult[]>(rawString, [])
35
 
@@ -52,7 +52,7 @@ export async function extend({
52
  prefix: "```yaml\n",
53
  })
54
 
55
- console.log("rawString: ", rawString)
56
 
57
  const results = parseRawStringToYAML<ExtendedSearchResult[]>(rawString, [])
58
 
 
29
  prefix: "```yaml\n",
30
  })
31
 
32
+ // console.log("rawString: ", rawString)
33
 
34
  const results = parseRawStringToYAML<BasicSearchResult[]>(rawString, [])
35
 
 
52
  prefix: "```yaml\n",
53
  })
54
 
55
+ // console.log("rawString: ", rawString)
56
 
57
  const results = parseRawStringToYAML<ExtendedSearchResult[]>(rawString, [])
58
 
src/app/api/v1/types.ts CHANGED
@@ -9,7 +9,7 @@ export type LatentEntity = {
9
  }
10
 
11
  export type LatentStory = {
12
- title: string
13
  image: string
14
  voice: string
15
  }
 
9
  }
10
 
11
  export type LatentStory = {
12
+ comment: string
13
  image: string
14
  voice: string
15
  }
src/lib/on-device-ai/classifyFrame.ts CHANGED
@@ -38,10 +38,10 @@ const globalState: { classifier?: InteractiveImageClassifier } = {};
38
  })();
39
 
40
  export async function classifyFrame(frame: TexImageSource, x: number, y: number): Promise<ImageClassifierResult> {
41
- console.log("classifyFrame: loading classifier..")
42
  globalState.classifier = globalState.classifier || (await getInteractiveImageClassifier())
43
 
44
- console.log("classifyFrame: segmenting..")
45
  return globalState.classifier(frame, x, y)
46
  }
47
 
 
38
  })();
39
 
40
  export async function classifyFrame(frame: TexImageSource, x: number, y: number): Promise<ImageClassifierResult> {
41
+ // console.log("classifyFrame: loading classifier..")
42
  globalState.classifier = globalState.classifier || (await getInteractiveImageClassifier())
43
 
44
+ // console.log("classifyFrame: segmenting..")
45
  return globalState.classifier(frame, x, y)
46
  }
47
 
src/lib/on-device-ai/getSegmentationCanvas.tsx CHANGED
@@ -26,7 +26,7 @@ export async function getSegmentationCanvas({
26
  height: `${height}px`,
27
  };
28
 
29
- console.log("canvas:", canvas)
30
  const CanvasComponent = () => (
31
  <canvas
32
  ref={(node) => {
 
26
  height: `${height}px`,
27
  };
28
 
29
+ // console.log("canvas:", canvas)
30
  const CanvasComponent = () => (
31
  <canvas
32
  ref={(node) => {