Spaces:

jbilcke-hf
/

ai-tube

Running on CPU Upgrade

App Files Files Community

jbilcke-hf HF staff commited on May 5

Commit

8919651

•

1 Parent(s): 5513dc6

eh, not bad for a side project

Browse files

Files changed (34) hide show

package-lock.json +19 -16
package.json +1 -1
src/app/api/generators/image/generateImageWithVideochain.ts +4 -1
src/app/api/generators/search/unknownObjectToLatentSearchResults.ts +4 -4
src/app/api/parsers/parseCompletionMode.ts +19 -4
src/app/api/parsers/parseEntityPrompts.ts +11 -0
src/app/api/parsers/parseSupportedExportFormat.ts +16 -0
src/app/api/parsers/{parseString.ts → parseTrimmedString.ts} +1 -1
src/app/api/v1/auth/config.ts +5 -0
src/app/api/{auth → v1/auth}/getToken.ts +5 -5
src/app/api/v1/auth/parseToken.ts +7 -0
src/app/api/v1/auth/throwIfInvalidToken.ts +22 -0
src/app/api/v1/create/index.ts +15 -11
src/app/api/v1/create/route.ts +5 -2
src/app/api/v1/create/systemPrompt.ts +15 -11
src/app/api/v1/create/types.ts +0 -6
src/app/api/v1/edit/dialogues/processShot.ts +9 -4
src/app/api/v1/edit/dialogues/route.ts +4 -4
src/app/api/v1/edit/entities/clapToLatentStory.ts +50 -0
src/app/api/v1/edit/entities/generateEntityPrompts.ts +135 -0
src/app/api/v1/edit/entities/generateImageID.ts +0 -1
src/app/api/v1/edit/entities/index.ts +97 -7
src/app/api/v1/edit/entities/route.ts +7 -3
src/app/api/v1/edit/entities/systemPrompt.ts +62 -1
src/app/api/v1/edit/storyboards/processShot.ts +10 -3
src/app/api/v1/edit/storyboards/route.ts +6 -7
src/app/api/v1/edit/types.ts +0 -8
src/app/api/v1/edit/videos/processShot.ts +11 -3
src/app/api/v1/edit/videos/route.ts +8 -10
src/app/api/v1/export/route.ts +4 -9
src/app/api/v1/types.ts +15 -0
src/app/latent/search/page.tsx +1 -1
src/app/latent/watch/page.tsx +1 -1
src/types/general.ts +2 -0

package-lock.json CHANGED Viewed

@@ -9,7 +9,7 @@
       "version": "0.0.0",
       "dependencies": {
         "@aitube/clap": "0.0.10",
-        "@aitube/client": "0.0.12",
         "@aitube/engine": "0.0.2",
         "@huggingface/hub": "0.12.3-oauth",
         "@huggingface/inference": "^2.6.7",
@@ -129,9 +129,12 @@
       }
     },
     "node_modules/@aitube/client": {
-      "version": "0.0.12",
-      "resolved": "https://registry.npmjs.org/@aitube/client/-/client-0.0.12.tgz",
-      "integrity": "sha512-b/QFTtAKwr7H5dMSco+iXhwJRpPw/sT487EGpNjDbuQamIJ3FqdlVMTC/c5jdX8meFp+m35n/dY58Iy39Lle5A==",
       "peerDependencies": {
         "@aitube/clap": "0.0.10"
       }
@@ -922,9 +925,9 @@
       }
     },
     "node_modules/@floating-ui/dom": {
-      "version": "1.6.4",
-      "resolved": "https://registry.npmjs.org/@floating-ui/dom/-/dom-1.6.4.tgz",
-      "integrity": "sha512-0G8R+zOvQsAG1pg2Q99P21jiqxqGBW1iRe/iXHsBRBxnpXKFI8QwbB4x5KmYLggNO5m34IQgOIu9SCRfR/WWiQ==",
       "dependencies": {
         "@floating-ui/core": "^1.0.0",
         "@floating-ui/utils": "^0.2.0"
@@ -2958,9 +2961,9 @@
       "integrity": "sha512-dRLjCWHYg4oaA77cxO64oO+7JwCwnIzkZPdrrC71jQmQtlhM556pwKo5bUzqvZndkVbeFLIIi+9TC40JNF5hNQ=="
     },
     "node_modules/@types/lodash": {
-      "version": "4.17.0",
-      "resolved": "https://registry.npmjs.org/@types/lodash/-/lodash-4.17.0.tgz",
-      "integrity": "sha512-t7dhREVv6dbNj0q17X12j7yDG4bD/DHYX7o5/DbDxobP0HnGPgpRz2Ej77aL7TZT3DSw13fqUTj8J4mMnqa7WA=="
     },
     "node_modules/@types/lodash.debounce": {
       "version": "4.0.9",
@@ -3740,9 +3743,9 @@
       }
     },
     "node_modules/caniuse-lite": {
-      "version": "1.0.30001615",
-      "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001615.tgz",
-      "integrity": "sha512-1IpazM5G3r38meiae0bHRnPhz+CBQ3ZLqbQMtrg+AsTPKAXgW38JNsXkyZ+v8waCsDmPq87lmfun5Q2AGysNEQ==",
       "funding": [
         {
           "type": "opencollective",
@@ -6354,9 +6357,9 @@
       }
     },
     "node_modules/minipass": {
-      "version": "7.0.4",
-      "resolved": "https://registry.npmjs.org/minipass/-/minipass-7.0.4.tgz",
-      "integrity": "sha512-jYofLM5Dam9279rdkWzqHozUo4ybjdZmCsDHePy5V/PbBcVMiSZR97gmAy45aqi8CK1lG2ECd356FU86avfwUQ==",
       "engines": {
         "node": ">=16 || 14 >=14.17"
       }

       "version": "0.0.0",
       "dependencies": {
         "@aitube/clap": "0.0.10",
+        "@aitube/client": "0.0.15",
         "@aitube/engine": "0.0.2",
         "@huggingface/hub": "0.12.3-oauth",
         "@huggingface/inference": "^2.6.7",
       }
     },
     "node_modules/@aitube/client": {
+      "version": "0.0.15",
+      "resolved": "https://registry.npmjs.org/@aitube/client/-/client-0.0.15.tgz",
+      "integrity": "sha512-lGmdsBqjNVStBxZSH+Iig/nOyPdSpqpqU6M0OvOBMTwR4rohSvIQ7TnFJGvoc4WEFciNoCc6Vg6Q5W99ovG+fg==",
+      "dependencies": {
+        "query-string": "^9.0.0"
+      },
       "peerDependencies": {
         "@aitube/clap": "0.0.10"
       }
       }
     },
     "node_modules/@floating-ui/dom": {
+      "version": "1.6.5",
+      "resolved": "https://registry.npmjs.org/@floating-ui/dom/-/dom-1.6.5.tgz",
+      "integrity": "sha512-Nsdud2X65Dz+1RHjAIP0t8z5e2ff/IRbei6BqFrl1urT8sDVzM1HMQ+R0XcU5ceRfyO3I6ayeqIfh+6Wb8LGTw==",
       "dependencies": {
         "@floating-ui/core": "^1.0.0",
         "@floating-ui/utils": "^0.2.0"
       "integrity": "sha512-dRLjCWHYg4oaA77cxO64oO+7JwCwnIzkZPdrrC71jQmQtlhM556pwKo5bUzqvZndkVbeFLIIi+9TC40JNF5hNQ=="
     },
     "node_modules/@types/lodash": {
+      "version": "4.17.1",
+      "resolved": "https://registry.npmjs.org/@types/lodash/-/lodash-4.17.1.tgz",
+      "integrity": "sha512-X+2qazGS3jxLAIz5JDXDzglAF3KpijdhFxlf/V1+hEsOUc+HnWi81L/uv/EvGuV90WY+7mPGFCUDGfQC3Gj95Q=="
     },
     "node_modules/@types/lodash.debounce": {
       "version": "4.0.9",
       }
     },
     "node_modules/caniuse-lite": {
+      "version": "1.0.30001616",
+      "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001616.tgz",
+      "integrity": "sha512-RHVYKov7IcdNjVHJFNY/78RdG4oGVjbayxv8u5IO74Wv7Hlq4PnJE6mo/OjFijjVFNy5ijnCt6H3IIo4t+wfEw==",
       "funding": [
         {
           "type": "opencollective",
       }
     },
     "node_modules/minipass": {
+      "version": "7.1.0",
+      "resolved": "https://registry.npmjs.org/minipass/-/minipass-7.1.0.tgz",
+      "integrity": "sha512-oGZRv2OT1lO2UF1zUcwdTb3wqUwI0kBGTgt/T7OdSj6M6N5m3o5uPf0AIW6lVxGGoiWUR7e2AwTE+xiwK8WQig==",
       "engines": {
         "node": ">=16 || 14 >=14.17"
       }

package.json CHANGED Viewed

@@ -11,7 +11,7 @@
   },
   "dependencies": {
     "@aitube/clap": "0.0.10",
-    "@aitube/client": "0.0.12",
     "@aitube/engine": "0.0.2",
     "@huggingface/hub": "0.12.3-oauth",
     "@huggingface/inference": "^2.6.7",

   },
   "dependencies": {
     "@aitube/clap": "0.0.10",
+    "@aitube/client": "0.0.15",
     "@aitube/engine": "0.0.2",
     "@huggingface/hub": "0.12.3-oauth",
     "@huggingface/inference": "^2.6.7",

src/app/api/generators/image/generateImageWithVideochain.ts CHANGED Viewed

@@ -10,6 +10,7 @@ const apiKey = `${process.env.VIDEOCHAIN_API_KEY || ""}`
 export async function newRender({
   prompt,
   negativePrompt,
   nbFrames,
   nbSteps,
   width,
@@ -20,6 +21,7 @@ export async function newRender({
 }: {
   prompt: string
   negativePrompt: string
   nbFrames: number
   nbSteps: number
   width: number
@@ -61,6 +63,7 @@ export async function newRender({
         body: JSON.stringify({
           prompt,
           negativePrompt,
           // nbFrames: 8 and nbSteps: 15 --> ~10 sec generation
           nbFrames, // when nbFrames is 1, we will only generate static images
           nbSteps, // 20 = fast, 30 = better, 50 = best
@@ -72,7 +75,7 @@ export async function newRender({
           upscalingFactor: 1, // let's disable upscaling right now
           turbo, // always use turbo mode (it's for images only anyway)
           // also what could be done iw that we could use the width and height to control this
-          cache: shouldRenewCache ? "renew" : "use"
         } as Partial<RenderRequest>),
         cache: 'no-store',
       // we can also use this (see https://vercel.com/blog/vercel-cache-api-nextjs-cache)

 export async function newRender({
   prompt,
   negativePrompt,
+  identityImage,
   nbFrames,
   nbSteps,
   width,
 }: {
   prompt: string
   negativePrompt: string
+  identityImage: string
   nbFrames: number
   nbSteps: number
   width: number
         body: JSON.stringify({
           prompt,
           negativePrompt,
+          identityImage,
           // nbFrames: 8 and nbSteps: 15 --> ~10 sec generation
           nbFrames, // when nbFrames is 1, we will only generate static images
           nbSteps, // 20 = fast, 30 = better, 50 = best
           upscalingFactor: 1, // let's disable upscaling right now
           turbo, // always use turbo mode (it's for images only anyway)
           // also what could be done iw that we could use the width and height to control this
+          cache: shouldRenewCache ? "renew" : "use",
         } as Partial<RenderRequest>),
         cache: 'no-store',
       // we can also use this (see https://vercel.com/blog/vercel-cache-api-nextjs-cache)

src/app/api/generators/search/unknownObjectToLatentSearchResults.ts CHANGED Viewed

@@ -1,6 +1,6 @@
 import { generateSeed } from "@aitube/clap"
-import { parseString } from "../../parsers/parseString"
 import { parseStringArray } from "../../parsers/parseStringArray"
 import { LatentSearchResult, LatentSearchResults } from "./types"
@@ -9,9 +9,9 @@ export function unknownObjectToLatentSearchResults(something: any): LatentSearch
   if (Array.isArray(something)) {
     results = something.map(thing => ({
-      label: parseString(thing && (thing?.label || thing?.title)),
-      summary: parseString(thing && (thing?.summary || thing?.description || thing?.synopsis)),
-      thumbnail: parseString(thing && (thing?.thumbnail)),
       tags: parseStringArray(thing && (thing?.tag)),
       seed: generateSeed(), // a seed is necessary for consistency between search results and viewer
     } as LatentSearchResult))

 import { generateSeed } from "@aitube/clap"
+import { parseTrimmedString } from "../../parsers/parseTrimmedString"
 import { parseStringArray } from "../../parsers/parseStringArray"
 import { LatentSearchResult, LatentSearchResults } from "./types"
   if (Array.isArray(something)) {
     results = something.map(thing => ({
+      label: parseTrimmedString(thing && (thing?.label || thing?.title)),
+      summary: parseTrimmedString(thing && (thing?.summary || thing?.description || thing?.synopsis)),
+      thumbnail: parseTrimmedString(thing && (thing?.thumbnail)),
       tags: parseStringArray(thing && (thing?.tag)),
       seed: generateSeed(), // a seed is necessary for consistency between search results and viewer
     } as LatentSearchResult))

src/app/api/parsers/parseCompletionMode.ts CHANGED Viewed

@@ -1,10 +1,25 @@
-import { ClapCompletionMode } from "../v1/edit/types"
-export function parseCompletionMode(input?: any, defaultMode: ClapCompletionMode = "partial"): ClapCompletionMode {
   let mode = defaultMode
   try {
-    let maybeMode = decodeURIComponent(`${input || ""}` || defaultMode).trim()
-    mode = ["partial", "full"].includes(maybeMode) ? (maybeMode as ClapCompletionMode) : defaultMode
   } catch (err) {}
   return mode
 }

+import { ClapCompletionMode } from "@aitube/client"
+export function parseCompletionMode(
+  input?: any,
+  defaultMode: ClapCompletionMode = ClapCompletionMode.PARTIAL
+): ClapCompletionMode {
   let mode = defaultMode
   try {
+    let maybeMode = decodeURIComponent(`${input || ""}`).trim()
+    if (!maybeMode) {
+      maybeMode = defaultMode
+    }
+    mode = maybeMode as ClapCompletionMode
   } catch (err) {}
+  if (!Object.values(ClapCompletionMode).includes(mode)) {
+    throw new Error(`Invalid clap completion mode: "${mode}"`)
+  }
   return mode
 }

src/app/api/parsers/parseEntityPrompts.ts ADDED Viewed

	@@ -0,0 +1,11 @@

+import { ClapEntityPrompt } from "@aitube/client"
+import { decode } from "js-base64"
+export function parseClapEntityPrompts(input?: any): ClapEntityPrompt[] {
+  let basicResult = JSON.parse(decode(`${input || ""}`))
+  if (Array.isArray(basicResult)) {
+    return basicResult as ClapEntityPrompt[]
+  } else {
+    return []
+  }
+}

src/app/api/parsers/parseSupportedExportFormat.ts ADDED Viewed

	@@ -0,0 +1,16 @@

+import { defaultExportFormat, SupportedExportFormat } from "@aitube/client"
+export function parseSupportedExportFormat(
+    input?: any,
+    defaultFormat: SupportedExportFormat = defaultExportFormat
+  ): SupportedExportFormat {
+  let format: SupportedExportFormat = defaultFormat
+  try {
+    format = decodeURIComponent(`${input || ""}` || defaultFormat).trim() as SupportedExportFormat
+    if (format !== "mp4" && format !== "webm") {
+      format = defaultFormat
+    }
+  } catch (err) {}
+  return format
+}

src/app/api/parsers/{parseString.ts → parseTrimmedString.ts} RENAMED Viewed

@@ -1,4 +1,4 @@
-export function parseString(something: any): string {
   let result: string = ""
   if (typeof something === "string") {
     result = `${something}`.trim()

+export function parseTrimmedString(something: any): string {
   let result: string = ""
   if (typeof something === "string") {
     result = `${something}`.trim()

src/app/api/v1/auth/config.ts ADDED Viewed

	@@ -0,0 +1,5 @@

+import { createSecretKey } from "node:crypto"
+export const secretKey = createSecretKey(`${process.env.API_SECRET_JWT_KEY || ""}`, 'utf-8')
+export const issuer = `${process.env.API_SECRET_JWT_ISSUER || ""}`
+export const audience = `${process.env.API_SECRET_JWT_AUDIENCE || ""}`

src/app/api/{auth → v1/auth}/getToken.ts RENAMED Viewed

@@ -1,20 +1,20 @@
-import { createSecretKey } from "crypto"
 import { SignJWT } from "jose"
 // https://jmswrnr.com/blog/protecting-next-js-api-routes-query-parameters
 export async function getToken(data: Record<string, any> = {}): Promise<string> {
-  const secretKey = createSecretKey(`${process.env.API_SECRET_JWT_KEY || ""}`, 'utf-8');
   const jwtToken = await new SignJWT(data)
    .setProtectedHeader({
     alg: 'HS256'
    }) // algorithm
    .setIssuedAt()
-   .setIssuer(`${process.env.API_SECRET_JWT_ISSUER || ""}`) // issuer
-   .setAudience(`${process.env.API_SECRET_JWT_AUDIENCE || ""}`) // audience
    .setExpirationTime("1 day") // token expiration time - to prevent hackers from re-using our URLs more than a day
-   .sign(secretKey); // secretKey generated from previous step
   return jwtToken
 }

 import { SignJWT } from "jose"
+import { secretKey, issuer, audience } from "./config"
 // https://jmswrnr.com/blog/protecting-next-js-api-routes-query-parameters
 export async function getToken(data: Record<string, any> = {}): Promise<string> {
   const jwtToken = await new SignJWT(data)
    .setProtectedHeader({
     alg: 'HS256'
    }) // algorithm
    .setIssuedAt()
+   .setIssuer(issuer) // issuer
+   .setAudience(audience) // audience
    .setExpirationTime("1 day") // token expiration time - to prevent hackers from re-using our URLs more than a day
+   .sign(secretKey) // secretKey generated from previous step
   return jwtToken
 }

src/app/api/v1/auth/parseToken.ts ADDED Viewed

	@@ -0,0 +1,7 @@

+export function parseToken(input?: any): string {
+  try {
+    return (decodeURIComponent(`${input || ""}`).split("Bearer").pop() || "").trim()
+  } catch (err) {
+    return ""
+  }
+}

src/app/api/v1/auth/throwIfInvalidToken.ts ADDED Viewed

	@@ -0,0 +1,22 @@

+import { jwtVerify } from "jose"
+import { secretKey } from "./config"
+import { parseToken } from "./parseToken"
+export async function throwIfInvalidToken(input?: any): Promise<boolean> {
+  // note: this performs a decodeURI, but I'm not sure we need to
+  const token = parseToken(input)
+  // verify token
+  const { payload, protectedHeader } = await jwtVerify(token, secretKey, {
+    issuer: `${process.env.API_SECRET_JWT_ISSUER || ""}`, // issuer
+    audience: `${process.env.API_SECRET_JWT_AUDIENCE || ""}`, // audience
+  })
+  // log values to console
+  console.log(payload)
+  console.log(protectedHeader)
+  return true
+}

src/app/api/v1/create/index.ts CHANGED Viewed

@@ -2,12 +2,12 @@
 import { ClapProject, getValidNumber, newClap, newSegment } from "@aitube/clap"
 import { predict } from "@/app/api/providers/huggingface/predictWithHuggingFace"
 import { parseRawStringToYAML } from "@/app/api/parsers/parseRawStringToYAML"
-import { sleep } from "@/lib/utils/sleep"
 import { systemPrompt } from "./systemPrompt"
-import { LatentStory } from "./types"
 // a helper to generate Clap stories from a few sentences
 // this is mostly used by external apps such as the Stories Factory
@@ -20,7 +20,6 @@ export async function create(request: {
   width: 1024,
   height: 576,
 }): Promise<ClapProject> {
   const prompt = `${request?.prompt || ""}`.trim()
   console.log("api/v1/create(): request:", request)
@@ -30,7 +29,9 @@ export async function create(request: {
   const width = getValidNumber(request?.width, 256, 8192, 1024)
   const height = getValidNumber(request?.height, 256, 8192, 576)
-  const userPrompt = `Video story to generate: ${prompt}`
   const prefix = "```yaml\n"
   const nbMaxNewTokens = 1400
@@ -70,12 +71,15 @@ export async function create(request: {
     maybeShots = parseRawStringToYAML<LatentStory[]>(rawString, [])
     if (!Array.isArray(maybeShots) || maybeShots.length === 0) {
       console.log(`api/v1/create(): failed to generate shots for the second time, which indicates an issue with the Hugging Face API`)
-    } else {
-      shots = maybeShots
-    }
-  } else {
     shots = maybeShots
   }
   console.log(`api/v1/create(): generated ${shots.length} shots`)
   // this is approximate - TTS generation will determine the final duration of each shot
@@ -88,8 +92,8 @@ export async function create(request: {
       title: "Not needed", // we don't need a title actually
       description: "This video has been generated using AI",
       synopsis: "",
-      licence: "Non Commercial",
-      orientation: "vertical",
       width,
       height,
       isInteractive: false,
@@ -160,7 +164,7 @@ export async function create(request: {
       startTimeInMs: currentElapsedTimeInMs,
       assetDurationInMs: defaultSegmentDurationInMs,
       category: "camera",
-      prompt: "vertical video",
       outputType: "text"
     }))

 import { ClapProject, getValidNumber, newClap, newSegment } from "@aitube/clap"
+import { sleep } from "@/lib/utils/sleep"
 import { predict } from "@/app/api/providers/huggingface/predictWithHuggingFace"
 import { parseRawStringToYAML } from "@/app/api/parsers/parseRawStringToYAML"
+import { LatentStory } from "@/app/api/v1/types"
 import { systemPrompt } from "./systemPrompt"
 // a helper to generate Clap stories from a few sentences
 // this is mostly used by external apps such as the Stories Factory
   width: 1024,
   height: 576,
 }): Promise<ClapProject> {
   const prompt = `${request?.prompt || ""}`.trim()
   console.log("api/v1/create(): request:", request)
   const width = getValidNumber(request?.width, 256, 8192, 1024)
   const height = getValidNumber(request?.height, 256, 8192, 576)
+  const userPrompt = `Movie story to generate: ${prompt}
+Output: `
   const prefix = "```yaml\n"
   const nbMaxNewTokens = 1400
     maybeShots = parseRawStringToYAML<LatentStory[]>(rawString, [])
     if (!Array.isArray(maybeShots) || maybeShots.length === 0) {
       console.log(`api/v1/create(): failed to generate shots for the second time, which indicates an issue with the Hugging Face API`)
+    }
+  }
+  if (maybeShots.length) {
     shots = maybeShots
+  } else {
+    throw new Error(`Hugging Face Inference API failure (the model failed to generate the shots)`)
   }
   console.log(`api/v1/create(): generated ${shots.length} shots`)
   // this is approximate - TTS generation will determine the final duration of each shot
       title: "Not needed", // we don't need a title actually
       description: "This video has been generated using AI",
       synopsis: "",
+      licence: "",
+      orientation: width > height ? "landscape" : height > width ? "portrait" : "square",
       width,
       height,
       isInteractive: false,
       startTimeInMs: currentElapsedTimeInMs,
       assetDurationInMs: defaultSegmentDurationInMs,
       category: "camera",
+      prompt: "video",
       outputType: "text"
     }))

src/app/api/v1/create/route.ts CHANGED Viewed

@@ -1,11 +1,14 @@
 import { NextResponse, NextRequest } from "next/server"
 import { getValidNumber, serializeClap } from "@aitube/clap"
 import { create } from "."
 // a helper to generate Clap stories from a few sentences
 // this is mostly used by external apps such as the Stories Factory
 export async function POST(req: NextRequest) {
   const request = await req.json() as {
     prompt: string
@@ -17,9 +20,9 @@ export async function POST(req: NextRequest) {
   console.log("[api/v1/create] request:", request)
   const clap = await create({
-    prompt:  `${request?.prompt || ""}`.trim(),
     width: getValidNumber(request?.width, 256, 8192, 1024),
-    height:  getValidNumber(request?.height, 256, 8192, 576)
   })
   // TODO replace by Clap file streaming

 import { NextResponse, NextRequest } from "next/server"
 import { getValidNumber, serializeClap } from "@aitube/clap"
+import { throwIfInvalidToken } from "@/app/api/v1/auth/throwIfInvalidToken"
 import { create } from "."
 // a helper to generate Clap stories from a few sentences
 // this is mostly used by external apps such as the Stories Factory
 export async function POST(req: NextRequest) {
+  await throwIfInvalidToken(req.headers.get("Authorization"))
   const request = await req.json() as {
     prompt: string
   console.log("[api/v1/create] request:", request)
   const clap = await create({
+    prompt: `${request?.prompt || ""}`.trim(),
     width: getValidNumber(request?.width, 256, 8192, 1024),
+    height: getValidNumber(request?.height, 256, 8192, 576)
   })
   // TODO replace by Clap file streaming

src/app/api/v1/create/systemPrompt.ts CHANGED Viewed

@@ -1,23 +1,29 @@
 export const systemPrompt: string =
   `# Context
 You are a server-side function generating stories from a single synopsis/brief (a "prompt").
-The video are vertical, so they can be displayed on mobile.
-They are meant to be shared on social media platform (Instagram, TikTok, Snapchat, Twitter, YouTube Shorts etc).
-Each video is composed of a sequence of static panels (a dozen in average),
-with a voice over and text.
 # Task
-Your mission is to generate a sequence of panels that will form the final video.
 You will be provided a "prompt" (for the story) and max number of images
-Each panel is composed of:
 - one title (which will be displayed as an overlay over the video, so keep it short eg. max 10/12 words),
-- one image (you must describe it using a Stable Diffusion prompt - about ~300 characters - using simple descriptive words and adjectives. Describe facts about characters, location, lights, texture, camera orientation, colors, clothes, movements etc. But don't give your opinion, don't talk about the emotions it evokes etc.)
 - one voice over (should be short too, about 10 to 15 words)
 # Examples
-You most reply by writing/completing a YAML list of objects.
 Here is a short example, the prompt was "a cute puppy who misbehaves in the kitchen, in 3 parts 🐶"
 Note how we asked for "3 parts". Sometimes the user will talk about steps, slides etc instead (that's fine, it means the same thing),
 or the user might omit to give the number (that's fine too, you can use 5 by default),
@@ -34,6 +40,4 @@ but if the user asks for large numbers, it should be ignored (our limit is 32).
   image: "medium-shot of a puppy eating a cake, on the kitchen table, birthday cake, eating, cute, instagram, funny, messy, vertical photo"
   voice: "Now my dog is eating my birtday cake. Please send help."
 \`\`\
-# Your turn:
-`

 export const systemPrompt: string =
   `# Context
 You are a server-side function generating stories from a single synopsis/brief (a "prompt").
+The videos are meant to be shared on social media platform (Instagram, TikTok, Snapchat, Twitter, YouTube Shorts etc).
+Each video is composed of a sequence of shots (a dozen in average), with a voice over and text.
 # Task
+Your mission is to generate a sequence of shots that will form the final video.
 You will be provided a "prompt" (for the story) and max number of images
+# Output schema
+Each shot is composed of:
 - one title (which will be displayed as an overlay over the video, so keep it short eg. max 10/12 words),
+- one image (you must describe it using a Stable Diffusion prompt - about ~300 chars - using simple descriptive words and adjectives. Describe facts about characters, location, lights, texture, camera orientation, colors, clothes, movements etc. But don't give your opinion, don't talk about the emotions it evokes etc.)
 - one voice over (should be short too, about 10 to 15 words)
+# Important
+You MUST reply by writing/completing a YAML list of objects.
+Copy the structure of the examples, but not their content: come up with your own original ideal, you should be creativeç
 # Examples
 Here is a short example, the prompt was "a cute puppy who misbehaves in the kitchen, in 3 parts 🐶"
 Note how we asked for "3 parts". Sometimes the user will talk about steps, slides etc instead (that's fine, it means the same thing),
 or the user might omit to give the number (that's fine too, you can use 5 by default),
   image: "medium-shot of a puppy eating a cake, on the kitchen table, birthday cake, eating, cute, instagram, funny, messy, vertical photo"
   voice: "Now my dog is eating my birtday cake. Please send help."
 \`\`\
+`

src/app/api/v1/create/types.ts DELETED Viewed

@@ -1,6 +0,0 @@
-export type LatentStory = {
-  title: string
-  image: string
-  voice: string
-}

src/app/api/v1/edit/dialogues/processShot.ts CHANGED Viewed

@@ -1,12 +1,17 @@
-import { ClapProject, ClapSegment, getClapAssetSourceType, filterSegments, ClapSegmentFilteringMode } from "@aitube/clap"
 import { getSpeechBackgroundAudioPrompt } from "@aitube/engine"
 import { generateSpeechWithParlerTTS } from "@/app/api/generators/speech/generateVoiceWithParlerTTS"
 import { getMediaInfo } from "@/app/api/utils/getMediaInfo"
-import { ClapCompletionMode } from "../types"
 export async function processShot({
   shotSegment,
   existingClap,
@@ -70,7 +75,7 @@ export async function processShot({
     console.log(`[api/edit/dialogues] processShot: generated dialogue audio: ${shotDialogueSegment?.assetUrl?.slice?.(0, 50)}...`)
   // if it's partial, we need to manually add it
-  if (mode === "partial") {
       newerClap.segments.push(shotDialogueSegment)
     }
   } else {

+import {
+  ClapProject,
+  ClapSegment,
+  getClapAssetSourceType,
+  filterSegments,
+  ClapSegmentFilteringMode
+} from "@aitube/clap"
+import { ClapCompletionMode } from "@aitube/client"
 import { getSpeechBackgroundAudioPrompt } from "@aitube/engine"
 import { generateSpeechWithParlerTTS } from "@/app/api/generators/speech/generateVoiceWithParlerTTS"
 import { getMediaInfo } from "@/app/api/utils/getMediaInfo"
 export async function processShot({
   shotSegment,
   existingClap,
     console.log(`[api/edit/dialogues] processShot: generated dialogue audio: ${shotDialogueSegment?.assetUrl?.slice?.(0, 50)}...`)
   // if it's partial, we need to manually add it
+  if (mode !== ClapCompletionMode.FULL) {
       newerClap.segments.push(shotDialogueSegment)
     }
   } else {

src/app/api/v1/edit/dialogues/route.ts CHANGED Viewed

@@ -2,16 +2,16 @@ import { NextResponse, NextRequest } from "next/server"
 import { ClapProject, ClapSegment, newClap, parseClap, serializeClap } from "@aitube/clap"
-import { getToken } from "@/app/api/auth/getToken"
 import { processShot } from "./processShot"
 import queryString from "query-string"
 import { parseCompletionMode } from "@/app/api/parsers/parseCompletionMode"
 // a helper to generate speech for a Clap
 export async function POST(req: NextRequest) {
-  const jwtToken = await getToken({ user: "anonymous" })
   const qs = queryString.parseUrl(req.url || "")
   const query = (qs || {}).query
@@ -33,7 +33,7 @@ export async function POST(req: NextRequest) {
     throw new Error(`Error, this endpoint being synchronous, it is designed for short stories only (max 32 shots).`)
   }
-  const newerClap = mode === "full" ? existingClap : newClap()
   // we process the shots in parallel (this will increase the queue size in the Gradio spaces)
   await Promise.all(shotsSegments.map(shotSegment =>

 import { ClapProject, ClapSegment, newClap, parseClap, serializeClap } from "@aitube/clap"
 import { processShot } from "./processShot"
 import queryString from "query-string"
 import { parseCompletionMode } from "@/app/api/parsers/parseCompletionMode"
+import { throwIfInvalidToken } from "@/app/api/v1/auth/throwIfInvalidToken"
+import { ClapCompletionMode } from "@aitube/client"
 // a helper to generate speech for a Clap
 export async function POST(req: NextRequest) {
+  await throwIfInvalidToken(req.headers.get("Authorization"))
   const qs = queryString.parseUrl(req.url || "")
   const query = (qs || {}).query
     throw new Error(`Error, this endpoint being synchronous, it is designed for short stories only (max 32 shots).`)
   }
+  const newerClap = mode === ClapCompletionMode.FULL ? existingClap : newClap()
   // we process the shots in parallel (this will increase the queue size in the Gradio spaces)
   await Promise.all(shotsSegments.map(shotSegment =>

src/app/api/v1/edit/entities/clapToLatentStory.ts ADDED Viewed

	@@ -0,0 +1,50 @@

+import { ClapProject, ClapSegmentFilteringMode, filterSegments } from "@aitube/clap"
+import { LatentStory } from "@/app/api/v1/types"
+/**
+ * Extract the latent story from a ClapProject
+ *
+ * This is useful to pass a simplified representation of a story to a LLM
+ *
+ * @param clap
+ * @returns
+ */
+export async function clapToLatentStory(clap: ClapProject): Promise<LatentStory[]> {
+  const shots = clap.segments.filter(s => s.category === "camera")
+  const latentStories: LatentStory[] = []
+  for (const shot of shots) {
+    const image = filterSegments(
+      ClapSegmentFilteringMode.START,
+      shot,
+      clap.segments,
+      "storyboard"
+    ).at(0)
+    const title = filterSegments(
+      ClapSegmentFilteringMode.START,
+      shot,
+      clap.segments,
+      "interface"
+    ).at(0)
+    const voice = filterSegments(
+      ClapSegmentFilteringMode.START,
+      shot,
+      clap.segments,
+      "dialogue"
+    ).at(0)
+    const latentStory: LatentStory = {
+      title: title.prompt,
+      image: image.prompt,
+      voice: voice.prompt,
+    }
+    latentStories.push(latentStory)
+  }
+  return latentStories
+}

src/app/api/v1/edit/entities/generateEntityPrompts.ts ADDED Viewed

	@@ -0,0 +1,135 @@

+"use server"
+import YAML from "yaml"
+import { generateSeed } from "@aitube/clap"
+import { ClapEntityPrompt } from "@aitube/client"
+import { sleep } from "@/lib/utils/sleep"
+import { predict } from "@/app/api/providers/huggingface/predictWithHuggingFace"
+import { parseRawStringToYAML } from "@/app/api/parsers/parseRawStringToYAML"
+import { LatentEntity, LatentStory } from "@/app/api/v1/types"
+import { systemPrompt } from "./systemPrompt"
+import { generateImageID } from "./generateImageID"
+export type EntityPromptResult = {
+  entityPrompt: ClapEntityPrompt
+  shots: number[]
+}
+// a helper to generate Clap stories from a few sentences
+// this is mostly used by external apps such as the Stories Factory
+export async function generateEntityPrompts({
+  prompt = "",
+  latentStory = []
+}: {
+  prompt?: string
+  latentStory?: LatentStory[]
+} = {
+  prompt: "",
+  latentStory: []
+}): Promise<EntityPromptResult[]> {
+  if (!prompt.length) { throw new Error(`please provide a prompt`) }
+  console.log("generateEntityPrompts(): prompt:", prompt)
+  if (!latentStory.length) { throw new Error(`please provide a story`) }
+  console.log("generateEntityPrompts(): latentStory:", latentStory)
+  const userPrompt = `The input story is about: ${prompt}.
+The input story timeline is:
+\`\`\`yaml
+${YAML.stringify(
+  // we need to help the LLM by marking the shots with a simple numeric ID
+  latentStory.map((shot, i) => ({
+    shot: i,
+    ...shot,
+  }))
+)}
+\`\`\`
+Now please generate the output entities:`
+  const prefix = "```yaml\n"
+  const nbMaxNewTokens = 1400
+  // TODO use streaming for the Hugging Face prediction
+  //
+  // note that a Clap file is actually a YAML stream of documents
+  // so technically we could stream everything from end-to-end
+  // (but I haven't coded the helpers to do this yet)
+  let rawString = await predict({
+    systemPrompt,
+    userPrompt,
+    nbMaxNewTokens,
+    prefix,
+  })
+  console.log("generateEntityPrompts(): rawString: ", rawString)
+  let results: EntityPromptResult[] = []
+  let maybeEntities = parseRawStringToYAML<LatentEntity[]>(rawString, [])
+  if (!Array.isArray(maybeEntities) || maybeEntities.length === 0) {
+    console.log(`generateEntityPrompts(): failed to generate entities.. trying again`)
+    await sleep(2000)
+    rawString = await predict({
+      systemPrompt,
+      userPrompt: userPrompt + ".", // we trick the Hugging Face cache
+      nbMaxNewTokens,
+      prefix,
+    })
+    console.log("generateEntityPrompts(): rawString: ", rawString)
+    maybeEntities = parseRawStringToYAML<LatentEntity[]>(rawString, [])
+    if (!Array.isArray(maybeEntities) || maybeEntities.length === 0) {
+      console.log(`generateEntityPrompts(): failed to generate shots for the second time, which indicates an issue with the Hugging Face API`)
+    }
+  }
+  if (maybeEntities.length) {
+    results = await Promise.all(maybeEntities.map(async ({
+      name,
+      category,
+      image,
+      audio,
+      shots,
+    }) => {
+      const entityPrompt: ClapEntityPrompt = {
+        name,
+        category,
+        age: "",
+        variant: image,
+        region: "",
+        identityImage: await generateImageID({
+          prompt: image,
+          seed: generateSeed()
+        }),
+        // TODO later
+        identityVoice: "" // await generateAudioID({ prompt: e.audio, seed: generateSeed() })
+      }
+      const result: EntityPromptResult = {
+        entityPrompt,
+        shots
+      }
+      return result
+    }))
+  } else {
+    throw new Error(`Hugging Face Inference API failure (the model failed to generate the entities)`)
+  }
+  console.log(`generateEntityPrompts(): generated ${results.length} entities with their images and voice ids`)
+  return results
+}

src/app/api/v1/edit/entities/generateImageID.ts CHANGED Viewed

@@ -2,7 +2,6 @@
 import { generateSeed } from "@aitube/clap"
 import { sleep } from "@/lib/utils/sleep"
-import { getValidNumber } from "@/lib/utils/getValidNumber"
 import { newRender, getRender } from "@/app/api/providers/videochain/renderWithVideoChain"
 import { getNegativePrompt, getPositivePrompt } from "@/app/api/utils/imagePrompts"

 import { generateSeed } from "@aitube/clap"
 import { sleep } from "@/lib/utils/sleep"
 import { newRender, getRender } from "@/app/api/providers/videochain/renderWithVideoChain"
 import { getNegativePrompt, getPositivePrompt } from "@/app/api/utils/imagePrompts"

src/app/api/v1/edit/entities/index.ts CHANGED Viewed

@@ -1,23 +1,113 @@
-import { ClapProject, getClapAssetSourceType, newClap } from "@aitube/clap"
 import { generateImageID } from "./generateImageID"
 import { generateAudioID } from "./generateAudioID"
-import { ClapCompletionMode } from "../types"
 export async function editEntities({
   existingClap,
   newerClap,
-  mode
 }: {
   existingClap: ClapProject
   newerClap: ClapProject
-  mode: ClapCompletionMode
 }) {
   if (!existingClap.entities.length) { throw new Error(`please provide at least one entity`) }
   for (const entity of existingClap.entities) {
     let entityHasBeenModified = false
@@ -57,13 +147,13 @@ export async function editEntities({
     }
     // in case we are doing a partial update
-    if (mode === "partial" && entityHasBeenModified && !newerClap.entityIndex[entity.id]) {
       newerClap.entities.push(entity)
       newerClap.entityIndex[entity.id] = entity
     }
   }
-  console.log(`[api/edit/entities] returning the newerClap`)
   return newerClap
 }

+import { ClapProject, getClapAssetSourceType, getValidNumber, newEntity } from "@aitube/clap"
+import { ClapCompletionMode, ClapEntityPrompt } from "@aitube/client"
 import { generateImageID } from "./generateImageID"
 import { generateAudioID } from "./generateAudioID"
+import { generateEntityPrompts } from "./generateEntityPrompts"
+import { clapToLatentStory } from "./clapToLatentStory"
 export async function editEntities({
   existingClap,
   newerClap,
+  entityPrompts = [],
+  mode = ClapCompletionMode.PARTIAL
 }: {
   existingClap: ClapProject
   newerClap: ClapProject
+  entityPrompts?: ClapEntityPrompt[]
+  mode?: ClapCompletionMode
 }) {
+  // note that we can only handle either FULL or PARTIAL
+  // other modes such as MERGE, REPLACE.. are irrelevant since those are client-side modes
+  // so from a server point of view those correspond to PARTIAL
+  //
+  // it is also worth noting that the use of FULL should be discouraged
+  const isFull = mode === ClapCompletionMode.FULL
+  const isPartial = !isFull
+  // if we don't have existing entities, and user passed none,
+  // then we need to hallucinate them
+  if (existingClap.entities.length === 0 && entityPrompts.length === 0) {
+    const entityPromptsWithShots = await generateEntityPrompts({
+      prompt: existingClap.meta.description,
+      latentStory: await clapToLatentStory(existingClap)
+    })
+    for (const {
+      entityPrompt: { name, category, age, variant, region, identityImage, identityVoice },
+      shots
+    } of entityPromptsWithShots) {
+      const newEnt = newEntity({
+        category,
+        triggerName: name,
+        label: name,
+        description: name,
+        author: "auto",
+        thumbnailUrl: "",
+        imagePrompt: "",
+        imageSourceType: getClapAssetSourceType(identityImage),
+        imageEngine: "SDXL Lightning",
+        imageId: identityImage,
+        audioPrompt: "",
+        audioSourceType: getClapAssetSourceType(identityVoice),
+        audioEngine: "Parler-TTS", // <- TODO: use OpenVoice 2, that way it can be personalized
+        audioId: identityVoice,
+        // note: using a numeric age should be deprecated,
+        // instead we should be able to specify things using text,
+        // eg. "8 months", "25 years old", "12th century"
+        age: getValidNumber(age, 0, 120, 25),
+        // TODO: delete gender and appearance, replace by a single concept of "variant"
+        gender: "",
+        appearance: variant,
+        region: region,
+      })
+      existingClap.entities.push(newEnt)
+    }
+  }
+  // otherwise try to add what's new
+  for (const { name, category, age, variant, region, identityImage, identityVoice } of entityPrompts) {
+    const newEnt = newEntity({
+      category,
+      triggerName: name,
+      label: name,
+      description: name,
+      author: "auto",
+      thumbnailUrl: "",
+      imagePrompt: "",
+      imageSourceType: getClapAssetSourceType(identityImage),
+      imageEngine: "SDXL Lightning",
+      imageId: identityImage,
+      audioPrompt: "",
+      audioSourceType: getClapAssetSourceType(identityVoice),
+      audioEngine: "Parler-TTS", // <- TODO: use OpenVoice 2, that way it can be personalized
+      audioId: identityVoice,
+      // note: using a numeric age should be deprecated,
+      // instead we should be able to specify things using text,
+      // eg. "8 months", "25 years old", "12th century"
+      age: getValidNumber(age, 0, 120, 25),
+      // TODO: delete gender and appearance, replace by a single concept of "variant"
+      gender: "",
+      appearance: variant,
+      region: region,
+    })
+    existingClap.entities.push(newEnt)
+  }
   if (!existingClap.entities.length) { throw new Error(`please provide at least one entity`) }
+  // then we try to automatically repair, edit, complete.. all the existing entities
   for (const entity of existingClap.entities) {
     let entityHasBeenModified = false
     }
     // in case we are doing a partial update
+    if (mode !== ClapCompletionMode.FULL && entityHasBeenModified && !newerClap.entityIndex[entity.id]) {
       newerClap.entities.push(entity)
       newerClap.entityIndex[entity.id] = entity
     }
   }
+  console.log(`api/edit/entities(): returning the newerClap`)
   return newerClap
 }

src/app/api/v1/edit/entities/route.ts CHANGED Viewed

@@ -2,12 +2,15 @@ import { NextResponse, NextRequest } from "next/server"
 import queryString from "query-string"
 import { newClap, parseClap, serializeClap } from "@aitube/clap"
-import { getToken } from "@/app/api/auth/getToken"
 import { parseCompletionMode } from "@/app/api/parsers/parseCompletionMode"
 import { editEntities } from "."
 export async function POST(req: NextRequest) {
   const qs = queryString.parseUrl(req.url || "")
   const query = (qs || {}).query
@@ -15,17 +18,18 @@ export async function POST(req: NextRequest) {
   const mode = parseCompletionMode(query?.c)
   // const prompt = parsePrompt(query?.p)
-  const jwtToken = await getToken({ user: "anonymous" })
   const blob = await req.blob()
   const existingClap = await parseClap(blob)
-  const newerClap = mode === "full" ? existingClap : newClap()
   await editEntities({
     existingClap,
     newerClap,
     mode
   })

 import queryString from "query-string"
 import { newClap, parseClap, serializeClap } from "@aitube/clap"
 import { parseCompletionMode } from "@/app/api/parsers/parseCompletionMode"
+import { parseClapEntityPrompts } from "@/app/api/parsers/parseEntityPrompts"
+import { throwIfInvalidToken } from "@/app/api/v1/auth/throwIfInvalidToken"
 import { editEntities } from "."
+import { ClapCompletionMode } from "@aitube/client"
 export async function POST(req: NextRequest) {
+  await throwIfInvalidToken(req.headers.get("Authorization"))
   const qs = queryString.parseUrl(req.url || "")
   const query = (qs || {}).query
   const mode = parseCompletionMode(query?.c)
   // const prompt = parsePrompt(query?.p)
+  const entityPrompts = parseClapEntityPrompts(query?.e)
   const blob = await req.blob()
   const existingClap = await parseClap(blob)
+  const newerClap = mode === ClapCompletionMode.FULL ? existingClap : newClap()
   await editEntities({
     existingClap,
     newerClap,
+    entityPrompts,
     mode
   })

src/app/api/v1/edit/entities/systemPrompt.ts CHANGED Viewed

	@@ -1,3 +1,64 @@





1


2
3	- ~~export~~ ~~const systemPrompt = ""~~

+export const systemPrompt: string =
+  `# Context
+You are a server-side function generating stories from a single synopsis/brief (a "prompt").
+The video are meant to be shared on social media platform (Instagram, TikTok, Snapchat, Twitter, YouTube Shorts etc).
+Each video is composed of a sequence of shots (a dozen in average), with a voice over and text.
+# Task
+You mission is to generate a list of entities/assets (characters, locations etc) associated with each shot.
+# Important
+- You MUST reply by writing/completing a YAML list of objects.
+- Copy the structure of the examples, but not their content: come up with your own original ideal, you should be creativeç
+# Output schema:
+name: name of the entity
+category: can be "character" or "location"
+image: a description of the entity (you must describe it using a Stable Diffusion prompt - about ~300 chars - using simple descriptive words and adjectives. Describe facts about characters, location, lights, texture, camera orientation, colors, clothes, movements etc. But don't give your opinion, don't talk about the emotions it evokes etc.)
+audio: a textual description of what and how the entity sounds like
+shots: an array containing the shot IDs where the entity is present
+# Short example
+Given the following inputs:
+"A king goes to see a witch to ask if or how he can win an upcoming and challenging battle"
+\`\`\`yaml
+- shot: 1
+  title: "King Arthus seeks the witch's guidance to win his imminent battle."
+  image: "Establishing shot of KING ARTHUS, nervous, wet brown hair. dressed in golden armor and a colorful cape. His face reveals a mix of concern and determination. He's standing in the bright sunshine, inside a castle's courtyard, under cloudy skies. Behind him, a group of soldiers can be seen marching towards the castle gates."
+  voice: "Dark sorceress of the shadows, it is time for you to serve your Lord. Tell me the augur, tell me what you foreknow. Tell me how I will cleave my ennemies to the bone, and ravage them in battle to come up victorious."
+- shot: 2
+  title: "The witch gives her counsel but warns of an unknown cost."
+  image: "close-up shot of THE WITCH, smiling cunningly, raising a finger while speaking. Background bokeh, dim lightning, menacing, mysterious."
+  voice: "Your Majesty, this will be a bloody battle, but I espy a way to victory for you. But if my advice you follow, victory I foresee, although at a great cost it will be."
+- shot: 3
+  title: "The words of the witch are sinking in, but King Arthus tries to appear strong"
+  image: "close-up shot on KING ARTHUS, looking concerned, somber, false confidence"
+  voice: "Witch with the wicked tongue, what must be done will be done. I will do everything for my people's sake. Speak now, make know the path to glory."
+\`\`\
+An example YAML output from the server-side function can be:
+\`\`\`yaml
+- name: "Castle's Courtyard"
+  category: "location"
+  image: "A medieval castle courtyard, ashlar walls, soldiers and horses, cloudy sky"
+  audio: "Background noises of voices, horses, birds, wind, carriages"
+  shots: [1, 2, 3]
+- name: "King Arthus"
+  category: "character"
+  image: 1 middle-aged king, pepper-and-salt hair, beared. Dressed in golden armor and a dark purple cape. Majestic, imposing."
+  label: King Arthus seeks the witch's guidance to win his imminent battle."
+  audio: a middle-aged man speaking clearly, with a deep voice tone, confident, imposing, calm, overpowering."
+  shots: [1, 3]
+- name: "The Witch"
+  category: "character"
+  image: "an old witch, with a villainous face full of warts, gray hair, and a hunchback. Gypsy look. Yellowed teeth, piercing eyes. She wears a crude robe, she has wrinkled hands with long dirty nails."
+  audio: "a sneering old woman, speaking with a hoarse and raspy voice. She is confident, hiding something."
+  shots: [2]
+\`\`\
+`

src/app/api/v1/edit/storyboards/processShot.ts CHANGED Viewed

@@ -1,10 +1,17 @@
-import { ClapProject, ClapSegment, getClapAssetSourceType, newSegment, filterSegments, ClapSegmentFilteringMode } from "@aitube/clap"
 import { getVideoPrompt } from "@aitube/engine"
 import { getPositivePrompt } from "@/app/api/utils/imagePrompts"
 import { generateStoryboard } from "./generateStoryboard"
-import { ClapCompletionMode } from "../types"
 export async function processShot({
   shotSegment,
@@ -84,7 +91,7 @@ export async function processShot({
     // if mode is full, newerClap already contains the ference to shotStoryboardSegment
     // but if it's partial, we need to manually add it
-    if (mode === "partial") {
       newerClap.segments.push(shotStoryboardSegment)
     }
   } else {

+import {
+  ClapProject,
+  ClapSegment,
+  getClapAssetSourceType,
+  newSegment,
+  filterSegments,
+  ClapSegmentFilteringMode
+} from "@aitube/clap"
+import { ClapCompletionMode } from "@aitube/client"
 import { getVideoPrompt } from "@aitube/engine"
 import { getPositivePrompt } from "@/app/api/utils/imagePrompts"
 import { generateStoryboard } from "./generateStoryboard"
 export async function processShot({
   shotSegment,
     // if mode is full, newerClap already contains the ference to shotStoryboardSegment
     // but if it's partial, we need to manually add it
+    if (mode !== ClapCompletionMode.FULL) {
       newerClap.segments.push(shotStoryboardSegment)
     }
   } else {

src/app/api/v1/edit/storyboards/route.ts CHANGED Viewed

@@ -2,11 +2,11 @@ import { NextResponse, NextRequest } from "next/server"
 import queryString from "query-string"
 import { ClapProject, ClapSegment, newClap, parseClap, serializeClap } from "@aitube/clap"
-import { getToken } from "@/app/api/auth/getToken"
 import { parseCompletionMode } from "@/app/api/parsers/parseCompletionMode"
 import { processShot } from "./processShot"
 // a helper to generate storyboards for a Clap
 // this is mostly used by external apps such as the Stories Factory
@@ -16,8 +16,7 @@ import { processShot } from "./processShot"
 // - add missing storyboard prompts
 // - add missing storyboard images
 export async function POST(req: NextRequest) {
-  const jwtToken = await getToken({ user: "anonymous" })
   const qs = queryString.parseUrl(req.url || "")
   const query = (qs || {}).query
@@ -30,16 +29,16 @@ export async function POST(req: NextRequest) {
   if (!existingClap?.segments) { throw new Error(`no segment found in the provided clap!`) }
-  console.log(`[api/v1/edit/storyboards] detected ${existingClap.segments.length} segments`)
   const shotsSegments: ClapSegment[] = existingClap.segments.filter(s => s.category === "camera")
-  console.log(`[api/v1/edit/storyboards] detected ${shotsSegments.length} shots`)
   if (shotsSegments.length > 32) {
     throw new Error(`Error, this endpoint being synchronous, it is designed for short stories only (max 32 shots).`)
   }
-  const newerClap = mode === "full" ? existingClap : newClap()
   // we process the shots in parallel (this will increase the queue size in the Gradio spaces)
   await Promise.all(shotsSegments.map(shotSegment =>

 import queryString from "query-string"
 import { ClapProject, ClapSegment, newClap, parseClap, serializeClap } from "@aitube/clap"
 import { parseCompletionMode } from "@/app/api/parsers/parseCompletionMode"
+import { throwIfInvalidToken } from "@/app/api/v1/auth/throwIfInvalidToken"
 import { processShot } from "./processShot"
+import { ClapCompletionMode } from "@aitube/client"
 // a helper to generate storyboards for a Clap
 // this is mostly used by external apps such as the Stories Factory
 // - add missing storyboard prompts
 // - add missing storyboard images
 export async function POST(req: NextRequest) {
+  await throwIfInvalidToken(req.headers.get("Authorization"))
   const qs = queryString.parseUrl(req.url || "")
   const query = (qs || {}).query
   if (!existingClap?.segments) { throw new Error(`no segment found in the provided clap!`) }
+  console.log(`api/v1/edit/storyboards(): detected ${existingClap.segments.length} segments`)
   const shotsSegments: ClapSegment[] = existingClap.segments.filter(s => s.category === "camera")
+  console.log(`api/v1/edit/storyboards(): detected ${shotsSegments.length} shots`)
   if (shotsSegments.length > 32) {
     throw new Error(`Error, this endpoint being synchronous, it is designed for short stories only (max 32 shots).`)
   }
+  const newerClap = mode === ClapCompletionMode.FULL ? existingClap : newClap()
   // we process the shots in parallel (this will increase the queue size in the Gradio spaces)
   await Promise.all(shotsSegments.map(shotSegment =>

src/app/api/v1/edit/types.ts DELETED Viewed

@@ -1,8 +0,0 @@
-export type ClapCompletionMode =
-  // the full .clap is returned, containing both previous data and also new entries
-  // this isn't the most optimized mode, obviously
-  | "full"
-  // only changes are
-  | "partial"

src/app/api/v1/edit/videos/processShot.ts CHANGED Viewed

@@ -1,11 +1,19 @@
-import { ClapProject, ClapSegment, getClapAssetSourceType, newSegment,filterSegments, ClapSegmentFilteringMode } from "@aitube/clap"
 import { getVideoPrompt } from "@aitube/engine"
 import { getPositivePrompt } from "@/app/api/utils/imagePrompts"
 import { generateVideo } from "./generateVideo"
-import { ClapCompletionMode } from "../types"
 export async function processShot({
   shotSegment,
@@ -89,7 +97,7 @@ export async function processShot({
     // if mode is full, newerClap already contains the ference to shotVideoSegment
     // but if it's partial, we need to manually add it
-    if (mode === "partial") {
       newerClap.segments.push(shotVideoSegment)
     }

+import {
+  ClapProject,
+  ClapSegment,
+  getClapAssetSourceType,
+  newSegment,
+  filterSegments,
+  ClapSegmentFilteringMode
+} from "@aitube/clap"
+import { ClapCompletionMode } from "@aitube/client"
 import { getVideoPrompt } from "@aitube/engine"
 import { getPositivePrompt } from "@/app/api/utils/imagePrompts"
 import { generateVideo } from "./generateVideo"
 export async function processShot({
   shotSegment,
     // if mode is full, newerClap already contains the ference to shotVideoSegment
     // but if it's partial, we need to manually add it
+    if (mode !== ClapCompletionMode.FULL) {
       newerClap.segments.push(shotVideoSegment)
     }

src/app/api/v1/edit/videos/route.ts CHANGED Viewed

@@ -1,13 +1,12 @@
 import { NextResponse, NextRequest } from "next/server"
 import queryString from "query-string"
 import { ClapProject, ClapSegment, newClap, parseClap, serializeClap } from "@aitube/clap"
-import { getToken } from "@/app/api/auth/getToken"
-import { processShot } from "./processShot"
 import { parseCompletionMode } from "@/app/api/parsers/parseCompletionMode"
 // a helper to generate videos for a Clap
 // this is mostly used by external apps such as the Stories Factory
@@ -17,8 +16,7 @@ import { parseCompletionMode } from "@/app/api/parsers/parseCompletionMode"
 // - add missing video prompts
 // - add missing video files
 export async function POST(req: NextRequest) {
-  const jwtToken = await getToken({ user: "anonymous" })
   const qs = queryString.parseUrl(req.url || "")
   const query = (qs || {}).query
@@ -31,16 +29,16 @@ export async function POST(req: NextRequest) {
   if (!existingClap?.segments) { throw new Error(`no segment found in the provided clap!`) }
-  console.log(`[api/edit/videos] detected ${existingClap.segments.length} segments`)
   const shotsSegments: ClapSegment[] = existingClap.segments.filter(s => s.category === "camera")
-  console.log(`[api/edit/videos] detected ${shotsSegments.length} shots`)
   if (shotsSegments.length > 32) {
     throw new Error(`Error, this endpoint being synchronous, it is designed for short stories only (max 32 shots).`)
   }
-  const newerClap = mode === "full" ? existingClap : newClap()
   // we process the shots in parallel (this will increase the queue size in the Gradio spaces)
   await Promise.all(shotsSegments.map(shotSegment =>
@@ -52,7 +50,7 @@ export async function POST(req: NextRequest) {
     })
   ))
-  console.log(`[api/edit/videos] returning the clap augmented with videos`)
   return new NextResponse(await serializeClap(newerClap), {
     status: 200,

 import { NextResponse, NextRequest } from "next/server"
 import queryString from "query-string"
 import { ClapProject, ClapSegment, newClap, parseClap, serializeClap } from "@aitube/clap"
+import { ClapCompletionMode } from "@aitube/client"
 import { parseCompletionMode } from "@/app/api/parsers/parseCompletionMode"
+import { throwIfInvalidToken } from "@/app/api/v1/auth/throwIfInvalidToken"
+import { processShot } from "./processShot"
 // a helper to generate videos for a Clap
 // this is mostly used by external apps such as the Stories Factory
 // - add missing video prompts
 // - add missing video files
 export async function POST(req: NextRequest) {
+  await throwIfInvalidToken(req.headers.get("Authorization"))
   const qs = queryString.parseUrl(req.url || "")
   const query = (qs || {}).query
   if (!existingClap?.segments) { throw new Error(`no segment found in the provided clap!`) }
+  console.log(`api/edit/videos(): detected ${existingClap.segments.length} segments`)
   const shotsSegments: ClapSegment[] = existingClap.segments.filter(s => s.category === "camera")
+  console.log(`api/edit/videos(): detected ${shotsSegments.length} shots`)
   if (shotsSegments.length > 32) {
     throw new Error(`Error, this endpoint being synchronous, it is designed for short stories only (max 32 shots).`)
   }
+  const newerClap = mode === ClapCompletionMode.FULL ? existingClap : newClap()
   // we process the shots in parallel (this will increase the queue size in the Gradio spaces)
   await Promise.all(shotsSegments.map(shotSegment =>
     })
   ))
+  console.log(`api/edit/videos(): returning the clap augmented with videos`)
   return new NextResponse(await serializeClap(newerClap), {
     status: 200,

src/app/api/v1/export/route.ts CHANGED Viewed

@@ -1,22 +1,17 @@
 import { NextResponse, NextRequest } from "next/server"
 import queryString from "query-string"
-type SupportedExportFormat = "mp4" | "webm"
-const defaultExportFormat = "mp4"
 // we hide/wrap the micro-service under a unified AiTube API
 export async function POST(req: NextRequest, res: NextResponse) {
   const qs = queryString.parseUrl(req.url || "")
   const query = (qs || {}).query
-  let format: SupportedExportFormat = defaultExportFormat
-  try {
-    format = decodeURIComponent(query?.f?.toString() || defaultExportFormat).trim() as SupportedExportFormat
-    if (format !== "mp4" && format !== "webm") {
-      format = defaultExportFormat
-    }
-  } catch (err) {}
   // let's call our micro-service, which is currently open bar.
   const result = await fetch(

 import { NextResponse, NextRequest } from "next/server"
 import queryString from "query-string"
+import { parseSupportedExportFormat } from "@/app/api/parsers/parseSupportedExportFormat"
+import { throwIfInvalidToken } from "@/app/api/v1/auth/throwIfInvalidToken"
 // we hide/wrap the micro-service under a unified AiTube API
 export async function POST(req: NextRequest, res: NextResponse) {
+  await throwIfInvalidToken(req.headers.get("Authorization"))
   const qs = queryString.parseUrl(req.url || "")
   const query = (qs || {}).query
+  const format = parseSupportedExportFormat(query?.f)
   // let's call our micro-service, which is currently open bar.
   const result = await fetch(

src/app/api/v1/types.ts ADDED Viewed

	@@ -0,0 +1,15 @@

+import { ClapSegmentCategory } from "@aitube/clap"
+export type LatentEntity = {
+  name: string
+  category: ClapSegmentCategory
+  image: string
+  audio: string
+  shots: number[]
+}
+export type LatentStory = {
+  title: string
+  image: string
+  voice: string
+}

src/app/latent/search/page.tsx CHANGED Viewed

@@ -5,7 +5,7 @@ import { LatentQueryProps } from "@/types/general"
 import { Main } from "../../main"
 import { getNewMediaInfo } from "../../api/generators/search/getNewMediaInfo"
-import { getToken } from "../../api/auth/getToken"
 // https://jmswrnr.com/blog/protecting-next-js-api-routes-query-parameters

 import { Main } from "../../main"
 import { getNewMediaInfo } from "../../api/generators/search/getNewMediaInfo"
+import { getToken } from "../../api/v1/auth/getToken"
 // https://jmswrnr.com/blog/protecting-next-js-api-routes-query-parameters

src/app/latent/watch/page.tsx CHANGED Viewed

@@ -6,7 +6,7 @@ import { parseBasicSearchResult } from '@/app/api/parsers/parseBasicSearchResult
 import { Main } from "../../main"
 import { getNewMediaInfo } from "../../api/generators/search/getNewMediaInfo"
-import { getToken } from "../../api/auth/getToken"
 // https://jmswrnr.com/blog/protecting-next-js-api-routes-query-parameters

 import { Main } from "../../main"
 import { getNewMediaInfo } from "../../api/generators/search/getNewMediaInfo"
+import { getToken } from "../../api/v1/auth/getToken"
 // https://jmswrnr.com/blog/protecting-next-js-api-routes-query-parameters

src/types/general.ts CHANGED Viewed

@@ -54,6 +54,8 @@ export interface RenderRequest {
   wait: boolean // wait until the job is completed
   analyze: boolean // analyze the image to generate a caption (optional)
 }
 export interface ImageSegment {

   wait: boolean // wait until the job is completed
   analyze: boolean // analyze the image to generate a caption (optional)
+  identityImage: string // reference image for the main entity
 }
 export interface ImageSegment {