Spaces:

jbilcke-hf
/

VideoChain-API

Running on CPU Upgrade

App Files Files Community

jbilcke-hf HF staff commited on Oct 16, 2023

Commit

ec194c9

•

1 Parent(s): 8aa943e

wip

Browse files

Files changed (15) hide show

package-lock.json +12 -0
package.json +1 -0
src/production/renderVideo.mts +1 -1
src/providers/lip-syncing/generateLipSyncVideo.mts +7 -0
src/providers/music-generation/generateMusicWithReplicate.mts +1 -0
src/providers/video-generation/defaultPrompts.mts +49 -0
src/providers/video-generation/generateVideoWithHotshotGradioAPI.mts +63 -0
src/providers/video-generation/generateVideoWithHotshotReplicate.mts +135 -0
src/providers/video-generation/generateVideoWithShow.mts +15 -0
src/providers/video-generation/generateVideoWithZeroscope.mts +49 -0
src/providers/video-generation/types.mts +62 -0
src/providers/video-interpolation/interpolateVideoWithReplicate.mts +104 -0
src/providers/{video-generation/generateVideo.mts → video-transformation/transformVideoWithHotshotReplicate.mts} +1 -3
src/scheduler/processVideo.mts +1 -1
src/types.mts +14 -0

package-lock.json CHANGED Viewed

@@ -26,6 +26,7 @@
         "nodejs-whisper": "^0.1.4",
         "openai": "^3.3.0",
         "puppeteer": "^20.8.0",
         "resize-base64": "^1.0.12",
         "sharp": "^0.32.4",
         "temp-dir": "^3.0.0",
@@ -4401,6 +4402,17 @@
         "node": ">=0.10"
       }
     },
     "node_modules/request": {
       "version": "2.88.2",
       "resolved": "https://registry.npmjs.org/request/-/request-2.88.2.tgz",

         "nodejs-whisper": "^0.1.4",
         "openai": "^3.3.0",
         "puppeteer": "^20.8.0",
+        "replicate": "^0.20.1",
         "resize-base64": "^1.0.12",
         "sharp": "^0.32.4",
         "temp-dir": "^3.0.0",
         "node": ">=0.10"
       }
     },
+    "node_modules/replicate": {
+      "version": "0.20.1",
+      "resolved": "https://registry.npmjs.org/replicate/-/replicate-0.20.1.tgz",
+      "integrity": "sha512-QVyI1rowGsSfNuDrRmumYPdCHa/fN/RkI3NHpcK0i5hSSiWK69URAyheAC/0MIAiS3oUs4kD56PB9zEI4oHENw==",
+      "engines": {
+        "git": ">=2.11.0",
+        "node": ">=18.0.0",
+        "npm": ">=7.19.0",
+        "yarn": ">=1.7.0"
+      }
+    },
     "node_modules/request": {
       "version": "2.88.2",
       "resolved": "https://registry.npmjs.org/request/-/request-2.88.2.tgz",

package.json CHANGED Viewed

@@ -33,6 +33,7 @@
     "nodejs-whisper": "^0.1.4",
     "openai": "^3.3.0",
     "puppeteer": "^20.8.0",
     "resize-base64": "^1.0.12",
     "sharp": "^0.32.4",
     "temp-dir": "^3.0.0",

     "nodejs-whisper": "^0.1.4",
     "openai": "^3.3.0",
     "puppeteer": "^20.8.0",
+    "replicate": "^0.20.1",
     "resize-base64": "^1.0.12",
     "sharp": "^0.32.4",
     "temp-dir": "^3.0.0",

src/production/renderVideo.mts CHANGED Viewed

@@ -1,5 +1,5 @@
 import { RenderedScene, RenderRequest } from "../types.mts"
-import { generateVideo } from "../providers/video-generation/generateVideo.mts"
 export async function renderVideo(
   request: RenderRequest,

 import { RenderedScene, RenderRequest } from "../types.mts"
+import { generateVideo } from "../providers/video-generation/generateVideoWithZeroscope.mts"
 export async function renderVideo(
   request: RenderRequest,

src/providers/lip-syncing/generateLipSyncVideo.mts ADDED Viewed

	@@ -0,0 +1,7 @@

+// TODO: we should use
+// or we can use Video ReTalking but it requires a video where the person is already talking I believe:
+// https://twitter.com/camenduru/status/1713570931342237852

src/providers/music-generation/generateMusicWithReplicate.mts ADDED Viewed

	@@ -0,0 +1 @@


1	+ // TODO use https://replicate.com/sakemin/musicgen-fine-tuner

src/providers/video-generation/defaultPrompts.mts ADDED Viewed

	@@ -0,0 +1,49 @@

+// should we really have default prompts in here?
+// I think they should probably be defined at the applicative software layer (ie. in the client)
+export function addWordsIfNotPartOfThePrompt(prompt: string = "", words: string[] = []): string {
+  const promptWords = prompt.split(",").map(w => w.trim().toLocaleLowerCase())
+  return [
+    prompt,
+    // we add our keywords, unless they are already part of the prompt
+    ...words.filter(w => !promptWords.includes(w.toLocaleLowerCase()))
+  ].join(", ")
+}
+ export function getPositivePrompt(prompt: string, triggerWord = "") {
+  return addWordsIfNotPartOfThePrompt(prompt, [
+    triggerWord,
+    "crisp",
+    "sharp",
+    "beautiful",
+    "4K",
+    "hd"
+  ])
+}
+export function getNegativePrompt(prompt: string) {
+  return addWordsIfNotPartOfThePrompt(prompt, [
+    "cropped",
+    // "underexposed", // <-- can be a desired style
+    // "overexposed", // <-- can be a desired style
+    "logo",
+    "censored",
+    "watermark",
+    "watermarked",
+    "extra digit",
+    "fewer digits",
+    "bad fingers",
+    "bad quality",
+    "worst quality",
+    "low quality",
+    "low resolution",
+    "glitch", // <-- keep or not? could be a desired style?
+    "deformed",
+    "mutated",
+    "ugly",
+    "disfigured",
+    "3D render", // <-- keep or not? could be a desired style?
+    "signature"
+  ])
+}

src/providers/video-generation/generateVideoWithHotshotGradioAPI.mts ADDED Viewed

	@@ -0,0 +1,63 @@

+import { client } from "@gradio/client"
+import { VideoGenerationOptions } from "./types.mts"
+import { getNegativePrompt, getPositivePrompt } from "./defaultPrompts.mts"
+import { generateSeed } from "../../utils/misc/generateSeed.mts"
+// we don't use replicas yet, because it ain't easy to get their hostname
+const instances: string[] = [
+  `${process.env.VC_HOTSHOT_XL_GRADIO_SPACE_API_URL_1 || ""}`,
+].filter(instance => instance?.length > 0)
+const secretToken = `${process.env.VC_MICROSERVICE_SECRET_TOKEN || ""}`
+export const generateVideo = async ({
+  positivePrompt,
+  negativePrompt = "",
+  seed,
+  nbFrames = 8, // for now the only values that make sense are 1 (for a jpg) or 8 (for a video)
+  videoDuration = 1000, // for now Hotshot doesn't really supports anything else
+  nbSteps = 30, // when rendering a final video, we want a value like 50 or 70 here
+  size = "768x320",
+  // for jbilcke-hf/sdxl-cinematic-2 it is "cinematic-2"
+  triggerWord = "cinematic-2",
+  huggingFaceLora = "jbilcke-hf/sdxl-cinematic-2",
+}: VideoGenerationOptions) => {
+  const instance = instances.shift()
+  instances.push(instance)
+  const api = await client(instance, {
+    hf_token: `${process.env.VC_HF_API_TOKEN}` as any
+  })
+  // pimp the prompt
+  positivePrompt = getPositivePrompt(positivePrompt, triggerWord)
+  negativePrompt = getNegativePrompt(negativePrompt)
+  try {
+    const rawResponse = await api.predict('/run', [
+      secretToken,
+			positivePrompt, // string  in 'Prompt' Textbox component
+			negativePrompt || "",
+      huggingFaceLora?.length || undefined, // string  in 'Public LoRA ID' Textbox component
+			size || '512x512', // string (Option from: [('320x768', '320x768'), ('384x672', '384x672'), ('416x608', '416x608'), ('512x512', '512x512'), ('608x416', '608x416'), ('672x384', '672x384'), ('768x320', '768x320')]) in 'Size' Dropdown component
+      !isNaN(seed) && isFinite(seed) ? seed : generateSeed(), // number (numeric value between -1 and 423538377342) in 'Seed' Slider component, -1 to set to random
+      nbSteps || 30,
+      nbFrames || 8,
+      videoDuration || 1000,
+    ]) as any
+    // console.log("rawResponse:", rawResponse)
+    console.log("data:", rawResponse?.data)
+    const { name } = rawResponse?.data?.[0]?.[0] as { name: string, orig_name: string }
+    return `${instance}/file=${name}`
+  } catch (err) {
+    throw err
+  }
+}

src/providers/video-generation/generateVideoWithHotshotReplicate.mts ADDED Viewed

	@@ -0,0 +1,135 @@

+"use server"
+import Replicate from "replicate"
+import { generateSeed } from "../../utils/misc/generateSeed.mts"
+import { sleep } from "../../utils/misc/sleep.mts"
+import { getNegativePrompt, getPositivePrompt } from "./defaultPrompts.mts"
+import { VideoGenerationOptions } from "./types.mts"
+const replicateToken = `${process.env.AUTH_REPLICATE_API_TOKEN || ""}`
+const replicateModel = `${process.env.HOTSHOT_XL_REPLICATE_MODEL || ""}`
+const replicateModelVersion = `${process.env.HOTSHOT_XL_REPLICATE_MODEL_VERSION || ""}`
+if (!replicateToken) {
+  throw new Error(`you need to configure your AUTH_REPLICATE_API_TOKEN`)
+}
+const replicate = new Replicate({ auth: replicateToken })
+/**
+ * Generate a video with hotshot through Replicate
+ *
+ * Note that if nbFrames == 1, then it will generate a jpg
+ *
+ */
+export async function generateVideoWithHotshotReplicate({
+    positivePrompt,
+    negativePrompt = "",
+    seed,
+    nbFrames = 8, // for now the only values that make sense are 1 (for a jpg) or 8 (for a video)
+    videoDuration = 1000, // for now Hotshot doesn't really supports anything else
+    nbSteps = 30, // when rendering a final video, we want a value like 50 or 70 here
+    size = "768x320",
+    // for a replicate LoRa this is always the same ("In the style of TOK")
+    // triggerWord = "In the style of TOK",
+    // for jbilcke-hf/sdxl-cinematic-2 it is "cinematic-2"
+    triggerWord = "cinematic-2",
+    huggingFaceLora = "jbilcke-hf/sdxl-cinematic-2",
+    // url to the weight
+    replicateLora,
+  }: VideoGenerationOptions): Promise<string> {
+  if (!positivePrompt?.length) {
+    throw new Error(`prompt is too short!`)
+  }
+  if (!replicateModel) {
+    throw new Error(`you need to configure your HOTSHOT_XL_REPLICATE_MODEL`)
+  }
+  if (!replicateModelVersion) {
+    throw new Error(`you need to configure your HOTSHOT_XL_REPLICATE_MODEL_VERSION`)
+  }
+  // pimp the prompt
+  positivePrompt = getPositivePrompt(positivePrompt, triggerWord)
+  negativePrompt = getNegativePrompt(negativePrompt)
+  const [width, height] = size.split("x").map(x => Number(x))
+  // see an example here:
+  // https://replicate.com/p/incraplbv23g3zv6woinhgdira
+  // for params and doc see https://replicate.com/cloneofsimo/hotshot-xl-lora-controlnet
+  const prediction = await replicate.predictions.create({
+    version: replicateModelVersion,
+    input: {
+      prompt: positivePrompt,
+      negative_prompt: negativePrompt,
+      // this is not a URL but a model name
+      hf_lora_url: replicateLora?.length ? undefined : huggingFaceLora,
+      // this is a URL to the .tar (we can get it from the "trainings" page)
+      replicate_weights_url: huggingFaceLora?.length ? undefined : replicateLora,
+      width,
+      height,
+      // those are used to create an upsampling or downsampling
+      // original_width: width,
+      // original_height: height,
+      // target_width: width,
+      // target_height: height,
+      steps: nbSteps,
+      // note: right now it only makes sense to use either 1 (a jpg)
+      video_length: nbFrames, // nb frames
+      video_duration: videoDuration, // video duration in ms
+      seed: !isNaN(seed) && isFinite(seed) ? seed : generateSeed()
+    }
+  })
+  // console.log("prediction:", prediction)
+  // Replicate requires at least 30 seconds of mandatory delay
+  await sleep(30000)
+  let res: Response
+  let pollingCount = 0
+  do {
+    // Check every 5 seconds
+    await sleep(5000)
+    res = await fetch(`https://api.replicate.com/v1/predictions/${prediction.id}`, {
+      method: "GET",
+      headers: {
+        Authorization: `Token ${replicateToken}`,
+      },
+      cache: 'no-store',
+    })
+    if (res.status === 200) {
+      const response = (await res.json()) as any
+      const error = `${response?.error || ""}`
+      if (error) {
+        throw new Error(error)
+      }
+    }
+    pollingCount++
+    // To prevent indefinite polling, we can stop after a certain number, here 30 (i.e. about 2 and half minutes)
+    if (pollingCount >= 30) {
+      throw new Error('Request time out.')
+    }
+  } while (true)
+}

src/providers/video-generation/generateVideoWithShow.mts ADDED Viewed

	@@ -0,0 +1,15 @@

+// we don't use replicas yet, because it ain't easy to get their hostname
+const instances: string[] = [
+  `${process.env.VC_ZEROSCOPE_SPACE_API_URL_1 || ""}`,
+].filter(instance => instance?.length > 0)
+const secretToken = `${process.env.VC_MICROSERVICE_SECRET_TOKEN || ""}`
+export const generateVideo = async (prompt: string, options?: {
+  seed: number;
+  nbFrames: number;
+  nbSteps: number;
+}) => {
+  throw new Error("Not implemented yet")
+}

src/providers/video-generation/generateVideoWithZeroscope.mts ADDED Viewed

	@@ -0,0 +1,49 @@

+import { client } from "@gradio/client"
+import { generateSeed } from "../../utils/misc/generateSeed.mts"
+import { VideoGenerationOptions } from "./types.mts"
+import { getPositivePrompt } from "./defaultPrompts.mts"
+// we don't use replicas yet, because it ain't easy to get their hostname
+const instances: string[] = [
+  `${process.env.VC_ZEROSCOPE_SPACE_API_URL_1 || ""}`,
+  `${process.env.VC_ZEROSCOPE_SPACE_API_URL_2 || ""}`,
+  `${process.env.VC_ZEROSCOPE_SPACE_API_URL_3 || ""}`,
+  `${process.env.VC_ZEROSCOPE_SPACE_API_URL_4 || ""}`,
+].filter(instance => instance?.length > 0)
+const secretToken = `${process.env.VC_MICROSERVICE_SECRET_TOKEN || ""}`
+export const generateVideo = async ({
+  positivePrompt,
+  seed,
+  nbFrames = 8, // for now the only values that make sense are 1 (for a jpg) or 8 (for a video)
+  nbSteps = 30, // when rendering a final video, we want a value like 50 or 70 here
+}: VideoGenerationOptions) => {
+  try {
+    const instance = instances.shift()
+    instances.push(instance)
+    const api = await client(instance, {
+      hf_token: `${process.env.VC_HF_API_TOKEN}` as any
+    })
+    const rawResponse = await api.predict('/run', [
+      getPositivePrompt(positivePrompt), // string  in 'Prompt' Textbox component
+      // we treat 0 as meaning "random seed"
+      !isNaN(seed) && isFinite(seed) && seed > 0 ? seed : generateSeed(), // number (numeric value between 0 and 2147483647) in 'Seed' Slider component
+      nbFrames || 24, // 24 // it is the nb of frames per seconds I think?
+      nbSteps || 35, // 10, (numeric value between 10 and 50) in 'Number of inference steps' Slider component
+      secretToken,
+    ]) as any
+    // console.log("rawResponse:", rawResponse)
+    const { name } = rawResponse?.data?.[0]?.[0] as { name: string, orig_name: string }
+    return `${instance}/file=${name}`
+  } catch (err) {
+    throw err
+  }
+}

src/providers/video-generation/types.mts ADDED Viewed

	@@ -0,0 +1,62 @@

+import { HotshotImageInferenceSize } from "../../types.mts"
+export type VideoGenerationOptions = {
+  /**
+   * Positive prompt to use
+   */
+  positivePrompt: string
+  /**
+   * Negative prompt to use
+   */
+  negativePrompt?: string
+  /**
+   * Seed.
+   *
+   * Depending on the vendor, if you use a negative value (eg -1) it should give you an always random value
+   */
+  seed?: number
+  /**
+   * Number of frames to generate
+   */
+  nbFrames?: number
+  /**
+   * Duration of the video, in seconds
+   */
+  videoDuration?: number
+  /**
+   * Number of inference steps (for final rendering use 70)
+   */
+  nbSteps?: number
+  /**
+   * Image size (which is actually a ratio)
+   *
+   * Note that Hotshot wasn't trained on all possible combinations,
+   * and in particular by default it supposed to only support 512x512 well
+   */
+  size?: HotshotImageInferenceSize
+  /**
+   * Trigger word
+   *
+   * for a replicate LoRa this is always the same ("In the style of TOK")
+   * triggerWord = "In the style of TOK",
+   * for jbilcke-hf/sdxl-cinematic-2 it is "cinematic-2"
+   */
+  triggerWord?: string
+  /**
+   * Owner + repo name of the Hugging Face LoRA
+   */
+  huggingFaceLora?: string
+  /**
+   * URL to the weights .tar (those can be hosted anywere, it doesn't have to be on Replicate.com)
+   */
+  replicateLora?: string
+}

src/providers/video-interpolation/interpolateVideoWithReplicate.mts ADDED Viewed

	@@ -0,0 +1,104 @@

+// this looks really great!
+// https://replicate.com/zsxkib/st-mfnet?prediction=bufijj3b45cjoe43pzloqkcghy
+"use server"
+import Replicate from "replicate"
+import { sleep } from "../../utils/misc/sleep.mts"
+const replicateToken = `${process.env.AUTH_REPLICATE_API_TOKEN || ""}`
+const replicateModel = `${process.env.STMFNET_REPLICATE_MODEL || ""}`
+const replicateModelVersion = `${process.env.STMFNET_REPLICATE_MODEL_VERSION || ""}`
+if (!replicateToken) {
+  throw new Error(`you need to configure your AUTH_REPLICATE_API_TOKEN`)
+}
+const replicate = new Replicate({ auth: replicateToken })
+/**
+ * Interpolate a video using Replicate
+ *
+ * Important note: the video will lose its sound, if any!
+ *
+ * With the current settingd, duration of the original video will be preserved
+ * (but we could make slow-mo too)
+ */
+export async function interpolateVideoWithReplicate({
+    video,
+    // so arguably 60 would look smoother, but we are tying to reach for a "movie" kind of feel here
+    nbFrames = 24,
+  }: {
+    video: string
+    /**
+     * Number of frame (duration of the original video will be preserved)
+     */
+    nbFrames?: number // min 1, max: 240
+  }): Promise<string> {
+  if (!video) {
+    throw new Error(`no video provided`)
+  }
+  if (!replicateModel) {
+    throw new Error(`you need to configure your STMFNET_REPLICATE_MODEL`)
+  }
+  if (!replicateModelVersion) {
+    throw new Error(`you need to configure your STMFNET_REPLICATE_MODEL_VERSION`)
+  }
+  // for params and doc see https://replicate.com/zsxkib/st-mfnet
+  const prediction = await replicate.predictions.create({
+    version: replicateModelVersion,
+    input: {
+      mp4: video, // I think it should be a base64 object?
+      framerate_multiplier: 2, // can be one of 2, 4, 8, 16, 32
+      // note: for now we use the simplest setting, which is to keep the original video duration
+      // if we don't keep the original duration, the video will look like a slow motion animation
+      // which may be a desired effect, but let's keep it simple for now
+      keep_original_duration: true, // false,
+      custom_fps: nbFrames // min 1, max: 240
+    }
+  })
+  // console.log("prediction:", prediction)
+  // Replicate requires at least 8 seconds of mandatory delay
+  await sleep(10000)
+  let res: Response
+  let pollingCount = 0
+  do {
+    // This is normally a fast model, so let's check every 2 seconds
+    await sleep(2000)
+    res = await fetch(`https://api.replicate.com/v1/predictions/${prediction.id}`, {
+      method: "GET",
+      headers: {
+        Authorization: `Token ${replicateToken}`,
+      },
+      cache: 'no-store',
+    })
+    if (res.status === 200) {
+      const response = (await res.json()) as any
+      const error = `${response?.error || ""}`
+      if (error) {
+        throw new Error(error)
+      }
+    }
+    pollingCount++
+    // To prevent indefinite polling, we can stop after a certain number
+    if (pollingCount >= 30) {
+      throw new Error('Request time out.')
+    }
+  } while (true)
+}

src/providers/{video-generation/generateVideo.mts → video-transformation/transformVideoWithHotshotReplicate.mts} RENAMED Viewed

@@ -9,9 +9,6 @@ export const state = {
 // we don't use replicas yet, because it ain't easy to get their hostname
 const instances: string[] = [
   `${process.env.VC_ZEROSCOPE_SPACE_API_URL_1 || ""}`,
-  `${process.env.VC_ZEROSCOPE_SPACE_API_URL_2 || ""}`,
-  `${process.env.VC_ZEROSCOPE_SPACE_API_URL_3 || ""}`,
-  `${process.env.VC_ZEROSCOPE_SPACE_API_URL_4 || ""}`,
 ].filter(instance => instance?.length > 0)
 const secretToken = `${process.env.VC_MICROSERVICE_SECRET_TOKEN || ""}`
@@ -21,6 +18,7 @@ export const generateVideo = async (prompt: string, options?: {
   nbFrames: number;
   nbSteps: number;
 }) => {
   /* let's disable load control, and let it use the queue */
   /*

 // we don't use replicas yet, because it ain't easy to get their hostname
 const instances: string[] = [
   `${process.env.VC_ZEROSCOPE_SPACE_API_URL_1 || ""}`,
 ].filter(instance => instance?.length > 0)
 const secretToken = `${process.env.VC_MICROSERVICE_SECRET_TOKEN || ""}`
   nbFrames: number;
   nbSteps: number;
 }) => {
+  throw new Error("Not implemented yet")
   /* let's disable load control, and let it use the queue */
   /*

src/scheduler/processVideo.mts CHANGED Viewed

@@ -2,7 +2,7 @@ import { v4 as uuidv4 } from "uuid"
 import { Video, VideoShot } from "../types.mts"
-import { generateVideo } from "../providers/video-generation/generateVideo.mts"
 import { upscaleVideo } from "../providers/video-upscaling/upscaleVideo.mts"
 import { interpolateVideo } from "../providers/video-interpolation/interpolateVideo.mts"
 import { postInterpolation } from "../production/postInterpolation.mts"

 import { Video, VideoShot } from "../types.mts"
+import { generateVideo } from "../providers/video-generation/generateVideoWithZeroscope.mts"
 import { upscaleVideo } from "../providers/video-upscaling/upscaleVideo.mts"
 import { interpolateVideo } from "../providers/video-interpolation/interpolateVideo.mts"
 import { postInterpolation } from "../production/postInterpolation.mts"

src/types.mts CHANGED Viewed

@@ -384,3 +384,17 @@ export type RenderingJob = {
   status: 'pending' | 'completed' | 'error'
 }

   status: 'pending' | 'completed' | 'error'
 }
+// vendor-specific types
+export type HotshotImageInferenceSize =
+| '320x768'
+| '384x672'
+| '416x608'
+| '512x512'
+| '608x416'
+| '672x384'
+| '768x320'
+| '1024x1024' // custom ratio - this isn't supported / supposed to work properly
+| '1024x512' // custom panoramic ratio - this isn't supported / supposed to work properly
+| '1024x576' // movie ratio (16:9) this isn't supported / supposed to work properly