jbilcke-hf HF staff commited on
Commit
0176e5b
1 Parent(s): cf329f1

let's add some music

Browse files
package-lock.json CHANGED
@@ -9,7 +9,7 @@
9
  "version": "0.0.0",
10
  "dependencies": {
11
  "@aitube/clap": "0.0.16",
12
- "@aitube/client": "0.0.23",
13
  "@aitube/engine": "0.0.6",
14
  "@huggingface/hub": "0.12.3-oauth",
15
  "@huggingface/inference": "^2.6.7",
@@ -130,9 +130,9 @@
130
  }
131
  },
132
  "node_modules/@aitube/client": {
133
- "version": "0.0.23",
134
- "resolved": "https://registry.npmjs.org/@aitube/client/-/client-0.0.23.tgz",
135
- "integrity": "sha512-zZeGacE2WWSIO1h+HOQu6ExwWfJ01mzW1SreP3bN67vOmrau+bWRzZmX6Wg7DAHePnjvTkeR01TAiZVXskJOkw==",
136
  "dependencies": {
137
  "query-string": "^9.0.0"
138
  },
@@ -4323,9 +4323,9 @@
4323
  "integrity": "sha512-I88TYZWc9XiYHRQ4/3c5rjjfgkjhLyW2luGIheGERbNQ6OY7yTybanSpDXZa8y7VUP9YmDcYa+eyq4ca7iLqWA=="
4324
  },
4325
  "node_modules/electron-to-chromium": {
4326
- "version": "1.4.761",
4327
- "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.4.761.tgz",
4328
- "integrity": "sha512-PIbxpiJGx6Bb8dQaonNc6CGTRlVntdLg/2nMa1YhnrwYOORY9a3ZgGN0UQYE6lAcj/lkyduJN7BPt/JiY+jAQQ=="
4329
  },
4330
  "node_modules/elliptic": {
4331
  "version": "6.5.4",
@@ -6691,9 +6691,9 @@
6691
  }
6692
  },
6693
  "node_modules/openai": {
6694
- "version": "4.43.0",
6695
- "resolved": "https://registry.npmjs.org/openai/-/openai-4.43.0.tgz",
6696
- "integrity": "sha512-4SMUB/XiqnO5IrEcdzEGGTcHoeXq7D/k82v36zoqSitrMUjenZXGH5JysIH7aF7Wr+gjvq0dT2mV6wLVKA7Seg==",
6697
  "dependencies": {
6698
  "@types/node": "^18.11.18",
6699
  "@types/node-fetch": "^2.6.4",
@@ -7922,9 +7922,9 @@
7922
  "integrity": "sha512-NnzSOEKyv4I83qbuKw9ROtJrrT6Z/Xt7I0HiP/e6H6GnpeTDvzwGIGeJ8slai+VwODSHQDooW2CAilJwT9SpRg=="
7923
  },
7924
  "node_modules/styled-components": {
7925
- "version": "6.1.10",
7926
- "resolved": "https://registry.npmjs.org/styled-components/-/styled-components-6.1.10.tgz",
7927
- "integrity": "sha512-4K8IKcn7iOt76riGLjvBhRyNPTkUKTvmnwoRFBOtJLswVvzy2VsoE2KOrfl9FJLQUYbITLJY2wfIZ3tjbkA/Zw==",
7928
  "dependencies": {
7929
  "@emotion/is-prop-valid": "1.2.2",
7930
  "@emotion/unitless": "0.8.1",
 
9
  "version": "0.0.0",
10
  "dependencies": {
11
  "@aitube/clap": "0.0.16",
12
+ "@aitube/client": "0.0.24",
13
  "@aitube/engine": "0.0.6",
14
  "@huggingface/hub": "0.12.3-oauth",
15
  "@huggingface/inference": "^2.6.7",
 
130
  }
131
  },
132
  "node_modules/@aitube/client": {
133
+ "version": "0.0.24",
134
+ "resolved": "https://registry.npmjs.org/@aitube/client/-/client-0.0.24.tgz",
135
+ "integrity": "sha512-9J3PhVabyc/aOcB5j7wF5Fxb4VORB6aoHTTY6Y+ciFi96zp1YcFeYBMmBpIDq/6KqucIJFtANtZDPoy5a/j1Og==",
136
  "dependencies": {
137
  "query-string": "^9.0.0"
138
  },
 
4323
  "integrity": "sha512-I88TYZWc9XiYHRQ4/3c5rjjfgkjhLyW2luGIheGERbNQ6OY7yTybanSpDXZa8y7VUP9YmDcYa+eyq4ca7iLqWA=="
4324
  },
4325
  "node_modules/electron-to-chromium": {
4326
+ "version": "1.4.762",
4327
+ "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.4.762.tgz",
4328
+ "integrity": "sha512-rrFvGweLxPwwSwJOjIopy3Vr+J3cIPtZzuc74bmlvmBIgQO3VYJDvVrlj94iKZ3ukXUH64Ex31hSfRTLqvjYJQ=="
4329
  },
4330
  "node_modules/elliptic": {
4331
  "version": "6.5.4",
 
6691
  }
6692
  },
6693
  "node_modules/openai": {
6694
+ "version": "4.44.0",
6695
+ "resolved": "https://registry.npmjs.org/openai/-/openai-4.44.0.tgz",
6696
+ "integrity": "sha512-jVpDIJsBAR83rVbIHPuWRr9UkFc5DaH9ev2kt2IQAhKCs73DBRoFOa5SwtqfN7/CcBdIGBdygpmpc0gsFaV+Ow==",
6697
  "dependencies": {
6698
  "@types/node": "^18.11.18",
6699
  "@types/node-fetch": "^2.6.4",
 
7922
  "integrity": "sha512-NnzSOEKyv4I83qbuKw9ROtJrrT6Z/Xt7I0HiP/e6H6GnpeTDvzwGIGeJ8slai+VwODSHQDooW2CAilJwT9SpRg=="
7923
  },
7924
  "node_modules/styled-components": {
7925
+ "version": "6.1.11",
7926
+ "resolved": "https://registry.npmjs.org/styled-components/-/styled-components-6.1.11.tgz",
7927
+ "integrity": "sha512-Ui0jXPzbp1phYij90h12ksljKGqF8ncGx+pjrNPsSPhbUUjWT2tD1FwGo2LF6USCnbrsIhNngDfodhxbegfEOA==",
7928
  "dependencies": {
7929
  "@emotion/is-prop-valid": "1.2.2",
7930
  "@emotion/unitless": "0.8.1",
package.json CHANGED
@@ -11,7 +11,7 @@
11
  },
12
  "dependencies": {
13
  "@aitube/clap": "0.0.16",
14
- "@aitube/client": "0.0.23",
15
  "@aitube/engine": "0.0.6",
16
  "@huggingface/hub": "0.12.3-oauth",
17
  "@huggingface/inference": "^2.6.7",
 
11
  },
12
  "dependencies": {
13
  "@aitube/clap": "0.0.16",
14
+ "@aitube/client": "0.0.24",
15
  "@aitube/engine": "0.0.6",
16
  "@huggingface/hub": "0.12.3-oauth",
17
  "@huggingface/inference": "^2.6.7",
src/app/api/generators/music/generateMusicAsBase64.ts ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { sleep } from "@/lib/utils/sleep"
2
+ import { generateMusicWithMusicgen } from "./generateMusicWithMusicgen"
3
+
4
+ // apparently if we ask to generate like 4 minutes of audio, it crashes
5
+ const maxAudioDurationInSec = 120
6
+
7
+ // generate music
8
+ // this may generate multiple tracks (one after another)
9
+ // if the durationInSec parameter is larger than the max audio duration
10
+ export async function generateMusicAsBase64({
11
+ prompt,
12
+ durationInSec,
13
+ hd = false,
14
+ }: {
15
+ prompt: string
16
+ durationInSec: number
17
+
18
+ // use diffusion (increases quality, but requires more RAM)
19
+ hd?: boolean
20
+ }): Promise<string[]> {
21
+
22
+ const musicPrompt = prompt || ""
23
+
24
+ if (durationInSec < 1 || !musicPrompt) { return [] }
25
+
26
+ if (durationInSec > maxAudioDurationInSec) {
27
+ const halfTheDuration = Math.round(durationInSec / 2)
28
+
29
+ // no! we shouldn't generate them in parallel
30
+ // or at least, no now, because we only have ONE music server!
31
+ // const chunks = await Promise.all([
32
+ // generateMusic({ video, durationInSec: halfTheDuration })
33
+ //])
34
+ // return chunks.reduce((acc, tracks) => ([...acc, ...tracks]), [])
35
+
36
+ // instead, let's play it safe and generate them one after another
37
+ let chunks: string[] = []
38
+ const partA = await generateMusicAsBase64({ prompt, hd, durationInSec: halfTheDuration })
39
+ if (partA) { chunks = chunks.concat(partA) }
40
+
41
+ const partB = await generateMusicAsBase64({ prompt, hd, durationInSec: halfTheDuration })
42
+ if (partB) { chunks = chunks.concat(partB) }
43
+
44
+ return [...partA, ...partB]
45
+ }
46
+
47
+ let musicTracks: string[] = []
48
+
49
+ const musicParams = {
50
+ prompt: musicPrompt,
51
+ durationInSec,
52
+ hd,
53
+ }
54
+ try {
55
+ console.log(` |- generating ${durationInSec} seconds of music..`)
56
+ const musicTrack = await generateMusicWithMusicgen(musicParams)
57
+ if (!musicTrack?.length) { throw new Error("audio is too short to be valid!")}
58
+ musicTracks.push(musicTrack)
59
+ } catch (err) {
60
+ try {
61
+ await sleep(4000)
62
+ const musicTrack = await generateMusicWithMusicgen(musicParams)
63
+ if (!musicTrack?.length) { throw new Error("audio is too short to be valid!")}
64
+ musicTracks.push(musicTrack)
65
+ } catch (err2) {
66
+ console.error(` |- failed to generate the music (yes, we retried after a delay)`)
67
+ }
68
+ }
69
+
70
+
71
+ return musicTracks
72
+ }
src/app/api/generators/music/generateMusicWithMusicgen.ts ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { addBase64Header } from "@/lib/data/addBase64Header"
2
+
3
+ import { tryApiCalls } from "../../utils/tryApiCall"
4
+ import { MusicGenerationParams } from "./types"
5
+
6
+ const gradioSpaceApiUrl = `https://jbilcke-hf-ai-tube-model-musicgen.hf.space`
7
+ const huggingFaceSpace = "jbilcke-hf/ai-tube-model-musicgen"
8
+ const microserviceApiKey = `${process.env.MICROSERVICE_API_SECRET_TOKEN || ""}`
9
+
10
+ /**
11
+ * Note: this generates a base64 mp3 file
12
+ */
13
+ export async function generateMusicWithMusicgen({
14
+ prompt,
15
+ durationInSec,
16
+ hd,
17
+ debug = false,
18
+ neverThrow = false,
19
+ }: MusicGenerationParams): Promise<string> {
20
+
21
+ const actualFunction = async () => {
22
+
23
+ const res = await fetch(gradioSpaceApiUrl + (gradioSpaceApiUrl.endsWith("/") ? "" : "/") + "api/predict", {
24
+ method: "POST",
25
+ headers: {
26
+ "Content-Type": "application/json",
27
+ // Authorization: `Bearer ${token}`,
28
+ },
29
+ body: JSON.stringify({
30
+ fn_index: 0, // <- important!
31
+ data: [
32
+ microserviceApiKey, // string in 'Secret Token' Textbox component
33
+ "facebook/musicgen-stereo-large", // string in 'Model' Radio component
34
+ "", // string in 'Model Path (custom models)' Textbox component
35
+
36
+ // can be one of Default or MultiBand_Diffusion
37
+ // since speed isn't an issue for AI Tube,
38
+ // we can afford to use the MultiBand Decoder
39
+ hd ? "MultiBand_Diffusion" : "Default",
40
+
41
+ prompt, // string in 'Input Text' Textbox component
42
+ null, // blob in 'File' Audio component
43
+ durationInSec, // number (numeric value between 1 and 300) in 'Duration' Slider component
44
+ 250, // number in 'Top-k' Number component
45
+ 0, // number in 'Top-p' Number component
46
+ 1, // number in 'Temperature' Number component
47
+ 3, // number in 'Classifier Free Guidance' Number component
48
+ ],
49
+ }),
50
+ cache: "no-store",
51
+ // we can also use this (see https://vercel.com/blog/vercel-cache-api-nextjs-cache)
52
+ // next: { revalidate: 1 }
53
+ })
54
+
55
+ if (res.status !== 200) {
56
+ throw new Error('Failed to fetch data')
57
+ }
58
+
59
+
60
+ const { data } = await res.json()
61
+
62
+ // console.log("data:", data)
63
+ // Recommendation: handle errors
64
+ if (res.status !== 200 || !Array.isArray(data)) {
65
+ // This will activate the closest `error.js` Error Boundary
66
+ throw new Error(`Failed to fetch data (status: ${res.status})`)
67
+ }
68
+ // console.log("data:", data.slice(0, 50))
69
+
70
+ if (!data[0]) {
71
+ throw new Error(`the returned music was empty`)
72
+ }
73
+
74
+ console.log("data:", data[0].slice(0, 60))
75
+ return addBase64Header(data[0] as string, "mp3")
76
+ }
77
+
78
+ try {
79
+ if (!prompt?.length) {
80
+ throw new Error(`prompt is too short!`)
81
+ }
82
+
83
+ const result = await tryApiCalls({
84
+ func: actualFunction,
85
+ huggingFaceSpace,
86
+ debug,
87
+ failureMessage: "failed to generate the music"
88
+ })
89
+
90
+ return result
91
+ } catch (err) {
92
+ if (neverThrow) {
93
+ console.error(`generateVoiceWithMusicgen():`, err)
94
+ return ""
95
+ } else {
96
+ throw err
97
+ }
98
+ }
99
+ }
src/app/api/generators/music/types.ts ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ export type MusicGenerationParams = {
2
+ prompt: string
3
+ durationInSec: number
4
+ hd?: boolean
5
+ debug?: boolean
6
+ neverThrow?: boolean
7
+ }
src/app/api/generators/speech/generateVoiceWithParlerTTS.ts CHANGED
@@ -3,7 +3,7 @@ import { tryApiCalls } from "../../utils/tryApiCall"
3
 
4
  const gradioSpaceApiUrl = `https://jbilcke-hf-ai-tube-model-parler-tts-mini.hf.space`
5
  const huggingFaceSpace = "jbilcke-hf/ai-tube-model-parler-tts-mini"
6
- const apiKey = `${process.env.MICROSERVICE_API_SECRET_TOKEN || ""}`
7
 
8
  /**
9
  * Note: this generates a MP3 file
@@ -34,7 +34,7 @@ export async function generateSpeechWithParlerTTS({
34
  body: JSON.stringify({
35
  fn_index: 0, // <- important!
36
  data: [
37
- apiKey,
38
  text,
39
  audioId,
40
  ],
 
3
 
4
  const gradioSpaceApiUrl = `https://jbilcke-hf-ai-tube-model-parler-tts-mini.hf.space`
5
  const huggingFaceSpace = "jbilcke-hf/ai-tube-model-parler-tts-mini"
6
+ const microserviceApiKey = `${process.env.MICROSERVICE_API_SECRET_TOKEN || ""}`
7
 
8
  /**
9
  * Note: this generates a MP3 file
 
34
  body: JSON.stringify({
35
  fn_index: 0, // <- important!
36
  data: [
37
+ microserviceApiKey,
38
  text,
39
  audioId,
40
  ],
src/app/api/v1/create/index.ts CHANGED
@@ -8,6 +8,8 @@ import { parseRawStringToYAML } from "@/app/api/parsers/parseRawStringToYAML"
8
  import { LatentStory } from "@/app/api/v1/types"
9
 
10
  import { systemPrompt } from "./systemPrompt"
 
 
11
 
12
  // a helper to generate Clap stories from a few sentences
13
  // this is mostly used by external apps such as the Stories Factory
@@ -177,5 +179,18 @@ Output: `
177
  currentElapsedTimeInMs += defaultSegmentDurationInMs
178
  }
179
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  return clap
181
  }
 
8
  import { LatentStory } from "@/app/api/v1/types"
9
 
10
  import { systemPrompt } from "./systemPrompt"
11
+ import { generateMusicPrompts } from "../edit/music/generateMusicPrompt"
12
+ import { clapToLatentStory } from "../edit/entities/clapToLatentStory"
13
 
14
  // a helper to generate Clap stories from a few sentences
15
  // this is mostly used by external apps such as the Stories Factory
 
179
  currentElapsedTimeInMs += defaultSegmentDurationInMs
180
  }
181
 
182
+ // one more thing: music!
183
+ let musicPrompts: string[] = []
184
+
185
+ try {
186
+ musicPrompts = await generateMusicPrompts({
187
+ prompt,
188
+ latentStory: await clapToLatentStory(clap)
189
+ })
190
+ } catch (err) {
191
+ console.error(`[api/v1/create] failed to generate music prompts`)
192
+ musicPrompts.push("lofi hiphop loop")
193
+ }
194
+
195
  return clap
196
  }
src/app/api/v1/edit/dialogues/route.ts CHANGED
@@ -1,15 +1,13 @@
1
  import { NextResponse, NextRequest } from "next/server"
2
-
3
  import { ClapProject, ClapSegment, ClapSegmentCategory, newClap, parseClap, serializeClap } from "@aitube/clap"
 
4
 
5
-
6
- import { processShot } from "./processShot"
7
- import queryString from "query-string"
8
  import { parseCompletionMode } from "@/app/api/parsers/parseCompletionMode"
9
  import { throwIfInvalidToken } from "@/app/api/v1/auth/throwIfInvalidToken"
10
- import { ClapCompletionMode } from "@aitube/client"
11
  import { parseTurbo } from "@/app/api/parsers/parseTurbo"
12
 
 
13
  // a helper to generate speech for a Clap
14
  export async function POST(req: NextRequest) {
15
  await throwIfInvalidToken(req.headers.get("Authorization"))
 
1
  import { NextResponse, NextRequest } from "next/server"
2
+ import queryString from "query-string"
3
  import { ClapProject, ClapSegment, ClapSegmentCategory, newClap, parseClap, serializeClap } from "@aitube/clap"
4
+ import { ClapCompletionMode } from "@aitube/client"
5
 
 
 
 
6
  import { parseCompletionMode } from "@/app/api/parsers/parseCompletionMode"
7
  import { throwIfInvalidToken } from "@/app/api/v1/auth/throwIfInvalidToken"
 
8
  import { parseTurbo } from "@/app/api/parsers/parseTurbo"
9
 
10
+ import { processShot } from "./processShot"
11
  // a helper to generate speech for a Clap
12
  export async function POST(req: NextRequest) {
13
  await throwIfInvalidToken(req.headers.get("Authorization"))
src/app/api/v1/edit/entities/generateEntityPrompts.ts CHANGED
@@ -1,5 +1,3 @@
1
- "use server"
2
-
3
  import YAML from "yaml"
4
  import { ClapSegmentCategory, generateSeed } from "@aitube/clap"
5
  import { ClapEntityPrompt } from "@aitube/client"
 
 
 
1
  import YAML from "yaml"
2
  import { ClapSegmentCategory, generateSeed } from "@aitube/clap"
3
  import { ClapEntityPrompt } from "@aitube/client"
src/app/api/v1/edit/music/generateMusic.ts ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import {
3
+ ClapProject,
4
+ ClapSegment,
5
+ getClapAssetSourceType,
6
+ filterSegments,
7
+ ClapSegmentFilteringMode,
8
+ ClapSegmentCategory,
9
+ newSegment
10
+ } from "@aitube/clap"
11
+ import { ClapCompletionMode } from "@aitube/client"
12
+ import { getSpeechBackgroundAudioPrompt } from "@aitube/engine"
13
+
14
+ import { generateSpeechWithParlerTTS } from "@/app/api/generators/speech/generateVoiceWithParlerTTS"
15
+ import { getMediaInfo } from "@/app/api/utils/getMediaInfo"
16
+ import { generateMusicWithMusicgen } from "@/app/api/generators/music/generateMusicWithMusicgen"
17
+
18
+ export async function generateMusic({
19
+ musicSegment,
20
+ existingClap,
21
+ newerClap,
22
+ mode,
23
+ turbo,
24
+ }: {
25
+ musicSegment?: ClapSegment
26
+ existingClap: ClapProject
27
+ newerClap: ClapProject
28
+ mode: ClapCompletionMode
29
+ turbo: boolean
30
+ }): Promise<void> {
31
+ if (!musicSegment) {
32
+ console.log(`generateMusic(): music segment is empty, so skipping music generation.`)
33
+ return
34
+ }
35
+ // for now we do something very basic
36
+ const prompt = musicSegment.prompt
37
+ if (!prompt) {
38
+ console.log(`generateMusic(): music prompt is empty, so skipping music generation.`)
39
+ return
40
+ }
41
+
42
+ const assetUrl = await generateMusicWithMusicgen({
43
+ prompt,
44
+ durationInSec: 10,
45
+ hd: false,
46
+ debug: true,
47
+ neverThrow: true,
48
+ })
49
+
50
+ if (!assetUrl || assetUrl?.length < 30) {
51
+ console.log(`generateMusic(): generated assetUrl is empty, so music generation failed.`)
52
+ return
53
+ }
54
+
55
+ if (mode !== ClapCompletionMode.FULL) {
56
+ console.log(`generateMusic(): adding music to a new clap file`)
57
+ newerClap.segments.push(newSegment({
58
+ ...musicSegment,
59
+ assetUrl,
60
+ }))
61
+ } else {
62
+ console.log(`generateMusic(): overwriting the music inside the existing clap file`)
63
+ // this will replace the existing clap (normally)
64
+ musicSegment.assetUrl = assetUrl
65
+ }
66
+ }
src/app/api/v1/edit/music/generateMusicPrompt.ts ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import YAML from "yaml"
3
+
4
+ import { predict } from "@/app/api/providers/huggingface/predictWithHuggingFace"
5
+ import { LatentStory } from "@/app/api/v1/types"
6
+
7
+ import { systemPrompt } from "./systemPrompt"
8
+
9
+ export async function generateMusicPrompts({
10
+ prompt = "",
11
+ latentStory = [],
12
+ turbo = false,
13
+ }: {
14
+ prompt?: string
15
+ latentStory?: LatentStory[]
16
+ turbo?: boolean
17
+ } = {
18
+ prompt: "",
19
+ latentStory: [],
20
+ turbo: false
21
+ }): Promise<string[]> {
22
+
23
+ if (!prompt.length) { throw new Error(`please provide a prompt`) }
24
+ console.log("generateMusicPrompts(): prompt:", prompt)
25
+
26
+
27
+ if (!latentStory.length) { throw new Error(`please provide a story`) }
28
+
29
+ console.log("generateMusicPrompts(): latentStory:", latentStory)
30
+
31
+ const userPrompt = `The input story is about: ${prompt}.
32
+
33
+ The input story is:
34
+ \`\`\`yaml
35
+ ${YAML.stringify(
36
+ // we need to help the LLM by marking the shots with a simple numeric ID
37
+ latentStory.map((shot, i) => ({
38
+ shot: i,
39
+ ...shot,
40
+ }))
41
+ )}
42
+ \`\`\`
43
+
44
+ # Output`
45
+
46
+ const prefix = "\""
47
+
48
+ // we don't need a lot here!
49
+ const nbMaxNewTokens = 120
50
+
51
+ // TODO use streaming for the Hugging Face prediction
52
+ //
53
+ // note that a Clap file is actually a YAML stream of documents
54
+ // so technically we could stream everything from end-to-end
55
+ // (but I haven't coded the helpers to do this yet)
56
+ let rawString = await predict({
57
+ systemPrompt,
58
+ userPrompt,
59
+ nbMaxNewTokens,
60
+ prefix,
61
+ turbo,
62
+ })
63
+
64
+ // console.log("generateEntityPrompts(): rawString: ", rawString)
65
+
66
+ let results: string[] = []
67
+
68
+ // we remove everything after the last ``` (or ``)
69
+ rawString = rawString.split(/```?/)[0].trim()
70
+ results.push(rawString)
71
+
72
+ if (!Array.isArray(results) || typeof results.at(0) !== "string" || !results) {
73
+ throw new Error(`failed to generate the output (rawString is: ${rawString})`)
74
+ }
75
+
76
+ return results
77
+ }
src/app/api/v1/edit/music/route.ts ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { NextResponse, NextRequest } from "next/server"
2
+ import queryString from "query-string"
3
+ import { ClapProject, ClapSegment, ClapSegmentCategory, newClap, parseClap, serializeClap } from "@aitube/clap"
4
+ import { ClapCompletionMode } from "@aitube/client"
5
+
6
+ import { parseCompletionMode } from "@/app/api/parsers/parseCompletionMode"
7
+ import { throwIfInvalidToken } from "@/app/api/v1/auth/throwIfInvalidToken"
8
+ import { parseTurbo } from "@/app/api/parsers/parseTurbo"
9
+
10
+ import { generateMusic } from "./generateMusic"
11
+
12
+ // a helper to generate speech for a Clap
13
+ export async function POST(req: NextRequest) {
14
+ await throwIfInvalidToken(req.headers.get("Authorization"))
15
+
16
+ const qs = queryString.parseUrl(req.url || "")
17
+ const query = (qs || {}).query
18
+
19
+ const mode = parseCompletionMode(query?.c)
20
+ const turbo = parseTurbo(query?.t)
21
+
22
+ const blob = await req.blob()
23
+
24
+ const existingClap: ClapProject = await parseClap(blob)
25
+
26
+ if (!existingClap?.segments) { throw new Error(`no segment found in the provided clap!`) }
27
+
28
+ // console.log(`[api/edit/dialogues] detected ${existingClap.segments.length} segments`)
29
+
30
+ const musicSegments: ClapSegment[] = existingClap.segments.filter(s => s.category === ClapSegmentCategory.MUSIC)
31
+ // console.log(`[api/edit/dialogues] detected ${shotsSegments.length} shots`)
32
+
33
+ const newerClap = mode === ClapCompletionMode.FULL ? existingClap : newClap({
34
+ meta: existingClap.meta
35
+ })
36
+
37
+ if (musicSegments.length > 1) {
38
+ throw new Error(`Error, only one music track can be generated with the V1 of the AiTube API`)
39
+ }
40
+
41
+ const musicSegment = musicSegments.at(0)
42
+
43
+ await generateMusic({
44
+ musicSegment,
45
+ existingClap,
46
+ newerClap,
47
+ mode,
48
+ turbo,
49
+ })
50
+
51
+ // console.log(`[api/edit/dialogues] returning the clap augmented with dialogues`)
52
+
53
+ return new NextResponse(await serializeClap(newerClap), {
54
+ status: 200,
55
+ headers: new Headers({ "content-type": "application/x-gzip" }),
56
+ })
57
+ }
src/app/api/v1/edit/music/systemPrompt.ts ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ export const systemPrompt: string = `
2
+ You are a backend API engine, designed to generate music prompt output from a story input.
3
+
4
+ ## Prompting guidelines
5
+
6
+ To create a music prompt, you need to combine styles with moods, plus a few other things.
7
+ 1. Please choose a base style among those categories: "Hip Hop and Rap track", "Classic track", "Jazz track", "Electronic and dance track", "Rock'n'Roll track", "Funk track", "Dubstep track", "Afrobeats", "Orchestral track", "Pop track", "Reggae track", "Metal track", "Country track", "Blues track", "Soul track", "R'n'B track", "Disco track", "Trap track", "Ambient track", "Lofi track", "Chill track", etc.
8
+ 2. Then choose a vibe: "with an happy vibe", "with a sad vibe", "with an angry vibe", "with a chill vibe", "with a romantic vibe", "with an epic vibe", "with an energetic vibe", "with a dreamy vibe", "with a mysterious vibe", "with a relaxing vibe", "with a dark vibe", "with an upbeat vibe", "with a motivational vibe", "with an inspiring vibe", "with a nostalgic vibe", "with a groovy vibe", "with a cheerful vibe", "with a melancholic vibe", "with a hopeful vibe", etc.
9
+ 3. build up a coherent description eg.: "80s pop track with bassy drums and synth", "90s rock song with loud guitars and heavy drums", "a light and cheerly EDM track, with syncopated drums, aery pads, and strong emotions bpm: 130", "A cheerful country song with acoustic guitars", "lofi slow bpm electro chill with organic samples" etc.
10
+
11
+ ## Example of input/output
12
+
13
+ Given the following input story, provided as YAML:
14
+
15
+ # Input
16
+
17
+ "A king goes to see a witch to ask if or how he can win an upcoming and challenging battle"
18
+
19
+ \`\`\`yaml
20
+ - shot: 1
21
+ comment: "King Arthus seeks the witch's guidance to win his imminent battle."
22
+ image: "Establishing shot of KING ARTHUS, nervous, wet brown hair. dressed in golden armor and a colorful cape. His face reveals a mix of concern and determination. He's standing in the bright sunshine, inside a castle's courtyard, under cloudy skies. Behind him, a group of soldiers can be seen marching towards the castle gates."
23
+ voice: "Dark sorceress of the shadows, it is time for you to serve your Lord. Tell me the augur, tell me what you foreknow. Tell me how I will cleave my ennemies to the bone, and ravage them in battle to come up victorious."
24
+ - shot: 2
25
+ comment: "The witch gives her counsel but warns of an unknown cost."
26
+ image: "close-up shot of THE WITCH, smiling cunningly, raising a finger while speaking. Background bokeh, dim lightning, menacing, mysterious."
27
+ voice: "Your Majesty, this will be a bloody battle, but I espy a way to victory for you. But if my advice you follow, victory I foresee, although at a great cost it will be."
28
+ - shot: 3
29
+ comment: "The words of the witch are sinking in, but King Arthus tries to appear strong"
30
+ image: "close-up shot on KING ARTHUS, looking concerned, somber, false confidence"
31
+ voice: "Witch with the wicked tongue, what must be done will be done. I will do everything for my people's sake. Speak now, make know the path to glory."
32
+ \`\`\
33
+
34
+ As you can see, the theme is medieval, this is for a fantasy movie. So you should generate a music like this:
35
+
36
+ ## Output
37
+
38
+ "Classical music with a symphonic orchestra, with medieval influences and a chilling mysterious vibe."
39
+ `