Spaces:
Running
Running
Commit
·
0176e5b
1
Parent(s):
cf329f1
let's add some music
Browse files- package-lock.json +13 -13
- package.json +1 -1
- src/app/api/generators/music/generateMusicAsBase64.ts +72 -0
- src/app/api/generators/music/generateMusicWithMusicgen.ts +99 -0
- src/app/api/generators/music/types.ts +7 -0
- src/app/api/generators/speech/generateVoiceWithParlerTTS.ts +2 -2
- src/app/api/v1/create/index.ts +15 -0
- src/app/api/v1/edit/dialogues/route.ts +3 -5
- src/app/api/v1/edit/entities/generateEntityPrompts.ts +0 -2
- src/app/api/v1/edit/music/generateMusic.ts +66 -0
- src/app/api/v1/edit/music/generateMusicPrompt.ts +77 -0
- src/app/api/v1/edit/music/route.ts +57 -0
- src/app/api/v1/edit/music/systemPrompt.ts +39 -0
package-lock.json
CHANGED
@@ -9,7 +9,7 @@
|
|
9 |
"version": "0.0.0",
|
10 |
"dependencies": {
|
11 |
"@aitube/clap": "0.0.16",
|
12 |
-
"@aitube/client": "0.0.
|
13 |
"@aitube/engine": "0.0.6",
|
14 |
"@huggingface/hub": "0.12.3-oauth",
|
15 |
"@huggingface/inference": "^2.6.7",
|
@@ -130,9 +130,9 @@
|
|
130 |
}
|
131 |
},
|
132 |
"node_modules/@aitube/client": {
|
133 |
-
"version": "0.0.
|
134 |
-
"resolved": "https://registry.npmjs.org/@aitube/client/-/client-0.0.
|
135 |
-
"integrity": "sha512-
|
136 |
"dependencies": {
|
137 |
"query-string": "^9.0.0"
|
138 |
},
|
@@ -4323,9 +4323,9 @@
|
|
4323 |
"integrity": "sha512-I88TYZWc9XiYHRQ4/3c5rjjfgkjhLyW2luGIheGERbNQ6OY7yTybanSpDXZa8y7VUP9YmDcYa+eyq4ca7iLqWA=="
|
4324 |
},
|
4325 |
"node_modules/electron-to-chromium": {
|
4326 |
-
"version": "1.4.
|
4327 |
-
"resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.4.
|
4328 |
-
"integrity": "sha512-
|
4329 |
},
|
4330 |
"node_modules/elliptic": {
|
4331 |
"version": "6.5.4",
|
@@ -6691,9 +6691,9 @@
|
|
6691 |
}
|
6692 |
},
|
6693 |
"node_modules/openai": {
|
6694 |
-
"version": "4.
|
6695 |
-
"resolved": "https://registry.npmjs.org/openai/-/openai-4.
|
6696 |
-
"integrity": "sha512-
|
6697 |
"dependencies": {
|
6698 |
"@types/node": "^18.11.18",
|
6699 |
"@types/node-fetch": "^2.6.4",
|
@@ -7922,9 +7922,9 @@
|
|
7922 |
"integrity": "sha512-NnzSOEKyv4I83qbuKw9ROtJrrT6Z/Xt7I0HiP/e6H6GnpeTDvzwGIGeJ8slai+VwODSHQDooW2CAilJwT9SpRg=="
|
7923 |
},
|
7924 |
"node_modules/styled-components": {
|
7925 |
-
"version": "6.1.
|
7926 |
-
"resolved": "https://registry.npmjs.org/styled-components/-/styled-components-6.1.
|
7927 |
-
"integrity": "sha512-
|
7928 |
"dependencies": {
|
7929 |
"@emotion/is-prop-valid": "1.2.2",
|
7930 |
"@emotion/unitless": "0.8.1",
|
|
|
9 |
"version": "0.0.0",
|
10 |
"dependencies": {
|
11 |
"@aitube/clap": "0.0.16",
|
12 |
+
"@aitube/client": "0.0.24",
|
13 |
"@aitube/engine": "0.0.6",
|
14 |
"@huggingface/hub": "0.12.3-oauth",
|
15 |
"@huggingface/inference": "^2.6.7",
|
|
|
130 |
}
|
131 |
},
|
132 |
"node_modules/@aitube/client": {
|
133 |
+
"version": "0.0.24",
|
134 |
+
"resolved": "https://registry.npmjs.org/@aitube/client/-/client-0.0.24.tgz",
|
135 |
+
"integrity": "sha512-9J3PhVabyc/aOcB5j7wF5Fxb4VORB6aoHTTY6Y+ciFi96zp1YcFeYBMmBpIDq/6KqucIJFtANtZDPoy5a/j1Og==",
|
136 |
"dependencies": {
|
137 |
"query-string": "^9.0.0"
|
138 |
},
|
|
|
4323 |
"integrity": "sha512-I88TYZWc9XiYHRQ4/3c5rjjfgkjhLyW2luGIheGERbNQ6OY7yTybanSpDXZa8y7VUP9YmDcYa+eyq4ca7iLqWA=="
|
4324 |
},
|
4325 |
"node_modules/electron-to-chromium": {
|
4326 |
+
"version": "1.4.762",
|
4327 |
+
"resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.4.762.tgz",
|
4328 |
+
"integrity": "sha512-rrFvGweLxPwwSwJOjIopy3Vr+J3cIPtZzuc74bmlvmBIgQO3VYJDvVrlj94iKZ3ukXUH64Ex31hSfRTLqvjYJQ=="
|
4329 |
},
|
4330 |
"node_modules/elliptic": {
|
4331 |
"version": "6.5.4",
|
|
|
6691 |
}
|
6692 |
},
|
6693 |
"node_modules/openai": {
|
6694 |
+
"version": "4.44.0",
|
6695 |
+
"resolved": "https://registry.npmjs.org/openai/-/openai-4.44.0.tgz",
|
6696 |
+
"integrity": "sha512-jVpDIJsBAR83rVbIHPuWRr9UkFc5DaH9ev2kt2IQAhKCs73DBRoFOa5SwtqfN7/CcBdIGBdygpmpc0gsFaV+Ow==",
|
6697 |
"dependencies": {
|
6698 |
"@types/node": "^18.11.18",
|
6699 |
"@types/node-fetch": "^2.6.4",
|
|
|
7922 |
"integrity": "sha512-NnzSOEKyv4I83qbuKw9ROtJrrT6Z/Xt7I0HiP/e6H6GnpeTDvzwGIGeJ8slai+VwODSHQDooW2CAilJwT9SpRg=="
|
7923 |
},
|
7924 |
"node_modules/styled-components": {
|
7925 |
+
"version": "6.1.11",
|
7926 |
+
"resolved": "https://registry.npmjs.org/styled-components/-/styled-components-6.1.11.tgz",
|
7927 |
+
"integrity": "sha512-Ui0jXPzbp1phYij90h12ksljKGqF8ncGx+pjrNPsSPhbUUjWT2tD1FwGo2LF6USCnbrsIhNngDfodhxbegfEOA==",
|
7928 |
"dependencies": {
|
7929 |
"@emotion/is-prop-valid": "1.2.2",
|
7930 |
"@emotion/unitless": "0.8.1",
|
package.json
CHANGED
@@ -11,7 +11,7 @@
|
|
11 |
},
|
12 |
"dependencies": {
|
13 |
"@aitube/clap": "0.0.16",
|
14 |
-
"@aitube/client": "0.0.
|
15 |
"@aitube/engine": "0.0.6",
|
16 |
"@huggingface/hub": "0.12.3-oauth",
|
17 |
"@huggingface/inference": "^2.6.7",
|
|
|
11 |
},
|
12 |
"dependencies": {
|
13 |
"@aitube/clap": "0.0.16",
|
14 |
+
"@aitube/client": "0.0.24",
|
15 |
"@aitube/engine": "0.0.6",
|
16 |
"@huggingface/hub": "0.12.3-oauth",
|
17 |
"@huggingface/inference": "^2.6.7",
|
src/app/api/generators/music/generateMusicAsBase64.ts
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import { sleep } from "@/lib/utils/sleep"
|
2 |
+
import { generateMusicWithMusicgen } from "./generateMusicWithMusicgen"
|
3 |
+
|
4 |
+
// apparently if we ask to generate like 4 minutes of audio, it crashes
|
5 |
+
const maxAudioDurationInSec = 120
|
6 |
+
|
7 |
+
// generate music
|
8 |
+
// this may generate multiple tracks (one after another)
|
9 |
+
// if the durationInSec parameter is larger than the max audio duration
|
10 |
+
export async function generateMusicAsBase64({
|
11 |
+
prompt,
|
12 |
+
durationInSec,
|
13 |
+
hd = false,
|
14 |
+
}: {
|
15 |
+
prompt: string
|
16 |
+
durationInSec: number
|
17 |
+
|
18 |
+
// use diffusion (increases quality, but requires more RAM)
|
19 |
+
hd?: boolean
|
20 |
+
}): Promise<string[]> {
|
21 |
+
|
22 |
+
const musicPrompt = prompt || ""
|
23 |
+
|
24 |
+
if (durationInSec < 1 || !musicPrompt) { return [] }
|
25 |
+
|
26 |
+
if (durationInSec > maxAudioDurationInSec) {
|
27 |
+
const halfTheDuration = Math.round(durationInSec / 2)
|
28 |
+
|
29 |
+
// no! we shouldn't generate them in parallel
|
30 |
+
// or at least, no now, because we only have ONE music server!
|
31 |
+
// const chunks = await Promise.all([
|
32 |
+
// generateMusic({ video, durationInSec: halfTheDuration })
|
33 |
+
//])
|
34 |
+
// return chunks.reduce((acc, tracks) => ([...acc, ...tracks]), [])
|
35 |
+
|
36 |
+
// instead, let's play it safe and generate them one after another
|
37 |
+
let chunks: string[] = []
|
38 |
+
const partA = await generateMusicAsBase64({ prompt, hd, durationInSec: halfTheDuration })
|
39 |
+
if (partA) { chunks = chunks.concat(partA) }
|
40 |
+
|
41 |
+
const partB = await generateMusicAsBase64({ prompt, hd, durationInSec: halfTheDuration })
|
42 |
+
if (partB) { chunks = chunks.concat(partB) }
|
43 |
+
|
44 |
+
return [...partA, ...partB]
|
45 |
+
}
|
46 |
+
|
47 |
+
let musicTracks: string[] = []
|
48 |
+
|
49 |
+
const musicParams = {
|
50 |
+
prompt: musicPrompt,
|
51 |
+
durationInSec,
|
52 |
+
hd,
|
53 |
+
}
|
54 |
+
try {
|
55 |
+
console.log(` |- generating ${durationInSec} seconds of music..`)
|
56 |
+
const musicTrack = await generateMusicWithMusicgen(musicParams)
|
57 |
+
if (!musicTrack?.length) { throw new Error("audio is too short to be valid!")}
|
58 |
+
musicTracks.push(musicTrack)
|
59 |
+
} catch (err) {
|
60 |
+
try {
|
61 |
+
await sleep(4000)
|
62 |
+
const musicTrack = await generateMusicWithMusicgen(musicParams)
|
63 |
+
if (!musicTrack?.length) { throw new Error("audio is too short to be valid!")}
|
64 |
+
musicTracks.push(musicTrack)
|
65 |
+
} catch (err2) {
|
66 |
+
console.error(` |- failed to generate the music (yes, we retried after a delay)`)
|
67 |
+
}
|
68 |
+
}
|
69 |
+
|
70 |
+
|
71 |
+
return musicTracks
|
72 |
+
}
|
src/app/api/generators/music/generateMusicWithMusicgen.ts
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import { addBase64Header } from "@/lib/data/addBase64Header"
|
2 |
+
|
3 |
+
import { tryApiCalls } from "../../utils/tryApiCall"
|
4 |
+
import { MusicGenerationParams } from "./types"
|
5 |
+
|
6 |
+
const gradioSpaceApiUrl = `https://jbilcke-hf-ai-tube-model-musicgen.hf.space`
|
7 |
+
const huggingFaceSpace = "jbilcke-hf/ai-tube-model-musicgen"
|
8 |
+
const microserviceApiKey = `${process.env.MICROSERVICE_API_SECRET_TOKEN || ""}`
|
9 |
+
|
10 |
+
/**
|
11 |
+
* Note: this generates a base64 mp3 file
|
12 |
+
*/
|
13 |
+
export async function generateMusicWithMusicgen({
|
14 |
+
prompt,
|
15 |
+
durationInSec,
|
16 |
+
hd,
|
17 |
+
debug = false,
|
18 |
+
neverThrow = false,
|
19 |
+
}: MusicGenerationParams): Promise<string> {
|
20 |
+
|
21 |
+
const actualFunction = async () => {
|
22 |
+
|
23 |
+
const res = await fetch(gradioSpaceApiUrl + (gradioSpaceApiUrl.endsWith("/") ? "" : "/") + "api/predict", {
|
24 |
+
method: "POST",
|
25 |
+
headers: {
|
26 |
+
"Content-Type": "application/json",
|
27 |
+
// Authorization: `Bearer ${token}`,
|
28 |
+
},
|
29 |
+
body: JSON.stringify({
|
30 |
+
fn_index: 0, // <- important!
|
31 |
+
data: [
|
32 |
+
microserviceApiKey, // string in 'Secret Token' Textbox component
|
33 |
+
"facebook/musicgen-stereo-large", // string in 'Model' Radio component
|
34 |
+
"", // string in 'Model Path (custom models)' Textbox component
|
35 |
+
|
36 |
+
// can be one of Default or MultiBand_Diffusion
|
37 |
+
// since speed isn't an issue for AI Tube,
|
38 |
+
// we can afford to use the MultiBand Decoder
|
39 |
+
hd ? "MultiBand_Diffusion" : "Default",
|
40 |
+
|
41 |
+
prompt, // string in 'Input Text' Textbox component
|
42 |
+
null, // blob in 'File' Audio component
|
43 |
+
durationInSec, // number (numeric value between 1 and 300) in 'Duration' Slider component
|
44 |
+
250, // number in 'Top-k' Number component
|
45 |
+
0, // number in 'Top-p' Number component
|
46 |
+
1, // number in 'Temperature' Number component
|
47 |
+
3, // number in 'Classifier Free Guidance' Number component
|
48 |
+
],
|
49 |
+
}),
|
50 |
+
cache: "no-store",
|
51 |
+
// we can also use this (see https://vercel.com/blog/vercel-cache-api-nextjs-cache)
|
52 |
+
// next: { revalidate: 1 }
|
53 |
+
})
|
54 |
+
|
55 |
+
if (res.status !== 200) {
|
56 |
+
throw new Error('Failed to fetch data')
|
57 |
+
}
|
58 |
+
|
59 |
+
|
60 |
+
const { data } = await res.json()
|
61 |
+
|
62 |
+
// console.log("data:", data)
|
63 |
+
// Recommendation: handle errors
|
64 |
+
if (res.status !== 200 || !Array.isArray(data)) {
|
65 |
+
// This will activate the closest `error.js` Error Boundary
|
66 |
+
throw new Error(`Failed to fetch data (status: ${res.status})`)
|
67 |
+
}
|
68 |
+
// console.log("data:", data.slice(0, 50))
|
69 |
+
|
70 |
+
if (!data[0]) {
|
71 |
+
throw new Error(`the returned music was empty`)
|
72 |
+
}
|
73 |
+
|
74 |
+
console.log("data:", data[0].slice(0, 60))
|
75 |
+
return addBase64Header(data[0] as string, "mp3")
|
76 |
+
}
|
77 |
+
|
78 |
+
try {
|
79 |
+
if (!prompt?.length) {
|
80 |
+
throw new Error(`prompt is too short!`)
|
81 |
+
}
|
82 |
+
|
83 |
+
const result = await tryApiCalls({
|
84 |
+
func: actualFunction,
|
85 |
+
huggingFaceSpace,
|
86 |
+
debug,
|
87 |
+
failureMessage: "failed to generate the music"
|
88 |
+
})
|
89 |
+
|
90 |
+
return result
|
91 |
+
} catch (err) {
|
92 |
+
if (neverThrow) {
|
93 |
+
console.error(`generateVoiceWithMusicgen():`, err)
|
94 |
+
return ""
|
95 |
+
} else {
|
96 |
+
throw err
|
97 |
+
}
|
98 |
+
}
|
99 |
+
}
|
src/app/api/generators/music/types.ts
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
export type MusicGenerationParams = {
|
2 |
+
prompt: string
|
3 |
+
durationInSec: number
|
4 |
+
hd?: boolean
|
5 |
+
debug?: boolean
|
6 |
+
neverThrow?: boolean
|
7 |
+
}
|
src/app/api/generators/speech/generateVoiceWithParlerTTS.ts
CHANGED
@@ -3,7 +3,7 @@ import { tryApiCalls } from "../../utils/tryApiCall"
|
|
3 |
|
4 |
const gradioSpaceApiUrl = `https://jbilcke-hf-ai-tube-model-parler-tts-mini.hf.space`
|
5 |
const huggingFaceSpace = "jbilcke-hf/ai-tube-model-parler-tts-mini"
|
6 |
-
const
|
7 |
|
8 |
/**
|
9 |
* Note: this generates a MP3 file
|
@@ -34,7 +34,7 @@ export async function generateSpeechWithParlerTTS({
|
|
34 |
body: JSON.stringify({
|
35 |
fn_index: 0, // <- important!
|
36 |
data: [
|
37 |
-
|
38 |
text,
|
39 |
audioId,
|
40 |
],
|
|
|
3 |
|
4 |
const gradioSpaceApiUrl = `https://jbilcke-hf-ai-tube-model-parler-tts-mini.hf.space`
|
5 |
const huggingFaceSpace = "jbilcke-hf/ai-tube-model-parler-tts-mini"
|
6 |
+
const microserviceApiKey = `${process.env.MICROSERVICE_API_SECRET_TOKEN || ""}`
|
7 |
|
8 |
/**
|
9 |
* Note: this generates a MP3 file
|
|
|
34 |
body: JSON.stringify({
|
35 |
fn_index: 0, // <- important!
|
36 |
data: [
|
37 |
+
microserviceApiKey,
|
38 |
text,
|
39 |
audioId,
|
40 |
],
|
src/app/api/v1/create/index.ts
CHANGED
@@ -8,6 +8,8 @@ import { parseRawStringToYAML } from "@/app/api/parsers/parseRawStringToYAML"
|
|
8 |
import { LatentStory } from "@/app/api/v1/types"
|
9 |
|
10 |
import { systemPrompt } from "./systemPrompt"
|
|
|
|
|
11 |
|
12 |
// a helper to generate Clap stories from a few sentences
|
13 |
// this is mostly used by external apps such as the Stories Factory
|
@@ -177,5 +179,18 @@ Output: `
|
|
177 |
currentElapsedTimeInMs += defaultSegmentDurationInMs
|
178 |
}
|
179 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
180 |
return clap
|
181 |
}
|
|
|
8 |
import { LatentStory } from "@/app/api/v1/types"
|
9 |
|
10 |
import { systemPrompt } from "./systemPrompt"
|
11 |
+
import { generateMusicPrompts } from "../edit/music/generateMusicPrompt"
|
12 |
+
import { clapToLatentStory } from "../edit/entities/clapToLatentStory"
|
13 |
|
14 |
// a helper to generate Clap stories from a few sentences
|
15 |
// this is mostly used by external apps such as the Stories Factory
|
|
|
179 |
currentElapsedTimeInMs += defaultSegmentDurationInMs
|
180 |
}
|
181 |
|
182 |
+
// one more thing: music!
|
183 |
+
let musicPrompts: string[] = []
|
184 |
+
|
185 |
+
try {
|
186 |
+
musicPrompts = await generateMusicPrompts({
|
187 |
+
prompt,
|
188 |
+
latentStory: await clapToLatentStory(clap)
|
189 |
+
})
|
190 |
+
} catch (err) {
|
191 |
+
console.error(`[api/v1/create] failed to generate music prompts`)
|
192 |
+
musicPrompts.push("lofi hiphop loop")
|
193 |
+
}
|
194 |
+
|
195 |
return clap
|
196 |
}
|
src/app/api/v1/edit/dialogues/route.ts
CHANGED
@@ -1,15 +1,13 @@
|
|
1 |
import { NextResponse, NextRequest } from "next/server"
|
2 |
-
|
3 |
import { ClapProject, ClapSegment, ClapSegmentCategory, newClap, parseClap, serializeClap } from "@aitube/clap"
|
|
|
4 |
|
5 |
-
|
6 |
-
import { processShot } from "./processShot"
|
7 |
-
import queryString from "query-string"
|
8 |
import { parseCompletionMode } from "@/app/api/parsers/parseCompletionMode"
|
9 |
import { throwIfInvalidToken } from "@/app/api/v1/auth/throwIfInvalidToken"
|
10 |
-
import { ClapCompletionMode } from "@aitube/client"
|
11 |
import { parseTurbo } from "@/app/api/parsers/parseTurbo"
|
12 |
|
|
|
13 |
// a helper to generate speech for a Clap
|
14 |
export async function POST(req: NextRequest) {
|
15 |
await throwIfInvalidToken(req.headers.get("Authorization"))
|
|
|
1 |
import { NextResponse, NextRequest } from "next/server"
|
2 |
+
import queryString from "query-string"
|
3 |
import { ClapProject, ClapSegment, ClapSegmentCategory, newClap, parseClap, serializeClap } from "@aitube/clap"
|
4 |
+
import { ClapCompletionMode } from "@aitube/client"
|
5 |
|
|
|
|
|
|
|
6 |
import { parseCompletionMode } from "@/app/api/parsers/parseCompletionMode"
|
7 |
import { throwIfInvalidToken } from "@/app/api/v1/auth/throwIfInvalidToken"
|
|
|
8 |
import { parseTurbo } from "@/app/api/parsers/parseTurbo"
|
9 |
|
10 |
+
import { processShot } from "./processShot"
|
11 |
// a helper to generate speech for a Clap
|
12 |
export async function POST(req: NextRequest) {
|
13 |
await throwIfInvalidToken(req.headers.get("Authorization"))
|
src/app/api/v1/edit/entities/generateEntityPrompts.ts
CHANGED
@@ -1,5 +1,3 @@
|
|
1 |
-
"use server"
|
2 |
-
|
3 |
import YAML from "yaml"
|
4 |
import { ClapSegmentCategory, generateSeed } from "@aitube/clap"
|
5 |
import { ClapEntityPrompt } from "@aitube/client"
|
|
|
|
|
|
|
1 |
import YAML from "yaml"
|
2 |
import { ClapSegmentCategory, generateSeed } from "@aitube/clap"
|
3 |
import { ClapEntityPrompt } from "@aitube/client"
|
src/app/api/v1/edit/music/generateMusic.ts
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import {
|
3 |
+
ClapProject,
|
4 |
+
ClapSegment,
|
5 |
+
getClapAssetSourceType,
|
6 |
+
filterSegments,
|
7 |
+
ClapSegmentFilteringMode,
|
8 |
+
ClapSegmentCategory,
|
9 |
+
newSegment
|
10 |
+
} from "@aitube/clap"
|
11 |
+
import { ClapCompletionMode } from "@aitube/client"
|
12 |
+
import { getSpeechBackgroundAudioPrompt } from "@aitube/engine"
|
13 |
+
|
14 |
+
import { generateSpeechWithParlerTTS } from "@/app/api/generators/speech/generateVoiceWithParlerTTS"
|
15 |
+
import { getMediaInfo } from "@/app/api/utils/getMediaInfo"
|
16 |
+
import { generateMusicWithMusicgen } from "@/app/api/generators/music/generateMusicWithMusicgen"
|
17 |
+
|
18 |
+
export async function generateMusic({
|
19 |
+
musicSegment,
|
20 |
+
existingClap,
|
21 |
+
newerClap,
|
22 |
+
mode,
|
23 |
+
turbo,
|
24 |
+
}: {
|
25 |
+
musicSegment?: ClapSegment
|
26 |
+
existingClap: ClapProject
|
27 |
+
newerClap: ClapProject
|
28 |
+
mode: ClapCompletionMode
|
29 |
+
turbo: boolean
|
30 |
+
}): Promise<void> {
|
31 |
+
if (!musicSegment) {
|
32 |
+
console.log(`generateMusic(): music segment is empty, so skipping music generation.`)
|
33 |
+
return
|
34 |
+
}
|
35 |
+
// for now we do something very basic
|
36 |
+
const prompt = musicSegment.prompt
|
37 |
+
if (!prompt) {
|
38 |
+
console.log(`generateMusic(): music prompt is empty, so skipping music generation.`)
|
39 |
+
return
|
40 |
+
}
|
41 |
+
|
42 |
+
const assetUrl = await generateMusicWithMusicgen({
|
43 |
+
prompt,
|
44 |
+
durationInSec: 10,
|
45 |
+
hd: false,
|
46 |
+
debug: true,
|
47 |
+
neverThrow: true,
|
48 |
+
})
|
49 |
+
|
50 |
+
if (!assetUrl || assetUrl?.length < 30) {
|
51 |
+
console.log(`generateMusic(): generated assetUrl is empty, so music generation failed.`)
|
52 |
+
return
|
53 |
+
}
|
54 |
+
|
55 |
+
if (mode !== ClapCompletionMode.FULL) {
|
56 |
+
console.log(`generateMusic(): adding music to a new clap file`)
|
57 |
+
newerClap.segments.push(newSegment({
|
58 |
+
...musicSegment,
|
59 |
+
assetUrl,
|
60 |
+
}))
|
61 |
+
} else {
|
62 |
+
console.log(`generateMusic(): overwriting the music inside the existing clap file`)
|
63 |
+
// this will replace the existing clap (normally)
|
64 |
+
musicSegment.assetUrl = assetUrl
|
65 |
+
}
|
66 |
+
}
|
src/app/api/v1/edit/music/generateMusicPrompt.ts
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import YAML from "yaml"
|
3 |
+
|
4 |
+
import { predict } from "@/app/api/providers/huggingface/predictWithHuggingFace"
|
5 |
+
import { LatentStory } from "@/app/api/v1/types"
|
6 |
+
|
7 |
+
import { systemPrompt } from "./systemPrompt"
|
8 |
+
|
9 |
+
export async function generateMusicPrompts({
|
10 |
+
prompt = "",
|
11 |
+
latentStory = [],
|
12 |
+
turbo = false,
|
13 |
+
}: {
|
14 |
+
prompt?: string
|
15 |
+
latentStory?: LatentStory[]
|
16 |
+
turbo?: boolean
|
17 |
+
} = {
|
18 |
+
prompt: "",
|
19 |
+
latentStory: [],
|
20 |
+
turbo: false
|
21 |
+
}): Promise<string[]> {
|
22 |
+
|
23 |
+
if (!prompt.length) { throw new Error(`please provide a prompt`) }
|
24 |
+
console.log("generateMusicPrompts(): prompt:", prompt)
|
25 |
+
|
26 |
+
|
27 |
+
if (!latentStory.length) { throw new Error(`please provide a story`) }
|
28 |
+
|
29 |
+
console.log("generateMusicPrompts(): latentStory:", latentStory)
|
30 |
+
|
31 |
+
const userPrompt = `The input story is about: ${prompt}.
|
32 |
+
|
33 |
+
The input story is:
|
34 |
+
\`\`\`yaml
|
35 |
+
${YAML.stringify(
|
36 |
+
// we need to help the LLM by marking the shots with a simple numeric ID
|
37 |
+
latentStory.map((shot, i) => ({
|
38 |
+
shot: i,
|
39 |
+
...shot,
|
40 |
+
}))
|
41 |
+
)}
|
42 |
+
\`\`\`
|
43 |
+
|
44 |
+
# Output`
|
45 |
+
|
46 |
+
const prefix = "\""
|
47 |
+
|
48 |
+
// we don't need a lot here!
|
49 |
+
const nbMaxNewTokens = 120
|
50 |
+
|
51 |
+
// TODO use streaming for the Hugging Face prediction
|
52 |
+
//
|
53 |
+
// note that a Clap file is actually a YAML stream of documents
|
54 |
+
// so technically we could stream everything from end-to-end
|
55 |
+
// (but I haven't coded the helpers to do this yet)
|
56 |
+
let rawString = await predict({
|
57 |
+
systemPrompt,
|
58 |
+
userPrompt,
|
59 |
+
nbMaxNewTokens,
|
60 |
+
prefix,
|
61 |
+
turbo,
|
62 |
+
})
|
63 |
+
|
64 |
+
// console.log("generateEntityPrompts(): rawString: ", rawString)
|
65 |
+
|
66 |
+
let results: string[] = []
|
67 |
+
|
68 |
+
// we remove everything after the last ``` (or ``)
|
69 |
+
rawString = rawString.split(/```?/)[0].trim()
|
70 |
+
results.push(rawString)
|
71 |
+
|
72 |
+
if (!Array.isArray(results) || typeof results.at(0) !== "string" || !results) {
|
73 |
+
throw new Error(`failed to generate the output (rawString is: ${rawString})`)
|
74 |
+
}
|
75 |
+
|
76 |
+
return results
|
77 |
+
}
|
src/app/api/v1/edit/music/route.ts
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import { NextResponse, NextRequest } from "next/server"
|
2 |
+
import queryString from "query-string"
|
3 |
+
import { ClapProject, ClapSegment, ClapSegmentCategory, newClap, parseClap, serializeClap } from "@aitube/clap"
|
4 |
+
import { ClapCompletionMode } from "@aitube/client"
|
5 |
+
|
6 |
+
import { parseCompletionMode } from "@/app/api/parsers/parseCompletionMode"
|
7 |
+
import { throwIfInvalidToken } from "@/app/api/v1/auth/throwIfInvalidToken"
|
8 |
+
import { parseTurbo } from "@/app/api/parsers/parseTurbo"
|
9 |
+
|
10 |
+
import { generateMusic } from "./generateMusic"
|
11 |
+
|
12 |
+
// a helper to generate speech for a Clap
|
13 |
+
export async function POST(req: NextRequest) {
|
14 |
+
await throwIfInvalidToken(req.headers.get("Authorization"))
|
15 |
+
|
16 |
+
const qs = queryString.parseUrl(req.url || "")
|
17 |
+
const query = (qs || {}).query
|
18 |
+
|
19 |
+
const mode = parseCompletionMode(query?.c)
|
20 |
+
const turbo = parseTurbo(query?.t)
|
21 |
+
|
22 |
+
const blob = await req.blob()
|
23 |
+
|
24 |
+
const existingClap: ClapProject = await parseClap(blob)
|
25 |
+
|
26 |
+
if (!existingClap?.segments) { throw new Error(`no segment found in the provided clap!`) }
|
27 |
+
|
28 |
+
// console.log(`[api/edit/dialogues] detected ${existingClap.segments.length} segments`)
|
29 |
+
|
30 |
+
const musicSegments: ClapSegment[] = existingClap.segments.filter(s => s.category === ClapSegmentCategory.MUSIC)
|
31 |
+
// console.log(`[api/edit/dialogues] detected ${shotsSegments.length} shots`)
|
32 |
+
|
33 |
+
const newerClap = mode === ClapCompletionMode.FULL ? existingClap : newClap({
|
34 |
+
meta: existingClap.meta
|
35 |
+
})
|
36 |
+
|
37 |
+
if (musicSegments.length > 1) {
|
38 |
+
throw new Error(`Error, only one music track can be generated with the V1 of the AiTube API`)
|
39 |
+
}
|
40 |
+
|
41 |
+
const musicSegment = musicSegments.at(0)
|
42 |
+
|
43 |
+
await generateMusic({
|
44 |
+
musicSegment,
|
45 |
+
existingClap,
|
46 |
+
newerClap,
|
47 |
+
mode,
|
48 |
+
turbo,
|
49 |
+
})
|
50 |
+
|
51 |
+
// console.log(`[api/edit/dialogues] returning the clap augmented with dialogues`)
|
52 |
+
|
53 |
+
return new NextResponse(await serializeClap(newerClap), {
|
54 |
+
status: 200,
|
55 |
+
headers: new Headers({ "content-type": "application/x-gzip" }),
|
56 |
+
})
|
57 |
+
}
|
src/app/api/v1/edit/music/systemPrompt.ts
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
export const systemPrompt: string = `
|
2 |
+
You are a backend API engine, designed to generate music prompt output from a story input.
|
3 |
+
|
4 |
+
## Prompting guidelines
|
5 |
+
|
6 |
+
To create a music prompt, you need to combine styles with moods, plus a few other things.
|
7 |
+
1. Please choose a base style among those categories: "Hip Hop and Rap track", "Classic track", "Jazz track", "Electronic and dance track", "Rock'n'Roll track", "Funk track", "Dubstep track", "Afrobeats", "Orchestral track", "Pop track", "Reggae track", "Metal track", "Country track", "Blues track", "Soul track", "R'n'B track", "Disco track", "Trap track", "Ambient track", "Lofi track", "Chill track", etc.
|
8 |
+
2. Then choose a vibe: "with an happy vibe", "with a sad vibe", "with an angry vibe", "with a chill vibe", "with a romantic vibe", "with an epic vibe", "with an energetic vibe", "with a dreamy vibe", "with a mysterious vibe", "with a relaxing vibe", "with a dark vibe", "with an upbeat vibe", "with a motivational vibe", "with an inspiring vibe", "with a nostalgic vibe", "with a groovy vibe", "with a cheerful vibe", "with a melancholic vibe", "with a hopeful vibe", etc.
|
9 |
+
3. build up a coherent description eg.: "80s pop track with bassy drums and synth", "90s rock song with loud guitars and heavy drums", "a light and cheerly EDM track, with syncopated drums, aery pads, and strong emotions bpm: 130", "A cheerful country song with acoustic guitars", "lofi slow bpm electro chill with organic samples" etc.
|
10 |
+
|
11 |
+
## Example of input/output
|
12 |
+
|
13 |
+
Given the following input story, provided as YAML:
|
14 |
+
|
15 |
+
# Input
|
16 |
+
|
17 |
+
"A king goes to see a witch to ask if or how he can win an upcoming and challenging battle"
|
18 |
+
|
19 |
+
\`\`\`yaml
|
20 |
+
- shot: 1
|
21 |
+
comment: "King Arthus seeks the witch's guidance to win his imminent battle."
|
22 |
+
image: "Establishing shot of KING ARTHUS, nervous, wet brown hair. dressed in golden armor and a colorful cape. His face reveals a mix of concern and determination. He's standing in the bright sunshine, inside a castle's courtyard, under cloudy skies. Behind him, a group of soldiers can be seen marching towards the castle gates."
|
23 |
+
voice: "Dark sorceress of the shadows, it is time for you to serve your Lord. Tell me the augur, tell me what you foreknow. Tell me how I will cleave my ennemies to the bone, and ravage them in battle to come up victorious."
|
24 |
+
- shot: 2
|
25 |
+
comment: "The witch gives her counsel but warns of an unknown cost."
|
26 |
+
image: "close-up shot of THE WITCH, smiling cunningly, raising a finger while speaking. Background bokeh, dim lightning, menacing, mysterious."
|
27 |
+
voice: "Your Majesty, this will be a bloody battle, but I espy a way to victory for you. But if my advice you follow, victory I foresee, although at a great cost it will be."
|
28 |
+
- shot: 3
|
29 |
+
comment: "The words of the witch are sinking in, but King Arthus tries to appear strong"
|
30 |
+
image: "close-up shot on KING ARTHUS, looking concerned, somber, false confidence"
|
31 |
+
voice: "Witch with the wicked tongue, what must be done will be done. I will do everything for my people's sake. Speak now, make know the path to glory."
|
32 |
+
\`\`\
|
33 |
+
|
34 |
+
As you can see, the theme is medieval, this is for a fantasy movie. So you should generate a music like this:
|
35 |
+
|
36 |
+
## Output
|
37 |
+
|
38 |
+
"Classical music with a symphonic orchestra, with medieval influences and a chilling mysterious vibe."
|
39 |
+
`
|