jbilcke-hf's picture
jbilcke-hf HF staff
update the musicgen cluster + make ffprobe optional
874c9ec
raw
history blame
No virus
2.93 kB
import {
ClapProject,
ClapSegment,
getClapAssetSourceType,
filterSegments,
ClapSegmentFilteringMode,
ClapSegmentCategory
} from "@aitube/clap"
import { ClapCompletionMode } from "@aitube/client"
import { getSpeechBackgroundAudioPrompt } from "@aitube/engine"
import { generateSpeechWithParlerTTS } from "@/app/api/generators/speech/generateVoiceWithParlerTTS"
import { getMediaInfo } from "@/app/api/utils/getMediaInfo"
export async function processShot({
shotSegment,
existingClap,
newerClap,
mode,
turbo,
}: {
shotSegment: ClapSegment
existingClap: ClapProject
newerClap: ClapProject
mode: ClapCompletionMode
turbo: boolean
}): Promise<void> {
const shotSegments: ClapSegment[] = filterSegments(
ClapSegmentFilteringMode.BOTH,
shotSegment,
existingClap.segments
)
const shotDialogueSegments: ClapSegment[] = shotSegments.filter(s =>
s.category === ClapSegmentCategory.DIALOGUE
)
let shotDialogueSegment: ClapSegment | undefined = shotDialogueSegments.at(0)
console.log(`[api/edit/dialogues] processShot: shot [${shotSegment.startTimeInMs}:${shotSegment.endTimeInMs}] has ${shotSegments.length} segments (${shotDialogueSegments.length} dialogues)`)
if (shotDialogueSegment && !shotDialogueSegment.assetUrl) {
// console.log(`[api/edit/dialogues] generating audio..`)
try {
// this generates a mp3
shotDialogueSegment.assetUrl = await generateSpeechWithParlerTTS({
text: shotDialogueSegment.prompt,
audioId: getSpeechBackgroundAudioPrompt(
shotSegments,
existingClap.entityIndex,
// TODO: use the entity description if it exists
["high quality", "crisp", "detailed"]
),
debug: true,
})
shotDialogueSegment.assetSourceType = getClapAssetSourceType(shotDialogueSegment.assetUrl)
const { durationInMs, hasAudio } = await getMediaInfo(shotDialogueSegment.assetUrl)
if (hasAudio && durationInMs > 1000) {
shotDialogueSegment.assetDurationInMs = durationInMs
shotSegment.assetDurationInMs = durationInMs
// we update the duration of all the segments for this shot
// (it is possible that this makes the two previous lines redundant)
existingClap.segments.forEach(s => {
s.assetDurationInMs = durationInMs
})
}
} catch (err) {
console.log(`[api/edit/dialogues] processShot: failed to generate audio: ${err}`)
throw err
}
console.log(`[api/edit/dialogues] processShot: generated dialogue audio: ${shotDialogueSegment?.assetUrl?.slice?.(0, 50)}...`)
// if it's partial, we need to manually add it
if (mode !== ClapCompletionMode.FULL) {
newerClap.segments.push(shotDialogueSegment)
}
} else {
console.log(`[api/edit/dialogues] processShot: there is already a dialogue audio: ${shotDialogueSegment?.assetUrl?.slice?.(0, 50)}...`)
}
}