"use server"

import { ClapProject, getValidNumber, newClap, newSegment, ClapSegmentCategory, ClapOutputType, ClapMediaOrientation } from "@aitube/clap"

import { sleep } from "@/lib/utils/sleep"
import { predict } from "@/app/api/providers/huggingface/predictWithHuggingFace"
import { parseRawStringToYAML } from "@/app/api/parsers/parseRawStringToYAML"
import { LatentStory } from "@/app/api/v1/types"

import { systemPrompt } from "./systemPrompt"
import { generateMusicPrompts } from "../edit/music/generateMusicPrompt"
import { clapToLatentStory } from "../edit/entities/clapToLatentStory"

// a helper to generate Clap stories from a few sentences
// this is mostly used by external apps such as the Stories Factory
export async function create(request: {
  prompt?: string
  width?: number
  height?: number
  turbo?: boolean
}= {
  prompt: "",
  width: 1024,
  height: 576,
  turbo: false,
}): Promise<ClapProject> {

  // we limit to 512 characters
  const prompt = `${request?.prompt || ""}`.trim().slice(0, 512)

  console.log("api/v1/create(): request:", request)

  if (!prompt.length) { throw new Error(`please provide a prompt`) }

  const width = getValidNumber(request?.width, 256, 8192, 1024)
  const height = getValidNumber(request?.height, 256, 8192, 576)
  const turbo = request?.turbo ? true : false
  
  const userPrompt = `Movie story to generate: ${prompt}

Output: `

  const prefix = "```yaml\n"
  const nbMaxNewTokens = 1400

  // TODO use streaming for the Hugging Face prediction
  //
  // note that a Clap file is actually a YAML stream of documents
  // so technically we could stream everything from end-to-end
  // (but I haven't coded the helpers to do this yet)
  let rawString = await predict({
    systemPrompt,
    userPrompt,
    nbMaxNewTokens,
    prefix,
    turbo,
  })

  // console.log("api/v1/create(): rawString: ", rawString)

  let shots: LatentStory[] = []
  
  let maybeShots = parseRawStringToYAML<LatentStory[]>(rawString, [])

  if (!Array.isArray(maybeShots) || maybeShots.length === 0) {
    console.log(`api/v1/create(): failed to generate shots.. trying again`)
    
    await sleep(2000)

    rawString = await predict({
      systemPrompt,
      userPrompt: userPrompt + ".", // we trick the Hugging Face cache
      nbMaxNewTokens,
      prefix,
      turbo,
    })
  
    // console.log("api/v1/create(): rawString: ", rawString)
  
    maybeShots = parseRawStringToYAML<LatentStory[]>(rawString, [])
    if (!Array.isArray(maybeShots) || maybeShots.length === 0) {
      console.log(`api/v1/create(): failed to generate shots for the second time, which indicates an issue with the Hugging Face API`)
    }
  }

  if (maybeShots.length) {
    shots = maybeShots
  } else {
    throw new Error(`Hugging Face Inference API failure (the model failed to generate the shots)`)
  }

  console.log(`api/v1/create(): generated ${shots.length} shots`)

  // this is approximate - TTS generation will determine the final duration of each shot
  const defaultSegmentDurationInMs = 7000

  let currentElapsedTimeInMs = 0

  const clap: ClapProject = newClap({
    meta: {
      title: prompt.split(",").shift() || "",
      description: prompt,
      synopsis: "",
      licence: "",
      orientation:
        width > height ? ClapMediaOrientation.LANDSCAPE :
        height > width ? ClapMediaOrientation.PORTRAIT :
        ClapMediaOrientation.SQUARE,
      width,
      height,
      isInteractive: false,
      isLoop: false,
      durationInMs: shots.length * defaultSegmentDurationInMs,
      defaultVideoModel: "AnimateDiff-Lightning",
    }
  })

  for (const { comment, image, voice } of shots) {

    console.log(`api/v1/create():  - ${comment}`)

    // note: it would be nice if we could have a convention saying that
    // track 0 is for videos and track 1 storyboards
    // 
    // however, that's a bit constraining as people will generate .clap
    // using all kind of tools and development experience,
    // and they may not wish to learn the Clap protocol format completely
    //
    // TL;DR: 
    // we should fix the Clap file editor to make it able to react videos
    // from any track number

    clap.segments.push(newSegment({
      track: 0,
      startTimeInMs: currentElapsedTimeInMs,
      endTimeInMs: currentElapsedTimeInMs + defaultSegmentDurationInMs,
      assetDurationInMs: defaultSegmentDurationInMs,
      category: ClapSegmentCategory.VIDEO,
      prompt: image,
      outputType: ClapOutputType.VIDEO,
      status: "to_generate",
    }))

    clap.segments.push(newSegment({
      track: 1,
      startTimeInMs: currentElapsedTimeInMs,
      endTimeInMs: currentElapsedTimeInMs + defaultSegmentDurationInMs,
      assetDurationInMs: defaultSegmentDurationInMs,
      category: ClapSegmentCategory.STORYBOARD,
      prompt: image,
      outputType: ClapOutputType.IMAGE,
      status: "to_generate",
    }))

    clap.segments.push(newSegment({
      track: 2,
      startTimeInMs: currentElapsedTimeInMs,
      endTimeInMs: currentElapsedTimeInMs + defaultSegmentDurationInMs,
      assetDurationInMs: defaultSegmentDurationInMs,
      category: ClapSegmentCategory.INTERFACE,
      prompt: comment,
      // assetUrl: `data:text/plain;base64,${btoa(comment)}`,
      assetUrl: comment,
      outputType: ClapOutputType.TEXT,
      status: "to_generate",
    }))

    clap.segments.push(newSegment({
      track: 3,
      startTimeInMs: currentElapsedTimeInMs,
      endTimeInMs: currentElapsedTimeInMs + defaultSegmentDurationInMs,
      assetDurationInMs: defaultSegmentDurationInMs,
      category: ClapSegmentCategory.DIALOGUE,
      prompt: voice,
      outputType: ClapOutputType.AUDIO,
      status: "to_generate",
    }))

    // the presence of a camera is mandatory
    clap.segments.push(newSegment({
      track: 4,
      startTimeInMs: currentElapsedTimeInMs,
      endTimeInMs: currentElapsedTimeInMs + defaultSegmentDurationInMs,
      assetDurationInMs: defaultSegmentDurationInMs,
      category: ClapSegmentCategory.CAMERA,
      prompt: "video",
      outputType: ClapOutputType.TEXT,
      status: "to_generate",
    }))

    currentElapsedTimeInMs += defaultSegmentDurationInMs
  }

  // one more thing: music!
  let musicPrompts: string[] = []
  
  try {
    musicPrompts = await generateMusicPrompts({
      prompt,
      latentStory: await clapToLatentStory(clap)
    })
    const musicPrompt = musicPrompts.at(0)
    if (!musicPrompt) { throw new Error(`not enough music prompts`) }

    // console.log("musicPrompt:", musicPrompt)

    clap.segments.push(newSegment({
      track: 5,
      startTimeInMs: 0,
      endTimeInMs: currentElapsedTimeInMs,
      assetDurationInMs: currentElapsedTimeInMs,
      category: ClapSegmentCategory.MUSIC,
      prompt: musicPrompt,
      outputType: ClapOutputType.AUDIO,
      status: "to_generate",
    }))
  } catch (err) {
    console.error(`[api/v1/create] failed to generate music prompts`)
    musicPrompts.push("lofi hiphop loop")
  }

  return clap
}