Spaces:

mvp-lab
/

SyncAI

Running on Zero

App Files Files Community

SyncAI / src /prompt_generator.py

ICGenAIShare04

Upload 52 files

72f552e verified 18 days ago

raw

history blame contribute delete

16.2 kB

	"""Generate image + video prompts from segments using an LLM.

	Takes segments.json (lyrics mapped to beat intervals) and produces two
	prompts per segment via two separate LLM calls:
	1. Image prompt — short, SDXL-optimized (≤77 CLIP tokens)
	2. Video prompt — detailed motion/action description for I2V (no token limit)

	Consistency: LLM keeps all scenes within a shared setting from the style guidance.
	Variety: LLM picks different subjects, camera angles, compositions per segment.
	Narrative: LLM derives an overarching visual story from the lyrics.
	"""

	import json
	import os
	from pathlib import Path
	from typing import Optional

	import anthropic
	from dotenv import load_dotenv

	load_dotenv()

	# Camera angles to cycle through for visual variety between cuts
	CAMERA_ANGLES = [
	"wide establishing shot",
	"close-up",
	"aerial view",
	"low angle shot",
	"medium shot",
	"extreme wide shot",
	"over-the-shoulder perspective",
	"dutch angle",
	"tracking shot from the side",
	"bird's eye view",
	"ground-level shot",
	"silhouette against the sky",
	]

	# Default quality suffix — overridden by style-specific quality_suffix from styles.py
	DEFAULT_QUALITY_SUFFIX = "8K, cinematic, atmospheric, sharp details"

	NEGATIVE_PROMPT = (
	"text, watermark, logo, blurry, low quality, deformed, "
	"ugly, oversaturated, cartoon, anime"
	)

	# ---------------------------------------------------------------------------
	# LLM Call 1: Image prompts (short, SDXL-optimized)
	# ---------------------------------------------------------------------------

	IMAGE_SYSTEM_PROMPT = """\
	You are a music video director. Given song lyrics, a SETTING, and a list of \
	segments (each ~2 seconds long), create a visually compelling shot list for \
	IMAGE generation (Stable Diffusion XL).

	Rules:
	1. A SETTING will be provided at the end of these instructions. ALL scenes \
	MUST take place within that setting — treat it as the world of a short film. \
	Never leave this world.
	2. Use the LYRICS to shape the MOOD, ENERGY, and EMOTIONAL ARC of each scene. \
	The lyrics dictate the vibe — if they're dark and melancholic, the visuals \
	should feel heavy and somber even within the setting. If they're upbeat, the \
	visuals should feel energetic.
	3. When lyrics are CONCRETE and naturally fit the setting, lean into them \
	heavily. For example, if the setting is a coastal drive and the lyrics say \
	"waves crashing down", make that segment literally about waves crashing \
	against rocks as the car passes. If the lyrics say "fading light", show the \
	sun dropping below the horizon. The more specific the lyrics, the more \
	directly they should influence the scene.
	4. When lyrics are ABSTRACT or metaphorical (e.g. "lost in your eyes", \
	"falling apart"), translate the emotion into something visual and physical \
	within the setting — don't try to literally depict abstract concepts.
	5. Each segment gets a UNIQUE SHOT within the shared setting — vary the \
	subject, angle, and composition, but NEVER leave the world.
	CRITICAL: Every scene MUST depict ACTION or MOTION — something must be \
	happening. These will be turned into short video clips, so static subjects \
	like "a wooden floor", "a parked car", or "an empty room" are useless. \
	Show vehicles driving, waves crashing, lights flickering, rain falling, \
	fires burning — dynamic scenes only.
	6. Use the assigned camera angle for each segment.
	7. Segments WITHOUT lyrics (instrumental): use atmospheric, mood-driven \
	details from the setting (environmental motion, weather, ambient action).
	8. Write prompts as SDXL-optimized natural language descriptions. \
	Keep each scene between 25-35 words. Be specific — name exact objects, \
	materials, colors, and weather details. Every word must earn its place. \
	Focus on CONCRETE OBJECTS and ACTIONS — what is physically in the frame \
	and what is happening. SDXL needs to know what to draw, not how to feel. \
	BAD: "reflections layering over glass, interior light diffused through water" — abstract mood. \
	GOOD: "taxi splashing through puddle on wet street, rain falling past neon bar sign" — objects + action. \
	BAD: "streetlights bleeding through downpour, darkness stretching ahead" — vague atmosphere. \
	GOOD: "car windshield wipers sweeping rain, blurred traffic lights ahead, wet dashboard" — specific things. \
	BAD: "water sheeting off canvas edge in a thick curtain" — SDXL will draw a curtain. \
	GOOD: "water pouring off awning edge, rain splashing on sidewalk below" — plain description. \
	Write like you're telling a 10-year-old what's in the picture. Simple, plain words. \
	Name the objects. Name the action. Lighting and mood come from the SETTING, \
	you don't need to describe them — describe what's HAPPENING. \
	Use LITERAL language only — no metaphors, no poetic phrasing. SDXL interprets \
	words literally. BANNED words: bleeding, drowning, bathed, kissed, dancing, \
	breathing, alive, whispering, haunting, cascading, diffusing, fragmenting. \
	These cause SDXL to generate unintended objects. \
	Also avoid describing PROCESSES or PHYSICS — SDXL generates a single frame, \
	not a sequence. "ripples expanding", "light fragmenting and reforming", \
	"reflections scattering" are processes, not objects. Instead describe the \
	RESULT: "rippled puddle", "blurry neon reflection in water", "wet glass". \
	Say exactly what a camera would capture in ONE freeze-frame. \
	Before finalizing each scene, sanity-check it: does this make physical \
	sense? Could this actually exist? "pooled water on a car hood" — no, car \
	hoods are curved and water runs off. "rain falling upward" — no. \
	"neon sign reflected in a brick wall" — no, brick doesn't reflect. \
	Only write scenes that obey basic physics and real-world logic. \
	Strip camera angle phrasing from the scene text (angles are metadata, not prompt words).
	9. Include lighting and color in every scene. Derive from the SETTING — \
	a sunset drive = warm golden-hour light, lens flares, long shadows; \
	a rainy city night = cold neon on wet surfaces, streetlight halos; \
	a stormy harbour = overcast grey, dramatic cloud breaks. \
	Keep lighting consistent across all scenes.
	10. Do NOT include style, quality, or technical tags in the scene — these \
	are appended automatically. BANNED from scenes: "cinematic", "moody", \
	"atmospheric", "dramatic lighting", "film grain", "color grade", "bokeh", \
	"depth of field", "35mm", "8K", "masterpiece", "best quality". \
	Your scene should contain ONLY objects, actions, and setting-derived light.
	11. Do NOT include text, words, or typography in the scenes.
	12. Do NOT end scenes with periods. Use commas to separate phrases. \
	Every character counts — periods waste a token.

	Return ONLY valid JSON: a list of objects with "segment" (number) and \
	"scene" (the creative description). No markdown, no explanation.\
	"""

	# ---------------------------------------------------------------------------
	# LLM Call 2: Video prompts (detailed motion descriptions)
	# ---------------------------------------------------------------------------

	VIDEO_SYSTEM_PROMPT = """\
	You are a music video director creating motion descriptions for an \
	image-to-video AI model. You will receive a list of segments, each with \
	an image scene description already written. Your job is to describe \
	HOW each scene should MOVE and ANIMATE.

	Rules:
	1. For each segment, write a detailed "video_prompt" (2-4 sentences) \
	describing all motion in the scene:
	- SUBJECT MOTION: what the subject does (walking, turning, reaching, \
	driving, dancing, running, etc.)
	- CAMERA MOTION: how the camera moves (slow pan left, dolly forward, \
	tracking shot, crane up, handheld shake, static with zoom, etc.)
	- ENVIRONMENTAL MOTION: ambient movement (wind blowing hair/clothes, \
	rain falling, leaves drifting, smoke rising, lights flickering, waves \
	crashing, clouds moving, reflections rippling, etc.)
	- PACING: match the emotional energy — slow and contemplative for \
	quiet moments, faster and more dynamic for intense moments.
	2. Be specific and physical. Not "things move around" but "the camera \
	slowly tracks forward as rain streaks across the windshield and the \
	wipers sweep left to right."
	3. Keep the motion consistent with the shared setting — all scenes are \
	part of the same story.
	4. Do NOT describe visual style, colors, or lighting — the image already \
	has those. Focus ONLY on motion and action.
	5. CRITICAL — ONLY animate what exists in the scene description. Do NOT \
	introduce new subjects, people, or objects that are not explicitly \
	mentioned. If the scene describes a landscape with no people, describe \
	ONLY environmental motion (wind, water, light changes, camera movement). \
	NEVER add a person walking into frame unless the scene already mentions \
	a person or figure.

	Return ONLY valid JSON: a list of objects with "segment" (number) and \
	"video_prompt" (the motion description). No markdown, no explanation.\
	"""


	def _build_user_prompt(
	segments: list[dict], song_name: str, style_description: str = "",
	) -> str:
	"""Build the user message for the image prompt LLM call."""
	all_lyrics = " ".join(
	seg["lyrics"] for seg in segments if seg["lyrics"]
	).strip()

	lines = [
	f'Song: "{song_name}"',
	f'Full lyrics in this clip: "{all_lyrics}"',
	f"Number of segments: {len(segments)}",
	]

	if style_description:
	lines.append(f'Visual style direction: "{style_description}"')

	lines += ["", "Segments:"]

	for i, seg in enumerate(segments):
	angle = CAMERA_ANGLES[i % len(CAMERA_ANGLES)]
	lyrics_note = f'lyrics: "{seg["lyrics"]}"' if seg["lyrics"] else "instrumental"
	lines.append(
	f' {seg["segment"]}. ({seg["start"]:.1f}s–{seg["end"]:.1f}s) '
	f'[{angle}] {lyrics_note}'
	)

	return "\n".join(lines)


	def _build_video_user_prompt(segments: list[dict]) -> str:
	"""Build the user message for the video prompt LLM call."""
	lines = [
	"Generate motion descriptions for each segment.",
	"IMPORTANT: ONLY animate elements that exist in the scene description.",
	"Do NOT add people, figures, or objects that aren't mentioned.",
	"",
	"Image scenes:",
	"",
	]

	for seg in segments:
	lyrics_note = f' (lyrics: "{seg["lyrics"]}")' if seg.get("lyrics") else " (instrumental)"
	lines.append(
	f' Segment {seg["segment"]}: "{seg["scene"]}"{lyrics_note}'
	)

	return "\n".join(lines)


	def _parse_llm_json(raw: str) -> list[dict]:
	"""Parse JSON from LLM response, stripping markdown fences if present."""
	raw = raw.strip()
	if raw.startswith("```"):
	raw = raw.split("\n", 1)[1]
	raw = raw.rsplit("```", 1)[0]
	return json.loads(raw)


	def generate_prompts(
	segments: list[dict],
	song_name: str = "Unknown",
	style_description: str = "",
	image_prompt_guidance: str = "",
	quality_suffix: str = "",
	model: str = "claude-sonnet-4-6",
	) -> list[dict]:
	"""Generate image + video prompts for each segment using two LLM calls.

	Args:
	segments: List of segment dicts from segmenter (with lyrics).
	song_name: Name of the song (helps the LLM set the mood).
	style_description: Description of the visual style (from styles registry).
	image_prompt_guidance: Style-specific creative direction appended to the
	image system prompt (from styles registry).
	quality_suffix: Style-specific quality tags appended to each prompt.
	model: Anthropic model to use.

	Returns:
	Updated segments list with added keys:
	- prompt: full SDXL prompt (scene + style suffix)
	- video_prompt: detailed motion description for I2V
	- negative_prompt: negative prompt for SDXL
	- camera_angle: the assigned camera angle
	- scene: raw scene description from LLM
	"""
	client = anthropic.Anthropic()

	# --- Call 1: Image prompts ---
	print(" Generating image prompts...")
	user_prompt = _build_user_prompt(segments, song_name, style_description)

	# Inject style-specific guidance into the system prompt
	image_system = IMAGE_SYSTEM_PROMPT
	if image_prompt_guidance:
	image_system += f"\n\n{image_prompt_guidance}"

	response = client.messages.create(
	model=model,
	max_tokens=2048,
	system=image_system,
	messages=[{"role": "user", "content": user_prompt}],
	)

	scenes = _parse_llm_json(response.content[0].text)
	scene_map = {s["segment"]: s for s in scenes}

	# Merge image prompts into segments
	suffix = quality_suffix or DEFAULT_QUALITY_SUFFIX
	for i, seg in enumerate(segments):
	angle = CAMERA_ANGLES[i % len(CAMERA_ANGLES)]
	scene_data = scene_map.get(seg["segment"], {})
	scene = scene_data.get("scene", "atmospheric landscape")

	seg["scene"] = scene
	seg["camera_angle"] = angle
	seg["prompt"] = f"{scene}, {suffix}"
	seg["negative_prompt"] = NEGATIVE_PROMPT

	# --- Call 2: Video prompts ---
	print(" Generating video prompts...")
	video_user_prompt = _build_video_user_prompt(segments)

	response = client.messages.create(
	model=model,
	max_tokens=4096,
	system=VIDEO_SYSTEM_PROMPT,
	messages=[{"role": "user", "content": video_user_prompt}],
	)

	video_scenes = _parse_llm_json(response.content[0].text)
	video_map = {s["segment"]: s for s in video_scenes}

	# Merge video prompts into segments
	for seg in segments:
	video_data = video_map.get(seg["segment"], {})
	seg["video_prompt"] = video_data.get(
	"video_prompt", f"smooth cinematic motion, {seg['scene']}"
	)

	return segments


	def save_segments(
	segments: list[dict],
	output_path: str \| Path,
	) -> Path:
	"""Save prompt-enriched segments to JSON."""
	output_path = Path(output_path)
	output_path.parent.mkdir(parents=True, exist_ok=True)

	with open(output_path, "w") as f:
	json.dump(segments, f, indent=2)

	return output_path


	def run(
	data_dir: str \| Path,
	song_name: Optional[str] = None,
	style_description: str = "",
	image_prompt_guidance: str = "",
	quality_suffix: str = "",
	) -> list[dict]:
	"""Full prompt generation pipeline: load segments, generate prompts, save.

	Args:
	data_dir: Run directory containing segments.json (e.g. data/Gone/run_001/).
	song_name: Name of the song. Defaults to the parent directory name.
	style_description: Description of the visual style (from styles registry).
	image_prompt_guidance: Style-specific creative direction for image prompts.
	quality_suffix: Style-specific quality tags appended to each prompt.

	Returns:
	List of prompt-enriched segment dicts.
	"""
	data_dir = Path(data_dir)

	if song_name is None:
	song_name = data_dir.parent.name

	with open(data_dir / "segments.json") as f:
	segments = json.load(f)

	segments = generate_prompts(
	segments, song_name=song_name, style_description=style_description,
	image_prompt_guidance=image_prompt_guidance,
	quality_suffix=quality_suffix,
	)
	save_segments(segments, data_dir / "segments.json")

	return segments


	if __name__ == "__main__":
	import sys

	if len(sys.argv) < 2:
	print("Usage: python -m src.prompt_generator <data_dir> [song_name]")
	print(" e.g. python -m src.prompt_generator data/Gone 'Gone'")
	sys.exit(1)

	name = sys.argv[2] if len(sys.argv) > 2 else None
	segments = run(sys.argv[1], song_name=name)

	print(f"Generated prompts for {len(segments)} segments:\n")
	for seg in segments:
	lyrics_tag = f' [{seg["lyrics"]}]' if seg["lyrics"] else ""
	print(f" Seg {seg['segment']}{lyrics_tag}")
	print(f" Scene: {seg['scene']}")
	print(f" Video: {seg['video_prompt'][:100]}...")
	print(f" Prompt: {seg['prompt'][:100]}...")
	print()