Spaces:

pratik-250620
/

MultiModal-Coherence-AI

Sleeping

App Files Files Community

MultiModal-Coherence-AI / src /planner /schema_to_text.py

pratik-250620

Upload folder using huggingface_hub

6835659 verified 3 months ago

raw

history blame contribute delete

9.1 kB

	from __future__ import annotations

	from typing import Any, Dict, List


	def _norm_list(x: Any) -> List[str]:
	if not x:
	return []
	if isinstance(x, list):
	return [str(v).strip() for v in x if str(v).strip()]
	return [str(x).strip()]


	def _join(items: List[str], sep: str = ", ") -> str:
	items = [i.strip() for i in items if i and i.strip()]
	return sep.join(items)


	def _sent(items: List[str]) -> str:
	"""Sentence-ish join. Keeps it readable."""
	items = [i.strip() for i in items if i and i.strip()]
	if not items:
	return ""
	if len(items) == 1:
	return items[0]
	return "; ".join(items)


	def plan_to_prompts(plan: Any) -> Dict[str, str]:
	"""
	Convert the UnifiedPlanner JSON schema output into STRICT, modality-specific prompts.
	This is the key fix: generators must obey the same semantic contract.

	Returns:
	{
	"text_prompt": "...",
	"image_prompt": "...",
	"audio_prompt": "...",
	"shared_brief": "..."
	}
	"""

	# Accept either pydantic model or dict-like
	if hasattr(plan, "model_dump"):
	p = plan.model_dump()
	elif isinstance(plan, dict):
	p = plan
	else:
	# last resort
	p = dict(plan)

	scene_summary = str(p.get("scene_summary", "")).strip()
	domain = str(p.get("domain", "")).strip()

	# Extract from nested structure (UnifiedPlan schema)
	core_sem = p.get("core_semantics", {})
	style_ctrl = p.get("style_controls", {})
	img_const = p.get("image_constraints", {})
	aud_const = p.get("audio_constraints", {})
	text_const = p.get("text_constraints", {})

	# Primary entities from core_semantics.main_subjects
	primary = _norm_list(core_sem.get("main_subjects") if isinstance(core_sem, dict) else [])
	# Secondary entities (not in schema, but check for compatibility)
	secondary = _norm_list(p.get("secondary_entities", []))

	# Visual attributes from style_controls and image_constraints
	visual_style = _norm_list(style_ctrl.get("visual_style", []) if isinstance(style_ctrl, dict) else [])
	color_palette = _norm_list(style_ctrl.get("color_palette", []) if isinstance(style_ctrl, dict) else [])
	lighting = _norm_list(style_ctrl.get("lighting", []) if isinstance(style_ctrl, dict) else [])
	img_objects = _norm_list(img_const.get("objects", []) if isinstance(img_const, dict) else [])
	env_details = _norm_list(img_const.get("environment_details", []) if isinstance(img_const, dict) else [])
	visual_attrs = visual_style + color_palette + lighting + img_objects + env_details

	# Style from style_controls
	style = visual_style # Use visual_style as style

	# Mood from style_controls
	mood = _norm_list(style_ctrl.get("mood_emotion", []) if isinstance(style_ctrl, dict) else [])

	# Tone from style_controls
	tone = _norm_list(style_ctrl.get("narrative_tone", []) if isinstance(style_ctrl, dict) else [])

	# Audio from audio_constraints
	audio_intent = _norm_list(aud_const.get("audio_intent", []) if isinstance(aud_const, dict) else [])
	sound_sources = _norm_list(aud_const.get("sound_sources", []) if isinstance(aud_const, dict) else [])
	ambience = _norm_list(aud_const.get("ambience", []) if isinstance(aud_const, dict) else [])
	audio_elems = audio_intent + sound_sources + ambience

	# Must include/avoid from constraints
	img_must_include = _norm_list(img_const.get("must_include", []) if isinstance(img_const, dict) else [])
	img_must_avoid = _norm_list(img_const.get("must_avoid", []) if isinstance(img_const, dict) else [])
	must_include = img_must_include # Use image constraints for now
	must_avoid = img_must_avoid

	# -------------------------
	# SHARED BRIEF (NO INSTRUCTIONS)
	# -------------------------
	# Important: This is NOT "do X". It's "X is present".
	brief_parts: List[str] = []

	if scene_summary:
	brief_parts.append(scene_summary)

	if domain:
	brief_parts.append(f"Domain: {domain}.")

	if primary:
	brief_parts.append(f"Primary entities: {_join(primary)}.")
	if secondary:
	brief_parts.append(f"Secondary entities: {_join(secondary)}.")

	if visual_attrs:
	brief_parts.append(f"Visual attributes: {_join(visual_attrs)}.")
	if style:
	brief_parts.append(f"Style: {_join(style)}.")
	if mood:
	brief_parts.append(f"Mood/emotion: {_join(mood)}.")
	if tone:
	brief_parts.append(f"Narrative tone: {_join(tone)}.")

	if must_include:
	brief_parts.append(f"Must include: {_join(must_include)}.")
	if must_avoid:
	brief_parts.append(f"Must avoid: {_join(must_avoid)}.")

	shared_brief = " ".join([b.strip() for b in brief_parts if b.strip()])

	# -------------------------
	# TEXT PROMPT (STRICT)
	# -------------------------
	# Goal: stop instruction-echo. We never say “describe” or “generate”.
	# We demand a literal depiction, short, grounded.
	text_lines: List[str] = []
	text_lines.append("Write a vivid, literal description of the exact scene below.")
	text_lines.append("Do not include instructions, bullets, headings, or meta commentary.")
	text_lines.append("Do not mention 'prompt' or 'plan'.")
	text_lines.append("")
	text_lines.append(shared_brief)
	text_lines.append("")
	text_lines.append("Constraints:")
	if must_include:
	text_lines.append(f"- Include: {_join(must_include)}")
	if must_avoid:
	text_lines.append(f"- Avoid: {_join(must_avoid)}")
	text_lines.append("- Length: 3 to 6 sentences.")

	text_prompt = "\n".join(text_lines).strip()

	# -------------------------
	# IMAGE PROMPT (STRICT VISUAL CONTRACT)
	# -------------------------
	# Build a rich, specific prompt for better image retrieval
	img_parts: List[str] = []

	# Core scene
	if scene_summary:
	img_parts.append(scene_summary)

	# Main subjects (most important for matching)
	if primary:
	img_parts.append(_join(primary))

	# Visual details
	if visual_attrs:
	# Use first few most important visual attributes
	key_visuals = visual_attrs[:5] # Limit to avoid too long prompts
	img_parts.append(_join(key_visuals))

	# Style and mood
	if style:
	img_parts.append(_join(style[:2])) # Limit style tags
	if mood:
	img_parts.append(_join(mood[:2])) # Limit mood tags

	# Core semantics for context
	if isinstance(core_sem, dict):
	setting = core_sem.get("setting", "")
	time_of_day = core_sem.get("time_of_day", "")
	weather = core_sem.get("weather", "")
	if setting:
	img_parts.append(setting)
	if time_of_day:
	img_parts.append(time_of_day)
	if weather:
	img_parts.append(weather)

	# Build final prompt - more specific for retrieval
	image_prompt = ", ".join([p for p in img_parts if p]).strip()

	# Fallback if empty
	if not image_prompt:
	image_prompt = scene_summary or "scene"

	# -------------------------
	# AUDIO PROMPT (STRICT AUDIO CONTRACT)
	# -------------------------
	# Build a specific, detailed audio prompt for AudioLDM
	aud_parts: List[str] = []

	# Core scene context
	if scene_summary:
	aud_parts.append(scene_summary)

	# Audio elements (most important)
	if sound_sources:
	aud_parts.append("sounds of " + _join(sound_sources[:4])) # Limit to avoid too long
	if ambience:
	aud_parts.append("ambient " + _join(ambience[:3]))
	if audio_intent:
	aud_parts.append(_join(audio_intent))

	# Context from core semantics
	if isinstance(core_sem, dict):
	setting = core_sem.get("setting", "")
	weather = core_sem.get("weather", "")
	if weather and weather.lower() not in ["clear", "sunny"]:
	aud_parts.append(weather.lower() + " weather sounds")
	if setting:
	aud_parts.append(setting.lower() + " environment")

	# Tempo/mood from audio constraints
	if isinstance(aud_const, dict):
	tempo = aud_const.get("tempo", "")
	if tempo:
	aud_parts.append(tempo + " tempo")

	# Build final prompt - specific and concise for AudioLDM
	audio_prompt = ", ".join([p for p in aud_parts if p]).strip()

	# Fallback if empty
	if not audio_prompt:
	audio_prompt = scene_summary or "ambient soundscape"

	# Add quality hints for AudioLDM
	if not audio_prompt.endswith("sound") and not audio_prompt.endswith("audio"):
	audio_prompt += " soundscape"

	return {
	"text_prompt": text_prompt,
	"image_prompt": image_prompt,
	"audio_prompt": audio_prompt,
	"shared_brief": shared_brief,
	}


	# Backward compatible function name (if older code imports it)
	def plan_to_canonical_text(plan: Any) -> str:
	"""
	Legacy: returns the shared brief. Keep this to avoid breaking other imports.
	"""
	return plan_to_prompts(plan)["shared_brief"]