Spaces:

Isshi14
/

CHECK

No application file

App Files Files Community

CHECK / script_gen.py

Isshi14

Upload 12 files

ebd182e verified 3 months ago

raw

history blame contribute delete

14.3 kB

	"""
	VoiceVerse AI — Script Generation Module.

	Delivery Modes:
	Summary — single-speaker structured narration
	Podcast — HOST_1 / HOST_2 two-host dialogue
	Song/Rap — rhythmic retention content
	Debate — DEBATER_A (female, for) vs DEBATER_B (male, against) structured debate
	"""

	import os
	import re
	from huggingface_hub import InferenceClient
	from utils import logger

	MODEL_ID = "HuggingFaceTB/SmolLM3-3B"
	MAX_NEW_TOKENS = 2048
	TEMPERATURE = 0.5


	# ══════════════════════════════════════════════════════════════════════════════
	# Prompts
	# ══════════════════════════════════════════════════════════════════════════════

	# ── Summary ───────────────────────────────────────────────────────────────────
	_SUMMARY_SYSTEM = """\
	You are a professional narrator. Produce a clear spoken summary strictly from the source material.
	RULES:
	1. Use ONLY facts from the source. Do NOT add outside knowledge.
	2. Write as one continuous flowing narration. Do NOT use any section headings, labels, or structural markers like "Introduction", "Intro", "Key Points", "Conclusion", "Summary", "Section 1", etc.
	3. Use smooth spoken transitions instead of headings. For example say "Let's start with..." or "Now moving on to..." or "To wrap things up..." instead of labeling sections.
	4. Plain text only — no markdown, no bullets, no headers, no labels of any kind.
	5. Write for the ear: short sentences, conversational tone.
	6. Never say "the document says". Speak as the expert.
	7. Output ONLY the spoken narration text, nothing else. It should read like someone is naturally talking."""

	_SUMMARY_USER = """\
	SOURCE MATERIAL:
	{context}

	Write a flowing spoken summary in plain sentences. Do NOT include any headings or labels like Intro, Conclusion, etc. Just speak naturally as if talking to a listener."""


	# ── Podcast ───────────────────────────────────────────────────────────────────
	_PODCAST_SYSTEM = """\
	You are a podcast script writer. Write a two-host conversation strictly from the source material.

	STRICT FORMAT — every single line must start with a speaker tag:
	ALEX: <what Alex says>
	SAM: <what Sam says>

	RULES:
	1. Alternate ALEX and SAM. Never same host twice in a row.
	2. ALEX introduces topics and asks questions.
	3. SAM explains concepts and answers.
	4. Use ONLY information from the source. No hallucination.
	5. Conversational, engaging tone.
	6. No markdown, no stage directions, no lines without a speaker tag.
	7. Aim for 16–24 exchanges."""

	_PODCAST_USER = """\
	SOURCE MATERIAL:
	{context}

	Write the full podcast. Every line must start with ALEX: or SAM:"""


	# ── Rap ───────────────────────────────────────────────────────────────────────
	_RAP_SYSTEM = """\
	You are a lyricist. Two steps:
	STEP 1 — silently extract 5–7 key ideas from the source.
	STEP 2 — write a punchy rhythmic RAP from those ideas.

	RULES:
	- Short punchy lines (5–8 words), fast-flow rhyme (AABB or ABAB).
	- Do NOT use any section labels like [VERSE 1], [CHORUS], [HOOK], [BRIDGE] etc.
	- Just write the rap lines continuously. Use a blank line to separate verses.
	- The hook/chorus should repeat naturally without a label.
	- Wordplay and repetition to aid retention.
	- Do NOT invent facts not in the source.
	- Output ONLY the lyrics, no labels, no headers."""

	_RAP_USER = """\
	SOURCE MATERIAL:
	{context}

	Extract the key ideas, then write the full rap. No section labels."""


	# ── Debate ────────────────────────────────────────────────────────────────────
	_DEBATE_SYSTEM = """\
	You are a debate script writer. Write a structured two-person debate strictly grounded \
	in the provided source material.

	STRICT FORMAT — every single line must start with a speaker tag:
	MAYA: <what Maya says>
	RYAN: <what Ryan says>

	CHARACTER PROFILES:
	- MAYA: Takes the PRO / supporting position. Tone is confident, optimistic, forward-thinking.
	- RYAN: Takes the CON / critical position. Tone is skeptical, cautious, questioning.

	DEBATE STRUCTURE:
	1. MAYA opens with a strong statement supporting the topic.
	2. RYAN immediately challenges with a counterpoint.
	3. They alternate, each directly responding to the other's previous point.
	4. Both use evidence and logic from the source material only.
	5. End with each debater giving a brief closing statement.

	RULES:
	- Alternate MAYA and RYAN. Never same debater twice in a row.
	- Use ONLY information from the source material. No hallucination.
	- Each turn should be 1–3 sentences — punchy, not long speeches.
	- No markdown, no stage directions, no narration outside the speaker tags.
	- Aim for 16–22 exchanges total."""

	_DEBATE_USER = """\
	SOURCE MATERIAL:
	{context}

	Write the full debate on the key topics from this material. \
	Every line must start with MAYA: or RYAN:"""


	# ── Story ─────────────────────────────────────────────────────────────────────
	_STORY_SYSTEM = """\
	You are a master storyteller. Retell the ideas from the source material as an \
	immersive narrative story written for slow, expressive audio delivery.

	RULES:
	1. Transform factual content into a story — use characters, scenes, a narrative arc \
	(beginning, middle, end). Characters can be fictional stand-ins for real concepts.
	2. Use ONLY information and ideas from the source. Do NOT invent new facts.
	3. Warm, descriptive storytelling voice. Vivid but calm.
	4. Short paragraphs, 1–3 sentences each, separated by blank lines.
	5. Plain text only — no markdown, no bullets, no headers.
	6. Begin with an evocative scene-setting sentence.
	7. End with a closing reflection or lesson drawn from the source.
	8. Output ONLY the story text, nothing else."""

	_STORY_USER = """\
	SOURCE MATERIAL:
	{context}

	Transform this into a rich narrative story for slow, expressive audio. \
	Use short paragraphs with blank lines between them."""


	# ══════════════════════════════════════════════════════════════════════════════
	# Post-processing
	# ══════════════════════════════════════════════════════════════════════════════

	def _clean(text: str) -> str:
	"""Remove all markdown and XML artifacts from LLM output."""
	text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL)
	text = re.sub(r"<[^>]+>", "", text)
	text = re.sub(r"^#{1,6}\s+", "", text, flags=re.MULTILINE)

	# Remove heading-like labels that TTS would read aloud
	# e.g. "Introduction:", "Intro:", "Conclusion:", "Key Points:", "Summary:" etc.
	text = re.sub(
	r"^(?:Introduction\|Intro\|Conclusion\|Summary\|Key\s*Points?\|Overview\|"
	r"Closing\|Opening\|Final\sThoughts?\|In\sSummary\|To\sConclude)\s[:\-—]?\s*$",
	"", text, flags=re.MULTILINE \| re.IGNORECASE
	)
	# Also remove inline heading labels at the start of a line followed by content
	text = re.sub(
	r"^(?:Introduction\|Intro\|Conclusion\|Summary\|Key\s*Points?\|Overview\|"
	r"Closing\|Opening\|Final\sThoughts?)\s[:\-—]\s+",
	"", text, flags=re.MULTILINE \| re.IGNORECASE
	)
	# Remove [VERSE 1], [CHORUS], [HOOK], [BRIDGE] etc. labels from rap/song output
	text = re.sub(r"\[(?:VERSE\|CHORUS\|HOOK\|BRIDGE\|INTRO\|OUTRO)\s\d\]", "", text, flags=re.IGNORECASE)
	text = re.sub(r"\{1,3}([^]+)\*{1,3}", r"\1", text)
	text = re.sub(r"_{1,3}([^_]+)_{1,3}", r"\1", text)
	text = re.sub(r"\[([^\]]+)\]$[^)]+$", r"\1", text)
	text = re.sub(r"```[^`]*```", "", text, flags=re.DOTALL)
	text = re.sub(r"`([^`]+)`", r"\1", text)
	text = re.sub(r"^[\s][-+]\s+", "", text, flags=re.MULTILINE)
	text = re.sub(r"^[\s]*\d+\.\s+", "", text, flags=re.MULTILINE)
	text = re.sub(r"^>\s+", "", text, flags=re.MULTILINE)
	text = re.sub(r"^[-_]{3,}\s$", "", text, flags=re.MULTILINE)
	text = re.sub(r"\n{3,}", "\n\n", text)
	text = re.sub(r" {2,}", " ", text)
	return text.strip()


	def _clean_dialogue(text: str, tag_a: str, tag_b: str) -> str:
	"""
	Clean output that must have speaker tags (podcast or debate).
	Normalises tag variants, removes lines without valid tags.
	"""
	text = _clean(text)

	# Normalise tag variants the model might produce
	if tag_a == "ALEX":
	text = re.sub(r"(?i)\balex\s*:", "ALEX:", text)
	text = re.sub(r"(?i)\bsam\s*:", "SAM:", text)
	text = re.sub(r"(?i)\bhost[\s_-]1\s:", "ALEX:", text)
	text = re.sub(r"(?i)\bhost[\s_-]2\s:", "SAM:", text)
	elif tag_a == "MAYA":
	text = re.sub(r"(?i)\bmaya\s*:", "MAYA:", text)
	text = re.sub(r"(?i)\bryan\s*:", "RYAN:", text)
	text = re.sub(r"(?i)\bdebater[\s_-]a\s:", "MAYA:", text)
	text = re.sub(r"(?i)\bdebater[\s_-]b\s:", "RYAN:", text)
	text = re.sub(r"(?i)\bpro\s*:", "MAYA:", text)
	text = re.sub(r"(?i)\bcon\s*:", "RYAN:", text)
	text = re.sub(r"(?i)\bspeaker[\s_-]a\s:", "MAYA:", text)
	text = re.sub(r"(?i)\bspeaker[\s_-]b\s:", "RYAN:", text)

	# Keep only lines that have a valid speaker tag
	lines = text.splitlines()
	clean_lines = [
	ln for ln in lines
	if ln.strip() == ""
	or ln.strip().startswith(f"{tag_a}:")
	or ln.strip().startswith(f"{tag_b}:")
	]
	return "\n".join(clean_lines).strip()


	# ══════════════════════════════════════════════════════════════════════════════
	# LLM client
	# ══════════════════════════════════════════════════════════════════════════════

	def _get_client() -> InferenceClient:
	token = os.environ.get("HF_TOKEN")
	if not token:
	raise EnvironmentError(
	"HF_TOKEN not set. Add your Hugging Face token as a Space secret."
	)
	return InferenceClient(provider="hf-inference", token=token)


	def _call_llm(system: str, user: str) -> str:
	client = _get_client()
	response = client.chat_completion(
	model=MODEL_ID,
	messages=[
	{"role": "system", "content": system},
	{"role": "user", "content": user},
	],
	max_tokens=MAX_NEW_TOKENS,
	temperature=TEMPERATURE,
	top_p=0.9,
	)
	raw = response.choices[0].message.content.strip()
	if not raw:
	raise RuntimeError("Model returned empty response. Please try again.")
	return raw


	# ══════════════════════════════════════════════════════════════════════════════
	# Public entry point
	# ══════════════════════════════════════════════════════════════════════════════

	def generate_script(
	context_chunks: list[str],
	mode: str = "Summary",
	sub_mode: str = "Rap",
	topic: str = "the key ideas from this document",
	) -> str:
	"""
	Generate a spoken script from RAG chunks.

	Args:
	context_chunks : chunks from RAGStore — NOT modified here
	mode : "Summary" \| "Podcast" \| "Song / Rap" \| "Debate"
	sub_mode : "Song" \| "Rap" (only for Song/Rap mode)

	Returns:
	Clean string ready for tts.generate_audio() or tts.generate_audio_podcast()
	Podcast/Debate modes preserve HOST_1/HOST_2 or DEBATER_A/DEBATER_B tags.
	"""
	if not context_chunks:
	raise ValueError("No document context. Please upload or paste content first.")

	context = "\n\n".join(context_chunks)
	if len(context) > 6000:
	context = context[:6000]
	logger.warning("Context truncated to 6000 chars")

	logger.info("generate_script \| mode=%s sub_mode=%s ctx=%d chars", mode, sub_mode, len(context))

	m = mode.strip().lower()

	if m == "summary":
	raw = _call_llm(_SUMMARY_SYSTEM, _SUMMARY_USER.format(context=context))
	script = _clean(raw)

	elif m == "podcast":
	raw = _call_llm(_PODCAST_SYSTEM, _PODCAST_USER.format(context=context))
	script = _clean_dialogue(raw, "ALEX", "SAM")

	elif "rap" in m:
	raw = _call_llm(_RAP_SYSTEM, _RAP_USER.format(context=context))
	script = _clean(raw)

	elif "debate" in m:
	raw = _call_llm(_DEBATE_SYSTEM, _DEBATE_USER.format(context=context))
	script = _clean_dialogue(raw, "MAYA", "RYAN")

	elif "story" in m:
	raw = _call_llm(_STORY_SYSTEM, _STORY_USER.format(context=context))
	script = _clean(raw)

	else:
	logger.warning("Unknown mode '%s' — falling back to Summary", mode)
	raw = _call_llm(_SUMMARY_SYSTEM, _SUMMARY_USER.format(context=context))
	script = _clean(raw)

	if not script:
	raise RuntimeError("Script was empty after cleaning. Please try again.")

	logger.info("Script ready: %d chars", len(script))
	return script