| """ |
| VoiceVerse AI β Script Generation Module. |
| |
| Delivery Modes: |
| Summary β single-speaker structured narration |
| Podcast β HOST_1 / HOST_2 two-host dialogue |
| Song/Rap β rhythmic retention content |
| Debate β DEBATER_A (female, for) vs DEBATER_B (male, against) structured debate |
| """ |
|
|
| import os |
| import re |
| from huggingface_hub import InferenceClient |
| from utils import logger |
|
|
| MODEL_ID = "HuggingFaceTB/SmolLM3-3B" |
| MAX_NEW_TOKENS = 2048 |
| TEMPERATURE = 0.5 |
|
|
|
|
| |
| |
| |
|
|
| |
| _SUMMARY_SYSTEM = """\ |
| You are a professional narrator. Produce a clear spoken summary strictly from the source material. |
| RULES: |
| 1. Use ONLY facts from the source. Do NOT add outside knowledge. |
| 2. Write as one continuous flowing narration. Do NOT use any section headings, labels, or structural markers like "Introduction", "Intro", "Key Points", "Conclusion", "Summary", "Section 1", etc. |
| 3. Use smooth spoken transitions instead of headings. For example say "Let's start with..." or "Now moving on to..." or "To wrap things up..." instead of labeling sections. |
| 4. Plain text only β no markdown, no bullets, no headers, no labels of any kind. |
| 5. Write for the ear: short sentences, conversational tone. |
| 6. Never say "the document says". Speak as the expert. |
| 7. Output ONLY the spoken narration text, nothing else. It should read like someone is naturally talking.""" |
|
|
| _SUMMARY_USER = """\ |
| SOURCE MATERIAL: |
| {context} |
| |
| Write a flowing spoken summary in plain sentences. Do NOT include any headings or labels like Intro, Conclusion, etc. Just speak naturally as if talking to a listener.""" |
|
|
|
|
| |
| _PODCAST_SYSTEM = """\ |
| You are a podcast script writer. Write a two-host conversation strictly from the source material. |
| |
| STRICT FORMAT β every single line must start with a speaker tag: |
| ALEX: <what Alex says> |
| SAM: <what Sam says> |
| |
| RULES: |
| 1. Alternate ALEX and SAM. Never same host twice in a row. |
| 2. ALEX introduces topics and asks questions. |
| 3. SAM explains concepts and answers. |
| 4. Use ONLY information from the source. No hallucination. |
| 5. Conversational, engaging tone. |
| 6. No markdown, no stage directions, no lines without a speaker tag. |
| 7. Aim for 16β24 exchanges.""" |
|
|
| _PODCAST_USER = """\ |
| SOURCE MATERIAL: |
| {context} |
| |
| Write the full podcast. Every line must start with ALEX: or SAM:""" |
|
|
|
|
| |
| _RAP_SYSTEM = """\ |
| You are a lyricist. Two steps: |
| STEP 1 β silently extract 5β7 key ideas from the source. |
| STEP 2 β write a punchy rhythmic RAP from those ideas. |
| |
| RULES: |
| - Short punchy lines (5β8 words), fast-flow rhyme (AABB or ABAB). |
| - Do NOT use any section labels like [VERSE 1], [CHORUS], [HOOK], [BRIDGE] etc. |
| - Just write the rap lines continuously. Use a blank line to separate verses. |
| - The hook/chorus should repeat naturally without a label. |
| - Wordplay and repetition to aid retention. |
| - Do NOT invent facts not in the source. |
| - Output ONLY the lyrics, no labels, no headers.""" |
|
|
| _RAP_USER = """\ |
| SOURCE MATERIAL: |
| {context} |
| |
| Extract the key ideas, then write the full rap. No section labels.""" |
|
|
|
|
| |
| _DEBATE_SYSTEM = """\ |
| You are a debate script writer. Write a structured two-person debate strictly grounded \ |
| in the provided source material. |
| |
| STRICT FORMAT β every single line must start with a speaker tag: |
| MAYA: <what Maya says> |
| RYAN: <what Ryan says> |
| |
| CHARACTER PROFILES: |
| - MAYA: Takes the PRO / supporting position. Tone is confident, optimistic, forward-thinking. |
| - RYAN: Takes the CON / critical position. Tone is skeptical, cautious, questioning. |
| |
| DEBATE STRUCTURE: |
| 1. MAYA opens with a strong statement supporting the topic. |
| 2. RYAN immediately challenges with a counterpoint. |
| 3. They alternate, each directly responding to the other's previous point. |
| 4. Both use evidence and logic from the source material only. |
| 5. End with each debater giving a brief closing statement. |
| |
| RULES: |
| - Alternate MAYA and RYAN. Never same debater twice in a row. |
| - Use ONLY information from the source material. No hallucination. |
| - Each turn should be 1β3 sentences β punchy, not long speeches. |
| - No markdown, no stage directions, no narration outside the speaker tags. |
| - Aim for 16β22 exchanges total.""" |
|
|
| _DEBATE_USER = """\ |
| SOURCE MATERIAL: |
| {context} |
| |
| Write the full debate on the key topics from this material. \ |
| Every line must start with MAYA: or RYAN:""" |
|
|
|
|
| |
| _STORY_SYSTEM = """\ |
| You are a master storyteller. Retell the ideas from the source material as an \ |
| immersive narrative story written for slow, expressive audio delivery. |
| |
| RULES: |
| 1. Transform factual content into a story β use characters, scenes, a narrative arc \ |
| (beginning, middle, end). Characters can be fictional stand-ins for real concepts. |
| 2. Use ONLY information and ideas from the source. Do NOT invent new facts. |
| 3. Warm, descriptive storytelling voice. Vivid but calm. |
| 4. Short paragraphs, 1β3 sentences each, separated by blank lines. |
| 5. Plain text only β no markdown, no bullets, no headers. |
| 6. Begin with an evocative scene-setting sentence. |
| 7. End with a closing reflection or lesson drawn from the source. |
| 8. Output ONLY the story text, nothing else.""" |
|
|
| _STORY_USER = """\ |
| SOURCE MATERIAL: |
| {context} |
| |
| Transform this into a rich narrative story for slow, expressive audio. \ |
| Use short paragraphs with blank lines between them.""" |
|
|
|
|
| |
| |
| |
|
|
| def _clean(text: str) -> str: |
| """Remove all markdown and XML artifacts from LLM output.""" |
| text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL) |
| text = re.sub(r"<[^>]+>", "", text) |
| text = re.sub(r"^#{1,6}\s+", "", text, flags=re.MULTILINE) |
|
|
| |
| |
| text = re.sub( |
| r"^(?:Introduction|Intro|Conclusion|Summary|Key\s*Points?|Overview|" |
| r"Closing|Opening|Final\s*Thoughts?|In\s*Summary|To\s*Conclude)\s*[:\-β]?\s*$", |
| "", text, flags=re.MULTILINE | re.IGNORECASE |
| ) |
| |
| text = re.sub( |
| r"^(?:Introduction|Intro|Conclusion|Summary|Key\s*Points?|Overview|" |
| r"Closing|Opening|Final\s*Thoughts?)\s*[:\-β]\s+", |
| "", text, flags=re.MULTILINE | re.IGNORECASE |
| ) |
| |
| text = re.sub(r"\[(?:VERSE|CHORUS|HOOK|BRIDGE|INTRO|OUTRO)\s*\d*\]", "", text, flags=re.IGNORECASE) |
| text = re.sub(r"\*{1,3}([^*]+)\*{1,3}", r"\1", text) |
| text = re.sub(r"_{1,3}([^_]+)_{1,3}", r"\1", text) |
| text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text) |
| text = re.sub(r"```[^`]*```", "", text, flags=re.DOTALL) |
| text = re.sub(r"`([^`]+)`", r"\1", text) |
| text = re.sub(r"^[\s]*[-*+]\s+", "", text, flags=re.MULTILINE) |
| text = re.sub(r"^[\s]*\d+\.\s+", "", text, flags=re.MULTILINE) |
| text = re.sub(r"^>\s+", "", text, flags=re.MULTILINE) |
| text = re.sub(r"^[-*_]{3,}\s*$", "", text, flags=re.MULTILINE) |
| text = re.sub(r"\n{3,}", "\n\n", text) |
| text = re.sub(r" {2,}", " ", text) |
| return text.strip() |
|
|
|
|
| def _clean_dialogue(text: str, tag_a: str, tag_b: str) -> str: |
| """ |
| Clean output that must have speaker tags (podcast or debate). |
| Normalises tag variants, removes lines without valid tags. |
| """ |
| text = _clean(text) |
|
|
| |
| if tag_a == "ALEX": |
| text = re.sub(r"(?i)\balex\s*:", "ALEX:", text) |
| text = re.sub(r"(?i)\bsam\s*:", "SAM:", text) |
| text = re.sub(r"(?i)\bhost[\s_-]*1\s*:", "ALEX:", text) |
| text = re.sub(r"(?i)\bhost[\s_-]*2\s*:", "SAM:", text) |
| elif tag_a == "MAYA": |
| text = re.sub(r"(?i)\bmaya\s*:", "MAYA:", text) |
| text = re.sub(r"(?i)\bryan\s*:", "RYAN:", text) |
| text = re.sub(r"(?i)\bdebater[\s_-]*a\s*:", "MAYA:", text) |
| text = re.sub(r"(?i)\bdebater[\s_-]*b\s*:", "RYAN:", text) |
| text = re.sub(r"(?i)\bpro\s*:", "MAYA:", text) |
| text = re.sub(r"(?i)\bcon\s*:", "RYAN:", text) |
| text = re.sub(r"(?i)\bspeaker[\s_-]*a\s*:", "MAYA:", text) |
| text = re.sub(r"(?i)\bspeaker[\s_-]*b\s*:", "RYAN:", text) |
|
|
| |
| lines = text.splitlines() |
| clean_lines = [ |
| ln for ln in lines |
| if ln.strip() == "" |
| or ln.strip().startswith(f"{tag_a}:") |
| or ln.strip().startswith(f"{tag_b}:") |
| ] |
| return "\n".join(clean_lines).strip() |
|
|
|
|
| |
| |
| |
|
|
| def _get_client() -> InferenceClient: |
| token = os.environ.get("HF_TOKEN") |
| if not token: |
| raise EnvironmentError( |
| "HF_TOKEN not set. Add your Hugging Face token as a Space secret." |
| ) |
| return InferenceClient(provider="hf-inference", token=token) |
|
|
|
|
| def _call_llm(system: str, user: str) -> str: |
| client = _get_client() |
| response = client.chat_completion( |
| model=MODEL_ID, |
| messages=[ |
| {"role": "system", "content": system}, |
| {"role": "user", "content": user}, |
| ], |
| max_tokens=MAX_NEW_TOKENS, |
| temperature=TEMPERATURE, |
| top_p=0.9, |
| ) |
| raw = response.choices[0].message.content.strip() |
| if not raw: |
| raise RuntimeError("Model returned empty response. Please try again.") |
| return raw |
|
|
|
|
| |
| |
| |
|
|
| def generate_script( |
| context_chunks: list[str], |
| mode: str = "Summary", |
| sub_mode: str = "Rap", |
| topic: str = "the key ideas from this document", |
| ) -> str: |
| """ |
| Generate a spoken script from RAG chunks. |
| |
| Args: |
| context_chunks : chunks from RAGStore β NOT modified here |
| mode : "Summary" | "Podcast" | "Song / Rap" | "Debate" |
| sub_mode : "Song" | "Rap" (only for Song/Rap mode) |
| |
| Returns: |
| Clean string ready for tts.generate_audio() or tts.generate_audio_podcast() |
| Podcast/Debate modes preserve HOST_1/HOST_2 or DEBATER_A/DEBATER_B tags. |
| """ |
| if not context_chunks: |
| raise ValueError("No document context. Please upload or paste content first.") |
|
|
| context = "\n\n".join(context_chunks) |
| if len(context) > 6000: |
| context = context[:6000] |
| logger.warning("Context truncated to 6000 chars") |
|
|
| logger.info("generate_script | mode=%s sub_mode=%s ctx=%d chars", mode, sub_mode, len(context)) |
|
|
| m = mode.strip().lower() |
|
|
| if m == "summary": |
| raw = _call_llm(_SUMMARY_SYSTEM, _SUMMARY_USER.format(context=context)) |
| script = _clean(raw) |
|
|
| elif m == "podcast": |
| raw = _call_llm(_PODCAST_SYSTEM, _PODCAST_USER.format(context=context)) |
| script = _clean_dialogue(raw, "ALEX", "SAM") |
|
|
| elif "rap" in m: |
| raw = _call_llm(_RAP_SYSTEM, _RAP_USER.format(context=context)) |
| script = _clean(raw) |
|
|
| elif "debate" in m: |
| raw = _call_llm(_DEBATE_SYSTEM, _DEBATE_USER.format(context=context)) |
| script = _clean_dialogue(raw, "MAYA", "RYAN") |
|
|
| elif "story" in m: |
| raw = _call_llm(_STORY_SYSTEM, _STORY_USER.format(context=context)) |
| script = _clean(raw) |
|
|
| else: |
| logger.warning("Unknown mode '%s' β falling back to Summary", mode) |
| raw = _call_llm(_SUMMARY_SYSTEM, _SUMMARY_USER.format(context=context)) |
| script = _clean(raw) |
|
|
| if not script: |
| raise RuntimeError("Script was empty after cleaning. Please try again.") |
|
|
| logger.info("Script ready: %d chars", len(script)) |
| return script |
|
|