| """ |
| modules/llm_backbone.py |
| ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| VoiceVerse Pro β LLM Script Generation Layer |
| |
| Model : meta-llama/Llama-3.1-8B-Instruct (default β widely supported 2026) |
| Swap via LLMConfig.model_id for any HF-hosted chat model. |
| Backend: huggingface_hub.InferenceClient with provider="hf-inference" |
| Forces HF's own serverless inference endpoint β avoids third-party |
| providers (e.g. Together) that independently deprecate models and |
| return 410 Gone errors. |
| Format : ChatCompletion messages API (system + user roles) |
| |
| WHY NOT HuggingFaceEndpoint? |
| langchain-huggingface's HuggingFaceEndpoint internally calls |
| InferenceClient.post(), which was REMOVED in huggingface_hub β₯ 0.26. |
| Using InferenceClient.chat_completion() directly is the stable 2026 path. |
| |
| DESIGN RULES: |
| - The LLM NEVER generates without retrieved context. |
| - Context is injected verbatim into every prompt via the user message. |
| - Output is structured spoken prose (transcript) or [HOST]/[GUEST] tagged |
| dialogue (podcast), depending on output_mode. |
| - Temperature, max_new_tokens are runtime-configurable. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import logging |
| import os |
| from dataclasses import dataclass |
| from typing import Optional |
|
|
| logger = logging.getLogger(__name__) |
|
|
|
|
| |
| |
| |
|
|
| SUPPORTED_MODELS = [ |
| "mistralai/Mistral-7B-Instruct-v0.2", |
| ] |
|
|
| DEFAULT_MODEL = SUPPORTED_MODELS[0] |
|
|
|
|
| |
| |
| |
|
|
| @dataclass |
| class LLMConfig: |
| """Runtime-tunable LLM parameters.""" |
| model_id: str = DEFAULT_MODEL |
| max_new_tokens: int = 1024 |
| temperature: float = 0.65 |
| hf_token: Optional[str] = None |
| |
| |
| provider: str = "auto" |
|
|
|
|
| |
| |
| |
|
|
| SYSTEM_PROMPT = """\ |
| You are VoiceVerse, a world-class scriptwriter for spoken-audio content. |
| Your ONLY source of facts is the RETRIEVED CONTEXT provided in the user message. |
| You MUST NOT introduce information not present in that context. |
| Write in a warm, engaging, conversational spoken-English style. |
| No markdown, no bullet points, no headers β pure spoken prose only. |
| The script will be read aloud by a TTS engine.""" |
|
|
| USER_TEMPLATE = """\ |
| βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| RETRIEVED CONTEXT (your SOLE factual source): |
| {context} |
| βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| |
| TASK: |
| {task_description} |
| |
| FORMAT REQUIREMENTS: |
| β’ Open with a compelling hook (1β2 sentences). |
| β’ Develop the topic across 3β5 natural paragraphs drawn ONLY from the context. |
| β’ Close with a memorable takeaway or question to the listener. |
| β’ No markdown. No lists. No headers. Pure spoken prose. |
| β’ Target length: {target_words} words.""" |
|
|
|
|
| |
|
|
| PODCAST_SYSTEM_PROMPT = """\ |
| You are VoiceVerse, a world-class podcast scriptwriter. |
| Your ONLY source of facts is the RETRIEVED CONTEXT provided in the user message. |
| You MUST NOT introduce information not present in that context. |
| Write a natural back-and-forth dialogue between two speakers: |
| HOST β female, warm and inquisitive, guides the conversation |
| GUEST β male, knowledgeable and enthusiastic, elaborates on topics |
| Each line MUST start with exactly "[HOST]" or "[GUEST]" followed by a space and the spoken text. |
| No markdown, no stage directions, no descriptions β only spoken dialogue lines. |
| The script will be read aloud by a TTS engine with two distinct voices.""" |
|
|
| PODCAST_USER_TEMPLATE = """\ |
| βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| RETRIEVED CONTEXT (your SOLE factual source): |
| {context} |
| βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| |
| TASK: |
| {task_description} |
| |
| FORMAT REQUIREMENTS (STRICTLY FOLLOW): |
| β’ Every line must start with [HOST] or [GUEST] followed by their spoken words. |
| β’ Alternate naturally between HOST and GUEST. Aim for 8β16 exchanges. |
| β’ HOST opens and closes the episode. |
| β’ Draw ALL facts ONLY from the context above. |
| β’ No markdown. No stage directions. No headers. Only dialogue lines. |
| β’ Target total length: {target_words} words of dialogue. |
| |
| Example format: |
| [HOST] Welcome to VoiceVerse. Today we're diving into something fascinating. |
| [GUEST] Thanks for having me. I've been looking forward to this conversation. |
| [HOST] Let's start with the basics. What should our listeners know first? |
| [GUEST] Great question. The most important thing to understand is...""" |
|
|
|
|
| |
| |
| |
|
|
| class LLMBackbone: |
| """ |
| Calls huggingface_hub.InferenceClient.chat_completion() to generate |
| grounded spoken-style scripts. |
| |
| Uses provider="hf-inference" (HF's own serverless endpoint) to avoid |
| third-party providers that independently deprecate models. |
| Supports two output modes: |
| - Transcript: plain spoken prose |
| - Podcast: [HOST]/[GUEST] tagged dialogue for dual-voice TTS |
| """ |
|
|
| def __init__(self, config: Optional[LLMConfig] = None) -> None: |
| self.config = config or LLMConfig() |
| self._client = None |
| logger.info( |
| "LLMBackbone initialised | model=%s | provider=%s", |
| self.config.model_id, |
| self.config.provider, |
| ) |
|
|
| |
|
|
| def generate_script( |
| self, |
| context_text: str, |
| task_description: str, |
| target_words: int = 400, |
| output_mode: str = "Audio Transcript", |
| ) -> str: |
| """ |
| Generate a grounded script. |
| |
| Args: |
| context_text: Retrieved context from RAGEngine (REQUIRED). |
| task_description: High-level user instruction for the script. |
| target_words: Approximate word count target. |
| output_mode: "Audio Transcript" or "Podcast (2 Speakers)". |
| |
| Returns: |
| Clean script text. Podcast scripts have [HOST]/[GUEST] line prefixes. |
| """ |
| if not context_text or not context_text.strip(): |
| raise ValueError( |
| "context_text must not be empty. " |
| "The LLM requires retrieved context to generate." |
| ) |
|
|
| is_podcast = output_mode == "Podcast (2 Speakers)" |
| messages = self._build_messages( |
| context_text, task_description, target_words, is_podcast |
| ) |
|
|
| logger.info( |
| "Calling chat_completion | model=%s | mode=%s | ~%d context chars", |
| self.config.model_id, |
| output_mode, |
| len(context_text), |
| ) |
|
|
| try: |
| response = self._get_client().chat_completion( |
| messages=messages, |
| max_tokens=self.config.max_new_tokens, |
| temperature=self.config.temperature, |
| ) |
| raw_output: str = response.choices[0].message.content |
| except Exception as exc: |
| logger.error("InferenceClient call failed: %s", exc) |
| raise RuntimeError(f"LLM generation failed: {exc}") from exc |
|
|
| script = self._post_process(raw_output) |
| logger.info("Script generated | %d words | podcast=%s", len(script.split()), is_podcast) |
| return script |
|
|
| |
|
|
| def _build_messages( |
| self, |
| context: str, |
| task: str, |
| target_words: int, |
| is_podcast: bool = False, |
| ) -> list[dict]: |
| if is_podcast: |
| system = PODCAST_SYSTEM_PROMPT |
| user_content = PODCAST_USER_TEMPLATE.format( |
| context=context, |
| task_description=task, |
| target_words=target_words, |
| ) |
| else: |
| system = SYSTEM_PROMPT |
| user_content = USER_TEMPLATE.format( |
| context=context, |
| task_description=task, |
| target_words=target_words, |
| ) |
| return [ |
| {"role": "system", "content": system}, |
| {"role": "user", "content": user_content}, |
| ] |
|
|
| |
|
|
| @staticmethod |
| def _post_process(raw: str) -> str: |
| for tag in ("[INST]", "[/INST]", "</s>", "<s>", "<<SYS>>", "<</SYS>>"): |
| raw = raw.replace(tag, "") |
| lines = [line.rstrip() for line in raw.splitlines()] |
| cleaned: list[str] = [] |
| blank_count = 0 |
| for line in lines: |
| if not line.strip(): |
| blank_count += 1 |
| if blank_count <= 2: |
| cleaned.append("") |
| else: |
| blank_count = 0 |
| cleaned.append(line) |
| return "\n".join(cleaned).strip() |
|
|
| |
|
|
| def _get_client(self): |
| """ |
| Lazy-load huggingface_hub.InferenceClient with provider="hf-inference". |
| Uses HF's own serverless inference endpoint β avoids third-party providers |
| (e.g. Together) that independently deprecate models and return 410 Gone. |
| The client is bound to a specific model at init time. |
| """ |
| if self._client is None: |
| from huggingface_hub import InferenceClient |
|
|
| token = self.config.hf_token or os.getenv("HUGGINGFACEHUB_API_TOKEN") |
| if not token: |
| raise EnvironmentError( |
| "Hugging Face API token not found. " |
| "Set HUGGINGFACEHUB_API_TOKEN in your .env file " |
| "or paste it in the sidebar." |
| ) |
|
|
| logger.info( |
| "Initialising InferenceClient | model=%s | provider=%s", |
| self.config.model_id, |
| self.config.provider, |
| ) |
| self._client = InferenceClient( |
| model=self.config.model_id, |
| token=token, |
| provider=self.config.provider, |
| ) |
| logger.info("InferenceClient ready | provider=%s | model=%s", self.config.provider, self.config.model_id) |
| return self._client |