from dataclasses import dataclass, field from typing import Optional import re @dataclass class Citation: id: int chunk_id: int source_url: str page_title: str section: str snippet: str score: float = 0.0 def build_context_block(citations: list[Citation]) -> str: """Format citations as numbered context for the LLM prompt.""" parts = [] for c in citations: header = f"[{c.id}] (Page: {c.page_title} | Section: {c.section} | URL: {c.source_url})" parts.append(f"{header}\n{c.snippet}") return "\n\n---\n\n".join(parts) def extract_inline_refs(text: str) -> set[int]: """Return the set of citation IDs referenced inline, e.g. [1], [2].""" return {int(m) for m in re.findall(r"\[(\d+)\]", text)} def render_citation_markdown(citations: list[Citation], used_ids: Optional[set[int]] = None) -> str: lines = ["### Sources"] for c in citations: if used_ids is not None and c.id not in used_ids: continue lines.append(f"{c.id}. [{c.page_title} — {c.section}]({c.source_url})") return "\n".join(lines)