Spaces:
Running
Running
| """ | |
| PDF/HTML extraction with layout-aware preference and graceful fallback. | |
| Default path: **Docling** (IBM, layout-aware). Preserves heading hierarchy, | |
| list ordering, table structure, and de-duplicates page-header repetition. | |
| Critical for multi-column source documents (e.g. Physiotherapy Standards | |
| framework PDF, where pymupdf scrambled bullet ordering). | |
| Fallback path: **markitdown**. Used when Docling import or convert fails | |
| on a specific source. Markitdown is faster and lighter; it just doesn't | |
| handle multi-column layouts well. | |
| Usage in build scripts:: | |
| from extract_pdf import extract_to_markdown | |
| body = extract_to_markdown(cache_path) | |
| The function handles both PDFs and HTML uniformly. PDFs go through Docling | |
| by default; HTML always uses markitdown (Docling's HTML support is less | |
| mature than its PDF support, and our HTML pages are mostly SilverStripe | |
| with semantic markup that markitdown handles well). | |
| """ | |
| from __future__ import annotations | |
| from pathlib import Path | |
| # Lazy-import flags β checked once, cached at module load | |
| try: | |
| from docling.document_converter import DocumentConverter | |
| _DOCLING_AVAILABLE = True | |
| except ImportError: | |
| _DOCLING_AVAILABLE = False | |
| DocumentConverter = None # type: ignore[assignment] | |
| def extract_to_markdown(path: str | Path, format_hint: str | None = None) -> str: | |
| """Convert a PDF or HTML file to markdown text. | |
| Args: | |
| path: filesystem path to the source file | |
| format_hint: optional "pdf" or "html". If omitted, inferred from file extension. | |
| Returns: | |
| Markdown text content (stripped of leading/trailing whitespace). | |
| Strategy: | |
| - .pdf β Docling preferred, markitdown fallback if Docling errors | |
| - .html / .htm β markitdown (Docling's HTML support less mature than PDF) | |
| - other β markitdown (general-purpose) | |
| """ | |
| p = Path(path) | |
| fmt = format_hint or p.suffix.lstrip(".").lower() | |
| if fmt == "pdf" and _DOCLING_AVAILABLE: | |
| try: | |
| return _docling_extract(p) | |
| except Exception as e: | |
| print(f" β Docling failed on {p.name}: {e!r}; falling back to markitdown") | |
| return _markitdown_extract(p) | |
| def _docling_extract(path: Path) -> str: | |
| """Run Docling and return the markdown export.""" | |
| converter = DocumentConverter() | |
| result = converter.convert(str(path)) | |
| return result.document.export_to_markdown().strip() | |
| def _markitdown_extract(path: Path) -> str: | |
| """Run markitdown and return the text content.""" | |
| from markitdown import MarkItDown | |
| md = MarkItDown() | |
| result = md.convert(str(path)) | |
| return result.text_content.strip() | |
| def is_docling_available() -> bool: | |
| """Public flag β useful for build scripts wanting to log which extractor is in use.""" | |
| return _DOCLING_AVAILABLE | |
| # ---- Heading demotion -------------------------------------------------- | |
| import re as _re | |
| _HEADING_LINE_RE = _re.compile(r"^(#{1,6}) ", _re.MULTILINE) | |
| def demote_headings(text: str, levels: int = 1, max_depth: int = 6) -> str: | |
| """Demote markdown headings by ``levels`` levels, capped at H``max_depth``. | |
| Used by build scripts to nest extracted body content under a higher-level | |
| heading injected by the build script. Without demotion, Docling-extracted | |
| body H2s collide with the build script's source-level H2 wrapper at the | |
| same tree depth. | |
| Example:: | |
| ## Source Title (build script wrapper) | |
| ## Introduction <-- collides at H2 with other sources' Introductions | |
| Body... | |
| becomes:: | |
| ## Source Title (build script wrapper) | |
| ### Introduction <-- now a child of the wrapper | |
| Body... | |
| Multiple sources can each have their own "### Introduction" without | |
| colliding because each is scoped under its own H2 parent. | |
| Caps demotion at ``max_depth`` (default H6) since markdown beyond H6 is | |
| treated as paragraph text. | |
| """ | |
| def _demote(m: "_re.Match[str]") -> str: | |
| hashes = m.group(1) | |
| new_count = min(len(hashes) + levels, max_depth) | |
| return "#" * new_count + " " | |
| return _HEADING_LINE_RE.sub(_demote, text) | |