Spaces:
Sleeping
Sleeping
| """Readiness validator — computes how ready a document is for export. | |
| Produces AltoReadiness / PageXmlReadiness per page and DocumentReadiness | |
| at document level. Does NOT decide whether to allow export — that's the | |
| export eligibility validator's job. | |
| """ | |
| from __future__ import annotations | |
| from typing import TYPE_CHECKING | |
| from src.app.domain.models.readiness import ( | |
| AltoReadiness, | |
| DocumentReadiness, | |
| PageXmlReadiness, | |
| ) | |
| from src.app.domain.models.status import ( | |
| GeometryStatus, | |
| MissingCapability, | |
| ReadinessLevel, | |
| ) | |
| if TYPE_CHECKING: | |
| from src.app.domain.models import CanonicalDocument, Page | |
| def compute_page_alto_readiness(page: Page) -> AltoReadiness: | |
| """Compute ALTO readiness for a single page. | |
| ALTO full requires: page dimensions, block bbox, line bbox, word bbox, word text. | |
| """ | |
| missing: list[MissingCapability] = [] | |
| if page.width <= 0 or page.height <= 0: | |
| missing.append(MissingCapability.PAGE_DIMENSIONS) | |
| has_blocks = len(page.text_regions) > 0 | |
| has_lines = False | |
| has_words = False | |
| has_word_geo = True | |
| has_word_text = True | |
| has_confidence = True | |
| for region in page.text_regions: | |
| if ( | |
| region.geometry.status == GeometryStatus.UNKNOWN | |
| and MissingCapability.BLOCK_GEOMETRY not in missing | |
| ): | |
| missing.append(MissingCapability.BLOCK_GEOMETRY) | |
| for line in region.lines: | |
| has_lines = True | |
| if ( | |
| line.geometry.status == GeometryStatus.UNKNOWN | |
| and MissingCapability.LINE_GEOMETRY not in missing | |
| ): | |
| missing.append(MissingCapability.LINE_GEOMETRY) | |
| for word in line.words: | |
| has_words = True | |
| if word.geometry.status == GeometryStatus.UNKNOWN: | |
| has_word_geo = False | |
| if not word.text: | |
| has_word_text = False | |
| if word.confidence is None: | |
| has_confidence = False | |
| if not has_blocks or not has_lines or not has_words: | |
| if not has_words: | |
| missing.append(MissingCapability.WORD_TEXT) | |
| if not has_lines: | |
| missing.append(MissingCapability.LINE_GEOMETRY) | |
| if not has_word_geo and MissingCapability.WORD_GEOMETRY not in missing: | |
| missing.append(MissingCapability.WORD_GEOMETRY) | |
| if not has_word_text and MissingCapability.WORD_TEXT not in missing: | |
| missing.append(MissingCapability.WORD_TEXT) | |
| if not has_confidence and MissingCapability.CONFIDENCE not in missing: | |
| missing.append(MissingCapability.CONFIDENCE) | |
| if not page.reading_order: | |
| missing.append(MissingCapability.READING_ORDER) | |
| level = _level_from_missing(missing, critical={ | |
| MissingCapability.PAGE_DIMENSIONS, | |
| MissingCapability.WORD_TEXT, | |
| MissingCapability.WORD_GEOMETRY, | |
| MissingCapability.LINE_GEOMETRY, | |
| }) | |
| return AltoReadiness(level=level, missing=missing) | |
| def compute_page_pagexml_readiness(page: Page) -> PageXmlReadiness: | |
| """Compute PAGE XML readiness for a single page. | |
| PAGE XML is more lenient: regions + lines are often sufficient. | |
| Word-level geometry is nice-to-have, not required. | |
| """ | |
| missing: list[MissingCapability] = [] | |
| if page.width <= 0 or page.height <= 0: | |
| missing.append(MissingCapability.PAGE_DIMENSIONS) | |
| has_regions = len(page.text_regions) > 0 | |
| has_lines = False | |
| for region in page.text_regions: | |
| if ( | |
| region.geometry.status == GeometryStatus.UNKNOWN | |
| and MissingCapability.BLOCK_GEOMETRY not in missing | |
| ): | |
| missing.append(MissingCapability.BLOCK_GEOMETRY) | |
| for line in region.lines: | |
| has_lines = True | |
| if ( | |
| line.geometry.status == GeometryStatus.UNKNOWN | |
| and MissingCapability.LINE_GEOMETRY not in missing | |
| ): | |
| missing.append(MissingCapability.LINE_GEOMETRY) | |
| if not has_regions: | |
| missing.append(MissingCapability.BLOCK_GEOMETRY) | |
| if not has_lines and MissingCapability.LINE_GEOMETRY not in missing: | |
| missing.append(MissingCapability.LINE_GEOMETRY) | |
| if not page.reading_order: | |
| missing.append(MissingCapability.READING_ORDER) | |
| level = _level_from_missing(missing, critical={ | |
| MissingCapability.PAGE_DIMENSIONS, | |
| MissingCapability.BLOCK_GEOMETRY, | |
| }) | |
| return PageXmlReadiness(level=level, missing=missing) | |
| def compute_document_readiness(doc: CanonicalDocument) -> DocumentReadiness: | |
| """Compute overall document readiness from per-page readiness.""" | |
| page_levels: list[ReadinessLevel] = [] | |
| for page in doc.pages: | |
| alto = compute_page_alto_readiness(page) | |
| page_levels.append(alto.level) | |
| if not page_levels: | |
| return DocumentReadiness(level=ReadinessLevel.NONE) | |
| if all(lv == ReadinessLevel.FULL for lv in page_levels): | |
| overall = ReadinessLevel.FULL | |
| elif all(lv == ReadinessLevel.NONE for lv in page_levels): | |
| overall = ReadinessLevel.NONE | |
| elif any(lv == ReadinessLevel.NONE for lv in page_levels): | |
| overall = ReadinessLevel.DEGRADED | |
| else: | |
| overall = ReadinessLevel.PARTIAL | |
| return DocumentReadiness(level=overall, page_readiness=page_levels) | |
| def _level_from_missing( | |
| missing: list[MissingCapability], | |
| critical: set[MissingCapability], | |
| ) -> ReadinessLevel: | |
| """Determine readiness level from missing capabilities.""" | |
| if not missing: | |
| return ReadinessLevel.FULL | |
| has_critical = any(m in critical for m in missing) | |
| if has_critical: | |
| return ReadinessLevel.NONE | |
| return ReadinessLevel.PARTIAL | |