XmLLM / tests /unit /test_structural_validator.py
Claude
Code quality: fix all ruff warnings, add CI/CD, improve test coverage
bbbfba8 unverified
"""Tests for the structural validator."""
from __future__ import annotations
from src.app.domain.models import (
AltoReadiness,
CanonicalDocument,
EvidenceType,
Geometry,
GeometryStatus,
NonTextRegion,
Page,
PageXmlReadiness,
Provenance,
ReadinessLevel,
Source,
TextLine,
TextRegion,
Word,
)
from src.app.domain.models.status import InputType, NonTextKind
from src.app.validators.structural_validator import validate_structure
def _prov() -> Provenance:
return Provenance(
provider="test", adapter="v1", source_ref="$",
evidence_type=EvidenceType.PROVIDER_NATIVE,
)
def _geo(x: float, y: float, w: float, h: float) -> Geometry:
return Geometry(bbox=(x, y, w, h), status=GeometryStatus.EXACT)
def _word(wid: str, x: float, y: float, w: float, h: float) -> Word:
return Word(id=wid, text="word", geometry=_geo(x, y, w, h), provenance=_prov())
def _line(lid: str, x: float, y: float, w: float, h: float, words: list[Word]) -> TextLine:
return TextLine(id=lid, geometry=_geo(x, y, w, h), provenance=_prov(), words=words)
def _region(rid: str, x: float, y: float, w: float, h: float, lines: list[TextLine]) -> TextRegion:
return TextRegion(id=rid, geometry=_geo(x, y, w, h), provenance=_prov(), lines=lines)
def _doc(regions: list[TextRegion], width: float = 1000, height: float = 1000,
reading_order: list[str] | None = None,
non_text: list[NonTextRegion] | None = None) -> CanonicalDocument:
ro = reading_order if reading_order is not None else [r.id for r in regions]
return CanonicalDocument(
document_id="test",
source=Source(input_type=InputType.IMAGE),
pages=[Page(
id="p1", page_index=0, width=width, height=height,
alto_readiness=AltoReadiness(level=ReadinessLevel.FULL),
page_readiness=PageXmlReadiness(level=ReadinessLevel.FULL),
reading_order=ro,
text_regions=regions,
non_text_regions=non_text or [],
)],
)
class TestIdUniqueness:
def test_all_unique_passes(self) -> None:
doc = _doc([
_region("tb1", 0, 0, 500, 200, [
_line("tl1", 0, 0, 500, 40, [_word("w1", 0, 0, 50, 30)]),
]),
])
report = validate_structure(doc)
assert report.is_valid
def test_duplicate_word_ids(self) -> None:
doc = _doc([
_region("tb1", 0, 0, 500, 200, [
_line("tl1", 0, 0, 500, 40, [
_word("w1", 0, 0, 50, 30),
_word("w1", 60, 0, 50, 30), # duplicate
]),
]),
])
report = validate_structure(doc)
assert not report.is_valid
assert any("Duplicate ID 'w1'" in e.message for e in report.errors)
def test_duplicate_across_levels(self) -> None:
# line ID = region ID
doc = _doc([
_region("same_id", 0, 0, 500, 200, [
_line("same_id", 0, 0, 500, 40, [_word("w1", 0, 0, 50, 30)]),
]),
])
report = validate_structure(doc)
assert not report.is_valid
def test_duplicate_with_non_text_region(self) -> None:
ntr = NonTextRegion(
id="tb1", kind=NonTextKind.ILLUSTRATION,
geometry=_geo(600, 0, 100, 100), provenance=_prov(),
)
doc = _doc(
[_region("tb1", 0, 0, 500, 200, [
_line("tl1", 0, 0, 500, 40, [_word("w1", 0, 0, 50, 30)]),
])],
non_text=[ntr],
)
report = validate_structure(doc)
assert not report.is_valid
class TestReadingOrder:
def test_valid_references(self) -> None:
doc = _doc([
_region("tb1", 0, 0, 500, 200, [
_line("tl1", 0, 0, 500, 40, [_word("w1", 0, 0, 50, 30)]),
]),
], reading_order=["tb1"])
report = validate_structure(doc)
assert report.is_valid
def test_invalid_reference(self) -> None:
doc = _doc([
_region("tb1", 0, 0, 500, 200, [
_line("tl1", 0, 0, 500, 40, [_word("w1", 0, 0, 50, 30)]),
]),
], reading_order=["tb1", "tb_nonexistent"])
report = validate_structure(doc)
assert not report.is_valid
assert any("unknown region ID" in e.message for e in report.errors)
class TestBboxContainment:
def test_all_contained_passes(self) -> None:
doc = _doc([
_region("tb1", 10, 10, 200, 100, [
_line("tl1", 20, 20, 150, 30, [
_word("w1", 25, 22, 50, 25),
]),
]),
])
report = validate_structure(doc)
assert report.warning_count == 0
def test_word_exceeds_line(self) -> None:
doc = _doc([
_region("tb1", 10, 10, 400, 100, [
_line("tl1", 20, 20, 100, 30, [
_word("w1", 20, 20, 200, 30), # word wider than line
]),
]),
])
report = validate_structure(doc, bbox_tolerance=0)
assert report.warning_count > 0
assert any("word_exceeds_line" in (e.code or "") for e in report.warnings)
def test_tolerance_allows_small_overflow(self) -> None:
doc = _doc([
_region("tb1", 10, 10, 200, 100, [
_line("tl1", 20, 20, 100, 30, [
_word("w1", 20, 20, 103, 30), # 3px overflow
]),
]),
])
report = validate_structure(doc, bbox_tolerance=5)
assert report.warning_count == 0
def test_tolerance_rejects_large_overflow(self) -> None:
doc = _doc([
_region("tb1", 10, 10, 200, 100, [
_line("tl1", 20, 20, 100, 30, [
_word("w1", 20, 20, 120, 30), # 20px overflow
]),
]),
])
report = validate_structure(doc, bbox_tolerance=5)
assert report.warning_count > 0
def test_region_exceeds_page(self) -> None:
doc = _doc([
_region("tb1", 900, 900, 200, 200, [ # exceeds 1000x1000 page
_line("tl1", 900, 900, 100, 30, [
_word("w1", 900, 900, 50, 25),
]),
]),
], width=1000, height=1000)
report = validate_structure(doc, bbox_tolerance=0)
assert any("region_exceeds_page" in (e.code or "") for e in report.warnings)
def test_line_exceeds_region(self) -> None:
doc = _doc([
_region("tb1", 10, 10, 100, 50, [
_line("tl1", 10, 10, 200, 30, [ # line wider than region
_word("w1", 10, 10, 50, 25),
]),
]),
])
report = validate_structure(doc, bbox_tolerance=0)
assert any("line_exceeds_region" in (e.code or "") for e in report.warnings)