pdfinspector / layout_utils.py
rianders's picture
Fix file load errors and implement auto-refresh functionality
0d61aa0
"""
Layout Utilities Module
Contains shared logic for block extraction, ordering, and data structures
to avoid circular dependencies between app.py and other modules.
"""
from dataclasses import dataclass
from typing import List, Tuple, Any, Dict, Optional
import pymupdf as fitz
import re
@dataclass
class SpanInfo:
bbox: Tuple[float, float, float, float]
text: str
font: str
size: float
@dataclass
class BlockInfo:
bbox: Tuple[float, float, float, float]
text: str
block_type: int # 0 text, 1 image, 2 drawing in PyMuPDF terms for some outputs
spans: List[SpanInfo]
@dataclass
class PageDiagnostic:
"""Extended diagnostic for batch processing."""
page_num: int
tagged_pdf: bool
text_len: int
image_block_count: int
font_count: int
has_type3_fonts: bool
suspicious_garbled_text: bool
likely_scanned_image_page: bool
likely_text_as_vector_outlines: bool
multi_column_guess: bool
processing_time_ms: Optional[int] = None
@dataclass
class BatchAnalysisResult:
"""Aggregate results from all pages."""
total_pages: int
pages_analyzed: int
summary_stats: Dict[str, int]
per_page_results: List[PageDiagnostic]
common_issues: List[str]
critical_pages: List[int]
processing_time_sec: float
def to_dict(self) -> Dict[str, Any]:
"""Convert to JSON-serializable dict."""
return {
"total_pages": self.total_pages,
"pages_analyzed": self.pages_analyzed,
"summary_stats": self.summary_stats,
"per_page_results": [
{
"page_num": p.page_num,
"tagged_pdf": p.tagged_pdf,
"text_len": p.text_len,
"image_block_count": p.image_block_count,
"font_count": p.font_count,
"has_type3_fonts": p.has_type3_fonts,
"suspicious_garbled_text": p.suspicious_garbled_text,
"likely_scanned_image_page": p.likely_scanned_image_page,
"likely_text_as_vector_outlines": p.likely_text_as_vector_outlines,
"multi_column_guess": p.multi_column_guess,
"processing_time_ms": p.processing_time_ms,
}
for p in self.per_page_results
],
"common_issues": self.common_issues,
"critical_pages": self.critical_pages,
"processing_time_sec": self.processing_time_sec,
}
def _safe_str(x: Any, max_len: int = 400) -> str:
s = str(x)
if len(s) > max_len:
s = s[:max_len] + "…"
return s
def _looks_like_math(text: str) -> bool:
# Heuristic: mathy glyphs/symbols and patterns
if not text:
return False
math_syms = r"[βˆ‘βˆ«βˆšβ‰ˆβ‰ β‰€β‰₯βˆžΒ±Γ—Γ·βˆ‚βˆ‡βˆˆβˆ©βˆͺβŠ‚βŠ†βŠ‡βŠƒβ†’β†¦βˆ€βˆƒβ„β„€β„šβ„•]"
latexy = r"(\\frac|\\sqrt|\\sum|\\int|_|\^|\b(?:sin|cos|tan|log|ln)\b)"
return bool(re.search(math_syms, text) or re.search(latexy, text))
def extract_blocks_spans(doc: fitz.Document, page_index: int) -> List[BlockInfo]:
page = doc[page_index]
raw = page.get_text("dict") # includes blocks/lines/spans with bboxes
mat = page.rotation_matrix
blocks: List[BlockInfo] = []
for b in raw.get("blocks", []):
btype = int(b.get("type", -1))
# Transform block bbox to visual coordinates
bbox_rect = fitz.Rect(b.get("bbox", (0, 0, 0, 0))) * mat
bbox = tuple(bbox_rect)
text_parts: List[str] = []
spans: List[SpanInfo] = []
if btype == 0: # text
for line in b.get("lines", []):
for sp in line.get("spans", []):
t = sp.get("text", "")
if t:
text_parts.append(t)
# Transform span bbox to visual coordinates
sp_bbox_rect = fitz.Rect(sp.get("bbox", (0, 0, 0, 0))) * mat
spans.append(
SpanInfo(
bbox=tuple(sp_bbox_rect),
text=t,
font=_safe_str(sp.get("font", "")),
size=float(sp.get("size", 0.0)),
)
)
text = "".join(text_parts).strip()
blocks.append(BlockInfo(bbox=bbox, text=text, block_type=btype, spans=spans))
return blocks
def order_blocks(blocks: List[BlockInfo], mode: str) -> List[Tuple[int, BlockInfo]]:
"""
Return list of (idx, block) in chosen order.
"""
indexed = list(enumerate(blocks))
if mode == "raw":
return indexed
def key_tblr(item: Tuple[int, BlockInfo]) -> Tuple[int, int]:
_, b = item
x0, y0, x1, y1 = b.bbox
return (int(y0), int(x0))
if mode == "tblr":
return sorted(indexed, key=key_tblr)
if mode == "columns":
# Simple 2-column heuristic:
# cluster by x-center around midline, then sort within each column.
# This is a heuristic; tagged PDFs should make this unnecessary.
xs = []
for _, b in indexed:
x0, y0, x1, y1 = b.bbox
if (x1 - x0) > 5:
xs.append((x0 + x1) / 2.0)
if not xs:
return sorted(indexed, key=key_tblr)
mid = sorted(xs)[len(xs) // 2]
left = []
right = []
for it in indexed:
_, b = it
x0, y0, x1, y1 = b.bbox
cx = (x0 + x1) / 2.0
(left if cx < mid else right).append(it)
left = sorted(left, key=key_tblr)
right = sorted(right, key=key_tblr)
# Read left column first, then right
return left + right
# Fallback
return sorted(indexed, key=key_tblr)