|
|
""" |
|
|
Core Document Intelligence Schemas |
|
|
|
|
|
Pydantic models for OCR regions, layout regions, chunks, and evidence. |
|
|
These form the foundation of the document processing pipeline. |
|
|
""" |
|
|
|
|
|
from enum import Enum |
|
|
from typing import List, Dict, Any, Optional, Tuple |
|
|
from datetime import datetime |
|
|
from pydantic import BaseModel, Field, field_validator |
|
|
import hashlib |
|
|
import json |
|
|
|
|
|
|
|
|
class BoundingBox(BaseModel): |
|
|
""" |
|
|
Bounding box in normalized coordinates (0-1) or pixel coordinates. |
|
|
Uses xyxy format: (x_min, y_min, x_max, y_max). |
|
|
""" |
|
|
x_min: float = Field(..., description="Left edge coordinate") |
|
|
y_min: float = Field(..., description="Top edge coordinate") |
|
|
x_max: float = Field(..., description="Right edge coordinate") |
|
|
y_max: float = Field(..., description="Bottom edge coordinate") |
|
|
|
|
|
|
|
|
normalized: bool = Field(default=False, description="True if coordinates are 0-1 normalized") |
|
|
page_width: Optional[int] = Field(default=None, description="Original page width in pixels") |
|
|
page_height: Optional[int] = Field(default=None, description="Original page height in pixels") |
|
|
|
|
|
@field_validator('x_max') |
|
|
@classmethod |
|
|
def x_max_greater_than_x_min(cls, v, info): |
|
|
if 'x_min' in info.data and v < info.data['x_min']: |
|
|
raise ValueError('x_max must be >= x_min') |
|
|
return v |
|
|
|
|
|
@field_validator('y_max') |
|
|
@classmethod |
|
|
def y_max_greater_than_y_min(cls, v, info): |
|
|
if 'y_min' in info.data and v < info.data['y_min']: |
|
|
raise ValueError('y_max must be >= y_min') |
|
|
return v |
|
|
|
|
|
@property |
|
|
def width(self) -> float: |
|
|
return self.x_max - self.x_min |
|
|
|
|
|
@property |
|
|
def height(self) -> float: |
|
|
return self.y_max - self.y_min |
|
|
|
|
|
@property |
|
|
def area(self) -> float: |
|
|
return self.width * self.height |
|
|
|
|
|
@property |
|
|
def center(self) -> Tuple[float, float]: |
|
|
return ((self.x_min + self.x_max) / 2, (self.y_min + self.y_max) / 2) |
|
|
|
|
|
def to_xyxy(self) -> Tuple[float, float, float, float]: |
|
|
"""Return as (x_min, y_min, x_max, y_max) tuple.""" |
|
|
return (self.x_min, self.y_min, self.x_max, self.y_max) |
|
|
|
|
|
def to_xywh(self) -> Tuple[float, float, float, float]: |
|
|
"""Return as (x, y, width, height) tuple.""" |
|
|
return (self.x_min, self.y_min, self.width, self.height) |
|
|
|
|
|
def normalize(self, width: int, height: int) -> "BoundingBox": |
|
|
"""Convert pixel coordinates to normalized (0-1) coordinates.""" |
|
|
if self.normalized: |
|
|
return self |
|
|
return BoundingBox( |
|
|
x_min=self.x_min / width, |
|
|
y_min=self.y_min / height, |
|
|
x_max=self.x_max / width, |
|
|
y_max=self.y_max / height, |
|
|
normalized=True, |
|
|
page_width=width, |
|
|
page_height=height, |
|
|
) |
|
|
|
|
|
def denormalize(self, width: int, height: int) -> "BoundingBox": |
|
|
"""Convert normalized coordinates to pixel coordinates.""" |
|
|
if not self.normalized: |
|
|
return self |
|
|
return BoundingBox( |
|
|
x_min=self.x_min * width, |
|
|
y_min=self.y_min * height, |
|
|
x_max=self.x_max * width, |
|
|
y_max=self.y_max * height, |
|
|
normalized=False, |
|
|
page_width=width, |
|
|
page_height=height, |
|
|
) |
|
|
|
|
|
def iou(self, other: "BoundingBox") -> float: |
|
|
"""Calculate Intersection over Union with another bbox.""" |
|
|
x1 = max(self.x_min, other.x_min) |
|
|
y1 = max(self.y_min, other.y_min) |
|
|
x2 = min(self.x_max, other.x_max) |
|
|
y2 = min(self.y_max, other.y_max) |
|
|
|
|
|
if x2 < x1 or y2 < y1: |
|
|
return 0.0 |
|
|
|
|
|
intersection = (x2 - x1) * (y2 - y1) |
|
|
union = self.area + other.area - intersection |
|
|
return intersection / union if union > 0 else 0.0 |
|
|
|
|
|
def contains(self, other: "BoundingBox") -> bool: |
|
|
"""Check if this bbox fully contains another.""" |
|
|
return ( |
|
|
self.x_min <= other.x_min and |
|
|
self.y_min <= other.y_min and |
|
|
self.x_max >= other.x_max and |
|
|
self.y_max >= other.y_max |
|
|
) |
|
|
|
|
|
|
|
|
class OCRRegion(BaseModel): |
|
|
""" |
|
|
Result from OCR processing for a single text region. |
|
|
Includes text, confidence, and precise location. |
|
|
""" |
|
|
text: str = Field(..., description="Recognized text content") |
|
|
confidence: float = Field(..., ge=0.0, le=1.0, description="OCR confidence score") |
|
|
bbox: BoundingBox = Field(..., description="Bounding box of the text region") |
|
|
polygon: Optional[List[Tuple[float, float]]] = Field( |
|
|
default=None, |
|
|
description="Polygon points for non-rectangular regions" |
|
|
) |
|
|
page: int = Field(..., ge=0, description="Zero-indexed page number") |
|
|
line_id: Optional[int] = Field(default=None, description="Line grouping ID") |
|
|
word_id: Optional[int] = Field(default=None, description="Word index within line") |
|
|
|
|
|
|
|
|
engine: str = Field(default="unknown", description="OCR engine used (paddle/tesseract)") |
|
|
language: Optional[str] = Field(default=None, description="Detected language code") |
|
|
|
|
|
def __hash__(self): |
|
|
return hash((self.text, self.page, self.bbox.x_min, self.bbox.y_min)) |
|
|
|
|
|
|
|
|
class LayoutType(str, Enum): |
|
|
"""Document layout region types.""" |
|
|
TEXT = "text" |
|
|
TITLE = "title" |
|
|
HEADING = "heading" |
|
|
PARAGRAPH = "paragraph" |
|
|
LIST = "list" |
|
|
TABLE = "table" |
|
|
FIGURE = "figure" |
|
|
CHART = "chart" |
|
|
FORMULA = "formula" |
|
|
HEADER = "header" |
|
|
FOOTER = "footer" |
|
|
PAGE_NUMBER = "page_number" |
|
|
CAPTION = "caption" |
|
|
FOOTNOTE = "footnote" |
|
|
WATERMARK = "watermark" |
|
|
LOGO = "logo" |
|
|
SIGNATURE = "signature" |
|
|
UNKNOWN = "unknown" |
|
|
|
|
|
|
|
|
class LayoutRegion(BaseModel): |
|
|
""" |
|
|
Result from layout detection for a document region. |
|
|
Identifies structural elements like tables, figures, paragraphs. |
|
|
""" |
|
|
id: str = Field(..., description="Unique region identifier") |
|
|
type: LayoutType = Field(..., description="Region type classification") |
|
|
confidence: float = Field(..., ge=0.0, le=1.0, description="Detection confidence") |
|
|
bbox: BoundingBox = Field(..., description="Bounding box of the region") |
|
|
page: int = Field(..., ge=0, description="Zero-indexed page number") |
|
|
|
|
|
|
|
|
reading_order: Optional[int] = Field( |
|
|
default=None, |
|
|
description="Position in reading order (0 = first)" |
|
|
) |
|
|
|
|
|
|
|
|
parent_id: Optional[str] = Field(default=None, description="Parent region ID") |
|
|
children_ids: List[str] = Field(default_factory=list, description="Child region IDs") |
|
|
|
|
|
|
|
|
ocr_region_ids: List[int] = Field( |
|
|
default_factory=list, |
|
|
description="Indices of OCR regions within this layout region" |
|
|
) |
|
|
|
|
|
|
|
|
extra: Dict[str, Any] = Field(default_factory=dict, description="Type-specific metadata") |
|
|
|
|
|
def __hash__(self): |
|
|
return hash(self.id) |
|
|
|
|
|
|
|
|
class ChunkType(str, Enum): |
|
|
"""Document chunk types for semantic segmentation.""" |
|
|
TEXT = "text" |
|
|
TITLE = "title" |
|
|
HEADING = "heading" |
|
|
PARAGRAPH = "paragraph" |
|
|
LIST_ITEM = "list_item" |
|
|
TABLE = "table" |
|
|
TABLE_CELL = "table_cell" |
|
|
FIGURE = "figure" |
|
|
CHART = "chart" |
|
|
FORMULA = "formula" |
|
|
CAPTION = "caption" |
|
|
FOOTNOTE = "footnote" |
|
|
HEADER = "header" |
|
|
FOOTER = "footer" |
|
|
METADATA = "metadata" |
|
|
|
|
|
|
|
|
class DocumentChunk(BaseModel): |
|
|
""" |
|
|
Semantic chunk of a document for retrieval and processing. |
|
|
Contains text, location evidence, and metadata for grounding. |
|
|
""" |
|
|
chunk_id: str = Field(..., description="Unique chunk identifier") |
|
|
chunk_type: ChunkType = Field(..., description="Semantic type of chunk") |
|
|
text: str = Field(..., description="Text content of the chunk") |
|
|
bbox: BoundingBox = Field(..., description="Bounding box covering the chunk") |
|
|
page: int = Field(..., ge=0, description="Zero-indexed page number") |
|
|
|
|
|
|
|
|
document_id: str = Field(..., description="Parent document identifier") |
|
|
source_path: Optional[str] = Field(default=None, description="Original file path") |
|
|
|
|
|
|
|
|
sequence_index: int = Field(..., ge=0, description="Position in document reading order") |
|
|
|
|
|
|
|
|
confidence: float = Field( |
|
|
default=1.0, |
|
|
ge=0.0, |
|
|
le=1.0, |
|
|
description="Chunk extraction confidence" |
|
|
) |
|
|
|
|
|
|
|
|
table_cell_ids: Optional[List[str]] = Field( |
|
|
default=None, |
|
|
description="Cell IDs if this is a table chunk" |
|
|
) |
|
|
row_index: Optional[int] = Field(default=None, description="Table row index") |
|
|
col_index: Optional[int] = Field(default=None, description="Table column index") |
|
|
|
|
|
|
|
|
caption: Optional[str] = Field(default=None, description="Associated caption text") |
|
|
references: List[str] = Field( |
|
|
default_factory=list, |
|
|
description="References to other chunks" |
|
|
) |
|
|
|
|
|
|
|
|
embedding: Optional[List[float]] = Field( |
|
|
default=None, |
|
|
description="Vector embedding for retrieval" |
|
|
) |
|
|
|
|
|
|
|
|
extra: Dict[str, Any] = Field(default_factory=dict, description="Additional metadata") |
|
|
|
|
|
@property |
|
|
def content_hash(self) -> str: |
|
|
"""Generate hash of chunk content for deduplication.""" |
|
|
content = f"{self.text}:{self.page}:{self.chunk_type}" |
|
|
return hashlib.sha256(content.encode()).hexdigest()[:16] |
|
|
|
|
|
def to_retrieval_dict(self) -> Dict[str, Any]: |
|
|
"""Convert to dictionary for vector store metadata.""" |
|
|
return { |
|
|
"chunk_id": self.chunk_id, |
|
|
"chunk_type": self.chunk_type.value, |
|
|
"page": self.page, |
|
|
"document_id": self.document_id, |
|
|
"source_path": self.source_path, |
|
|
"bbox_xyxy": self.bbox.to_xyxy(), |
|
|
"sequence_index": self.sequence_index, |
|
|
"confidence": self.confidence, |
|
|
} |
|
|
|
|
|
def __hash__(self): |
|
|
return hash(self.chunk_id) |
|
|
|
|
|
|
|
|
class EvidenceRef(BaseModel): |
|
|
""" |
|
|
Evidence reference for grounding extracted information. |
|
|
Links extracted data back to source document locations. |
|
|
""" |
|
|
chunk_id: str = Field(..., description="Referenced chunk ID") |
|
|
page: int = Field(..., ge=0, description="Page number") |
|
|
bbox: BoundingBox = Field(..., description="Bounding box of evidence") |
|
|
source_type: str = Field(..., description="Type of source (text/table/figure)") |
|
|
snippet: str = Field(..., max_length=500, description="Text snippet as evidence") |
|
|
confidence: float = Field(..., ge=0.0, le=1.0, description="Evidence confidence") |
|
|
|
|
|
|
|
|
image_base64: Optional[str] = Field( |
|
|
default=None, |
|
|
description="Base64-encoded crop of the evidence region" |
|
|
) |
|
|
|
|
|
|
|
|
extra: Dict[str, Any] = Field(default_factory=dict, description="Additional metadata") |
|
|
|
|
|
def to_citation(self) -> str: |
|
|
"""Format as a human-readable citation.""" |
|
|
return f"[Page {self.page + 1}, {self.source_type}]: \"{self.snippet[:100]}...\"" |
|
|
|
|
|
|
|
|
class ExtractionResult(BaseModel): |
|
|
""" |
|
|
Result of a field extraction or analysis task. |
|
|
Always includes evidence for grounding. |
|
|
""" |
|
|
data: Dict[str, Any] = Field(..., description="Extracted data dictionary") |
|
|
evidence: List[EvidenceRef] = Field( |
|
|
default_factory=list, |
|
|
description="Evidence supporting the extraction" |
|
|
) |
|
|
warnings: List[str] = Field( |
|
|
default_factory=list, |
|
|
description="Warnings or issues encountered" |
|
|
) |
|
|
confidence: float = Field( |
|
|
default=1.0, |
|
|
ge=0.0, |
|
|
le=1.0, |
|
|
description="Overall extraction confidence" |
|
|
) |
|
|
|
|
|
|
|
|
abstained_fields: List[str] = Field( |
|
|
default_factory=list, |
|
|
description="Fields where extraction was abstained due to low confidence" |
|
|
) |
|
|
|
|
|
|
|
|
processing_time_ms: Optional[float] = Field( |
|
|
default=None, |
|
|
description="Processing time in milliseconds" |
|
|
) |
|
|
model_used: Optional[str] = Field(default=None, description="Model used for extraction") |
|
|
|
|
|
@property |
|
|
def is_grounded(self) -> bool: |
|
|
"""Check if all extracted data has evidence.""" |
|
|
return len(self.evidence) > 0 and len(self.abstained_fields) == 0 |
|
|
|
|
|
def add_warning(self, warning: str): |
|
|
"""Add a warning message.""" |
|
|
self.warnings.append(warning) |
|
|
|
|
|
def abstain(self, field: str, reason: str): |
|
|
"""Mark a field as abstained with reason.""" |
|
|
self.abstained_fields.append(field) |
|
|
self.warnings.append(f"Abstained from extracting '{field}': {reason}") |
|
|
|
|
|
|
|
|
class DocumentMetadata(BaseModel): |
|
|
"""Metadata about a processed document.""" |
|
|
document_id: str = Field(..., description="Unique document identifier") |
|
|
source_path: str = Field(..., description="Original file path") |
|
|
filename: str = Field(..., description="Original filename") |
|
|
file_type: str = Field(..., description="File type (pdf/image/etc)") |
|
|
file_size_bytes: int = Field(..., ge=0, description="File size in bytes") |
|
|
|
|
|
|
|
|
num_pages: int = Field(..., ge=1, description="Total number of pages") |
|
|
page_dimensions: List[Tuple[int, int]] = Field( |
|
|
default_factory=list, |
|
|
description="(width, height) for each page" |
|
|
) |
|
|
|
|
|
|
|
|
created_at: datetime = Field(default_factory=datetime.utcnow) |
|
|
processed_at: Optional[datetime] = Field(default=None) |
|
|
|
|
|
|
|
|
total_chunks: int = Field(default=0, description="Number of chunks extracted") |
|
|
total_characters: int = Field(default=0, description="Total character count") |
|
|
|
|
|
|
|
|
detected_language: Optional[str] = Field(default=None, description="Primary language") |
|
|
language_confidence: Optional[float] = Field(default=None) |
|
|
|
|
|
|
|
|
ocr_confidence_avg: Optional[float] = Field(default=None) |
|
|
layout_confidence_avg: Optional[float] = Field(default=None) |
|
|
|
|
|
|
|
|
extra: Dict[str, Any] = Field(default_factory=dict) |
|
|
|
|
|
|
|
|
class ProcessedDocument(BaseModel): |
|
|
""" |
|
|
Complete processed document with all extracted information. |
|
|
This is the main output of the document processing pipeline. |
|
|
""" |
|
|
metadata: DocumentMetadata = Field(..., description="Document metadata") |
|
|
|
|
|
|
|
|
ocr_regions: List[OCRRegion] = Field( |
|
|
default_factory=list, |
|
|
description="All OCR regions" |
|
|
) |
|
|
|
|
|
|
|
|
layout_regions: List[LayoutRegion] = Field( |
|
|
default_factory=list, |
|
|
description="All layout regions" |
|
|
) |
|
|
|
|
|
|
|
|
chunks: List[DocumentChunk] = Field( |
|
|
default_factory=list, |
|
|
description="Document chunks for retrieval" |
|
|
) |
|
|
|
|
|
|
|
|
full_text: str = Field(default="", description="Full text in reading order") |
|
|
|
|
|
|
|
|
status: str = Field(default="completed", description="Processing status") |
|
|
errors: List[str] = Field(default_factory=list, description="Processing errors") |
|
|
warnings: List[str] = Field(default_factory=list, description="Processing warnings") |
|
|
|
|
|
def get_page_chunks(self, page: int) -> List[DocumentChunk]: |
|
|
"""Get all chunks for a specific page.""" |
|
|
return [c for c in self.chunks if c.page == page] |
|
|
|
|
|
def get_chunks_by_type(self, chunk_type: ChunkType) -> List[DocumentChunk]: |
|
|
"""Get all chunks of a specific type.""" |
|
|
return [c for c in self.chunks if c.chunk_type == chunk_type] |
|
|
|
|
|
def to_json(self, indent: int = 2) -> str: |
|
|
"""Serialize to JSON string.""" |
|
|
return self.model_dump_json(indent=indent) |
|
|
|
|
|
@classmethod |
|
|
def from_json(cls, json_str: str) -> "ProcessedDocument": |
|
|
"""Deserialize from JSON string.""" |
|
|
return cls.model_validate_json(json_str) |
|
|
|
|
|
def save(self, path: str): |
|
|
"""Save to JSON file.""" |
|
|
with open(path, "w", encoding="utf-8") as f: |
|
|
f.write(self.to_json()) |
|
|
|
|
|
@classmethod |
|
|
def load(cls, path: str) -> "ProcessedDocument": |
|
|
"""Load from JSON file.""" |
|
|
with open(path, "r", encoding="utf-8") as f: |
|
|
return cls.from_json(f.read()) |
|
|
|