|
|
""" |
|
|
Evidence Builder for Document Grounding |
|
|
|
|
|
Creates evidence references for extracted information. |
|
|
Handles image cropping and base64 encoding. |
|
|
""" |
|
|
|
|
|
import base64 |
|
|
import io |
|
|
from typing import List, Optional, Dict, Any, Tuple |
|
|
from pydantic import BaseModel, Field |
|
|
import numpy as np |
|
|
from PIL import Image |
|
|
from loguru import logger |
|
|
|
|
|
from ..schemas.core import ( |
|
|
BoundingBox, |
|
|
DocumentChunk, |
|
|
EvidenceRef, |
|
|
OCRRegion, |
|
|
) |
|
|
|
|
|
|
|
|
class GroundingConfig(BaseModel): |
|
|
"""Configuration for grounding and evidence generation.""" |
|
|
|
|
|
include_images: bool = Field( |
|
|
default=True, |
|
|
description="Include cropped images in evidence" |
|
|
) |
|
|
crop_padding: int = Field( |
|
|
default=10, |
|
|
ge=0, |
|
|
description="Padding around crop regions in pixels" |
|
|
) |
|
|
max_image_size: int = Field( |
|
|
default=512, |
|
|
ge=64, |
|
|
description="Maximum dimension for cropped images" |
|
|
) |
|
|
image_format: str = Field( |
|
|
default="PNG", |
|
|
description="Image format for encoding (PNG/JPEG)" |
|
|
) |
|
|
image_quality: int = Field( |
|
|
default=85, |
|
|
ge=1, |
|
|
le=100, |
|
|
description="JPEG quality if using JPEG format" |
|
|
) |
|
|
|
|
|
|
|
|
max_snippet_length: int = Field( |
|
|
default=200, |
|
|
ge=50, |
|
|
description="Maximum length of text snippets" |
|
|
) |
|
|
include_context: bool = Field( |
|
|
default=True, |
|
|
description="Include surrounding context in snippets" |
|
|
) |
|
|
|
|
|
|
|
|
def crop_region_image( |
|
|
image: np.ndarray, |
|
|
bbox: BoundingBox, |
|
|
padding: int = 10, |
|
|
max_size: Optional[int] = None, |
|
|
) -> np.ndarray: |
|
|
""" |
|
|
Crop a region from an image. |
|
|
|
|
|
Args: |
|
|
image: Source image (RGB, HWC format) |
|
|
bbox: Bounding box to crop |
|
|
padding: Padding around the crop |
|
|
max_size: Maximum dimension (will resize if larger) |
|
|
|
|
|
Returns: |
|
|
Cropped image as numpy array |
|
|
""" |
|
|
height, width = image.shape[:2] |
|
|
|
|
|
|
|
|
x1 = max(0, int(bbox.x_min) - padding) |
|
|
y1 = max(0, int(bbox.y_min) - padding) |
|
|
x2 = min(width, int(bbox.x_max) + padding) |
|
|
y2 = min(height, int(bbox.y_max) + padding) |
|
|
|
|
|
|
|
|
cropped = image[y1:y2, x1:x2] |
|
|
|
|
|
|
|
|
if max_size and max(cropped.shape[:2]) > max_size: |
|
|
pil_img = Image.fromarray(cropped) |
|
|
pil_img.thumbnail((max_size, max_size), Image.Resampling.LANCZOS) |
|
|
cropped = np.array(pil_img) |
|
|
|
|
|
return cropped |
|
|
|
|
|
|
|
|
def encode_image_base64( |
|
|
image: np.ndarray, |
|
|
format: str = "PNG", |
|
|
quality: int = 85, |
|
|
) -> str: |
|
|
""" |
|
|
Encode image to base64 string. |
|
|
|
|
|
Args: |
|
|
image: Image as numpy array |
|
|
format: Image format (PNG/JPEG) |
|
|
quality: JPEG quality if applicable |
|
|
|
|
|
Returns: |
|
|
Base64-encoded string |
|
|
""" |
|
|
pil_img = Image.fromarray(image) |
|
|
|
|
|
|
|
|
if pil_img.mode != "RGB": |
|
|
pil_img = pil_img.convert("RGB") |
|
|
|
|
|
|
|
|
buffer = io.BytesIO() |
|
|
if format.upper() == "JPEG": |
|
|
pil_img.save(buffer, format="JPEG", quality=quality) |
|
|
else: |
|
|
pil_img.save(buffer, format="PNG") |
|
|
|
|
|
buffer.seek(0) |
|
|
encoded = base64.b64encode(buffer.read()).decode("utf-8") |
|
|
|
|
|
return encoded |
|
|
|
|
|
|
|
|
def create_evidence_ref( |
|
|
chunk: DocumentChunk, |
|
|
source_type: str = "text", |
|
|
snippet: Optional[str] = None, |
|
|
confidence: float = 1.0, |
|
|
image: Optional[np.ndarray] = None, |
|
|
config: Optional[GroundingConfig] = None, |
|
|
) -> EvidenceRef: |
|
|
""" |
|
|
Create an evidence reference from a document chunk. |
|
|
|
|
|
Args: |
|
|
chunk: Source chunk |
|
|
source_type: Type of source (text/table/figure) |
|
|
snippet: Optional specific snippet (defaults to chunk text) |
|
|
confidence: Confidence score |
|
|
image: Optional page image for cropping |
|
|
config: Grounding configuration |
|
|
|
|
|
Returns: |
|
|
EvidenceRef instance |
|
|
""" |
|
|
config = config or GroundingConfig() |
|
|
|
|
|
|
|
|
if snippet is None: |
|
|
snippet = chunk.text[:config.max_snippet_length] |
|
|
if len(chunk.text) > config.max_snippet_length: |
|
|
snippet += "..." |
|
|
|
|
|
|
|
|
evidence = EvidenceRef( |
|
|
chunk_id=chunk.chunk_id, |
|
|
page=chunk.page, |
|
|
bbox=chunk.bbox, |
|
|
source_type=source_type, |
|
|
snippet=snippet, |
|
|
confidence=confidence, |
|
|
) |
|
|
|
|
|
|
|
|
if image is not None and config.include_images: |
|
|
try: |
|
|
cropped = crop_region_image( |
|
|
image, |
|
|
chunk.bbox, |
|
|
padding=config.crop_padding, |
|
|
max_size=config.max_image_size, |
|
|
) |
|
|
evidence.image_base64 = encode_image_base64( |
|
|
cropped, |
|
|
format=config.image_format, |
|
|
quality=config.image_quality, |
|
|
) |
|
|
except Exception as e: |
|
|
logger.warning(f"Failed to crop evidence image: {e}") |
|
|
|
|
|
return evidence |
|
|
|
|
|
|
|
|
class EvidenceBuilder: |
|
|
""" |
|
|
Builder for creating evidence references. |
|
|
|
|
|
Handles: |
|
|
- Evidence from chunks |
|
|
- Evidence from OCR regions |
|
|
- Evidence aggregation |
|
|
- Image cropping and encoding |
|
|
""" |
|
|
|
|
|
def __init__(self, config: Optional[GroundingConfig] = None): |
|
|
"""Initialize evidence builder.""" |
|
|
self.config = config or GroundingConfig() |
|
|
|
|
|
def from_chunk( |
|
|
self, |
|
|
chunk: DocumentChunk, |
|
|
image: Optional[np.ndarray] = None, |
|
|
additional_context: Optional[str] = None, |
|
|
) -> EvidenceRef: |
|
|
""" |
|
|
Create evidence reference from a chunk. |
|
|
|
|
|
Args: |
|
|
chunk: Source chunk |
|
|
image: Optional page image for visual evidence |
|
|
additional_context: Optional additional context |
|
|
|
|
|
Returns: |
|
|
EvidenceRef |
|
|
""" |
|
|
|
|
|
source_type = chunk.chunk_type.value |
|
|
|
|
|
|
|
|
snippet = chunk.text[:self.config.max_snippet_length] |
|
|
if additional_context: |
|
|
snippet = f"{additional_context}\n{snippet}" |
|
|
if len(chunk.text) > self.config.max_snippet_length: |
|
|
snippet += "..." |
|
|
|
|
|
return create_evidence_ref( |
|
|
chunk=chunk, |
|
|
source_type=source_type, |
|
|
snippet=snippet, |
|
|
confidence=chunk.confidence, |
|
|
image=image, |
|
|
config=self.config, |
|
|
) |
|
|
|
|
|
def from_ocr_region( |
|
|
self, |
|
|
region: OCRRegion, |
|
|
chunk_id: str, |
|
|
document_id: str, |
|
|
image: Optional[np.ndarray] = None, |
|
|
) -> EvidenceRef: |
|
|
""" |
|
|
Create evidence reference from an OCR region. |
|
|
|
|
|
Args: |
|
|
region: OCR region |
|
|
chunk_id: ID to assign |
|
|
document_id: Parent document ID |
|
|
image: Optional page image |
|
|
|
|
|
Returns: |
|
|
EvidenceRef |
|
|
""" |
|
|
|
|
|
from ..schemas.core import DocumentChunk, ChunkType |
|
|
|
|
|
chunk = DocumentChunk( |
|
|
chunk_id=chunk_id, |
|
|
chunk_type=ChunkType.TEXT, |
|
|
text=region.text, |
|
|
bbox=region.bbox, |
|
|
page=region.page, |
|
|
document_id=document_id, |
|
|
source_path=None, |
|
|
sequence_index=0, |
|
|
confidence=region.confidence, |
|
|
) |
|
|
|
|
|
return self.from_chunk(chunk, image) |
|
|
|
|
|
def aggregate_evidence( |
|
|
self, |
|
|
evidence_list: List[EvidenceRef], |
|
|
combine_snippets: bool = True, |
|
|
) -> List[EvidenceRef]: |
|
|
""" |
|
|
Aggregate and deduplicate evidence references. |
|
|
|
|
|
Args: |
|
|
evidence_list: List of evidence references |
|
|
combine_snippets: Whether to combine snippets from same chunk |
|
|
|
|
|
Returns: |
|
|
Deduplicated evidence list |
|
|
""" |
|
|
if not evidence_list: |
|
|
return [] |
|
|
|
|
|
|
|
|
by_chunk: Dict[str, List[EvidenceRef]] = {} |
|
|
for ev in evidence_list: |
|
|
if ev.chunk_id not in by_chunk: |
|
|
by_chunk[ev.chunk_id] = [] |
|
|
by_chunk[ev.chunk_id].append(ev) |
|
|
|
|
|
|
|
|
result = [] |
|
|
for chunk_id, evidences in by_chunk.items(): |
|
|
if len(evidences) == 1: |
|
|
result.append(evidences[0]) |
|
|
else: |
|
|
|
|
|
best = max(evidences, key=lambda e: e.confidence) |
|
|
if combine_snippets: |
|
|
all_snippets = list(set(e.snippet for e in evidences)) |
|
|
combined = " ... ".join(all_snippets[:3]) |
|
|
best = EvidenceRef( |
|
|
chunk_id=best.chunk_id, |
|
|
page=best.page, |
|
|
bbox=best.bbox, |
|
|
source_type=best.source_type, |
|
|
snippet=combined[:self.config.max_snippet_length], |
|
|
confidence=best.confidence, |
|
|
image_base64=best.image_base64, |
|
|
) |
|
|
result.append(best) |
|
|
|
|
|
|
|
|
result.sort(key=lambda e: (e.page, e.bbox.y_min, e.bbox.x_min)) |
|
|
|
|
|
return result |
|
|
|
|
|
def create_grounding_context( |
|
|
self, |
|
|
evidence_list: List[EvidenceRef], |
|
|
include_images: bool = False, |
|
|
) -> str: |
|
|
""" |
|
|
Create a text context from evidence for LLM prompting. |
|
|
|
|
|
Args: |
|
|
evidence_list: Evidence references |
|
|
include_images: Whether to include image markers |
|
|
|
|
|
Returns: |
|
|
Formatted context string |
|
|
""" |
|
|
if not evidence_list: |
|
|
return "" |
|
|
|
|
|
lines = ["Evidence from document:"] |
|
|
for i, ev in enumerate(evidence_list, 1): |
|
|
lines.append( |
|
|
f"\n[{i}] Page {ev.page + 1}, {ev.source_type} " |
|
|
f"(confidence: {ev.confidence:.2f}):" |
|
|
) |
|
|
lines.append(f' "{ev.snippet}"') |
|
|
|
|
|
if include_images and ev.image_base64: |
|
|
lines.append(" [Image available]") |
|
|
|
|
|
return "\n".join(lines) |
|
|
|