dots-ocr-idcard / src /kybtech_dots_ocr /field_extraction.py
tommulder's picture
style: format Python files with Black
5537ceb
"""Field extraction utilities for OCR text processing.
This module provides field extraction and mapping from OCR results
to structured KYB field formats.
"""
import re
from typing import Optional
from .api_models import ExtractedField, IdCardFields, MRZData
class FieldExtractor:
"""Field extraction and mapping from OCR results."""
# Field mapping patterns for Dutch ID cards
FIELD_PATTERNS = {
"document_number": [
r"documentnummer[:\s]*([A-Z0-9]+)",
r"document\s*number[:\s]*([A-Z0-9]+)",
r"nr[:\s]*([A-Z0-9]+)",
],
"surname": [
r"achternaam[:\s]*([A-Z]+)",
r"surname[:\s]*([A-Z]+)",
r"family\s*name[:\s]*([A-Z]+)",
],
"given_names": [
r"voornamen[:\s]*([A-Z]+)",
r"given\s*names[:\s]*([A-Z]+)",
r"first\s*name[:\s]*([A-Z]+)",
],
"nationality": [
r"nationaliteit[:\s]*([A-Za-z]+)",
r"nationality[:\s]*([A-Za-z]+)",
],
"date_of_birth": [
r"geboortedatum[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})",
r"date\s*of\s*birth[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})",
r"born[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})",
],
"gender": [r"geslacht[:\s]*([MF])", r"gender[:\s]*([MF])", r"sex[:\s]*([MF])"],
"place_of_birth": [
r"geboorteplaats[:\s]*([A-Za-z\s]+)",
r"place\s*of\s*birth[:\s]*([A-Za-z\s]+)",
r"born\s*in[:\s]*([A-Za-z\s]+)",
],
"date_of_issue": [
r"uitgiftedatum[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})",
r"date\s*of\s*issue[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})",
r"issued[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})",
],
"date_of_expiry": [
r"vervaldatum[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})",
r"date\s*of\s*expiry[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})",
r"expires[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})",
],
"personal_number": [
r"persoonsnummer[:\s]*(\d{9})",
r"personal\s*number[:\s]*(\d{9})",
r"bsn[:\s]*(\d{9})",
],
}
@classmethod
def extract_fields(cls, ocr_text: str) -> IdCardFields:
"""Extract structured fields from OCR text.
Args:
ocr_text: Raw OCR text from document processing
Returns:
IdCardFields object with extracted field data
"""
fields = {}
for field_name, patterns in cls.FIELD_PATTERNS.items():
value = None
confidence = 0.0
for pattern in patterns:
match = re.search(pattern, ocr_text, re.IGNORECASE)
if match:
value = match.group(1).strip()
confidence = 0.8 # Base confidence for pattern match
break
if value:
fields[field_name] = ExtractedField(
field_name=field_name,
value=value,
confidence=confidence,
source="ocr",
)
return IdCardFields(**fields)
@classmethod
def extract_mrz(cls, ocr_text: str) -> Optional[MRZData]:
"""Extract MRZ data from OCR text.
Args:
ocr_text: Raw OCR text from document processing
Returns:
MRZData object if MRZ detected, None otherwise
"""
# Look for MRZ patterns (TD1, TD2, TD3)
mrz_patterns = [
r"(P<[A-Z0-9<]+\n[A-Z0-9<]+)", # Generic passport format (try first)
r"([A-Z0-9<]{30}\n[A-Z0-9<]{30})", # TD1 format
r"([A-Z0-9<]{44}\n[A-Z0-9<]{44})", # TD2 format
r"([A-Z0-9<]{44}\n[A-Z0-9<]{44}\n[A-Z0-9<]{44})", # TD3 format
]
for pattern in mrz_patterns:
match = re.search(pattern, ocr_text, re.MULTILINE)
if match:
raw_mrz = match.group(1)
# Basic MRZ parsing (simplified)
return MRZData(
raw_text=raw_mrz,
format_type="TD3" if len(raw_mrz.split("\n")) == 3 else "TD2",
is_valid=True, # Assume valid if present
checksum_errors=[], # Not implemented in basic version
confidence=0.9,
)
return None