Spaces:
Paused
Paused
| """Field extraction utilities for OCR text processing. | |
| This module provides field extraction and mapping from OCR results | |
| to structured KYB field formats. | |
| """ | |
| import re | |
| from typing import Optional | |
| from .api_models import ExtractedField, IdCardFields, MRZData | |
| class FieldExtractor: | |
| """Field extraction and mapping from OCR results.""" | |
| # Field mapping patterns for Dutch ID cards | |
| FIELD_PATTERNS = { | |
| "document_number": [ | |
| r"documentnummer[:\s]*([A-Z0-9]+)", | |
| r"document\s*number[:\s]*([A-Z0-9]+)", | |
| r"nr[:\s]*([A-Z0-9]+)", | |
| ], | |
| "surname": [ | |
| r"achternaam[:\s]*([A-Z]+)", | |
| r"surname[:\s]*([A-Z]+)", | |
| r"family\s*name[:\s]*([A-Z]+)", | |
| ], | |
| "given_names": [ | |
| r"voornamen[:\s]*([A-Z]+)", | |
| r"given\s*names[:\s]*([A-Z]+)", | |
| r"first\s*name[:\s]*([A-Z]+)", | |
| ], | |
| "nationality": [ | |
| r"nationaliteit[:\s]*([A-Za-z]+)", | |
| r"nationality[:\s]*([A-Za-z]+)", | |
| ], | |
| "date_of_birth": [ | |
| r"geboortedatum[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})", | |
| r"date\s*of\s*birth[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})", | |
| r"born[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})", | |
| ], | |
| "gender": [r"geslacht[:\s]*([MF])", r"gender[:\s]*([MF])", r"sex[:\s]*([MF])"], | |
| "place_of_birth": [ | |
| r"geboorteplaats[:\s]*([A-Za-z\s]+)", | |
| r"place\s*of\s*birth[:\s]*([A-Za-z\s]+)", | |
| r"born\s*in[:\s]*([A-Za-z\s]+)", | |
| ], | |
| "date_of_issue": [ | |
| r"uitgiftedatum[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})", | |
| r"date\s*of\s*issue[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})", | |
| r"issued[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})", | |
| ], | |
| "date_of_expiry": [ | |
| r"vervaldatum[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})", | |
| r"date\s*of\s*expiry[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})", | |
| r"expires[:\s]*(\d{2}[./-]\d{2}[./-]\d{4})", | |
| ], | |
| "personal_number": [ | |
| r"persoonsnummer[:\s]*(\d{9})", | |
| r"personal\s*number[:\s]*(\d{9})", | |
| r"bsn[:\s]*(\d{9})", | |
| ], | |
| } | |
| def extract_fields(cls, ocr_text: str) -> IdCardFields: | |
| """Extract structured fields from OCR text. | |
| Args: | |
| ocr_text: Raw OCR text from document processing | |
| Returns: | |
| IdCardFields object with extracted field data | |
| """ | |
| fields = {} | |
| for field_name, patterns in cls.FIELD_PATTERNS.items(): | |
| value = None | |
| confidence = 0.0 | |
| for pattern in patterns: | |
| match = re.search(pattern, ocr_text, re.IGNORECASE) | |
| if match: | |
| value = match.group(1).strip() | |
| confidence = 0.8 # Base confidence for pattern match | |
| break | |
| if value: | |
| fields[field_name] = ExtractedField( | |
| field_name=field_name, | |
| value=value, | |
| confidence=confidence, | |
| source="ocr", | |
| ) | |
| return IdCardFields(**fields) | |
| def extract_mrz(cls, ocr_text: str) -> Optional[MRZData]: | |
| """Extract MRZ data from OCR text. | |
| Args: | |
| ocr_text: Raw OCR text from document processing | |
| Returns: | |
| MRZData object if MRZ detected, None otherwise | |
| """ | |
| # Look for MRZ patterns (TD1, TD2, TD3) | |
| mrz_patterns = [ | |
| r"(P<[A-Z0-9<]+\n[A-Z0-9<]+)", # Generic passport format (try first) | |
| r"([A-Z0-9<]{30}\n[A-Z0-9<]{30})", # TD1 format | |
| r"([A-Z0-9<]{44}\n[A-Z0-9<]{44})", # TD2 format | |
| r"([A-Z0-9<]{44}\n[A-Z0-9<]{44}\n[A-Z0-9<]{44})", # TD3 format | |
| ] | |
| for pattern in mrz_patterns: | |
| match = re.search(pattern, ocr_text, re.MULTILINE) | |
| if match: | |
| raw_mrz = match.group(1) | |
| # Basic MRZ parsing (simplified) | |
| return MRZData( | |
| raw_text=raw_mrz, | |
| format_type="TD3" if len(raw_mrz.split("\n")) == 3 else "TD2", | |
| is_valid=True, # Assume valid if present | |
| checksum_errors=[], # Not implemented in basic version | |
| confidence=0.9, | |
| ) | |
| return None | |