Spaces:
Paused
Paused
File size: 3,023 Bytes
211e423 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
"""Tests for field extraction functionality."""
import pytest
from src.kybtech_dots_ocr.enhanced_field_extraction import EnhancedFieldExtractor
class TestEnhancedFieldExtractor:
"""Test cases for EnhancedFieldExtractor."""
def test_extract_fields_dutch_id(self):
"""Test field extraction with Dutch ID card text."""
extractor = EnhancedFieldExtractor()
text = """
IDENTITEITSKAART
Documentnummer: NLD123456789
Achternaam: MULDER
Voornamen: THOMAS JAN
Nationaliteit: NLD
Geboortedatum: 15-03-1990
Geslacht: M
"""
fields = extractor.extract_fields(text)
assert fields.document_number is not None
assert fields.document_number.value == "NLD123456789"
assert fields.surname is not None
assert fields.surname.value == "MULDER"
assert fields.given_names is not None
assert fields.given_names.value == "THOMAS JAN"
def test_extract_fields_english_id(self):
"""Test field extraction with English ID card text."""
extractor = EnhancedFieldExtractor()
text = """
IDENTITY CARD
Document Number: NLD123456789
Surname: MULDER
Given Names: THOMAS JAN
Nationality: NLD
Date of Birth: 15-03-1990
Gender: M
"""
fields = extractor.extract_fields(text)
assert fields.document_number is not None
assert fields.document_number.value == "NLD123456789"
assert fields.surname is not None
assert fields.surname.value == "MULDER"
def test_extract_mrz_data(self):
"""Test MRZ data extraction."""
extractor = EnhancedFieldExtractor()
text = """
P<NLDMULDER<<THOMAS<<<<<<<<<<<<<<<<<<<<<<<<<
NLD123456789NLD9003151M300101123456789<<<<<<<<
"""
mrz_data = extractor.extract_mrz(text)
assert mrz_data is not None
assert mrz_data.format_type == "TD3"
assert mrz_data.confidence > 0.8
def test_extract_fields_empty_text(self):
"""Test field extraction with empty text."""
extractor = EnhancedFieldExtractor()
fields = extractor.extract_fields("")
# Should return empty fields
assert fields.document_number is None
assert fields.surname is None
def test_confidence_scoring(self):
"""Test confidence scoring functionality."""
extractor = EnhancedFieldExtractor()
# High quality text
high_quality = "Documentnummer: NLD123456789 Achternaam: MULDER"
fields_high = extractor.extract_fields(high_quality)
# Lower quality text
low_quality = "doc nr: NLD123"
fields_low = extractor.extract_fields(low_quality)
if fields_high.document_number and fields_low.document_number:
assert fields_high.document_number.confidence >= fields_low.document_number.confidence
|