document-ocr-demo / tests /test_extractor.py
vincentlo's picture
Upload folder using huggingface_hub
2548af8 verified
import os
import pytest
from src.extractor import AzureExtractor
@pytest.fixture
def extractor():
return AzureExtractor(endpoint=os.environ['AZURE_ENDPOINT'], key=os.environ['AZURE_KEY'])
def test_extract_image_in_content_mode(extractor, image_path):
result = extractor.extract(image_path, mode='content')
assert isinstance(result, dict)
assert 'content' in result
assert len(result) == 1
assert 'CREDIT APPLICATION' in result['content']
def test_extract_image_in_keypair_mode(extractor, image_path):
result = extractor.extract(image_path, mode='key_pair')
assert isinstance(result, dict)
assert len(result) > 1
def test_extract_pdf_in_content_mode(extractor, pdf_path):
result = extractor.extract(pdf_path, mode='content')
assert 'content' in result
assert isinstance(result, dict)
assert len(result['content']) > 10
assert 'CREDIT APPLICATION' in result['content']
assert 'Student ID' in result['content']
def test_extract_pdf_in_keypair_mode(extractor, pdf_path):
result = extractor.extract(pdf_path, mode='key_pair')
assert isinstance(result, dict)
assert result['TELEPHONE NO.'] == '(243) 555-2309'
assert result['Student e-mail'] == 'john.doe@example.com'