zeroshotGPU / tests /test_external_parser_adapters.py
Arjunvir Singh
Initial commit: zeroshotGPU MVP with full eval surface
db06ffa
import unittest
from unittest.mock import patch
from zsgdp.config import load_config
from zsgdp.normalize.normalize_unstructured import normalize_unstructured_parts
from zsgdp.parsers.external import MinerUParser, OlmOCRParser, PaddleOCRParser
from zsgdp.schema import DocumentProfile, PageProfile
class ExternalParserAdapterTests(unittest.TestCase):
def test_command_backed_parsers_normalize_markdown(self):
cases = [
(MinerUParser, "mineru"),
(OlmOCRParser, "olmocr"),
(PaddleOCRParser, "paddleocr"),
]
profile = _profile()
for parser_class, parser_name in cases:
with self.subTest(parser=parser_name), patch.object(parser_class, "available", return_value=True), patch(
"zsgdp.parsers.external.run_external_parser_to_markdown",
return_value="# Report\n\n| A | B |\n| --- | --- |\n| 1 | 2 |",
):
candidate = parser_class().parse("sample.pdf", profile, load_config())
self.assertEqual(candidate.parser_name, parser_name)
self.assertEqual(candidate.elements[0].source_parser, parser_name)
self.assertEqual(len(candidate.tables), 1)
self.assertEqual(candidate.provenance["requested_pages"], [1])
def test_unstructured_normalizer_preserves_page_and_title_metadata(self):
class Metadata:
page_number = 2
class Title:
category = "Title"
metadata = Metadata()
def __str__(self):
return "Executive Summary"
class Narrative:
category = "NarrativeText"
metadata = Metadata()
def __str__(self):
return "The document parser keeps provenance."
candidate = normalize_unstructured_parts(parts=[Title(), Narrative()], profile=_profile(), source_path="sample.pdf")
self.assertEqual(candidate.parser_name, "unstructured")
self.assertEqual(candidate.elements[0].page_num, 2)
self.assertEqual(candidate.elements[0].type, "title")
self.assertEqual(candidate.elements[0].markdown, "# Executive Summary")
def _profile():
return DocumentProfile(
doc_id="d1",
source_path="sample.pdf",
file_type="pdf",
page_count=1,
extension=".pdf",
pages=[PageProfile(page_num=1, digital_text_chars=20)],
)
if __name__ == "__main__":
unittest.main()