zeroshotGPU / tests /test_marker_parser.py
Arjunvir Singh
Initial commit: zeroshotGPU MVP with full eval surface
db06ffa
import tempfile
import unittest
from pathlib import Path
from unittest.mock import patch
from zsgdp.config import load_config
from zsgdp.parsers.external import MarkerParser, _read_external_markdown, _read_marker_markdown, normalize_marker_markdown
from zsgdp.schema import DocumentProfile, PageProfile
class MarkerParserTests(unittest.TestCase):
def test_normalize_marker_markdown_emits_common_schema(self):
profile = DocumentProfile(
doc_id="d1",
source_path="sample.pdf",
file_type="pdf",
page_count=1,
extension=".pdf",
pages=[PageProfile(page_num=1, digital_text_chars=20)],
)
candidate = normalize_marker_markdown(
markdown="# Report\n\n| A | B |\n| --- | --- |\n| 1 | 2 |\n\n![Chart](chart.png)",
profile=profile,
source_path="sample.pdf",
)
self.assertEqual(candidate.parser_name, "marker")
self.assertEqual(len(candidate.tables), 1)
self.assertEqual(len(candidate.figures), 1)
self.assertEqual(candidate.pages[0]["source_parser"], "marker")
def test_marker_parser_runs_markdown_through_normalizer(self):
profile = DocumentProfile(
doc_id="d1",
source_path="sample.pdf",
file_type="pdf",
page_count=1,
extension=".pdf",
pages=[PageProfile(page_num=1, digital_text_chars=20)],
)
with patch.object(MarkerParser, "available", return_value=True), patch(
"zsgdp.parsers.external.run_marker_to_markdown",
return_value="# Report\n\nBody.",
):
candidate = MarkerParser().parse("sample.pdf", profile, load_config())
self.assertEqual(candidate.parser_name, "marker")
self.assertEqual(candidate.elements[0].source_parser, "marker")
self.assertEqual(candidate.provenance["requested_pages"], [1])
def test_read_marker_markdown_prefers_markdown_file(self):
with tempfile.TemporaryDirectory() as tmp:
root = Path(tmp)
nested = root / "sample"
nested.mkdir()
(nested / "other.md").write_text("# Other", encoding="utf-8")
(nested / "markdown.md").write_text("# Preferred", encoding="utf-8")
markdown = _read_marker_markdown(root)
self.assertEqual(markdown, "# Preferred")
def test_read_external_markdown_falls_back_to_stdout(self):
with tempfile.TemporaryDirectory() as tmp:
markdown = _read_external_markdown(Path(tmp), parser_name="mineru", stdout="# From stdout")
self.assertEqual(markdown, "# From stdout")
if __name__ == "__main__":
unittest.main()