Spaces:
Running on Zero
Running on Zero
| import tempfile | |
| import unittest | |
| from pathlib import Path | |
| from unittest.mock import patch | |
| from zsgdp.config import load_config | |
| from zsgdp.parsers.external import MarkerParser, _read_external_markdown, _read_marker_markdown, normalize_marker_markdown | |
| from zsgdp.schema import DocumentProfile, PageProfile | |
| class MarkerParserTests(unittest.TestCase): | |
| def test_normalize_marker_markdown_emits_common_schema(self): | |
| profile = DocumentProfile( | |
| doc_id="d1", | |
| source_path="sample.pdf", | |
| file_type="pdf", | |
| page_count=1, | |
| extension=".pdf", | |
| pages=[PageProfile(page_num=1, digital_text_chars=20)], | |
| ) | |
| candidate = normalize_marker_markdown( | |
| markdown="# Report\n\n| A | B |\n| --- | --- |\n| 1 | 2 |\n\n", | |
| profile=profile, | |
| source_path="sample.pdf", | |
| ) | |
| self.assertEqual(candidate.parser_name, "marker") | |
| self.assertEqual(len(candidate.tables), 1) | |
| self.assertEqual(len(candidate.figures), 1) | |
| self.assertEqual(candidate.pages[0]["source_parser"], "marker") | |
| def test_marker_parser_runs_markdown_through_normalizer(self): | |
| profile = DocumentProfile( | |
| doc_id="d1", | |
| source_path="sample.pdf", | |
| file_type="pdf", | |
| page_count=1, | |
| extension=".pdf", | |
| pages=[PageProfile(page_num=1, digital_text_chars=20)], | |
| ) | |
| with patch.object(MarkerParser, "available", return_value=True), patch( | |
| "zsgdp.parsers.external.run_marker_to_markdown", | |
| return_value="# Report\n\nBody.", | |
| ): | |
| candidate = MarkerParser().parse("sample.pdf", profile, load_config()) | |
| self.assertEqual(candidate.parser_name, "marker") | |
| self.assertEqual(candidate.elements[0].source_parser, "marker") | |
| self.assertEqual(candidate.provenance["requested_pages"], [1]) | |
| def test_read_marker_markdown_prefers_markdown_file(self): | |
| with tempfile.TemporaryDirectory() as tmp: | |
| root = Path(tmp) | |
| nested = root / "sample" | |
| nested.mkdir() | |
| (nested / "other.md").write_text("# Other", encoding="utf-8") | |
| (nested / "markdown.md").write_text("# Preferred", encoding="utf-8") | |
| markdown = _read_marker_markdown(root) | |
| self.assertEqual(markdown, "# Preferred") | |
| def test_read_external_markdown_falls_back_to_stdout(self): | |
| with tempfile.TemporaryDirectory() as tmp: | |
| markdown = _read_external_markdown(Path(tmp), parser_name="mineru", stdout="# From stdout") | |
| self.assertEqual(markdown, "# From stdout") | |
| if __name__ == "__main__": | |
| unittest.main() | |