Arjunvir Singh commited on
Commit
db06ffa
·
0 Parent(s):

Initial commit: zeroshotGPU MVP with full eval surface

Browse files

Profiler, router, parser registry, schema, merger with conflict detection,
verifier (coverage/reading-order/table/figure/formula/chunk-readiness plus
GT-comparison: layout F1, table structure, formula CER, retrieval recall),
iterative repair loop with optional GPU escalation, agentic chunker,
benchmark suite (per-doc + per-parser + ablation + cross-dataset),
Gradio Spaces UI with abuse guards + per-artifact downloads, structured
JSON logging, preflight runner, regression-fixture format with perf
floors, .env loading, pre-commit/pre-push hooks, CONTRIBUTING.md +
docs/space_smoke.md, scripts/run_space_smoke.py runner.

Test count: 240/240 passing.

This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .env.example +16 -0
  2. .gitignore +15 -0
  3. .pre-commit-config.yaml +44 -0
  4. CHANGELOG.md +274 -0
  5. CONTRIBUTING.md +235 -0
  6. Makefile +49 -0
  7. README.md +287 -0
  8. app.py +251 -0
  9. configs/default.yaml +159 -0
  10. configs/docling.yaml +29 -0
  11. configs/gpu.yaml +43 -0
  12. configs/parsers.yaml +33 -0
  13. configs/routing.yaml +8 -0
  14. docs/space_smoke.md +269 -0
  15. examples/parse_folder.py +27 -0
  16. examples/parse_pdf.py +25 -0
  17. examples/run_benchmark.py +33 -0
  18. pyproject.toml +41 -0
  19. requirements.txt +33 -0
  20. scripts/__init__.py +0 -0
  21. scripts/run_space_smoke.py +455 -0
  22. tests/__init__.py +1 -0
  23. tests/regression/README.md +97 -0
  24. tests/regression/__init__.py +0 -0
  25. tests/regression/fixtures/markdown_basic.expected.json +31 -0
  26. tests/regression/fixtures/markdown_basic.input.md +14 -0
  27. tests/regression/test_regression.py +255 -0
  28. tests/test_ablation_runner.py +133 -0
  29. tests/test_app.py +141 -0
  30. tests/test_artifacts.py +82 -0
  31. tests/test_benchmark.py +55 -0
  32. tests/test_chunking.py +286 -0
  33. tests/test_cli_help.py +91 -0
  34. tests/test_conflict_detection.py +89 -0
  35. tests/test_cross_dataset.py +123 -0
  36. tests/test_datasets.py +152 -0
  37. tests/test_deployment.py +43 -0
  38. tests/test_docling_parser.py +39 -0
  39. tests/test_embedding_retriever.py +190 -0
  40. tests/test_env_loading.py +110 -0
  41. tests/test_external_parser_adapters.py +69 -0
  42. tests/test_gpu_runner.py +185 -0
  43. tests/test_gpu_runtime.py +47 -0
  44. tests/test_gpu_tasks.py +99 -0
  45. tests/test_layout_f1.py +190 -0
  46. tests/test_logging.py +125 -0
  47. tests/test_markdown_normalizer.py +63 -0
  48. tests/test_marker_parser.py +73 -0
  49. tests/test_merge.py +134 -0
  50. tests/test_parser_disagreement.py +177 -0
.env.example ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copy to .env and fill in. .env is gitignored; .env.example is committed.
2
+ # Loaded automatically by zsgdp.config.load_env_file() when CLI / app starts.
3
+
4
+ # Hugging Face Hub access token. Required for gated models like jina-v3
5
+ # (the embedding retriever) and any private model id used in gpu.models.
6
+ # Read transparently by transformers / sentence-transformers when set.
7
+ HF_TOKEN=
8
+
9
+ # Logging — see zsgdp/logging_config.py.
10
+ # ZSGDP_LOG_LEVEL=INFO
11
+ # ZSGDP_LOG_JSON=1
12
+
13
+ # Pipeline overrides.
14
+ # ZSGDP_CONFIG_PATH=configs/docling.yaml
15
+ # ZSGDP_MAX_UPLOAD_BYTES=52428800
16
+ # ZSGDP_MAX_PAGE_COUNT=200
.gitignore ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.py[cod]
3
+ .pytest_cache/
4
+ .mypy_cache/
5
+ .ruff_cache/
6
+ .venv/
7
+ venv/
8
+ out/
9
+ parsed/
10
+ benchmarks/results/
11
+
12
+ # Secrets — never commit. Loaded by zsgdp.config.load_env_file() at runtime.
13
+ .env
14
+ .env.*
15
+ !.env.example
.pre-commit-config.yaml ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Pre-commit and pre-push hooks for zeroshotGPU.
2
+ #
3
+ # Install once with:
4
+ # python -m pip install pre-commit
5
+ # pre-commit install --install-hooks --hook-type pre-commit --hook-type pre-push
6
+ #
7
+ # pre-commit runs only fast static checks on every commit so the developer
8
+ # loop stays tight. The slow `preflight` runs at pre-push time so it gates
9
+ # what reaches the remote without slowing down individual commits.
10
+
11
+ default_language_version:
12
+ python: python3.11
13
+
14
+ repos:
15
+ - repo: https://github.com/pre-commit/pre-commit-hooks
16
+ rev: v5.0.0
17
+ hooks:
18
+ - id: trailing-whitespace
19
+ stages: [pre-commit]
20
+ - id: end-of-file-fixer
21
+ stages: [pre-commit]
22
+ - id: check-yaml
23
+ stages: [pre-commit]
24
+ # The simple YAML in configs/*.yaml uses a tiny subset; check-yaml
25
+ # is fine. `app_file` etc. in README.md aren't real YAML headers
26
+ # — they're HF Spaces front-matter and excluded from this hook.
27
+ exclude: ^README\.md$
28
+ - id: check-json
29
+ stages: [pre-commit]
30
+ - id: check-added-large-files
31
+ stages: [pre-commit]
32
+ args: ["--maxkb=2048"]
33
+ - id: check-merge-conflict
34
+ stages: [pre-commit]
35
+
36
+ - repo: local
37
+ hooks:
38
+ - id: zsgdp-preflight
39
+ name: zsgdp preflight (unit + regression + space-check + parsers)
40
+ entry: python -m zsgdp.cli preflight --root .
41
+ language: system
42
+ pass_filenames: false
43
+ stages: [pre-push]
44
+ always_run: true
CHANGELOG.md ADDED
@@ -0,0 +1,274 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Changelog
2
+
3
+ All notable changes to zeroshotGPU. Format follows
4
+ [Keep a Changelog](https://keepachangelog.com/en/1.1.0/); versions follow
5
+ [Semantic Versioning](https://semver.org/spec/v2.0.0.html) but the project is
6
+ pre-1.0 so minor bumps may include breaking changes.
7
+
8
+ ## [Unreleased]
9
+
10
+ ### Documentation — README restructured
11
+
12
+ - Reorganised into Install → Quick start → Opt-ins → Outputs →
13
+ Architecture map → Production benchmark numbers → Deployment →
14
+ Contributing.
15
+ - New "Production benchmark numbers" placeholder table with §29
16
+ success criteria recalled inline; columns are
17
+ `Metric / Dataset / Value / Date / Run` so the operator pastes real
18
+ numbers in after running `make space-smoke` and `make benchmark`
19
+ on the Space.
20
+ - Optional-extras table (`embedding`, `gpu_repair`, `spaces`)
21
+ documents what each extra adds and the config flag that requires it.
22
+ - Architecture quick-map turned into a table; one row per top-level
23
+ module with its responsibility.
24
+ - Deployment section is now a numbered checklist that ends with
25
+ "update the production-benchmark table."
26
+
27
+ ### Added — Space smoke validation runner
28
+
29
+ - `scripts/run_space_smoke.py` automates the five smokes documented in
30
+ `docs/space_smoke.md`. One command runs whichever smokes have their
31
+ deps installed; missing deps surface as `skip` results with explicit
32
+ `pip install` hints, not crashes.
33
+ - Five smokes: `lexical` (model-free benchmark), `ablation` (per-parser
34
+ runner), `embedding` (sentence-transformers + jina-v3 lazy-load
35
+ probe), `gpu_repair` (dry-run plan + repair-loop iteration check —
36
+ *does not* download multi-GB Qwen weights, defers live invocation
37
+ to `run-gpu-tasks --execute`), `marker` (binary detection +
38
+ registry availability).
39
+ - `--strict` mode treats skipped smokes as failures; `--output PATH`
40
+ emits a structured JSON report with per-smoke `detail`, elapsed
41
+ seconds, status (`pass`/`fail`/`skip`/`error`), and aggregate
42
+ summary counts.
43
+ - 14 new tests covering registry membership, report aggregation,
44
+ text formatting per status, strict-mode skip-as-failure, end-to-end
45
+ smoke execution for the three model-free smokes, and skip-path
46
+ structure for the model-dependent ones.
47
+
48
+ ### Added — per-artifact downloads in the Space UI
49
+
50
+ - New "Artifacts" tab in `app.py` exposes each top-level artifact
51
+ (`parsed_document.json`, `document.md`, `chunks.jsonl`,
52
+ `quality_report.json`, etc. — 16 candidate files) as an individual
53
+ download via `gr.Files`. The bundled zip stays as it was for
54
+ archival, and nested asset dirs (`assets/pages/*.png`,
55
+ `assets/tables/*.png`) are intentionally excluded from the
56
+ per-artifact list — they can be large and the zip already covers
57
+ them.
58
+ - The artifact list is built from `_INDIVIDUAL_ARTIFACT_NAMES` in
59
+ declaration order so the UI listing is stable across runs. Missing
60
+ files are silently skipped (different parses emit different subsets;
61
+ e.g. `conflict_report.json` only when multiple parsers ran).
62
+ - All return paths in `parse_uploaded_document` now go through a
63
+ single `_empty_outputs(...)` helper so the tuple width can't drift
64
+ between success and the four error paths. New drift-guard test
65
+ asserts `len(outputs) == 11` for every error path.
66
+ - Summary JSON now includes `individual_artifact_count`.
67
+
68
+ ### Added — CLI help with examples
69
+
70
+ - Each non-trivial CLI subcommand (`parse`, `parse-folder`, `space-check`,
71
+ `run-gpu-tasks`, `benchmark`, `benchmark-ablate`, `preflight`,
72
+ `combine-benchmarks`, `export-chunks`, `validate-artifacts`, plus the
73
+ top-level help) now ships with an `Examples:` block in its `--help`
74
+ output. Multi-line shell snippets render via
75
+ `argparse.RawDescriptionHelpFormatter` + a textwrap-dedent helper so
76
+ the source-side indentation doesn't leak into the rendered output.
77
+ - `zsgdp run-gpu-tasks --help` now explicitly contrasts the dry-run
78
+ default against `--execute`, matching the safety contract of
79
+ `repair.execute_gpu_escalations` in config.
80
+ - 9 new tests guarding: epilog dedent helper, blank-line preservation
81
+ in epilogs, top-level help lists examples, and per-subcommand
82
+ examples cover their distinguishing flags (e.g. `benchmark` shows
83
+ all three dataset modes; `combine-benchmarks` shows label pairing).
84
+
85
+ ### Added — contributor onboarding
86
+
87
+ - `CONTRIBUTING.md` documenting setup, hooks, test layout, fixture
88
+ format, parser/metric/schema-bump procedure, logging conventions,
89
+ PR checklist, and an architecture quick-map.
90
+ - `.pre-commit-config.yaml` with two stages:
91
+ - **pre-commit**: trailing whitespace, end-of-file fixer, JSON/YAML
92
+ syntax, large-file guard (2 MB cap), merge-conflict markers.
93
+ - **pre-push**: runs `python -m zsgdp.cli preflight` so failing
94
+ preflight blocks the push. External hook repo is pinned to a
95
+ specific tag (no `master`/`HEAD` references).
96
+ - `tests/test_repo_hygiene.py` (6 tests) — guards `.env` is in
97
+ `.gitignore`, `.env.example` is committed and contains no
98
+ real-shape secrets, pre-commit config has the preflight hook on
99
+ the pre-push stage with a pinned external repo, `CONTRIBUTING.md`
100
+ references the preflight workflow and Space smoke checklist,
101
+ `CHANGELOG.md` has an `[Unreleased]` section.
102
+
103
+ ### Added — performance baselines
104
+
105
+ - Regression fixture format gains an optional `performance` block:
106
+ `repeats`, `max_elapsed_seconds`, `min_pages_per_second`,
107
+ `always_enforce`. The runner parses each fixture N times and compares
108
+ the median against the floor — the cold-import outlier on the first
109
+ run is stripped automatically.
110
+ - Default opt-in via `ZSGDP_REGRESSION_PERF=1`; per-fixture override
111
+ via `always_enforce: true`. Floors are intended as
112
+ catastrophic-regression guards, not tight perf bars.
113
+ - Seed fixture `markdown_basic` ships with a 2.0s / 0.5pps floor
114
+ (~80x slack against measured ~6ms median) so it exercises the path
115
+ without flaking on slow CI.
116
+ - 5 new unit tests for the perf evaluator: max-elapsed and
117
+ min-pps trip correctly, median strips cold outliers, env-var gating
118
+ honours `always_enforce`.
119
+
120
+ ### Added — preflight + secrets
121
+
122
+ - Preflight runner: `zsgdp preflight` CLI subcommand and `make preflight`
123
+ target. Chains `unittest discover`, regression fixtures, `space-check`,
124
+ and `parsers` registry sanity. `--benchmark` adds an end-to-end smoke
125
+ against the regression fixtures. Each step's output is suppressed on
126
+ success and surfaced on failure; one-line summary always printed.
127
+ - `Makefile` with targets `test`, `regression`, `space-check`, `parsers`,
128
+ `preflight`, `preflight-full`, `benchmark`, `clean`.
129
+ - `.env` loading via `zsgdp.config.load_env_file()`. Read at CLI start
130
+ and `app.py` import; pre-set environment variables always win. Never
131
+ overrides Space-side secrets. `.env.example` shipped as the template.
132
+ - `.env`/`.env.*` added to `.gitignore` (`.env.example` whitelisted).
133
+ - `zsgdp.config.hf_token()` resolves `HF_TOKEN`,
134
+ `HUGGING_FACE_HUB_TOKEN`, `HUGGINGFACE_TOKEN` in priority order.
135
+
136
+ ### Added — structured logging
137
+
138
+ - `zsgdp.logging_config` with idempotent `configure_logging()`. Default
139
+ level WARNING; opt in via `ZSGDP_LOG_LEVEL`. Optional one-line JSON
140
+ records via `ZSGDP_LOG_JSON=1`; structured `extra={...}` fields
141
+ promoted to top-level keys for HF Spaces logs / Loki / Datadog.
142
+ - Wired into pipeline (`parse_start`, `parser_candidate`,
143
+ `parser_failed`, `parse_end`), repair controller (`repair_iteration`),
144
+ GPU worker (`gpu_task_executed`, `gpu_task_blocked`), CLI, and
145
+ `app.py`. App auto-enables JSON mode when `SPACE_ID` is set.
146
+
147
+ ### Added — deployment-readiness pass
148
+
149
+ - Pinned upper bounds on all `requirements.txt` and `pyproject.toml`
150
+ dependencies. Added explicit `embedding` and `gpu_repair` extras so the
151
+ optional sentence-transformers / transformers stacks can be installed
152
+ without dragging the whole spaces extra in.
153
+ - Abuse / cost guards in the Gradio Space entrypoint (`app.py`):
154
+ `MAX_UPLOAD_BYTES` (50 MB default) and `MAX_PAGE_COUNT` (200 default),
155
+ both overridable via `ZSGDP_MAX_*` env vars. Oversized uploads are
156
+ rejected with a clear UI error before parsing starts.
157
+ - `SCHEMA_VERSION` constant and `ParsedDocument.schema_version` field.
158
+ Surfaced into the artifact manifest as
159
+ `parsed_document_schema_version` alongside the existing manifest
160
+ `schema_version`. Validation report echoes both so consumers can gate.
161
+ - Regression fixture format under `tests/regression/`: a YAML-style
162
+ `*.expected.json` tolerance spec paired with an input document. Runner
163
+ auto-discovers, asserts on tolerances (counts, score, markdown
164
+ contains/excludes, repair/disagreement rate ranges). One seed fixture
165
+ shipped (`markdown_basic`).
166
+
167
+ ### Added — eval surface
168
+
169
+ - Per-parser GT-comparison metrics within a single merged run
170
+ (`zsgdp/benchmarks/per_parser_metrics.py`). Reads pre-merge candidate
171
+ snapshots from `parsed.provenance.candidates` and computes layout F1 /
172
+ table structure / formula CER per parser against the same GT.
173
+ Surfaced as `per_parser_metrics.csv` and per-doc field
174
+ `per_parser_metrics`.
175
+ - Per-parser cross-doc leaderboard rollup
176
+ (`per_parser_gt_leaderboard.csv`) with truth-aware filtering: a metric
177
+ contributes to a parser's mean only when that parser was actually
178
+ evaluated against truths for that metric on that document.
179
+ - Cross-dataset comparison (`zsgdp/benchmarks/cross_dataset.py`) with
180
+ `combine-benchmarks` CLI subcommand. Combines multiple
181
+ `results.json` summaries into `dataset_summary.csv` and a
182
+ parser-vs-dataset matrix. Missing metrics surface as `None` rather
183
+ than 0.0 so callers can distinguish absent from true-zero.
184
+ - Embedding-based retriever (`zsgdp/benchmarks/embedding_retriever.py`)
185
+ satisfying the `Retriever` protocol. Defaults to lexical (model-free,
186
+ CI-safe); opt in via `benchmarks.retriever.backend=embedding` in
187
+ config. Lazy-loads `sentence-transformers` on first use; falls back
188
+ cleanly when unavailable.
189
+ - Layout F1 against ground-truth bbox annotations
190
+ (`zsgdp/verify/layout_f1.py`). Class-aware and class-agnostic scores
191
+ side-by-side, per-category breakdown. DocLayNet COCO and OmniDocBench
192
+ JSON adapters in `zsgdp/benchmarks/ground_truth.py`.
193
+ - Table structure similarity (`zsgdp/verify/table_structure.py`):
194
+ shape similarity × multiset cell-content F1, greedy bipartite
195
+ matching.
196
+ - Formula extraction CER (`zsgdp/verify/formula_extraction.py`):
197
+ Levenshtein-based, normalized for whitespace and `$`/`$$` delimiters.
198
+ - Retrieval-readiness metrics (`zsgdp/verify/retrieval.py`): recall@k,
199
+ citation accuracy@k, mean reciprocal rank. Synthetic QA generator
200
+ (`zsgdp/benchmarks/retrieval.py`) using distinctive sentences.
201
+ - Parser-disagreement rate
202
+ (`zsgdp/verify/parser_disagreement.py`): conflict count over parser
203
+ pair count from the merger's existing conflict report.
204
+ - Repair success / regression rates
205
+ (`zsgdp/verify/repair_success.py`): pre/post issue identity diff;
206
+ iteration history, score delta, action counts.
207
+ - Parser contribution counts: which parser's elements survived the
208
+ merge, surfaced as per-doc and aggregate fractions.
209
+ - Parser ablation runner (`zsgdp/benchmarks/ablation_runner.py`) with
210
+ `benchmark-ablate` CLI subcommand. Runs the benchmark once per parser
211
+ in isolation plus a merged arm, emits a comparison CSV.
212
+ - Three dataset loaders (`zsgdp/benchmarks/datasets.py`):
213
+ `custom_folder`, `omnidocbench`, `doclaynet`. `DatasetDocument`
214
+ dataclass; registry pattern for downstream extension.
215
+
216
+ ### Added — pipeline
217
+
218
+ - Iterative repair loop in `pipeline.py`: bounded by
219
+ `repair.max_iterations`, terminates on quality-accepted OR
220
+ no-changes-this-pass. Per-iteration history under
221
+ `provenance.repair_iterations`.
222
+ - GPU repair escalation wired into `repair/controller.py`. Plans
223
+ same-schema GPU tasks for invalid tables, OCR/text coverage issues,
224
+ reading-order failures, and figure issues, then dispatches via
225
+ `GPUWorker`. Default safe (`repair.gpu_escalation=true,
226
+ repair.execute_gpu_escalations=false`); flip the second to invoke
227
+ the configured backend.
228
+ - Per-parser candidate snapshots persisted in
229
+ `parsed.provenance.candidates` so per-parser GT metrics can be
230
+ computed without re-parsing.
231
+ - Real Marker and Unstructured normalizers
232
+ (`zsgdp/normalize/normalize_marker.py` and
233
+ `normalize_unstructured.py`) wired through `parsers/external.py`.
234
+
235
+ ### Changed
236
+
237
+ - `requirements.txt` no longer pins `torch`; the HF Spaces image
238
+ preinstalls a CUDA-matched build and pinning here would override it.
239
+ - `--gpu-workers` flag help text clarified — the value is recorded for
240
+ downstream task-execution accounting but document parsing uses
241
+ `--workers`.
242
+ - `--dataset` benchmark flag now selects the loader name
243
+ (default `custom_folder`); `custom`/`folder`/`default` accepted as
244
+ aliases. Previous behaviour was a freeform reporting label only.
245
+ - Embedding-retriever toy hashing test now uses
246
+ `hashlib.md5`-based stable hashing instead of `builtins.hash()`,
247
+ fixing per-process flakiness.
248
+
249
+ ### Documentation
250
+
251
+ - `tests/regression/README.md` documents the fixture format.
252
+ - `configs/default.yaml` and `configs/docling.yaml` annotated to
253
+ explain the new `repair.execute_gpu_escalations` and the deliberate
254
+ Docling+PyMuPDF dual-enable for the disagreement metric.
255
+
256
+ ### Test count
257
+
258
+ - 181 tests pass (was 4 at the start of the eval surface work).
259
+
260
+ ## [0.1.0] — initial MVP
261
+
262
+ - Profiler, page router, parser registry (text, pymupdf, docling, plus
263
+ shell-out adapters for marker / mineru / olmocr / paddleocr /
264
+ unstructured).
265
+ - Canonical schema (`Element`, `TableObject`, `FigureObject`, `Chunk`,
266
+ `ParsedDocument`, `QualityReport`).
267
+ - Merger with conflict detection, quality verifier (coverage, reading
268
+ order, table validity, chunk readiness), deterministic repair
269
+ controller.
270
+ - Agentic chunker with fixed-token / recursive-structure / parent-child
271
+ / page-level / table / figure strategies; semantic / late /
272
+ vision-guided / proposition stubs.
273
+ - Artifact manifest with SHA-256 checksums, `validate-artifacts` CLI.
274
+ - Gradio Spaces entrypoint, `space-check` deployment readiness CLI.
CONTRIBUTING.md ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Contributing to zeroshotGPU
2
+
3
+ Thanks for working on this. Three things to know up front:
4
+
5
+ 1. **Run `make preflight` before pushing.** It's the same suite that runs
6
+ in pre-push if you have the hooks installed (see below). A green
7
+ preflight is the local signal that the branch is ready for the
8
+ [Space smoke checklist](docs/space_smoke.md).
9
+ 2. **Keep it dependency-light by default.** New runtime dependencies need
10
+ a corresponding entry in `pyproject.toml` extras and an explicit
11
+ gate (config flag, lazy import, or feature-detection fallback). The
12
+ `embedding` extra is the model: opt-in, lazy-imported on first use,
13
+ raises a clean `RuntimeError` when missing.
14
+ 3. **Don't change schema shapes silently.** Bump
15
+ `zsgdp.schema.SCHEMA_VERSION` whenever the on-disk shape of
16
+ `parsed_document.json`, `chunks.jsonl`, etc. changes. See
17
+ [Schema versioning](#schema-versioning) below.
18
+
19
+ ---
20
+
21
+ ## Setup
22
+
23
+ ```bash
24
+ git clone <repo>
25
+ cd "Document Parser"
26
+ python3.11 -m venv .venv && source .venv/bin/activate
27
+ python -m pip install -e ".[pdf,yaml,docling,dev]"
28
+ ```
29
+
30
+ Optional extras:
31
+
32
+ - `.[embedding]` — sentence-transformers + transformers for the embedding
33
+ retriever. Only needed when you set `benchmarks.retriever.backend=embedding`.
34
+ - `.[gpu_repair]` — transformers for live GPU repair. Only needed when you
35
+ set `repair.execute_gpu_escalations=true`.
36
+ - `.[spaces]` — mirrors the root `requirements.txt` so an editable install
37
+ matches a Space deploy.
38
+
39
+ Set up `.env` for local secrets:
40
+
41
+ ```bash
42
+ cp .env.example .env
43
+ # Fill in HF_TOKEN if you need gated models.
44
+ ```
45
+
46
+ `.env` is gitignored. CLI and `app.py` load it automatically; pre-set
47
+ environment variables always win, so a Space's secrets never get
48
+ overridden by a stray local file.
49
+
50
+ ---
51
+
52
+ ## Pre-commit / pre-push hooks
53
+
54
+ ```bash
55
+ python -m pip install pre-commit
56
+ pre-commit install --install-hooks --hook-type pre-commit --hook-type pre-push
57
+ ```
58
+
59
+ Two stages:
60
+
61
+ - **pre-commit** — fast static checks: trailing whitespace, end-of-file
62
+ newline, JSON/YAML syntax, large-file guard, merge-conflict markers.
63
+ Runs on every `git commit`.
64
+ - **pre-push** — runs `python -m zsgdp.cli preflight`. Same as
65
+ `make preflight`. Failing this blocks the push.
66
+
67
+ Skip on a specific commit with `git commit --no-verify` if you genuinely
68
+ need to (e.g. WIP). Skip the pre-push gate with `git push --no-verify`,
69
+ but only if you have a separately verified preflight run.
70
+
71
+ ---
72
+
73
+ ## Running tests
74
+
75
+ ```bash
76
+ make test # full unittest discover
77
+ make regression # snapshot fixture suite
78
+ make preflight # everything except the benchmark smoke
79
+ make preflight-full # adds an end-to-end benchmark smoke
80
+ make benchmark # parses tests/regression/fixtures/ via the CLI
81
+ ```
82
+
83
+ Or directly:
84
+
85
+ ```bash
86
+ python -m unittest discover
87
+ python -m unittest tests.regression.test_regression
88
+ python -m zsgdp.cli preflight --root . --benchmark
89
+ ```
90
+
91
+ Performance regressions are gated behind `ZSGDP_REGRESSION_PERF=1`:
92
+
93
+ ```bash
94
+ ZSGDP_REGRESSION_PERF=1 python -m unittest tests.regression.test_regression
95
+ ```
96
+
97
+ See [tests/regression/README.md](tests/regression/README.md) for the
98
+ fixture format including the `performance` block.
99
+
100
+ ---
101
+
102
+ ## Adding a regression fixture
103
+
104
+ 1. Drop the input under `tests/regression/fixtures/<name>.input.<ext>`.
105
+ 2. Parse it once locally and inspect the output:
106
+ ```bash
107
+ python -m zsgdp.cli parse --input tests/regression/fixtures/<name>.input.<ext> --output /tmp/sanity
108
+ ```
109
+ 3. Hand-write `tests/regression/fixtures/<name>.expected.json` with the
110
+ tolerances you want to lock down. Prefer ranges over exact counts
111
+ where reasonable variance exists.
112
+ 4. Optional: add a `performance` block with `max_elapsed_seconds` set to
113
+ ~50–100x your local median (catastrophic-regression guard, not a
114
+ tight bar).
115
+ 5. Run `make regression` to confirm the fixture is picked up.
116
+
117
+ ---
118
+
119
+ ## Adding a parser adapter
120
+
121
+ 1. Subclass `BaseParser` in `zsgdp/parsers/<name>_parser.py` (or extend
122
+ `external.py` for shell-out adapters).
123
+ 2. Set `name`, `supported_file_types`, implement `available()` and
124
+ `parse(path, profile, config, *, pages=None)`.
125
+ 3. Register in `zsgdp/parsers/registry.py`.
126
+ 4. If the parser produces Markdown, write a normalizer under
127
+ `zsgdp/normalize/normalize_<name>.py` that returns a `ParseCandidate`
128
+ via `normalize_markdown_candidate(...)`.
129
+ 5. Add a config block to `configs/default.yaml` with `enabled: false`
130
+ plus any CLI flags the adapter needs.
131
+ 6. Add the dependency to `pyproject.toml` as an optional extra. Don't
132
+ pin it in the top-level `requirements.txt` unless it's free to
133
+ install on every Space build.
134
+
135
+ ---
136
+
137
+ ## Adding a metric
138
+
139
+ Pure metrics live under `zsgdp/verify/`:
140
+
141
+ 1. Define inputs as plain dicts/lists (not `ParsedDocument`-keyed) so
142
+ the same metric works on per-parser candidate snapshots, not just
143
+ the merged document.
144
+ 2. Pin definitions in the module docstring — exact denominator,
145
+ handling of empty inputs, what each return key means.
146
+ 3. Surface in `zsgdp/benchmarks/parser_quality.py`:
147
+ - Add per-document fields to the `doc_record`.
148
+ - Add aggregated means to the top-level `summary` dict.
149
+ - Add a per-document CSV writer if it has detail worth its own file.
150
+ 4. Add tests for: perfect input, no-match input, partial overlap,
151
+ vacuous empty/empty case, and a benchmark-integration test that
152
+ asserts the metric appears in `summary["documents"][0]`.
153
+
154
+ ---
155
+
156
+ ## Schema versioning
157
+
158
+ `zsgdp.schema.SCHEMA_VERSION` lives in
159
+ [zsgdp/schema/document.py](zsgdp/schema/document.py). It's surfaced into
160
+ `artifact_manifest.json` as `parsed_document_schema_version` so a
161
+ consumer reading old output can gate.
162
+
163
+ Bump rules:
164
+
165
+ - **Additive change** (new optional field with a default) — bump the
166
+ patch (1.0 → 1.1).
167
+ - **Breaking change** (renamed/removed field, semantics changed) — bump
168
+ the major (1.0 → 2.0). Update the regression fixtures in the same
169
+ PR; downstream consumers will need a migration.
170
+ - **No change** — leave it alone.
171
+
172
+ When you bump, add an entry to `CHANGELOG.md` under
173
+ "### Schema" with the version and what changed.
174
+
175
+ ---
176
+
177
+ ## Logging
178
+
179
+ Use `from zsgdp.logging_config import get_logger` then
180
+ `logger = get_logger(__name__)`. Call `.info`/`.warning`/`.error` with
181
+ structured `extra={...}` fields rather than f-string-formatted messages
182
+ where possible — the JSON formatter promotes `extra` keys to top-level
183
+ fields so the HF Spaces logs page is greppable.
184
+
185
+ Default log level is WARNING (CLI summaries unaffected). Opt in with
186
+ `ZSGDP_LOG_LEVEL=INFO` and `ZSGDP_LOG_JSON=1` for Space-style output.
187
+
188
+ ---
189
+
190
+ ## Pull request checklist
191
+
192
+ Before opening a PR:
193
+
194
+ - [ ] `make preflight` passes locally.
195
+ - [ ] If you added a metric, an adapter, or changed the schema, you
196
+ updated `CHANGELOG.md`.
197
+ - [ ] If you changed parser behavior, you ran `make regression` and any
198
+ fixture drift is intentional (and the snapshot was regenerated
199
+ explicitly).
200
+ - [ ] If your change touches GPU/model code paths, you flagged it for
201
+ Space-side smoke testing in the PR description (the
202
+ [smoke checklist](docs/space_smoke.md) covers what to run).
203
+ - [ ] You did **not** commit `.env` or any secret. The `.gitignore`
204
+ should catch this; if you suspect a leak, treat the token as
205
+ compromised and rotate it.
206
+
207
+ ---
208
+
209
+ ## Architecture quick map
210
+
211
+ - `zsgdp/profiling/` — page-level features and labels.
212
+ - `zsgdp/routing/` — deterministic page → expert mapping.
213
+ - `zsgdp/parsers/` — adapters; one canonical schema regardless of source.
214
+ - `zsgdp/normalize/` — convert each parser's output into the schema.
215
+ - `zsgdp/merge/` — align candidates, dedupe, detect conflicts.
216
+ - `zsgdp/verify/` — coverage, reading order, table/figure/formula/chunk
217
+ quality, GT-comparison metrics (layout F1, table structure, formula
218
+ CER, retrieval recall), parser disagreement and repair success rates.
219
+ - `zsgdp/repair/` — deterministic header/table fixes plus GPU
220
+ escalation that dispatches to `gpu/worker.py`.
221
+ - `zsgdp/chunking/` — agentic planner + structure-aware / parent-child /
222
+ table / figure / page chunk builders, with semantic / late /
223
+ vision-guided / proposition deterministic stubs.
224
+ - `zsgdp/gpu/` — task planning, batching, dry-run worker, transformers
225
+ and vLLM clients.
226
+ - `zsgdp/benchmarks/` — dataset loaders, metric runners, ablation,
227
+ cross-dataset comparison, retrieval (lexical + embedding).
228
+ - `zsgdp/cli.py` — single entry point exposing all of the above.
229
+ - `app.py` — Gradio Space front-end.
230
+
231
+ The full spec lives in
232
+ [zero_shot_gpu_document_parser_project_spec.md](zero_shot_gpu_document_parser_project_spec.md).
233
+ The 2000-line read isn't required to contribute, but section §10 (schema)
234
+ and §17 (chunking ladder) are worth skimming if you're touching those
235
+ modules.
Makefile ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ PYTHON ?= python3.11
2
+
3
+ .PHONY: help test regression space-check parsers preflight preflight-full benchmark space-smoke space-smoke-strict clean
4
+
5
+ help:
6
+ @echo "Targets:"
7
+ @echo " test - run the full unittest discover suite"
8
+ @echo " regression - run the regression fixture snapshot suite"
9
+ @echo " space-check - run the HF Space readiness check"
10
+ @echo " parsers - print the parser registry"
11
+ @echo " preflight - run test + regression + space-check + parsers"
12
+ @echo " preflight-full - preflight + an end-to-end benchmark smoke"
13
+ @echo " benchmark - run zsgdp benchmark against tests/regression/fixtures"
14
+ @echo " space-smoke - run docs/space_smoke.md smokes (deps-permitting)"
15
+ @echo " space-smoke-strict - same, but treat skipped smokes as failures"
16
+ @echo " clean - remove __pycache__ and benchmark output"
17
+
18
+ test:
19
+ $(PYTHON) -m unittest discover
20
+
21
+ regression:
22
+ $(PYTHON) -m unittest tests.regression.test_regression -v
23
+
24
+ space-check:
25
+ $(PYTHON) -m zsgdp.cli space-check --root .
26
+
27
+ parsers:
28
+ $(PYTHON) -m zsgdp.cli parsers
29
+
30
+ preflight:
31
+ $(PYTHON) -m zsgdp.cli preflight --root .
32
+
33
+ preflight-full:
34
+ $(PYTHON) -m zsgdp.cli preflight --root . --benchmark
35
+
36
+ benchmark:
37
+ $(PYTHON) -m zsgdp.cli benchmark \
38
+ --input tests/regression/fixtures \
39
+ --output out/preflight_benchmark
40
+
41
+ space-smoke:
42
+ $(PYTHON) -m scripts.run_space_smoke --output out/space_smoke_report.json
43
+
44
+ space-smoke-strict:
45
+ $(PYTHON) -m scripts.run_space_smoke --strict --output out/space_smoke_report.json
46
+
47
+ clean:
48
+ find . -type d -name __pycache__ -exec rm -rf {} + 2>/dev/null || true
49
+ rm -rf out/preflight_benchmark
README.md ADDED
@@ -0,0 +1,287 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: zeroshotGPU
3
+ sdk: gradio
4
+ app_file: app.py
5
+ python_version: 3.11
6
+ suggested_hardware: l4x1
7
+ short_description: Agentic zero-shot document parser with parser metrics and chunk artifacts.
8
+ ---
9
+
10
+ # Zero-Shot GPU Document Parser
11
+
12
+ A self-hosted parsing control plane that profiles documents, routes pages to
13
+ parser experts, normalizes outputs, verifies quality with GT-comparison
14
+ metrics, repairs weak regions through a bounded verify/repair loop (with
15
+ optional GPU escalation), and emits auditable parsed-document artifacts plus
16
+ strategy-aware chunks. Implements the project described in
17
+ [`zero_shot_gpu_document_parser_project_spec.md`](zero_shot_gpu_document_parser_project_spec.md).
18
+
19
+ The codebase is intentionally dependency-light by default. Text and Markdown
20
+ work with the standard library; PyMuPDF, Docling, Marker, MinerU, olmOCR,
21
+ PaddleOCR, and Unstructured plug in via optional extras. Live GPU repair
22
+ (Qwen2.5-VL-3B) and the embedding retriever (jina-embeddings-v3) are gated
23
+ behind explicit config flags so a fresh clone never silently downloads
24
+ multi-gigabyte weights.
25
+
26
+ ---
27
+
28
+ ## Install
29
+
30
+ For the local MVP (text + PyMuPDF + Docling):
31
+
32
+ ```bash
33
+ python -m pip install -e ".[pdf,yaml,docling,dev]"
34
+ ```
35
+
36
+ Optional extras:
37
+
38
+ | Extra | Adds | Required for |
39
+ |---------------|--------------------------------------------------|-----------------------------------------------|
40
+ | `embedding` | `sentence-transformers`, `transformers` | `benchmarks.retriever.backend=embedding` |
41
+ | `gpu_repair` | `transformers` | `repair.execute_gpu_escalations=true` |
42
+ | `spaces` | mirrors `requirements.txt` for HF Spaces parity | running `app.py` locally as a Space simulant |
43
+
44
+ External parser CLIs (Marker, MinerU, olmOCR, PaddleOCR) install separately;
45
+ configure each via `parsers.<name>.command`, `output_args`, and `extra_args`
46
+ in your YAML config.
47
+
48
+ Secrets:
49
+
50
+ ```bash
51
+ cp .env.example .env
52
+ # Set HF_TOKEN if you'll use gated models (jina-embeddings-v3, private repos).
53
+ ```
54
+
55
+ `.env` is gitignored. The CLI and `app.py` load it on startup; pre-set
56
+ environment variables (e.g. Space-side secrets) always win.
57
+
58
+ ---
59
+
60
+ ## Quick start
61
+
62
+ ### Parse one document or a folder
63
+
64
+ ```bash
65
+ python -m zsgdp.cli parse --input ./docs/sample.md --output ./out/sample
66
+ python -m zsgdp.cli parse-folder --input ./docs --output ./parsed --workers 4
67
+ python -m zsgdp.cli parse --input ./docs/report.pdf --output ./out/report --config configs/docling.yaml
68
+ ```
69
+
70
+ Each parse writes a full artifact bundle. `parsed_document.json` is the
71
+ canonical record; `chunks.jsonl` is the retrieval-ready output;
72
+ `quality_report.json` carries every metric the verifier computed.
73
+
74
+ ### Run a benchmark
75
+
76
+ ```bash
77
+ # Custom corpus, no GT — runs every metric that doesn't need labels:
78
+ python -m zsgdp.cli benchmark --input ./docs --output ./bench
79
+
80
+ # Labelled datasets — adds layout F1 / table structure / formula CER:
81
+ python -m zsgdp.cli benchmark --input ./omnidocbench --dataset omnidocbench --output ./bench/omni
82
+ python -m zsgdp.cli benchmark --input ./doclaynet --dataset doclaynet --output ./bench/doclay
83
+ ```
84
+
85
+ ### Compare parsers (ablation)
86
+
87
+ ```bash
88
+ python -m zsgdp.cli benchmark-ablate \
89
+ --input ./docs --output ./bench/ablation \
90
+ --parser docling --parser pymupdf --parser text
91
+ ```
92
+
93
+ Runs the benchmark once per parser plus a merged arm; emits
94
+ `ablation_comparison.csv`.
95
+
96
+ ### Compare across datasets
97
+
98
+ ```bash
99
+ python -m zsgdp.cli combine-benchmarks \
100
+ --input ./bench/omni --label omnidocbench \
101
+ --input ./bench/doclay --label doclaynet \
102
+ --output ./bench/cross
103
+ ```
104
+
105
+ Emits `dataset_summary.csv` and `parser_matrix.csv` (parser × dataset).
106
+
107
+ ### Before pushing to a Space — preflight
108
+
109
+ ```bash
110
+ make preflight # unit + regression + space-check + parsers (~10s)
111
+ make preflight-full # ...plus an end-to-end benchmark smoke
112
+ ```
113
+
114
+ A green preflight is the local signal that the branch is ready for the
115
+ Space. Pre-commit and pre-push hooks (see [CONTRIBUTING.md](CONTRIBUTING.md))
116
+ make this automatic on every `git push`.
117
+
118
+ ### On the Space — smoke validation
119
+
120
+ Once deployed, exercise the deferred GPU/model paths:
121
+
122
+ ```bash
123
+ make space-smoke # runs whichever of 5 smokes have their deps
124
+ python -m scripts.run_space_smoke --strict --output ./space_smoke.json
125
+ ```
126
+
127
+ See [docs/space_smoke.md](docs/space_smoke.md) for the manual fallback
128
+ procedure (real PDF uploads, full Marker parses) and per-smoke
129
+ acceptance criteria.
130
+
131
+ ---
132
+
133
+ ## Opt-ins
134
+
135
+ ### Embedding retriever
136
+
137
+ Default retriever is lexical TF-IDF (zero deps). To use a real embedder:
138
+
139
+ ```yaml
140
+ # configs/myrun.yaml
141
+ benchmarks:
142
+ retriever:
143
+ backend: embedding
144
+ model_id: jinaai/jina-embeddings-v3 # or any sentence-transformers model
145
+ task: retrieval.passage
146
+ ```
147
+
148
+ ```bash
149
+ python -m pip install -e ".[embedding]"
150
+ python -m zsgdp.cli benchmark --input ./docs --output ./bench --config configs/myrun.yaml
151
+ ```
152
+
153
+ The first call lazy-loads the model; subsequent calls reuse it in-process.
154
+ Set `HF_TOKEN` in `.env` for gated models.
155
+
156
+ ### Live GPU repair
157
+
158
+ The repair controller plans GPU tasks for verification failures (invalid
159
+ tables, OCR coverage gaps, reading-order issues, missing figure captions).
160
+ By default these are dry-run only. To execute:
161
+
162
+ ```yaml
163
+ # configs/myrun.yaml
164
+ repair:
165
+ gpu_escalation: true
166
+ execute_gpu_escalations: true # invokes the configured backend
167
+ gpu:
168
+ backend: transformers # or "vllm" for OpenAI-compat
169
+ models:
170
+ table:
171
+ model_id: Qwen/Qwen2.5-VL-3B-Instruct
172
+ ```
173
+
174
+ Each executed task writes its output back into the merged document with a
175
+ `gpu_repair_task_id` provenance field.
176
+
177
+ ---
178
+
179
+ ## Outputs
180
+
181
+ Every parse writes:
182
+
183
+ - `parsed_document.json` — canonical record (carries `schema_version`).
184
+ - `document.md` — human-readable Markdown reconstruction.
185
+ - `elements.jsonl` / `tables.jsonl` / `figures.jsonl` / `chunks.jsonl` — JSONL streams.
186
+ - `chunking_plan.json` — strategy ladder + per-strategy metadata.
187
+ - `parser_metrics.json` — per-parser candidate-level stats.
188
+ - `quality_report.json` — every verifier metric (text coverage, reading order, table validity, parser disagreement, repair resolution/regression rates, GT-comparison metrics when applicable).
189
+ - `routing_report.json` — page → parser routing decisions.
190
+ - `profile.json` — document profiler output.
191
+ - `gpu_runtime.json` — detected GPU/device state at parse time.
192
+ - `gpu_tasks.jsonl` (when model-backed work is planned) and `gpu_task_report.json` (preflight validation).
193
+ - `conflict_report.json` (when multiple parsers ran).
194
+ - `artifact_manifest.json` with SHA-256 checksums and the parsed-document schema version.
195
+ - `assets/pages/*.png`, `assets/tables/*.png`, `assets/figures/*.png` — rendered PDF page and region crops.
196
+
197
+ Benchmark runs additionally write:
198
+
199
+ - `results.json` — full structured summary including aggregate means.
200
+ - `leaderboard.csv` and `per_parser_gt_leaderboard.csv` — parser leaderboards (without and with GT comparison).
201
+ - `per_parser_metrics.csv` — per-document, per-parser GT-comparison breakdown.
202
+ - `layout_runs.csv`, `table_structure_runs.csv`, `formula_runs.csv`, `retrieval_runs.csv`, `repair_runs.csv` — per-document detail per metric family.
203
+ - `parser_runs.csv`, `chunk_runs.csv`, `structure_runs.csv`, `chunk_quality.csv`, `throughput_runs.csv`, `ablations.json` — additional detail.
204
+
205
+ `benchmark-ablate` adds `ablation_comparison.csv`. `combine-benchmarks`
206
+ adds `dataset_summary.csv`, `parser_matrix.csv`, and
207
+ `cross_dataset_comparison.json`.
208
+
209
+ ---
210
+
211
+ ## Architecture map
212
+
213
+ | Module | Responsibility |
214
+ |-------------------------|-------------------------------------------------------------------------|
215
+ | `zsgdp/profiling/` | Cheap per-page features (scanned-score, table density, columns, etc.) |
216
+ | `zsgdp/routing/` | Deterministic page → parser-expert decisions with budget |
217
+ | `zsgdp/parsers/` | Adapters; one canonical schema regardless of source |
218
+ | `zsgdp/normalize/` | Convert each parser's output into the schema |
219
+ | `zsgdp/merge/` | Align candidates, dedupe, detect conflicts |
220
+ | `zsgdp/verify/` | Coverage / reading order / table / figure / formula / chunk readiness, plus GT-comparison: layout F1, table structure, formula CER, retrieval recall, parser disagreement, repair success |
221
+ | `zsgdp/repair/` | Deterministic header/table fixes plus GPU escalation through `gpu/worker.py` |
222
+ | `zsgdp/chunking/` | Agentic planner + structure / parent-child / table / figure / page chunkers, with semantic / late / vision / proposition deterministic stubs |
223
+ | `zsgdp/gpu/` | Task planning, batching, dry-run worker, transformers + vLLM clients |
224
+ | `zsgdp/benchmarks/` | Dataset loaders, metric runners, ablation, cross-dataset, retrieval |
225
+ | `zsgdp/cli.py` | All entry points |
226
+ | `app.py` | Gradio Space UI |
227
+
228
+ The full spec is in [`zero_shot_gpu_document_parser_project_spec.md`](zero_shot_gpu_document_parser_project_spec.md). §10 (schema) and §17 (chunking ladder) are the most useful sections to skim before touching those modules.
229
+
230
+ ---
231
+
232
+ ## Production benchmark numbers
233
+
234
+ Once the Space deploy is live and `make space-smoke` is green, run the
235
+ benchmark against your representative corpus and paste the headline
236
+ metrics here. Spec §29 success criteria for reference:
237
+
238
+ - **MVP:** full agentic loop improves table QA by ≥20% over best single parser; agentic chunking improves citation accuracy by ≥10% over recursive baseline.
239
+ - **Production-style (HR / financial reports / etc.):** retrieval recall@5 ≥ 90%, citation accuracy ≥ 90%, table QA exactness ≥ 85%, manual review rate ≤ 10%, parser blocking failure rate ≤ 5%.
240
+
241
+ | Metric | Dataset / Corpus | Value | Date | Run |
242
+ |---------------------------------|------------------|-------|------|-----|
243
+ | `mean_quality_score` | _todo_ | _todo_| _todo_ | _todo_ |
244
+ | `mean_layout_f1` | _todo_ | _todo_| _todo_ | _todo_ |
245
+ | `mean_table_structure_score` | _todo_ | _todo_| _todo_ | _todo_ |
246
+ | `mean_formula_cer` | _todo_ | _todo_| _todo_ | _todo_ |
247
+ | `mean_retrieval_recall_at_5` | _todo_ | _todo_| _todo_ | _todo_ |
248
+ | `mean_parser_disagreement_rate` | _todo_ | _todo_| _todo_ | _todo_ |
249
+ | `mean_repair_resolution_rate` | _todo_ | _todo_| _todo_ | _todo_ |
250
+ | `mean_pages_per_second` | _todo_ | _todo_| _todo_ | _todo_ |
251
+
252
+ Source rows are individual `results.json` files under each Space-side
253
+ benchmark output; commit the directory or a redacted summary so the
254
+ numbers above are reproducible.
255
+
256
+ ---
257
+
258
+ ## Deployment
259
+
260
+ Targeted: Hugging Face Spaces, hardware `l4x1`, GPU/model target
261
+ `zeroshotGPU`.
262
+
263
+ Pre-deploy gate:
264
+
265
+ 1. `make preflight` (local).
266
+ 2. `make preflight-full` (local with end-to-end benchmark smoke).
267
+ 3. Duplicate the Space, set `HF_TOKEN` and any other secrets in **Variables and secrets**.
268
+ 4. Push.
269
+ 5. `make space-smoke` from the Space's JupyterLab terminal.
270
+ 6. Inspect [docs/space_smoke.md](docs/space_smoke.md) Smoke 3 (live GPU repair) manually if the runner-level wiring smoke passed but you want full model-invocation validation.
271
+ 7. Run `python -m zsgdp.cli benchmark` against your representative corpus and update the table above.
272
+
273
+ The Space defaults to `configs/docling.yaml` (Docling + PyMuPDF
274
+ co-enabled so the parser disagreement rate has signal). Override via
275
+ `ZSGDP_CONFIG_PATH` in Space variables for custom configs.
276
+
277
+ ---
278
+
279
+ ## Contributing
280
+
281
+ See [CONTRIBUTING.md](CONTRIBUTING.md) for setup, hooks, test layout,
282
+ fixture format, parser/metric/schema-bump procedures, and the PR checklist.
283
+
284
+ For changes touching the on-disk schema, bump `zsgdp.schema.SCHEMA_VERSION`
285
+ and add an entry under `### Schema` in [CHANGELOG.md](CHANGELOG.md). The
286
+ artifact manifest surfaces the version under
287
+ `parsed_document_schema_version` so downstream consumers can gate.
app.py ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Hugging Face Spaces entrypoint for zeroshotGPU."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ import shutil
7
+ import tempfile
8
+ from pathlib import Path
9
+ from typing import Any
10
+
11
+ try:
12
+ import gradio as gr
13
+ except ImportError as exc: # pragma: no cover - only used when launching the Space UI.
14
+ raise RuntimeError("Gradio is required for the Spaces UI. Install with `python -m pip install -r requirements.txt`.") from exc
15
+
16
+ from zsgdp.artifacts import validate_artifact_manifest
17
+ from zsgdp.config import load_config, load_env_file
18
+ from zsgdp.gpu import collect_gpu_runtime_status
19
+ from zsgdp.logging_config import configure_logging, get_logger
20
+ from zsgdp.pipeline import parse_document
21
+ from zsgdp.profiling import profile_document
22
+
23
+ # Load .env first so any keys it sets (HF_TOKEN, ZSGDP_LOG_LEVEL, etc.) are
24
+ # visible before we read environment defaults below. Pre-set Space variables
25
+ # always win — load_env_file does not override existing env entries.
26
+ load_env_file()
27
+
28
+ # Default to JSON logs on the Space so the HF Spaces logs page is greppable.
29
+ # Override locally with `ZSGDP_LOG_JSON=0` for human-readable text output.
30
+ os.environ.setdefault("ZSGDP_LOG_LEVEL", "INFO")
31
+ os.environ.setdefault("ZSGDP_LOG_JSON", "1" if os.environ.get("SPACE_ID") else "0")
32
+ configure_logging()
33
+ _logger = get_logger(__name__)
34
+
35
+ ROOT = Path(__file__).resolve().parent
36
+ DOCLING_CONFIG = ROOT / "configs" / "docling.yaml"
37
+
38
+ # Abuse guards. Override at deployment time via env vars to relax for trusted
39
+ # Spaces or tighten further for public ones.
40
+ MAX_UPLOAD_BYTES = int(os.environ.get("ZSGDP_MAX_UPLOAD_BYTES", str(50 * 1024 * 1024))) # 50 MB
41
+ MAX_PAGE_COUNT = int(os.environ.get("ZSGDP_MAX_PAGE_COUNT", "200"))
42
+
43
+
44
+ class UploadRejected(Exception):
45
+ """Raised when an upload exceeds an abuse-guard limit."""
46
+
47
+
48
+ def _validate_upload(path: Path) -> None:
49
+ """Reject oversized uploads or PDFs with too many pages before parsing.
50
+
51
+ Cheap to compute (file stat + profiler page count) and avoids spending
52
+ GPU/CPU minutes on inputs the Space wasn't sized for.
53
+ """
54
+
55
+ if not path.exists():
56
+ raise UploadRejected("Uploaded file is missing on disk.")
57
+ size = path.stat().st_size
58
+ if size > MAX_UPLOAD_BYTES:
59
+ raise UploadRejected(
60
+ f"Upload is {size / 1024 / 1024:.1f} MB; the Space limit is "
61
+ f"{MAX_UPLOAD_BYTES / 1024 / 1024:.0f} MB. Set ZSGDP_MAX_UPLOAD_BYTES to override."
62
+ )
63
+ try:
64
+ profile = profile_document(path)
65
+ except Exception: # pragma: no cover - profiler is robust; this is belt-and-braces.
66
+ return
67
+ if profile.page_count > MAX_PAGE_COUNT:
68
+ raise UploadRejected(
69
+ f"Document has {profile.page_count} pages; the Space limit is "
70
+ f"{MAX_PAGE_COUNT}. Set ZSGDP_MAX_PAGE_COUNT to override."
71
+ )
72
+
73
+
74
+ # Top-level artifact files surfaced as individual downloads. Nested
75
+ # directories like assets/ stay bundled in the zip only — they can be
76
+ # large for multi-page PDFs and would clutter the per-artifact list.
77
+ _INDIVIDUAL_ARTIFACT_NAMES = (
78
+ "parsed_document.json",
79
+ "document.md",
80
+ "elements.jsonl",
81
+ "tables.jsonl",
82
+ "figures.jsonl",
83
+ "chunks.jsonl",
84
+ "chunking_plan.json",
85
+ "parser_metrics.json",
86
+ "quality_report.json",
87
+ "routing_report.json",
88
+ "profile.json",
89
+ "gpu_runtime.json",
90
+ "gpu_tasks.jsonl",
91
+ "gpu_task_report.json",
92
+ "artifact_manifest.json",
93
+ "conflict_report.json",
94
+ )
95
+
96
+
97
+ def _collect_artifact_files(output_dir: Path) -> list[str]:
98
+ """Return absolute paths for the top-level artifacts the Space surfaces.
99
+
100
+ Order matches _INDIVIDUAL_ARTIFACT_NAMES so the UI listing is stable.
101
+ Missing files are silently skipped (different parse runs emit different
102
+ subsets — e.g. conflict_report.json only when multiple parsers ran).
103
+ """
104
+
105
+ paths: list[str] = []
106
+ for name in _INDIVIDUAL_ARTIFACT_NAMES:
107
+ candidate = output_dir / name
108
+ if candidate.exists():
109
+ paths.append(str(candidate))
110
+ return paths
111
+
112
+
113
+ def _empty_outputs(reason: str, source: Path | None, *, rejected: bool, runtime: dict) -> tuple:
114
+ """Return-shape used for every error path. Centralised so the tuple width
115
+ can't drift between the success path and the four error paths."""
116
+
117
+ summary: dict[str, Any] = {"error": reason}
118
+ if source is not None:
119
+ summary["source"] = str(source)
120
+ if rejected:
121
+ summary["rejected"] = True
122
+ return ("", summary, {}, {}, {}, runtime, [], {}, {}, None, [])
123
+
124
+
125
+ def parse_uploaded_document(file_obj: Any, pipeline_mode: str):
126
+ if file_obj is None:
127
+ return _empty_outputs("Upload a document first.", None, rejected=False, runtime={})
128
+
129
+ source = Path(file_obj.name)
130
+ work_dir = Path(tempfile.mkdtemp(prefix="zeroshotgpu_"))
131
+ output_dir = work_dir / "parsed"
132
+ config_path = _config_path_for_mode(pipeline_mode)
133
+
134
+ try:
135
+ _validate_upload(source)
136
+ except UploadRejected as exc:
137
+ _logger.warning(
138
+ "space_upload_rejected",
139
+ extra={"source_path": str(source), "reason": str(exc)},
140
+ )
141
+ runtime = runtime_status_for_mode(pipeline_mode)
142
+ return _empty_outputs(str(exc), source, rejected=True, runtime=runtime)
143
+
144
+ try:
145
+ parsed = parse_document(source, output_dir, config_path=config_path)
146
+ except Exception as exc: # pragma: no cover - surfaced in the Space UI.
147
+ runtime = runtime_status_for_mode(pipeline_mode)
148
+ return _empty_outputs(str(exc), source, rejected=False, runtime=runtime)
149
+
150
+ artifact_validation = validate_artifact_manifest(output_dir)
151
+ archive_path = shutil.make_archive(str(output_dir), "zip", output_dir)
152
+ individual_files = _collect_artifact_files(output_dir)
153
+ runtime = parsed.provenance.get("gpu_runtime", {})
154
+ summary = {
155
+ "doc_id": parsed.doc_id,
156
+ "file_type": parsed.file_type,
157
+ "elements": len(parsed.elements),
158
+ "tables": len(parsed.tables),
159
+ "figures": len(parsed.figures),
160
+ "chunks": len(parsed.chunks),
161
+ "quality_score": parsed.quality_report.score,
162
+ "blocking": parsed.quality_report.has_blocking_failures,
163
+ "deployment": parsed.provenance.get("config_deployment", {}),
164
+ "runtime_device": runtime.get("device"),
165
+ "running_on_huggingface_space": runtime.get("running_on_huggingface_space"),
166
+ "artifact_manifest_valid": artifact_validation.get("valid"),
167
+ "artifact_count": artifact_validation.get("artifact_count"),
168
+ "artifact_checked_count": artifact_validation.get("checked_count"),
169
+ "individual_artifact_count": len(individual_files),
170
+ }
171
+ return (
172
+ parsed.to_markdown(),
173
+ summary,
174
+ parsed.quality_report.to_dict(),
175
+ parsed.provenance.get("parser_metrics", {}),
176
+ parsed.provenance.get("chunking", {}),
177
+ runtime,
178
+ parsed.provenance.get("gpu_tasks", []),
179
+ parsed.provenance.get("gpu_task_report", {}),
180
+ artifact_validation,
181
+ archive_path,
182
+ individual_files,
183
+ )
184
+
185
+
186
+ def _config_path_for_mode(pipeline_mode: str) -> Path | None:
187
+ env_config = os.environ.get("ZSGDP_CONFIG_PATH")
188
+ if env_config:
189
+ return Path(env_config)
190
+ if pipeline_mode == "Docling + PyMuPDF" and DOCLING_CONFIG.exists():
191
+ return DOCLING_CONFIG
192
+ return None
193
+
194
+
195
+ def runtime_status_for_mode(pipeline_mode: str) -> dict:
196
+ return collect_gpu_runtime_status(load_config(_config_path_for_mode(pipeline_mode))).to_dict()
197
+
198
+
199
+ with gr.Blocks(title="zeroshotGPU") as demo:
200
+ gr.Markdown("# zeroshotGPU")
201
+ with gr.Row():
202
+ upload = gr.File(label="Document", file_types=[".pdf", ".md", ".txt", ".html"])
203
+ with gr.Column():
204
+ pipeline = gr.Dropdown(
205
+ choices=["Docling + PyMuPDF", "Default lightweight"],
206
+ value="Docling + PyMuPDF",
207
+ label="Pipeline",
208
+ )
209
+ parse_button = gr.Button("Parse", variant="primary")
210
+ archive = gr.File(label="Artifacts (zip)")
211
+ with gr.Tabs():
212
+ with gr.Tab("Markdown"):
213
+ markdown = gr.Markdown(label="Canonical Markdown")
214
+ with gr.Tab("Run"):
215
+ summary = gr.JSON(label="Summary")
216
+ quality = gr.JSON(label="Quality Report")
217
+ parser_metrics = gr.JSON(label="Parser Metrics")
218
+ chunking = gr.JSON(label="Chunking Plan")
219
+ artifact_validation = gr.JSON(label="Artifact Manifest Validation")
220
+ with gr.Tab("Artifacts"):
221
+ gr.Markdown(
222
+ "Each top-level artifact is downloadable individually. "
223
+ "Nested assets (page renders, table/figure crops) stay bundled "
224
+ "in the zip above."
225
+ )
226
+ individual_artifacts = gr.Files(label="Individual artifacts")
227
+ with gr.Tab("Runtime"):
228
+ runtime = gr.JSON(label="GPU Runtime", value=runtime_status_for_mode("Docling + PyMuPDF"))
229
+ gpu_tasks = gr.JSON(label="Planned GPU Tasks")
230
+ gpu_task_report = gr.JSON(label="GPU Task Preflight")
231
+ parse_button.click(
232
+ parse_uploaded_document,
233
+ inputs=[upload, pipeline],
234
+ outputs=[
235
+ markdown,
236
+ summary,
237
+ quality,
238
+ parser_metrics,
239
+ chunking,
240
+ runtime,
241
+ gpu_tasks,
242
+ gpu_task_report,
243
+ artifact_validation,
244
+ archive,
245
+ individual_artifacts,
246
+ ],
247
+ )
248
+
249
+
250
+ if __name__ == "__main__":
251
+ demo.launch()
configs/default.yaml ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ parsers:
2
+ text:
3
+ enabled: true
4
+ pymupdf:
5
+ enabled: true
6
+ docling:
7
+ enabled: false
8
+ do_ocr: false
9
+ do_table_structure: false
10
+ force_backend_text: true
11
+ marker:
12
+ enabled: false
13
+ command: null
14
+ timeout_seconds: 300
15
+ output_args: "--output_dir {output_dir} --output_format markdown"
16
+ extra_args: ""
17
+ mineru:
18
+ enabled: false
19
+ command: null
20
+ timeout_seconds: 600
21
+ output_args: "--output_dir {output_dir}"
22
+ extra_args: ""
23
+ olmocr:
24
+ enabled: false
25
+ command: null
26
+ timeout_seconds: 600
27
+ output_args: "--output_dir {output_dir}"
28
+ extra_args: ""
29
+ paddleocr:
30
+ enabled: false
31
+ command: null
32
+ timeout_seconds: 600
33
+ output_args: "--output_dir {output_dir}"
34
+ extra_args: ""
35
+ unstructured:
36
+ enabled: false
37
+
38
+ routing:
39
+ run_multiple_on_hard_pages: true
40
+ max_primary_parsers_per_page: 2
41
+ hard_page_threshold: 0.65
42
+ scanned_text_threshold: 0.40
43
+ table_density_threshold: 0.25
44
+ formula_density_threshold: 0.15
45
+ figure_density_threshold: 0.20
46
+
47
+ repair:
48
+ enabled: true
49
+ max_iterations: 3
50
+ # Plan and dry-run GPU escalations for verification failures.
51
+ gpu_escalation: true
52
+ # Actually invoke the configured GPU/VLM backend on flagged regions.
53
+ # Defaults to false to avoid surprise model downloads on local runs;
54
+ # set true on the Space once GPU models are warm.
55
+ execute_gpu_escalations: false
56
+ table_repair: true
57
+ reading_order_repair: true
58
+ figure_repair: true
59
+ ocr_repair: true
60
+
61
+ gpu:
62
+ backend: transformers
63
+ provider: huggingface_spaces
64
+ space_name: zeroshotGPU
65
+ batch_pages: true
66
+ validate_tasks: true
67
+ max_batch_size: 4
68
+ max_gpu_seconds_per_doc: 120
69
+ max_vlm_calls_per_doc: 30
70
+ models:
71
+ vlm:
72
+ model_id: Qwen/Qwen2.5-VL-3B-Instruct
73
+ task: image-text-to-text
74
+ device: auto
75
+ dtype: bfloat16
76
+ max_batch_size: 1
77
+ ocr:
78
+ model_id: Qwen/Qwen2.5-VL-3B-Instruct
79
+ task: document-ocr
80
+ device: auto
81
+ dtype: bfloat16
82
+ max_batch_size: 1
83
+ table:
84
+ model_id: Qwen/Qwen2.5-VL-3B-Instruct
85
+ task: table-repair
86
+ device: auto
87
+ dtype: bfloat16
88
+ max_batch_size: 1
89
+ embedding:
90
+ model_id: jinaai/jina-embeddings-v3
91
+ task: retrieval.passage
92
+ device: auto
93
+ dtype: bfloat16
94
+ max_batch_size: 16
95
+ task_model_roles:
96
+ vlm_route_repair: vlm
97
+ ocr_page: ocr
98
+ table_vlm_repair: table
99
+ figure_description: vlm
100
+
101
+ pdf:
102
+ render_pages: true
103
+ render_dpi: 150
104
+ crop_tables: true
105
+ crop_figures: true
106
+ asset_dir: assets
107
+
108
+ quality:
109
+ accept_threshold: 0.88
110
+ blocking_failures:
111
+ - empty_page
112
+ - invalid_table
113
+ - missing_text_coverage
114
+ - reading_order_failure
115
+
116
+ chunking:
117
+ enabled: true
118
+ planner: agentic
119
+ baseline_strategy: recursive_structure
120
+ target_tokens: 512
121
+ min_tokens: 120
122
+ overlap_ratio: 0.15
123
+ parent_child: true
124
+ parent_target_tokens: 1600
125
+ page_level_for_paginated_docs: true
126
+ table_chunks: true
127
+ figure_chunks: true
128
+ contextual_prefix: false
129
+ contextual_retrieval: false
130
+ semantic_similarity_threshold: 0.18
131
+ max_propositions_per_source: 8
132
+ max_proposition_chunks: 64
133
+ semantic_chunking: false
134
+ late_chunking: false
135
+ vision_guided: false
136
+ agentic_proposition_chunking: false
137
+ strategy_ladder:
138
+ - fixed_token_baseline
139
+ - recursive_structure
140
+ - metadata_enriched
141
+ - parent_child
142
+ - contextual_retrieval
143
+ - late_chunking
144
+ - semantic_chunking
145
+ - vision_guided
146
+ - agentic_proposition
147
+
148
+ benchmarks:
149
+ retriever:
150
+ # `lexical` (default, model-free TF-IDF) or `embedding` (sentence-transformers).
151
+ # The `embedding` backend pulls model_id and task from gpu.models.embedding
152
+ # unless overridden here. Requires `pip install sentence-transformers`.
153
+ backend: lexical
154
+ model_id: null
155
+ task: null
156
+
157
+ deployment:
158
+ target: huggingface_spaces
159
+ gpu_models_target: zeroshotGPU
configs/docling.yaml ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ parsers:
2
+ # Both docling and pymupdf are enabled deliberately so the parser
3
+ # disagreement-rate metric has a comparison surface on PDF inputs.
4
+ # Disable one if you only need a single-parser baseline.
5
+ docling:
6
+ enabled: true
7
+ do_ocr: false
8
+ do_table_structure: false
9
+ force_backend_text: true
10
+ generate_page_images: false
11
+ generate_picture_images: false
12
+ generate_table_images: false
13
+ do_picture_description: false
14
+ do_picture_classification: false
15
+ do_formula_enrichment: false
16
+ do_code_enrichment: false
17
+ marker:
18
+ enabled: false
19
+ pymupdf:
20
+ enabled: true
21
+
22
+ routing:
23
+ run_multiple_on_hard_pages: true
24
+ max_primary_parsers_per_page: 2
25
+
26
+ pdf:
27
+ render_pages: true
28
+ crop_tables: true
29
+ crop_figures: true
configs/gpu.yaml ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gpu:
2
+ backend: transformers
3
+ provider: huggingface_spaces
4
+ space_name: zeroshotGPU
5
+ batch_pages: true
6
+ validate_tasks: true
7
+ max_batch_size: 4
8
+ max_gpu_seconds_per_doc: 120
9
+ max_vlm_calls_per_doc: 30
10
+ models:
11
+ vlm:
12
+ model_id: Qwen/Qwen2.5-VL-3B-Instruct
13
+ task: image-text-to-text
14
+ device: auto
15
+ dtype: bfloat16
16
+ max_batch_size: 1
17
+ ocr:
18
+ model_id: Qwen/Qwen2.5-VL-3B-Instruct
19
+ task: document-ocr
20
+ device: auto
21
+ dtype: bfloat16
22
+ max_batch_size: 1
23
+ table:
24
+ model_id: Qwen/Qwen2.5-VL-3B-Instruct
25
+ task: table-repair
26
+ device: auto
27
+ dtype: bfloat16
28
+ max_batch_size: 1
29
+ embedding:
30
+ model_id: jinaai/jina-embeddings-v3
31
+ task: retrieval.passage
32
+ device: auto
33
+ dtype: bfloat16
34
+ max_batch_size: 16
35
+ task_model_roles:
36
+ vlm_route_repair: vlm
37
+ ocr_page: ocr
38
+ table_vlm_repair: table
39
+ figure_description: vlm
40
+
41
+ deployment:
42
+ target: huggingface_spaces
43
+ gpu_models_target: zeroshotGPU
configs/parsers.yaml ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ parsers:
2
+ text:
3
+ enabled: true
4
+ pymupdf:
5
+ enabled: true
6
+ docling:
7
+ enabled: false
8
+ marker:
9
+ enabled: false
10
+ command: null
11
+ timeout_seconds: 300
12
+ output_args: "--output_dir {output_dir} --output_format markdown"
13
+ extra_args: ""
14
+ mineru:
15
+ enabled: false
16
+ command: null
17
+ timeout_seconds: 600
18
+ output_args: "--output_dir {output_dir}"
19
+ extra_args: ""
20
+ olmocr:
21
+ enabled: false
22
+ command: null
23
+ timeout_seconds: 600
24
+ output_args: "--output_dir {output_dir}"
25
+ extra_args: ""
26
+ paddleocr:
27
+ enabled: false
28
+ command: null
29
+ timeout_seconds: 600
30
+ output_args: "--output_dir {output_dir}"
31
+ extra_args: ""
32
+ unstructured:
33
+ enabled: false
configs/routing.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ routing:
2
+ run_multiple_on_hard_pages: true
3
+ max_primary_parsers_per_page: 2
4
+ hard_page_threshold: 0.65
5
+ scanned_text_threshold: 0.40
6
+ table_density_threshold: 0.25
7
+ formula_density_threshold: 0.15
8
+ figure_density_threshold: 0.20
docs/space_smoke.md ADDED
@@ -0,0 +1,269 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Hugging Face Space smoke-test checklist
2
+
3
+ This is the deferred deployment-readiness work that can only be exercised on
4
+ real GPU hardware against real models / external CLIs. Run each smoke once
5
+ against a duplicated `zeroshotGPU` Space (or your own dev Space). Each entry
6
+ gives the exact env vars / config flips, the command to trigger, and the
7
+ structured log lines you should expect.
8
+
9
+ All log lines below assume the Space is run with `ZSGDP_LOG_LEVEL=INFO` and
10
+ `ZSGDP_LOG_JSON=1`. `app.py` sets these automatically when `SPACE_ID` is in
11
+ the environment, so on a normal Space you do not need to set them yourself.
12
+ The HF Spaces logs page will surface the JSON records on stderr.
13
+
14
+ ---
15
+
16
+ ## Pre-flight
17
+
18
+ 1. Duplicate the Space, give it `l4x1` hardware.
19
+ 2. Make sure these are set in **Space settings → Variables and secrets**:
20
+ - `ZSGDP_LOG_LEVEL=INFO`
21
+ - `ZSGDP_LOG_JSON=1`
22
+ - (Optional, only for parser smokes that hit a private repo) `HF_TOKEN`.
23
+ 3. In the Space's `requirements.txt`, uncomment the dependency block matching
24
+ the smoke you are running. Do **one smoke per Space deploy** — combining
25
+ them risks an OOM or slow cold-start on the L4.
26
+ 4. Push and wait for the Space to build. First-build cold-start with a model
27
+ download is ~5-10 minutes; subsequent restarts are seconds.
28
+
29
+ After deploy, watch the **Logs** tab for the `parse_start` event. If you do
30
+ not see structured JSON lines there, the logging config is not active —
31
+ double-check `ZSGDP_LOG_JSON=1` in the Space variables.
32
+
33
+ ## Automated runner
34
+
35
+ Each smoke below has an automated counterpart in
36
+ `scripts/run_space_smoke.py`. From a Space JupyterLab terminal (or any
37
+ shell with the project installed):
38
+
39
+ ```bash
40
+ # Run all smokes whose deps are installed; skip the rest with hints:
41
+ python -m scripts.run_space_smoke --output ./space_smoke_report.json
42
+
43
+ # Run only specific smokes:
44
+ python -m scripts.run_space_smoke --smoke lexical --smoke ablation
45
+
46
+ # CI-strict mode: treat skipped smokes as failures (use after you've
47
+ # uncommented the deps for the smoke you intend to run):
48
+ python -m scripts.run_space_smoke --smoke embedding --strict
49
+ ```
50
+
51
+ The runner reports `pass` / `fail` / `skip` / `error` per smoke, plus
52
+ elapsed seconds and a `detail` block with the metrics it gathered. The
53
+ manual procedure below is the fallback when you want to inspect the UI
54
+ directly or test something the runner doesn't cover (e.g. uploading a
55
+ specific real PDF rather than a synthetic fixture).
56
+
57
+ ---
58
+
59
+ ## Smoke 1 — Lexical retriever benchmark (model-free)
60
+
61
+ Confirms the Space's parsing + benchmark plumbing works end-to-end before
62
+ adding any model dependency.
63
+
64
+ **Setup:**
65
+ - Default `requirements.txt` (no uncommenting needed).
66
+ - Default config (no flips).
67
+
68
+ **Trigger:** upload a small markdown file via the Gradio UI.
69
+
70
+ **Expected log lines (in order):**
71
+ - `parse_start` with `doc_id`, `file_type`, `device` (likely `cuda`).
72
+ - One `parser_candidate` per parser that ran (typically `text`, possibly
73
+ `pymupdf` and `docling` if the file was a PDF).
74
+ - Possibly one or more `repair_iteration` records if quality < threshold.
75
+ - `parse_end` with `quality_score`, `repair_iterations`, `chunk_count`.
76
+
77
+ **Pass criteria:**
78
+ - All log lines appear with `doc_id` populated.
79
+ - `parse_end.quality_score >= 0.85` for a clean markdown doc.
80
+ - No `parser_failed` or `gpu_task_blocked` records.
81
+
82
+ ---
83
+
84
+ ## Smoke 2 — Embedding retriever (jina-embeddings-v3)
85
+
86
+ Confirms `sentence-transformers` lazy-load path and that jina-v3 specifically
87
+ runs on the L4 with `trust_remote_code=True`.
88
+
89
+ **Setup:**
90
+ - In `requirements.txt`, uncomment `transformers` and `sentence-transformers`
91
+ lines.
92
+ - Add `configs/space_embedding.yaml` to the repo with:
93
+
94
+ ```yaml
95
+ benchmarks:
96
+ retriever:
97
+ backend: embedding
98
+ model_id: jinaai/jina-embeddings-v3
99
+ task: retrieval.passage
100
+ ```
101
+
102
+ - In `app.py` set `os.environ["ZSGDP_CONFIG_PATH"] = "configs/space_embedding.yaml"`,
103
+ or pass via the env var configured in Space variables.
104
+
105
+ **Trigger:** upload any markdown / PDF; the benchmark CLI is not reachable
106
+ from the Gradio UI today, so for the embedding-retriever smoke you'd need
107
+ to run `zsgdp benchmark --input ./fixtures --output ./out` from a Space
108
+ **JupyterLab** session against a small input dir.
109
+
110
+ **Expected log lines:**
111
+ - First call: a 30–90s pause while jina-v3 weights download (no log lines
112
+ during this — torch logs go to its own logger). Then `parse_start`.
113
+ - After the first parse, subsequent calls are fast (model is in memory).
114
+
115
+ **Pass criteria:**
116
+ - Benchmark completes without an exception.
117
+ - `summary["mean_retrieval_recall_at_5"] >= 0.7` on a small distinct-text
118
+ corpus.
119
+ - No `gpu_task_blocked` records (those are repair-related, not retrieval).
120
+ - The parse_end record's `device` field reads `cuda`.
121
+
122
+ **Failure modes to watch:**
123
+ - `RuntimeError: EmbeddingRetriever requires sentence-transformers` →
124
+ package not in `requirements.txt`.
125
+ - CUDA OOM → switch to a smaller embedding model
126
+ (`sentence-transformers/all-MiniLM-L6-v2`) for the smoke and confirm the
127
+ wiring before retrying jina-v3.
128
+
129
+ ---
130
+
131
+ ## Smoke 3 — Live GPU repair on a malformed table
132
+
133
+ Confirms the repair loop's GPU escalation path actually invokes the
134
+ configured VLM and that the result is applied to the merged document.
135
+
136
+ **Setup:**
137
+ - In `requirements.txt`, uncomment `transformers` (sentence-transformers
138
+ not needed for this smoke).
139
+ - Add `configs/space_gpu_repair.yaml`:
140
+
141
+ ```yaml
142
+ parsers:
143
+ docling:
144
+ enabled: true
145
+ pymupdf:
146
+ enabled: true
147
+ repair:
148
+ enabled: true
149
+ gpu_escalation: true
150
+ execute_gpu_escalations: true # the bit that flips the live path on
151
+ gpu:
152
+ backend: transformers
153
+ models:
154
+ table:
155
+ model_id: Qwen/Qwen2.5-VL-3B-Instruct
156
+ task: table-repair
157
+ device: auto
158
+ dtype: bfloat16
159
+ ```
160
+
161
+ - Set `ZSGDP_CONFIG_PATH=configs/space_gpu_repair.yaml` on the Space.
162
+
163
+ **Trigger:** upload a PDF that contains a table the parsers will likely
164
+ mangle. A two-column financial statement page works well; if you don't
165
+ have one handy, take a Wikipedia article PDF that has a comparison table.
166
+
167
+ **Expected log lines (in order):**
168
+ - `parse_start`.
169
+ - `parser_candidate` for docling and pymupdf (both should fire on a PDF).
170
+ - `repair_iteration` with `iteration=1`, `gpu_task_count >= 1`,
171
+ `gpu_dry_run=false`.
172
+ - One `gpu_task_executed` record per GPU task. `status` should be
173
+ `executed` and `elapsed_seconds` 1-10s for a 3B-param VLM on L4.
174
+ - A second `repair_iteration` with `iteration=2` only if iteration 1
175
+ changed something and quality is still below threshold; otherwise the
176
+ loop terminates.
177
+ - `parse_end` with `repair_iterations >= 1`.
178
+
179
+ **Pass criteria:**
180
+ - At least one `gpu_task_executed` with `status=executed`.
181
+ - The output `parsed_document.json` shows `parsed.tables[i].provenance.gpu_repair_task_id` set.
182
+ - No `gpu_task_blocked` records (would mean missing image_path or doc_id).
183
+
184
+ **Failure modes to watch:**
185
+ - All `gpu_task_executed` records show `status=execution_failed` →
186
+ inspect `output.error` field; common causes are missing image_path
187
+ (the PDF doesn't render page crops because `pdf.crop_tables=true` isn't
188
+ set) or a CUDA OOM.
189
+ - No `repair_iteration` records → the verifier didn't flag any
190
+ blocking issues; pick a different input PDF.
191
+
192
+ ---
193
+
194
+ ## Smoke 4 — Per-parser ablation across docling + pymupdf
195
+
196
+ Confirms the ablation runner produces a comparison CSV and that each arm's
197
+ artifacts are isolated. No GPU dependency, runs on default Space hardware.
198
+
199
+ **Setup:** default config, no requirements.txt changes.
200
+
201
+ **Trigger:** Space JupyterLab terminal:
202
+
203
+ ```bash
204
+ zsgdp benchmark-ablate \
205
+ --input ./fixtures/pdfs \
206
+ --output ./out/ablation \
207
+ --parser docling --parser pymupdf
208
+ ```
209
+
210
+ **Expected log lines:** one parse cycle per arm (parse_start through
211
+ parse_end), three arms total (docling-only, pymupdf-only, merged).
212
+
213
+ **Pass criteria:**
214
+ - `out/ablation/ablation_comparison.csv` has 3 rows.
215
+ - Each arm's `mean_quality_score` is non-zero.
216
+ - The merged arm's `mean_quality_score` is `>= max(per-parser arms)`.
217
+
218
+ ---
219
+
220
+ ## Smoke 5 — External parser CLI (Marker)
221
+
222
+ The riskiest of the four external adapters because Marker's argv schema
223
+ has changed several times. Per-Space, do not bundle with other smokes.
224
+
225
+ **Setup:**
226
+ - Uncomment `marker-pdf` in `requirements.txt`.
227
+ - Add `configs/space_marker.yaml`:
228
+
229
+ ```yaml
230
+ parsers:
231
+ text:
232
+ enabled: false
233
+ pymupdf:
234
+ enabled: false
235
+ marker:
236
+ enabled: true
237
+ timeout_seconds: 300
238
+ output_args: ["--output_dir", "{output_dir}", "--output_format", "markdown"]
239
+ extra_args: []
240
+ ```
241
+
242
+ - Set `ZSGDP_CONFIG_PATH=configs/space_marker.yaml`.
243
+
244
+ **Trigger:** upload a small PDF (1–3 pages) via the Gradio UI.
245
+
246
+ **Expected log lines:**
247
+ - `parse_start`.
248
+ - `parser_candidate` for `marker` with non-zero `element_count`.
249
+ - `parse_end` with `candidate_parsers=["marker"]`.
250
+
251
+ **Pass criteria:**
252
+ - No `parser_failed` record for marker.
253
+ - Output Markdown has reasonable content (open the artifact zip and check).
254
+ - If `parser_failed` fires, look at `extra.error` — most common cause is
255
+ argv schema drift; tweak `output_args` in the config and retry.
256
+
257
+ ---
258
+
259
+ ## What "deployment ready" means after this checklist
260
+
261
+ If smokes 1–3 pass on a fresh duplicated Space, the project is genuinely
262
+ deployable for the Docling + PyMuPDF + Qwen2.5-VL-3B repair stack. Smokes 4
263
+ and 5 are nice-to-have — the per-parser ablation works locally too, and
264
+ external parsers stay flagged "experimental" until you actively need them.
265
+
266
+ Open the `parsed_document.json` from each smoke, copy the `quality_score`,
267
+ `mean_layout_f1` (where applicable), and any §29-relevant metric into
268
+ `README.md` under a new "Production benchmark numbers" section. That
269
+ publishes evidence that the success criteria are met against real data.
examples/parse_folder.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Parse a folder sequentially."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ from pathlib import Path
7
+
8
+ from zsgdp import parse_document
9
+
10
+
11
+ def main() -> int:
12
+ parser = argparse.ArgumentParser()
13
+ parser.add_argument("input")
14
+ parser.add_argument("output")
15
+ args = parser.parse_args()
16
+
17
+ input_dir = Path(args.input)
18
+ output_dir = Path(args.output)
19
+ output_dir.mkdir(parents=True, exist_ok=True)
20
+ for path in sorted(item for item in input_dir.iterdir() if item.is_file()):
21
+ parsed = parse_document(path, output_dir / path.stem)
22
+ print(f"{path.name}: score={parsed.quality_report.score:.2f}")
23
+ return 0
24
+
25
+
26
+ if __name__ == "__main__":
27
+ raise SystemExit(main())
examples/parse_pdf.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Parse one PDF with the MVP pipeline."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+
7
+ from zsgdp import parse_document
8
+
9
+
10
+ def main() -> int:
11
+ parser = argparse.ArgumentParser()
12
+ parser.add_argument("input")
13
+ parser.add_argument("output")
14
+ args = parser.parse_args()
15
+ parsed = parse_document(args.input, args.output)
16
+ print(
17
+ f"score={parsed.quality_report.score:.2f} "
18
+ f"elements={len(parsed.elements)} tables={len(parsed.tables)} "
19
+ f"figures={len(parsed.figures)} chunks={len(parsed.chunks)}"
20
+ )
21
+ return 0
22
+
23
+
24
+ if __name__ == "__main__":
25
+ raise SystemExit(main())
examples/run_benchmark.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Minimal benchmark runner placeholder."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ from pathlib import Path
7
+ from time import perf_counter
8
+
9
+ from zsgdp import parse_document
10
+ from zsgdp.benchmarks.throughput import pages_per_second
11
+
12
+
13
+ def main() -> int:
14
+ parser = argparse.ArgumentParser()
15
+ parser.add_argument("input")
16
+ parser.add_argument("output")
17
+ args = parser.parse_args()
18
+
19
+ input_dir = Path(args.input)
20
+ output_dir = Path(args.output)
21
+ output_dir.mkdir(parents=True, exist_ok=True)
22
+ total_pages = 0
23
+ started = perf_counter()
24
+ for path in sorted(item for item in input_dir.iterdir() if item.is_file()):
25
+ parsed = parse_document(path, output_dir / path.stem)
26
+ total_pages += len(parsed.pages)
27
+ elapsed = perf_counter() - started
28
+ print(f"pages={total_pages} seconds={elapsed:.2f} pages_per_second={pages_per_second(total_pages, elapsed):.2f}")
29
+ return 0
30
+
31
+
32
+ if __name__ == "__main__":
33
+ raise SystemExit(main())
pyproject.toml ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [build-system]
2
+ requires = ["setuptools>=68", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "zero-shot-gpu-doc-parser"
7
+ version = "0.1.0"
8
+ description = "Zero-shot GPU document parsing and agentic chunking control plane."
9
+ readme = "README.md"
10
+ requires-python = ">=3.11"
11
+ license = { text = "MIT" }
12
+ authors = [{ name = "Zero-Shot GPU Document Parser Contributors" }]
13
+ dependencies = []
14
+
15
+ [project.optional-dependencies]
16
+ pdf = ["pymupdf>=1.24.0,<1.28.0"]
17
+ yaml = ["pyyaml>=6.0.1,<7.0.0"]
18
+ docling = ["docling>=2.0.0,<3.0.0"]
19
+ # `spaces` mirrors requirements.txt at the root, which is what HF Spaces
20
+ # installs verbatim. Keep these two in sync; torch is intentionally absent
21
+ # because the l4x1 Space image preinstalls a CUDA-matched build.
22
+ spaces = [
23
+ "gradio>=4.44.0,<7.0.0",
24
+ "pymupdf>=1.24.0,<1.28.0",
25
+ "pyyaml>=6.0.1,<7.0.0",
26
+ "docling>=2.0.0,<3.0.0",
27
+ ]
28
+ embedding = ["sentence-transformers>=3.0.0,<4.0.0", "transformers>=4.45.0,<6.0.0"]
29
+ gpu_repair = ["transformers>=4.45.0,<6.0.0"]
30
+ dev = ["pytest>=8.0.0"]
31
+
32
+ [project.scripts]
33
+ zsgdp = "zsgdp.cli:main"
34
+
35
+ [tool.setuptools.packages.find]
36
+ where = ["."]
37
+ include = ["zsgdp*"]
38
+
39
+ [tool.pytest.ini_options]
40
+ testpaths = ["tests"]
41
+ pythonpath = ["."]
requirements.txt ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Hugging Face Spaces dependencies for zeroshotGPU.
2
+ #
3
+ # Versions are pinned to tested upper bounds within each major. Bump these
4
+ # when you have run `python -m unittest discover` and the benchmark suite
5
+ # successfully against a new release.
6
+ #
7
+ # Torch is intentionally NOT pinned here. The l4x1 Space image preinstalls a
8
+ # CUDA-matched torch build; pinning torch in this file overrides it and risks
9
+ # a runtime/driver mismatch. If you're running locally without the Space
10
+ # preinstall, install torch separately via the recommended channel for your
11
+ # platform (e.g. `pip install torch --index-url https://download.pytorch.org/whl/cu121`).
12
+
13
+ gradio>=4.44.0,<7.0.0
14
+ pymupdf>=1.24.0,<1.28.0
15
+ pyyaml>=6.0.1,<7.0.0
16
+ docling>=2.0.0,<3.0.0
17
+
18
+ # Optional GPU/embedding stack. Uncomment to enable the embedding retriever
19
+ # (benchmarks.retriever.backend=embedding) and live GPU repair escalations
20
+ # (repair.execute_gpu_escalations=true). Both are off by default.
21
+ #
22
+ # transformers>=4.45.0,<6.0.0
23
+ # sentence-transformers>=3.0.0,<4.0.0
24
+
25
+ # Optional external parser CLIs. Each adds a non-trivial install footprint;
26
+ # enable only the ones the Space hardware can support. Adapter shells out to
27
+ # the CLI binary (see zsgdp/parsers/external.py); these adapters have not
28
+ # been smoke-tested against a live install — verify the argv schema before
29
+ # enabling in production.
30
+ #
31
+ # marker-pdf>=1.0.0
32
+ # mineru
33
+ # unstructured>=0.15.0
scripts/__init__.py ADDED
File without changes
scripts/run_space_smoke.py ADDED
@@ -0,0 +1,455 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Space-side smoke validation runner.
2
+
3
+ Automates the smokes documented in docs/space_smoke.md so a Space operator
4
+ can run one command and get a JSON report of which smokes passed, which
5
+ were skipped (missing deps), and which failed (with diagnostic context).
6
+
7
+ Usage:
8
+
9
+ # Run all smokes that have their deps installed:
10
+ python -m scripts.run_space_smoke --output ./space_smoke_report.json
11
+
12
+ # Run only a subset:
13
+ python -m scripts.run_space_smoke --smoke lexical --smoke ablation
14
+
15
+ # Force-fail on skipped smokes (CI-style strict mode):
16
+ python -m scripts.run_space_smoke --strict
17
+
18
+ The runner does NOT install missing dependencies — that's deliberately the
19
+ operator's job (each smoke's deps add Space build time and download cost).
20
+ A skipped smoke prints the exact `pip install` line you'd need.
21
+
22
+ Smokes mirror docs/space_smoke.md:
23
+
24
+ lexical - model-free benchmark on a synthetic markdown corpus
25
+ ablation - per-parser ablation runner (text vs pymupdf)
26
+ embedding - sentence-transformers / jina-embeddings-v3 retrieval
27
+ gpu_repair - live Qwen2.5-VL invocation against a malformed table
28
+ marker - shell out to marker_single on a small PDF (if installed)
29
+ """
30
+
31
+ from __future__ import annotations
32
+
33
+ import argparse
34
+ import importlib.util
35
+ import json
36
+ import shutil
37
+ import subprocess
38
+ import sys
39
+ import tempfile
40
+ import time
41
+ from dataclasses import dataclass, field
42
+ from pathlib import Path
43
+ from typing import Any, Callable
44
+
45
+ REPO_ROOT = Path(__file__).resolve().parents[1]
46
+
47
+
48
+ @dataclass(slots=True)
49
+ class SmokeResult:
50
+ name: str
51
+ status: str # "pass" | "fail" | "skip" | "error"
52
+ elapsed_seconds: float = 0.0
53
+ detail: dict[str, Any] = field(default_factory=dict)
54
+ skip_reason: str = ""
55
+ install_hint: str = ""
56
+
57
+
58
+ @dataclass(slots=True)
59
+ class SmokeReport:
60
+ smokes: list[SmokeResult] = field(default_factory=list)
61
+
62
+ @property
63
+ def passed(self) -> bool:
64
+ return all(item.status in {"pass", "skip"} for item in self.smokes)
65
+
66
+ def to_dict(self) -> dict[str, Any]:
67
+ return {
68
+ "smokes": [
69
+ {
70
+ "name": item.name,
71
+ "status": item.status,
72
+ "elapsed_seconds": round(item.elapsed_seconds, 3),
73
+ "detail": item.detail,
74
+ "skip_reason": item.skip_reason,
75
+ "install_hint": item.install_hint,
76
+ }
77
+ for item in self.smokes
78
+ ],
79
+ "summary": {
80
+ "total": len(self.smokes),
81
+ "passed": sum(1 for item in self.smokes if item.status == "pass"),
82
+ "failed": sum(1 for item in self.smokes if item.status == "fail"),
83
+ "errored": sum(1 for item in self.smokes if item.status == "error"),
84
+ "skipped": sum(1 for item in self.smokes if item.status == "skip"),
85
+ },
86
+ }
87
+
88
+
89
+ # --- Individual smokes -------------------------------------------------------
90
+
91
+
92
+ def _make_distinctive_corpus(root: Path) -> Path:
93
+ """Build a small corpus with three sentences distinct enough that the
94
+ synthetic-QA generator picks one query per chunk."""
95
+
96
+ src = root / "in"
97
+ src.mkdir()
98
+ (src / "doc.md").write_text(
99
+ "# Sample Doc\n\n"
100
+ "Apples grow on trees in the orchard during autumn harvest season.\n\n"
101
+ "Submarines navigate beneath the ocean using sonar pulses across waters.\n\n"
102
+ "Mountains rise above the clouds in the distant horizon line.\n",
103
+ encoding="utf-8",
104
+ )
105
+ return src
106
+
107
+
108
+ def smoke_lexical() -> SmokeResult:
109
+ started = time.perf_counter()
110
+ from zsgdp.benchmarks.parser_quality import run_parser_benchmark
111
+
112
+ with tempfile.TemporaryDirectory() as tmp:
113
+ tmp_path = Path(tmp)
114
+ src = _make_distinctive_corpus(tmp_path)
115
+ out = tmp_path / "out"
116
+ try:
117
+ summary = run_parser_benchmark(src, out, dataset_name="custom_folder")
118
+ except Exception as exc:
119
+ return SmokeResult(
120
+ name="lexical",
121
+ status="error",
122
+ elapsed_seconds=time.perf_counter() - started,
123
+ detail={"exception": str(exc)},
124
+ )
125
+
126
+ quality = float(summary.get("mean_quality_score", 0.0))
127
+ recall = float(summary.get("mean_retrieval_recall_at_1", 0.0))
128
+ passed = quality >= 0.85 and recall >= 0.7
129
+ return SmokeResult(
130
+ name="lexical",
131
+ status="pass" if passed else "fail",
132
+ elapsed_seconds=time.perf_counter() - started,
133
+ detail={
134
+ "mean_quality_score": quality,
135
+ "mean_retrieval_recall_at_1": recall,
136
+ "documents_evaluated": summary.get("document_count"),
137
+ },
138
+ )
139
+
140
+
141
+ def smoke_ablation() -> SmokeResult:
142
+ started = time.perf_counter()
143
+ from zsgdp.benchmarks.ablation_runner import run_parser_ablations
144
+
145
+ with tempfile.TemporaryDirectory() as tmp:
146
+ tmp_path = Path(tmp)
147
+ src = _make_distinctive_corpus(tmp_path)
148
+ out = tmp_path / "out"
149
+ try:
150
+ comparison = run_parser_ablations(
151
+ src,
152
+ out,
153
+ parsers=["text", "pymupdf"],
154
+ dataset_name="custom_folder",
155
+ )
156
+ except Exception as exc:
157
+ return SmokeResult(
158
+ name="ablation",
159
+ status="error",
160
+ elapsed_seconds=time.perf_counter() - started,
161
+ detail={"exception": str(exc)},
162
+ )
163
+
164
+ comparison_csv_exists = (out / "ablation_comparison.csv").exists()
165
+
166
+ arms = [row["arm"] for row in comparison["rows"]]
167
+ expected_arms = {"text", "pymupdf", "merged"}
168
+ passed = comparison["arm_count"] == 3 and set(arms) == expected_arms and comparison_csv_exists
169
+ return SmokeResult(
170
+ name="ablation",
171
+ status="pass" if passed else "fail",
172
+ elapsed_seconds=time.perf_counter() - started,
173
+ detail={
174
+ "arm_count": comparison["arm_count"],
175
+ "arms": arms,
176
+ "comparison_csv_emitted": comparison_csv_exists,
177
+ },
178
+ )
179
+
180
+
181
+ def smoke_embedding() -> SmokeResult:
182
+ started = time.perf_counter()
183
+ if importlib.util.find_spec("sentence_transformers") is None:
184
+ return SmokeResult(
185
+ name="embedding",
186
+ status="skip",
187
+ elapsed_seconds=time.perf_counter() - started,
188
+ skip_reason="sentence-transformers not installed",
189
+ install_hint="python -m pip install 'zero-shot-gpu-doc-parser[embedding]'",
190
+ )
191
+
192
+ from zsgdp.benchmarks.embedding_retriever import EmbeddingRetriever
193
+ from zsgdp.benchmarks.parser_quality import run_parser_benchmark
194
+
195
+ # Try to load the configured embedding model. If the load fails (no HF
196
+ # token, download error, OOM at import time), we report it as a skip
197
+ # with the exception text so the operator sees what to fix without the
198
+ # whole smoke run blowing up.
199
+ try:
200
+ retriever = EmbeddingRetriever()
201
+ retriever._ensure_embedder() # type: ignore[attr-defined] # private but intentional
202
+ except Exception as exc:
203
+ return SmokeResult(
204
+ name="embedding",
205
+ status="skip",
206
+ elapsed_seconds=time.perf_counter() - started,
207
+ skip_reason=f"embedding model failed to load: {exc}",
208
+ install_hint="Set HF_TOKEN if the model is gated, or downsize via "
209
+ "benchmarks.retriever.model_id (e.g. sentence-transformers/all-MiniLM-L6-v2).",
210
+ )
211
+
212
+ config_overrides = {"benchmarks": {"retriever": {"backend": "embedding"}}}
213
+ with tempfile.TemporaryDirectory() as tmp:
214
+ tmp_path = Path(tmp)
215
+ src = _make_distinctive_corpus(tmp_path)
216
+ out = tmp_path / "out"
217
+ config_path = tmp_path / "config.yaml"
218
+ # Inline config write — keeps the smoke self-contained.
219
+ config_path.write_text(
220
+ "benchmarks:\n retriever:\n backend: embedding\n",
221
+ encoding="utf-8",
222
+ )
223
+ try:
224
+ summary = run_parser_benchmark(src, out, config_path=config_path, dataset_name="custom_folder")
225
+ except Exception as exc:
226
+ return SmokeResult(
227
+ name="embedding",
228
+ status="error",
229
+ elapsed_seconds=time.perf_counter() - started,
230
+ detail={"exception": str(exc)},
231
+ )
232
+
233
+ recall_5 = float(summary.get("mean_retrieval_recall_at_5", 0.0))
234
+ passed = recall_5 >= 0.7
235
+ return SmokeResult(
236
+ name="embedding",
237
+ status="pass" if passed else "fail",
238
+ elapsed_seconds=time.perf_counter() - started,
239
+ detail={
240
+ "mean_retrieval_recall_at_5": recall_5,
241
+ "mean_retrieval_recall_at_1": float(summary.get("mean_retrieval_recall_at_1", 0.0)),
242
+ "documents_evaluated": summary.get("document_count"),
243
+ },
244
+ )
245
+
246
+
247
+ def smoke_gpu_repair() -> SmokeResult:
248
+ started = time.perf_counter()
249
+ if importlib.util.find_spec("transformers") is None:
250
+ return SmokeResult(
251
+ name="gpu_repair",
252
+ status="skip",
253
+ elapsed_seconds=time.perf_counter() - started,
254
+ skip_reason="transformers not installed",
255
+ install_hint="python -m pip install 'zero-shot-gpu-doc-parser[gpu_repair]'",
256
+ )
257
+
258
+ # Don't actually instantiate the transformers pipeline here — it would
259
+ # download multi-GB Qwen2.5-VL weights even on a dry probe. Instead, we
260
+ # smoke-test the wiring: a dry-run task plan, and report whether the
261
+ # underlying client class can be imported. Operators who want a real
262
+ # model invocation should use `run-gpu-tasks --execute` against a parsed
263
+ # output directory; the result lands in repair.gpu_escalation.results.
264
+ from zsgdp.gpu.transformers_client import TransformersClient
265
+ from zsgdp.pipeline import parse_document
266
+
267
+ with tempfile.TemporaryDirectory() as tmp:
268
+ tmp_path = Path(tmp)
269
+ src = tmp_path / "report.md"
270
+ # Malformed table (header has 2 columns; data row has 3) forces the
271
+ # repair loop to plan a table_vlm_repair task.
272
+ src.write_text(
273
+ "# Report\n\n| A | B |\n| --- | --- |\n| 1 | 2 | 3 |\n",
274
+ encoding="utf-8",
275
+ )
276
+ out = tmp_path / "out"
277
+ try:
278
+ parsed = parse_document(src, out)
279
+ except Exception as exc:
280
+ return SmokeResult(
281
+ name="gpu_repair",
282
+ status="error",
283
+ elapsed_seconds=time.perf_counter() - started,
284
+ detail={"exception": str(exc)},
285
+ )
286
+
287
+ repair = parsed.provenance.get("repair", {})
288
+ gpu_escalation = repair.get("gpu_escalation") or {}
289
+ task_count = int(gpu_escalation.get("task_count") or 0)
290
+ iterations = parsed.provenance.get("repair_iterations") or []
291
+ # We can confirm:
292
+ # * Dry-run plan ran (task_count >= 1 for the malformed table)
293
+ # * The repair loop iterated at least once
294
+ # * The TransformersClient class is importable for live execution
295
+ can_execute = TransformersClient is not None
296
+ passed = task_count >= 1 and len(iterations) >= 1 and can_execute
297
+ return SmokeResult(
298
+ name="gpu_repair",
299
+ status="pass" if passed else "fail",
300
+ elapsed_seconds=time.perf_counter() - started,
301
+ detail={
302
+ "dry_run_task_count": task_count,
303
+ "repair_iterations": len(iterations),
304
+ "transformers_client_importable": can_execute,
305
+ "note": "This smoke verifies wiring only. To verify model invocation "
306
+ "end-to-end, set repair.execute_gpu_escalations=true in config "
307
+ "and run zsgdp run-gpu-tasks --execute against a parsed dir.",
308
+ },
309
+ )
310
+
311
+
312
+ def smoke_marker() -> SmokeResult:
313
+ started = time.perf_counter()
314
+ if shutil.which("marker_single") is None and shutil.which("marker") is None:
315
+ return SmokeResult(
316
+ name="marker",
317
+ status="skip",
318
+ elapsed_seconds=time.perf_counter() - started,
319
+ skip_reason="neither `marker_single` nor `marker` found on PATH",
320
+ install_hint="python -m pip install marker-pdf",
321
+ )
322
+
323
+ # Marker is heavy enough that even a probe call can take 30+s on first
324
+ # invocation (model load). We confirm the registry adapter reports
325
+ # available, but don't run a full parse here — surface that as a manual
326
+ # follow-up via the smoke checklist.
327
+ from zsgdp.parsers.registry import get_parser
328
+
329
+ try:
330
+ adapter = get_parser("marker")
331
+ except KeyError as exc:
332
+ return SmokeResult(
333
+ name="marker",
334
+ status="error",
335
+ elapsed_seconds=time.perf_counter() - started,
336
+ detail={"exception": str(exc)},
337
+ )
338
+ available = bool(adapter.available())
339
+ return SmokeResult(
340
+ name="marker",
341
+ status="pass" if available else "fail",
342
+ elapsed_seconds=time.perf_counter() - started,
343
+ detail={
344
+ "adapter_reports_available": available,
345
+ "note": "End-to-end Marker parse is intentionally not run here "
346
+ "(cold-load is heavy). See docs/space_smoke.md Smoke 5 "
347
+ "for the manual upload-and-parse procedure.",
348
+ },
349
+ )
350
+
351
+
352
+ SMOKE_REGISTRY: dict[str, Callable[[], SmokeResult]] = {
353
+ "lexical": smoke_lexical,
354
+ "ablation": smoke_ablation,
355
+ "embedding": smoke_embedding,
356
+ "gpu_repair": smoke_gpu_repair,
357
+ "marker": smoke_marker,
358
+ }
359
+
360
+
361
+ # --- Driver ------------------------------------------------------------------
362
+
363
+
364
+ def run_smokes(names: list[str] | None = None) -> SmokeReport:
365
+ selected = names or list(SMOKE_REGISTRY)
366
+ report = SmokeReport()
367
+ for name in selected:
368
+ smoke = SMOKE_REGISTRY.get(name)
369
+ if smoke is None:
370
+ report.smokes.append(
371
+ SmokeResult(
372
+ name=name,
373
+ status="error",
374
+ detail={"exception": f"unknown smoke: {name}"},
375
+ )
376
+ )
377
+ continue
378
+ try:
379
+ result = smoke()
380
+ except Exception as exc:
381
+ result = SmokeResult(
382
+ name=name,
383
+ status="error",
384
+ detail={"exception": f"{type(exc).__name__}: {exc}"},
385
+ )
386
+ report.smokes.append(result)
387
+ return report
388
+
389
+
390
+ def format_text_summary(report: SmokeReport, *, strict: bool = False) -> str:
391
+ lines: list[str] = []
392
+ for item in report.smokes:
393
+ marker = {
394
+ "pass": "ok",
395
+ "fail": "FAIL",
396
+ "skip": "skip",
397
+ "error": "ERROR",
398
+ }.get(item.status, item.status.upper())
399
+ line = f" [{marker}] {item.name} ({item.elapsed_seconds:.2f}s)"
400
+ if item.status == "skip":
401
+ line += f" reason={item.skip_reason}"
402
+ elif item.status == "fail":
403
+ line += f" detail={json.dumps(item.detail, default=str)}"
404
+ elif item.status == "error":
405
+ line += f" detail={json.dumps(item.detail, default=str)}"
406
+ lines.append(line)
407
+
408
+ summary = report.to_dict()["summary"]
409
+ overall = "PASS" if (report.passed and (not strict or summary["skipped"] == 0)) else "FAIL"
410
+ lines.append(
411
+ f"smoke: {overall} passed={summary['passed']} failed={summary['failed']} "
412
+ f"errored={summary['errored']} skipped={summary['skipped']}"
413
+ )
414
+ return "\n".join(lines)
415
+
416
+
417
+ def main(argv: list[str] | None = None) -> int:
418
+ parser = argparse.ArgumentParser(
419
+ prog="run_space_smoke",
420
+ description="Run zsgdp Space-side smoke validations.",
421
+ )
422
+ parser.add_argument(
423
+ "--smoke",
424
+ action="append",
425
+ dest="smokes",
426
+ choices=list(SMOKE_REGISTRY),
427
+ help="Smoke to run. Repeat to run multiple. Default: all registered smokes.",
428
+ )
429
+ parser.add_argument("--output", help="Optional JSON report path.")
430
+ parser.add_argument(
431
+ "--strict",
432
+ action="store_true",
433
+ help="Treat skipped smokes as failures (useful in CI when all deps must be present).",
434
+ )
435
+ args = parser.parse_args(argv)
436
+
437
+ report = run_smokes(args.smokes)
438
+ print(format_text_summary(report, strict=args.strict))
439
+
440
+ if args.output:
441
+ Path(args.output).write_text(
442
+ json.dumps(report.to_dict(), indent=2, ensure_ascii=False) + "\n",
443
+ encoding="utf-8",
444
+ )
445
+
446
+ summary = report.to_dict()["summary"]
447
+ if summary["failed"] or summary["errored"]:
448
+ return 1
449
+ if args.strict and summary["skipped"]:
450
+ return 1
451
+ return 0
452
+
453
+
454
+ if __name__ == "__main__":
455
+ raise SystemExit(main())
tests/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Test package."""
tests/regression/README.md ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Regression fixtures
2
+
3
+ Each fixture is a `(<name>.input.<ext>, <name>.expected.json)` pair under
4
+ `fixtures/`. The runner in `test_regression.py` parses every input through
5
+ `parse_document` and compares the resulting `ParsedDocument` against the
6
+ snapshot in `<name>.expected.json` with explicit tolerances.
7
+
8
+ ## Fixture file shape
9
+
10
+ `<name>.expected.json` has these keys (all optional except `name`):
11
+
12
+ ```json
13
+ {
14
+ "name": "human-readable identifier",
15
+ "config": "configs/docling.yaml",
16
+ "selected_parsers": ["text"],
17
+ "tolerances": {
18
+ "quality_score_min": 0.85,
19
+ "element_count_range": [3, 6],
20
+ "table_count": 1,
21
+ "figure_count": 0,
22
+ "chunk_count_min": 1,
23
+ "blocking_failures": false,
24
+ "must_contain_markdown": ["# Report", "Apples grow"],
25
+ "must_not_contain_markdown": ["TODO", "FIXME"]
26
+ }
27
+ }
28
+ ```
29
+
30
+ Tolerance keys (all optional):
31
+
32
+ - `quality_score_min` (float): assert `parsed.quality_report.score >= value`.
33
+ - `quality_score_max` (float): assert `parsed.quality_report.score <= value`.
34
+ - `element_count` (int) or `element_count_range` ([min, max]).
35
+ - `table_count` (int) or `table_count_range`.
36
+ - `figure_count` (int) or `figure_count_range`.
37
+ - `chunk_count_min` (int): assert at least N chunks.
38
+ - `chunk_count_max` (int): assert at most N chunks.
39
+ - `blocking_failures` (bool): assert `quality_report.has_blocking_failures` matches.
40
+ - `must_contain_markdown` (list[str]): each string must appear in
41
+ `parsed.to_markdown()`.
42
+ - `must_not_contain_markdown` (list[str]): each string must NOT appear.
43
+ - `must_contain_quality_metrics` (list[str]): each metric key must appear in
44
+ `quality_report.metrics`.
45
+ - `parser_disagreement_rate_max` (float): assert disagreement <= value.
46
+ - `repair_resolution_rate_min` (float): assert resolution >= value.
47
+
48
+ Missing keys are not asserted (no false failures from over-specification).
49
+
50
+ ## Adding a fixture
51
+
52
+ 1. Drop the input document under `fixtures/`. PDFs, markdown, html, txt all
53
+ work via the standard pipeline.
54
+ 2. Run a one-off `parse_document` against it locally and inspect the output.
55
+ 3. Hand-write `<name>.expected.json` with the constraints you want to lock
56
+ down. Prefer ranges over exact counts where reasonable variance exists.
57
+ 4. Run `python3.11 -m unittest tests.test_regression`. It auto-discovers.
58
+
59
+ ## Performance baselines (opt-in)
60
+
61
+ A fixture may include a `performance` block with throughput floors:
62
+
63
+ ```json
64
+ {
65
+ "performance": {
66
+ "repeats": 2,
67
+ "max_elapsed_seconds": 2.0,
68
+ "min_pages_per_second": 0.5,
69
+ "always_enforce": false
70
+ }
71
+ }
72
+ ```
73
+
74
+ Keys:
75
+
76
+ - `repeats` (int, default 2): number of warm parses to time. The median
77
+ elapsed is compared against the floor so a single cold-import outlier
78
+ does not flag.
79
+ - `max_elapsed_seconds`: parse must finish under this in median.
80
+ - `min_pages_per_second`: median pages/sec must meet or beat this.
81
+ - `always_enforce` (bool, default false): when true, perf is always checked.
82
+
83
+ Otherwise perf is gated on `ZSGDP_REGRESSION_PERF=1` so slow CI runners
84
+ don't get noisy. Floors should be **catastrophic-regression guards** — set
85
+ them ~50–100x slacker than your local median, not tight perf bars. The
86
+ point is to catch "parsing a tiny markdown doc now takes 30 seconds,"
87
+ not to track 5 % perf shifts.
88
+
89
+ To set a baseline for a new fixture: parse it 5 times locally, take the
90
+ median, multiply by ~10–80x for the `max_elapsed_seconds` floor.
91
+
92
+ ## When a regression fires
93
+
94
+ The failure message points at the specific tolerance that broke. Don't blindly
95
+ loosen the tolerance — investigate whether the regression is real first
96
+ (parser-version bump, repair-loop drift, chunk planner change). If the new
97
+ behavior is intentional and better, regenerate the snapshot.
tests/regression/__init__.py ADDED
File without changes
tests/regression/fixtures/markdown_basic.expected.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "markdown_basic",
3
+ "tolerances": {
4
+ "quality_score_min": 0.9,
5
+ "blocking_failures": false,
6
+ "element_count_range": [4, 8],
7
+ "table_count": 1,
8
+ "figure_count": 0,
9
+ "chunk_count_min": 4,
10
+ "must_contain_markdown": [
11
+ "# Quarterly Report",
12
+ "Apples grow on trees in the orchard",
13
+ "| Region | Q1 | Q2 |",
14
+ "Submarines navigate beneath the ocean"
15
+ ],
16
+ "must_not_contain_markdown": ["TODO", "FIXME"],
17
+ "must_contain_quality_metrics": [
18
+ "document_text_coverage",
19
+ "parser_disagreement_rate",
20
+ "repair_resolution_rate"
21
+ ],
22
+ "parser_disagreement_rate_max": 0.5,
23
+ "repair_resolution_rate_min": 0.5
24
+ },
25
+ "performance": {
26
+ "_comment": "Floors are catastrophic-regression guards, not tight perf bars. Median of 2 warm runs (cold-import outlier dropped) was ~6ms locally; the floor is 80x that to absorb slow CI. Enable with ZSGDP_REGRESSION_PERF=1 or set always_enforce: true.",
27
+ "repeats": 2,
28
+ "max_elapsed_seconds": 2.0,
29
+ "min_pages_per_second": 0.5
30
+ }
31
+ }
tests/regression/fixtures/markdown_basic.input.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Quarterly Report
2
+
3
+ Apples grow on trees in the orchard during the autumn harvest season.
4
+
5
+ ## Revenue
6
+
7
+ | Region | Q1 | Q2 |
8
+ | --- | --- | --- |
9
+ | North America | 10 | 12 |
10
+ | Europe | 8 | 9 |
11
+
12
+ ## Outlook
13
+
14
+ Submarines navigate beneath the ocean using sonar pulses across waters.
tests/regression/test_regression.py ADDED
@@ -0,0 +1,255 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Snapshot regression tests against fixtures in this directory.
2
+
3
+ Discovery: every <name>.expected.json under fixtures/ pairs with a sibling
4
+ <name>.input.<ext>. The runner parses the input, then asserts each tolerance
5
+ in the expected file. Tolerance keys are documented in fixtures/README.md.
6
+
7
+ Performance baselines are opt-in per fixture via a `performance` block in
8
+ the expected file. They run only when ZSGDP_REGRESSION_PERF=1 (or when the
9
+ performance block has `always_enforce: true`) so a slow CI runner does not
10
+ fail on transient noise. When enabled, the parse is run twice and the
11
+ median elapsed time is compared against the floor.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import json
17
+ import os
18
+ import statistics
19
+ import tempfile
20
+ import time
21
+ import unittest
22
+ import unittest.mock
23
+ from pathlib import Path
24
+ from typing import Any
25
+
26
+ from zsgdp.pipeline import parse_document
27
+
28
+ FIXTURE_DIR = Path(__file__).parent / "fixtures"
29
+
30
+
31
+ def _discover_fixtures() -> list[tuple[str, Path, Path]]:
32
+ pairs: list[tuple[str, Path, Path]] = []
33
+ if not FIXTURE_DIR.exists():
34
+ return pairs
35
+ for expected in sorted(FIXTURE_DIR.glob("*.expected.json")):
36
+ name = expected.name[: -len(".expected.json")]
37
+ candidates = sorted(FIXTURE_DIR.glob(f"{name}.input.*"))
38
+ if not candidates:
39
+ continue
40
+ pairs.append((name, candidates[0], expected))
41
+ return pairs
42
+
43
+
44
+ def _check_int_or_range(actual: int, exact: Any, range_value: Any, label: str) -> str | None:
45
+ if exact is not None and int(exact) != actual:
46
+ return f"{label}: expected {exact}, got {actual}"
47
+ if isinstance(range_value, (list, tuple)) and len(range_value) == 2:
48
+ lo, hi = int(range_value[0]), int(range_value[1])
49
+ if not (lo <= actual <= hi):
50
+ return f"{label}: expected in [{lo}, {hi}], got {actual}"
51
+ return None
52
+
53
+
54
+ def _evaluate(parsed, tolerances: dict[str, Any]) -> list[str]:
55
+ failures: list[str] = []
56
+ score = float(parsed.quality_report.score)
57
+ if "quality_score_min" in tolerances and score < float(tolerances["quality_score_min"]):
58
+ failures.append(f"quality_score: {score:.3f} < {tolerances['quality_score_min']}")
59
+ if "quality_score_max" in tolerances and score > float(tolerances["quality_score_max"]):
60
+ failures.append(f"quality_score: {score:.3f} > {tolerances['quality_score_max']}")
61
+
62
+ for label, count, exact_key, range_key in (
63
+ ("element_count", len(parsed.elements), "element_count", "element_count_range"),
64
+ ("table_count", len(parsed.tables), "table_count", "table_count_range"),
65
+ ("figure_count", len(parsed.figures), "figure_count", "figure_count_range"),
66
+ ):
67
+ message = _check_int_or_range(count, tolerances.get(exact_key), tolerances.get(range_key), label)
68
+ if message:
69
+ failures.append(message)
70
+
71
+ chunk_count = len(parsed.chunks)
72
+ if "chunk_count_min" in tolerances and chunk_count < int(tolerances["chunk_count_min"]):
73
+ failures.append(f"chunk_count: {chunk_count} < {tolerances['chunk_count_min']}")
74
+ if "chunk_count_max" in tolerances and chunk_count > int(tolerances["chunk_count_max"]):
75
+ failures.append(f"chunk_count: {chunk_count} > {tolerances['chunk_count_max']}")
76
+
77
+ if "blocking_failures" in tolerances:
78
+ actual = parsed.quality_report.has_blocking_failures
79
+ expected = bool(tolerances["blocking_failures"])
80
+ if actual != expected:
81
+ failures.append(f"blocking_failures: expected {expected}, got {actual}")
82
+
83
+ md = parsed.to_markdown()
84
+ for needle in tolerances.get("must_contain_markdown", []) or []:
85
+ if str(needle) not in md:
86
+ failures.append(f"must_contain_markdown: {needle!r} not found")
87
+ for needle in tolerances.get("must_not_contain_markdown", []) or []:
88
+ if str(needle) in md:
89
+ failures.append(f"must_not_contain_markdown: {needle!r} present")
90
+
91
+ metrics = parsed.quality_report.metrics
92
+ for key in tolerances.get("must_contain_quality_metrics", []) or []:
93
+ if key not in metrics:
94
+ failures.append(f"must_contain_quality_metrics: {key!r} missing")
95
+
96
+ if "parser_disagreement_rate_max" in tolerances:
97
+ rate = float(metrics.get("parser_disagreement_rate", 0.0))
98
+ if rate > float(tolerances["parser_disagreement_rate_max"]):
99
+ failures.append(
100
+ f"parser_disagreement_rate: {rate:.3f} > {tolerances['parser_disagreement_rate_max']}"
101
+ )
102
+ if "repair_resolution_rate_min" in tolerances:
103
+ rate = float(metrics.get("repair_resolution_rate", 1.0))
104
+ if rate < float(tolerances["repair_resolution_rate_min"]):
105
+ failures.append(
106
+ f"repair_resolution_rate: {rate:.3f} < {tolerances['repair_resolution_rate_min']}"
107
+ )
108
+
109
+ return failures
110
+
111
+
112
+ def _perf_enforcement_enabled(performance: dict[str, Any]) -> bool:
113
+ if performance.get("always_enforce"):
114
+ return True
115
+ return os.environ.get("ZSGDP_REGRESSION_PERF", "").strip().lower() in {"1", "true", "yes"}
116
+
117
+
118
+ def _measure_parse(input_path: Path, *, config_path: Path | None, selected_parsers, repeats: int) -> tuple[Any, list[float]]:
119
+ """Parse the input N times, returning (last_parsed, list_of_elapsed_seconds).
120
+
121
+ Uses a fresh temp output directory for each run so disk caching effects
122
+ are roughly equal across runs. The last parsed document is returned for
123
+ tolerance evaluation; per-run elapsed times feed the perf assertion.
124
+ """
125
+
126
+ elapsed: list[float] = []
127
+ parsed = None
128
+ for _ in range(max(1, repeats)):
129
+ with tempfile.TemporaryDirectory() as tmp:
130
+ started = time.perf_counter()
131
+ parsed = parse_document(
132
+ input_path,
133
+ Path(tmp) / "out",
134
+ config_path=config_path if config_path else None,
135
+ selected_parsers=selected_parsers,
136
+ )
137
+ elapsed.append(time.perf_counter() - started)
138
+ return parsed, elapsed
139
+
140
+
141
+ def _evaluate_performance(parsed, performance: dict[str, Any], elapsed_seconds: list[float]) -> list[str]:
142
+ failures: list[str] = []
143
+ if not elapsed_seconds:
144
+ return failures
145
+
146
+ median_elapsed = statistics.median(elapsed_seconds)
147
+ page_count = max(len(parsed.pages), 1)
148
+ median_pages_per_second = page_count / median_elapsed if median_elapsed > 0 else float("inf")
149
+
150
+ max_elapsed = performance.get("max_elapsed_seconds")
151
+ if max_elapsed is not None and median_elapsed > float(max_elapsed):
152
+ failures.append(
153
+ f"performance.max_elapsed_seconds: median {median_elapsed:.2f}s > {max_elapsed}s "
154
+ f"(runs={len(elapsed_seconds)})"
155
+ )
156
+
157
+ min_pps = performance.get("min_pages_per_second")
158
+ if min_pps is not None and median_pages_per_second < float(min_pps):
159
+ failures.append(
160
+ f"performance.min_pages_per_second: median {median_pages_per_second:.2f} < {min_pps} "
161
+ f"(runs={len(elapsed_seconds)})"
162
+ )
163
+
164
+ return failures
165
+
166
+
167
+ class RegressionFixturesTest(unittest.TestCase):
168
+ def test_regression_fixtures_match_snapshots(self):
169
+ fixtures = _discover_fixtures()
170
+ if not fixtures:
171
+ self.skipTest("No regression fixtures present.")
172
+
173
+ all_failures: list[str] = []
174
+ for name, input_path, expected_path in fixtures:
175
+ with self.subTest(fixture=name):
176
+ expected = json.loads(expected_path.read_text(encoding="utf-8"))
177
+ tolerances = expected.get("tolerances") or {}
178
+ performance = expected.get("performance") or {}
179
+ config_rel = expected.get("config")
180
+ config_path = Path(config_rel) if config_rel else None
181
+ if config_path and not config_path.is_absolute():
182
+ config_path = Path(__file__).resolve().parents[2] / config_path
183
+ selected_parsers = expected.get("selected_parsers")
184
+
185
+ perf_enabled = bool(performance) and _perf_enforcement_enabled(performance)
186
+ repeats = int(performance.get("repeats", 2)) if perf_enabled else 1
187
+
188
+ parsed, elapsed = _measure_parse(
189
+ input_path,
190
+ config_path=config_path,
191
+ selected_parsers=selected_parsers,
192
+ repeats=repeats,
193
+ )
194
+
195
+ failures = _evaluate(parsed, tolerances)
196
+ if perf_enabled:
197
+ failures.extend(_evaluate_performance(parsed, performance, elapsed))
198
+ if failures:
199
+ all_failures.append(f"[{name}] " + "; ".join(failures))
200
+
201
+ if all_failures:
202
+ self.fail("\n".join(all_failures))
203
+
204
+
205
+ class PerformanceEvaluatorTests(unittest.TestCase):
206
+ """Unit tests for the perf-evaluation helpers, separate from fixture discovery."""
207
+
208
+ def test_max_elapsed_floor_fires_when_too_slow(self):
209
+ from types import SimpleNamespace
210
+
211
+ parsed = SimpleNamespace(pages=[{"page_num": 1}])
212
+ failures = _evaluate_performance(parsed, {"max_elapsed_seconds": 0.1}, [0.5, 0.5])
213
+ self.assertEqual(len(failures), 1)
214
+ self.assertIn("max_elapsed_seconds", failures[0])
215
+
216
+ def test_min_pages_per_second_fires_when_too_slow(self):
217
+ from types import SimpleNamespace
218
+
219
+ parsed = SimpleNamespace(pages=[{"page_num": 1}])
220
+ # 1 page in 10s => 0.1 pps, floor 1.0 => fail.
221
+ failures = _evaluate_performance(parsed, {"min_pages_per_second": 1.0}, [10.0, 10.0])
222
+ self.assertEqual(len(failures), 1)
223
+ self.assertIn("min_pages_per_second", failures[0])
224
+
225
+ def test_passing_floors_yield_no_failures(self):
226
+ from types import SimpleNamespace
227
+
228
+ parsed = SimpleNamespace(pages=[{"page_num": 1}, {"page_num": 2}])
229
+ # 2 pages in 0.5s => 4 pps; floor 1.0 pps and max 2s.
230
+ failures = _evaluate_performance(
231
+ parsed,
232
+ {"max_elapsed_seconds": 2.0, "min_pages_per_second": 1.0},
233
+ [0.5, 0.5, 0.5],
234
+ )
235
+ self.assertEqual(failures, [])
236
+
237
+ def test_median_strips_cold_outlier(self):
238
+ from types import SimpleNamespace
239
+
240
+ parsed = SimpleNamespace(pages=[{"page_num": 1}])
241
+ # First run cold (5s), next two warm (0.1s). Median = 0.1s; floor 1s passes.
242
+ failures = _evaluate_performance(parsed, {"max_elapsed_seconds": 1.0}, [5.0, 0.1, 0.1])
243
+ self.assertEqual(failures, [])
244
+
245
+ def test_perf_enforcement_gating(self):
246
+ with unittest.mock.patch.dict("os.environ", {"ZSGDP_REGRESSION_PERF": "0"}, clear=False):
247
+ self.assertFalse(_perf_enforcement_enabled({"max_elapsed_seconds": 1.0}))
248
+ self.assertTrue(_perf_enforcement_enabled({"always_enforce": True}))
249
+
250
+ with unittest.mock.patch.dict("os.environ", {"ZSGDP_REGRESSION_PERF": "1"}, clear=False):
251
+ self.assertTrue(_perf_enforcement_enabled({"max_elapsed_seconds": 1.0}))
252
+
253
+
254
+ if __name__ == "__main__":
255
+ unittest.main()
tests/test_ablation_runner.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for parser-contribution metrics and the ablation runner."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import tempfile
7
+ import unittest
8
+ from pathlib import Path
9
+
10
+ from zsgdp.benchmarks.ablation_runner import ABLATION_METRIC_KEYS, run_parser_ablations
11
+ from zsgdp.benchmarks.parser_quality import run_parser_benchmark
12
+
13
+
14
+ class TestParserContribution(unittest.TestCase):
15
+ def test_contribution_counts_appear_in_summary(self):
16
+ with tempfile.TemporaryDirectory() as tmp:
17
+ tmp = Path(tmp)
18
+ src = tmp / "in"
19
+ src.mkdir()
20
+ (src / "doc.md").write_text("# Doc\n\nA paragraph.\n", encoding="utf-8")
21
+
22
+ summary = run_parser_benchmark(src, tmp / "out", dataset_name="custom_folder")
23
+
24
+ doc = summary["documents"][0]
25
+ self.assertIn("parser_contribution_counts", doc)
26
+ self.assertIn("parser_contribution_fractions", doc)
27
+ self.assertGreater(sum(doc["parser_contribution_counts"].values()), 0)
28
+ # The sum of fractions should be ~1.0 across parsers.
29
+ total_fraction = sum(doc["parser_contribution_fractions"].values())
30
+ self.assertAlmostEqual(total_fraction, 1.0, places=6)
31
+
32
+ top_summary = summary["parser_contribution_summary"]
33
+ self.assertGreater(top_summary["total"], 0)
34
+ self.assertEqual(set(top_summary["counts"]), set(top_summary["fractions"]))
35
+
36
+ def test_text_parser_dominates_markdown_doc(self):
37
+ with tempfile.TemporaryDirectory() as tmp:
38
+ tmp = Path(tmp)
39
+ src = tmp / "in"
40
+ src.mkdir()
41
+ (src / "doc.md").write_text("# Doc\n\nPara one.\n\nPara two.\n", encoding="utf-8")
42
+
43
+ summary = run_parser_benchmark(src, tmp / "out", dataset_name="custom_folder")
44
+
45
+ top_counts = summary["parser_contribution_summary"]["counts"]
46
+ self.assertIn("text", top_counts)
47
+ text_count = top_counts["text"]
48
+ other_count = sum(value for parser, value in top_counts.items() if parser != "text")
49
+ self.assertGreaterEqual(text_count, other_count)
50
+
51
+
52
+ class TestRunParserAblations(unittest.TestCase):
53
+ def test_two_arms_plus_merged(self):
54
+ with tempfile.TemporaryDirectory() as tmp:
55
+ tmp = Path(tmp)
56
+ src = tmp / "in"
57
+ src.mkdir()
58
+ (src / "doc.md").write_text("# Doc\n\nPara one.\n\nPara two.\n", encoding="utf-8")
59
+ out = tmp / "out"
60
+
61
+ comparison = run_parser_ablations(
62
+ src,
63
+ out,
64
+ parsers=["text", "pymupdf"],
65
+ dataset_name="custom_folder",
66
+ )
67
+
68
+ self.assertEqual(comparison["arm_count"], 3)
69
+ arms = sorted(row["arm"] for row in comparison["rows"])
70
+ self.assertEqual(arms, ["merged", "pymupdf", "text"])
71
+ self.assertTrue((out / "arm_text").exists())
72
+ self.assertTrue((out / "arm_pymupdf").exists())
73
+ self.assertTrue((out / "arm_merged").exists())
74
+ self.assertTrue((out / "ablation_comparison.csv").exists())
75
+ self.assertTrue((out / "ablation_summary.json").exists())
76
+
77
+ # Each arm record carries the canonical metric keys (subset of those present).
78
+ for row in comparison["rows"]:
79
+ self.assertIn("mean_quality_score", row)
80
+
81
+ def test_no_merged_when_disabled(self):
82
+ with tempfile.TemporaryDirectory() as tmp:
83
+ tmp = Path(tmp)
84
+ src = tmp / "in"
85
+ src.mkdir()
86
+ (src / "doc.md").write_text("# Doc\n\nPara.\n", encoding="utf-8")
87
+
88
+ comparison = run_parser_ablations(
89
+ src,
90
+ tmp / "out",
91
+ parsers=["text", "pymupdf"],
92
+ dataset_name="custom_folder",
93
+ include_merged=False,
94
+ )
95
+ self.assertEqual(comparison["arm_count"], 2)
96
+ self.assertNotIn("merged", {row["arm"] for row in comparison["rows"]})
97
+
98
+ def test_single_parser_ablation_skips_merged_arm(self):
99
+ with tempfile.TemporaryDirectory() as tmp:
100
+ tmp = Path(tmp)
101
+ src = tmp / "in"
102
+ src.mkdir()
103
+ (src / "doc.md").write_text("# Doc\n\nPara.\n", encoding="utf-8")
104
+
105
+ comparison = run_parser_ablations(
106
+ src,
107
+ tmp / "out",
108
+ parsers=["text"],
109
+ dataset_name="custom_folder",
110
+ )
111
+ # Single parser + include_merged defaults true, but len(parsers) == 1
112
+ # so merged would be redundant and is skipped.
113
+ self.assertEqual(comparison["arm_count"], 1)
114
+ self.assertEqual(comparison["rows"][0]["arm"], "text")
115
+
116
+ def test_empty_parsers_raises(self):
117
+ with self.assertRaises(ValueError):
118
+ run_parser_ablations(".", "./out", parsers=[])
119
+
120
+ def test_metric_keys_constant_matches_summary_shape(self):
121
+ with tempfile.TemporaryDirectory() as tmp:
122
+ tmp = Path(tmp)
123
+ src = tmp / "in"
124
+ src.mkdir()
125
+ (src / "doc.md").write_text("# Doc\n\nPara.\n", encoding="utf-8")
126
+
127
+ summary = run_parser_benchmark(src, tmp / "out", dataset_name="custom_folder")
128
+ for key in ABLATION_METRIC_KEYS:
129
+ self.assertIn(key, summary, f"benchmark summary missing key {key}")
130
+
131
+
132
+ if __name__ == "__main__":
133
+ unittest.main()
tests/test_app.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tempfile
2
+ import unittest
3
+ from pathlib import Path
4
+ from unittest.mock import patch
5
+
6
+ try:
7
+ import app as space_app
8
+ except RuntimeError as exc:
9
+ space_app = None
10
+ APP_IMPORT_ERROR = str(exc)
11
+ else:
12
+ APP_IMPORT_ERROR = ""
13
+
14
+
15
+ class _UploadedFile:
16
+ def __init__(self, name: str):
17
+ self.name = name
18
+
19
+
20
+ class AppTests(unittest.TestCase):
21
+ def test_parse_uploaded_document_returns_artifact_validation(self):
22
+ if space_app is None:
23
+ self.skipTest(APP_IMPORT_ERROR)
24
+
25
+ with tempfile.TemporaryDirectory() as tmp:
26
+ input_path = Path(tmp) / "sample.md"
27
+ input_path.write_text("# Report\n\nHello from the Space UI.\n", encoding="utf-8")
28
+
29
+ outputs = space_app.parse_uploaded_document(_UploadedFile(str(input_path)), "Default lightweight")
30
+
31
+ self.assertEqual(len(outputs), 11)
32
+ summary = outputs[1]
33
+ artifact_validation = outputs[8]
34
+ archive_path = outputs[9]
35
+ individual_files = outputs[10]
36
+ self.assertTrue(summary["artifact_manifest_valid"])
37
+ self.assertTrue(artifact_validation["valid"])
38
+ self.assertTrue(Path(archive_path).exists())
39
+ # Per-artifact downloads.
40
+ self.assertIsInstance(individual_files, list)
41
+ self.assertGreater(len(individual_files), 0)
42
+ names = [Path(p).name for p in individual_files]
43
+ # Core artifacts every parse should produce.
44
+ for required in ("parsed_document.json", "document.md", "chunks.jsonl", "artifact_manifest.json"):
45
+ self.assertIn(required, names)
46
+ # Each path actually exists on disk so Gradio can serve it.
47
+ for path in individual_files:
48
+ self.assertTrue(Path(path).exists(), f"missing: {path}")
49
+ # The archive zip is a separate artifact and must NOT appear in the
50
+ # per-artifact list (zip is the bundled-everything view).
51
+ self.assertNotIn(Path(archive_path).name, names)
52
+ # Summary records the per-artifact count.
53
+ self.assertEqual(summary["individual_artifact_count"], len(individual_files))
54
+
55
+
56
+ class UploadGuardTests(unittest.TestCase):
57
+ def test_oversized_upload_rejected_with_clear_message(self):
58
+ if space_app is None:
59
+ self.skipTest(APP_IMPORT_ERROR)
60
+
61
+ with tempfile.TemporaryDirectory() as tmp:
62
+ input_path = Path(tmp) / "huge.md"
63
+ input_path.write_text("# Big\n\n" + "x" * 4096, encoding="utf-8")
64
+
65
+ with patch.object(space_app, "MAX_UPLOAD_BYTES", 1024):
66
+ outputs = space_app.parse_uploaded_document(
67
+ _UploadedFile(str(input_path)), "Default lightweight"
68
+ )
69
+
70
+ summary = outputs[1]
71
+ self.assertTrue(summary.get("rejected"))
72
+ self.assertIn("MB", summary["error"])
73
+
74
+ def test_high_page_count_rejected(self):
75
+ if space_app is None:
76
+ self.skipTest(APP_IMPORT_ERROR)
77
+
78
+ with tempfile.TemporaryDirectory() as tmp:
79
+ input_path = Path(tmp) / "doc.md"
80
+ input_path.write_text("# Doc\n\nSomething small.\n", encoding="utf-8")
81
+
82
+ class _FakeProfile:
83
+ page_count = 1000
84
+
85
+ with patch.object(space_app, "MAX_PAGE_COUNT", 50), patch.object(
86
+ space_app, "profile_document", return_value=_FakeProfile()
87
+ ):
88
+ outputs = space_app.parse_uploaded_document(
89
+ _UploadedFile(str(input_path)), "Default lightweight"
90
+ )
91
+
92
+ summary = outputs[1]
93
+ self.assertTrue(summary.get("rejected"))
94
+ self.assertIn("pages", summary["error"])
95
+
96
+ def test_missing_upload_path_rejected(self):
97
+ if space_app is None:
98
+ self.skipTest(APP_IMPORT_ERROR)
99
+
100
+ outputs = space_app.parse_uploaded_document(
101
+ _UploadedFile("/tmp/zsgdp-does-not-exist.md"), "Default lightweight"
102
+ )
103
+ summary = outputs[1]
104
+ self.assertTrue(summary.get("rejected"))
105
+ self.assertIn("missing", summary["error"].lower())
106
+
107
+ def test_error_paths_return_full_tuple_width(self):
108
+ # Drift guard: every return path (success + error) must yield 11 outputs
109
+ # so the Gradio click handler doesn't error on shape mismatch.
110
+ if space_app is None:
111
+ self.skipTest(APP_IMPORT_ERROR)
112
+
113
+ # No upload at all.
114
+ outputs = space_app.parse_uploaded_document(None, "Default lightweight")
115
+ self.assertEqual(len(outputs), 11)
116
+ self.assertEqual(outputs[10], [])
117
+
118
+ # Missing-file rejection.
119
+ outputs = space_app.parse_uploaded_document(
120
+ _UploadedFile("/tmp/zsgdp-does-not-exist-xyz.md"), "Default lightweight"
121
+ )
122
+ self.assertEqual(len(outputs), 11)
123
+ self.assertEqual(outputs[10], [])
124
+
125
+ def test_normal_upload_passes_guards(self):
126
+ if space_app is None:
127
+ self.skipTest(APP_IMPORT_ERROR)
128
+
129
+ with tempfile.TemporaryDirectory() as tmp:
130
+ input_path = Path(tmp) / "ok.md"
131
+ input_path.write_text("# OK\n\nA normal document.\n", encoding="utf-8")
132
+ outputs = space_app.parse_uploaded_document(
133
+ _UploadedFile(str(input_path)), "Default lightweight"
134
+ )
135
+
136
+ summary = outputs[1]
137
+ self.assertNotIn("rejected", summary)
138
+
139
+
140
+ if __name__ == "__main__":
141
+ unittest.main()
tests/test_artifacts.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import tempfile
3
+ import unittest
4
+ from pathlib import Path
5
+
6
+ from zsgdp.artifacts import MANIFEST_SCHEMA_VERSION, validate_artifact_manifest
7
+ from zsgdp.cli import main
8
+ from zsgdp.pipeline import parse_document
9
+ from zsgdp.schema import SCHEMA_VERSION
10
+
11
+
12
+ class ArtifactManifestTests(unittest.TestCase):
13
+ def test_parse_writes_valid_artifact_manifest(self):
14
+ with tempfile.TemporaryDirectory() as tmp:
15
+ tmp_path = Path(tmp)
16
+ input_path = tmp_path / "sample.md"
17
+ output_dir = tmp_path / "out"
18
+ input_path.write_text("# Report\n\nHello world.\n", encoding="utf-8")
19
+
20
+ parsed = parse_document(input_path, output_dir)
21
+ manifest = json.loads((output_dir / "artifact_manifest.json").read_text(encoding="utf-8"))
22
+ validation = validate_artifact_manifest(output_dir)
23
+
24
+ self.assertEqual(manifest["doc_id"], parsed.doc_id)
25
+ self.assertEqual(manifest["counts"]["chunks"], len(parsed.chunks))
26
+ self.assertTrue(any(record["path"] == "parsed_document.json" for record in manifest["files"]))
27
+ self.assertTrue(validation["valid"])
28
+ self.assertEqual(validation["checked_count"], manifest["artifact_count"])
29
+
30
+ def test_manifest_records_schema_versions(self):
31
+ with tempfile.TemporaryDirectory() as tmp:
32
+ tmp_path = Path(tmp)
33
+ input_path = tmp_path / "sample.md"
34
+ output_dir = tmp_path / "out"
35
+ input_path.write_text("# Report\n\nHello.\n", encoding="utf-8")
36
+
37
+ parsed = parse_document(input_path, output_dir)
38
+ manifest = json.loads((output_dir / "artifact_manifest.json").read_text(encoding="utf-8"))
39
+
40
+ # Manifest format version is its own integer; parsed-document
41
+ # schema version is a string echoed from the dataclass.
42
+ self.assertEqual(manifest["schema_version"], MANIFEST_SCHEMA_VERSION)
43
+ self.assertEqual(manifest["parsed_document_schema_version"], SCHEMA_VERSION)
44
+ self.assertEqual(parsed.schema_version, SCHEMA_VERSION)
45
+
46
+ # Validation echoes both versions so callers can gate on them.
47
+ validation = validate_artifact_manifest(output_dir)
48
+ self.assertEqual(validation["manifest_schema_version"], MANIFEST_SCHEMA_VERSION)
49
+ self.assertEqual(validation["parsed_document_schema_version"], SCHEMA_VERSION)
50
+
51
+ def test_validate_artifact_manifest_detects_checksum_mismatch(self):
52
+ with tempfile.TemporaryDirectory() as tmp:
53
+ tmp_path = Path(tmp)
54
+ input_path = tmp_path / "sample.md"
55
+ output_dir = tmp_path / "out"
56
+ input_path.write_text("# Report\n\nHello world.\n", encoding="utf-8")
57
+ parse_document(input_path, output_dir)
58
+
59
+ (output_dir / "document.md").write_text("tampered\n", encoding="utf-8")
60
+ validation = validate_artifact_manifest(output_dir)
61
+
62
+ self.assertFalse(validation["valid"])
63
+ self.assertTrue(any("SHA-256 mismatch: document.md" == error for error in validation["errors"]))
64
+
65
+ def test_validate_artifacts_cli_writes_report(self):
66
+ with tempfile.TemporaryDirectory() as tmp:
67
+ tmp_path = Path(tmp)
68
+ input_path = tmp_path / "sample.md"
69
+ output_dir = tmp_path / "out"
70
+ report_path = tmp_path / "validation.json"
71
+ input_path.write_text("# Report\n\nHello world.\n", encoding="utf-8")
72
+ parse_document(input_path, output_dir)
73
+
74
+ code = main(["validate-artifacts", "--parsed", str(output_dir), "--output", str(report_path)])
75
+
76
+ self.assertEqual(code, 0)
77
+ self.assertTrue(report_path.exists())
78
+ self.assertTrue(json.loads(report_path.read_text(encoding="utf-8"))["valid"])
79
+
80
+
81
+ if __name__ == "__main__":
82
+ unittest.main()
tests/test_benchmark.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tempfile
2
+ import unittest
3
+ from pathlib import Path
4
+
5
+ from zsgdp.benchmarks.parser_quality import run_parser_benchmark
6
+ from zsgdp.cli import main
7
+
8
+
9
+ class BenchmarkTests(unittest.TestCase):
10
+ def test_run_parser_benchmark_writes_results(self):
11
+ with tempfile.TemporaryDirectory() as tmp:
12
+ tmp_path = Path(tmp)
13
+ docs = tmp_path / "docs"
14
+ out = tmp_path / "bench"
15
+ docs.mkdir()
16
+ (docs / "one.md").write_text("# One\n\nHello world", encoding="utf-8")
17
+
18
+ summary = run_parser_benchmark(docs, out)
19
+
20
+ self.assertEqual(summary["document_count"], 1)
21
+ self.assertIn("fixed_token_baseline", summary["documents"][0]["chunk_strategy_counts"])
22
+ self.assertTrue(summary["chunk_strategy_leaderboard"])
23
+ self.assertIn("structure_quality", summary)
24
+ self.assertIn("chunking_quality", summary)
25
+ self.assertIn("throughput", summary)
26
+ self.assertIn("ablation_plan", summary)
27
+ self.assertTrue((out / "results.json").exists())
28
+ self.assertTrue((out / "leaderboard.csv").exists())
29
+ self.assertTrue((out / "parser_runs.csv").exists())
30
+ self.assertTrue((out / "chunk_runs.csv").exists())
31
+ self.assertTrue((out / "structure_runs.csv").exists())
32
+ self.assertTrue((out / "chunk_quality.csv").exists())
33
+ self.assertTrue((out / "throughput_runs.csv").exists())
34
+ self.assertTrue((out / "ablations.json").exists())
35
+
36
+ def test_benchmark_cli(self):
37
+ with tempfile.TemporaryDirectory() as tmp:
38
+ tmp_path = Path(tmp)
39
+ docs = tmp_path / "docs"
40
+ out = tmp_path / "bench"
41
+ docs.mkdir()
42
+ (docs / "one.md").write_text("# One\n\nHello world", encoding="utf-8")
43
+
44
+ code = main(["benchmark", "--input", str(docs), "--output", str(out), "--parsers", "text"])
45
+
46
+ self.assertEqual(code, 0)
47
+ self.assertTrue((out / "leaderboard.csv").exists())
48
+ self.assertTrue((out / "chunk_runs.csv").exists())
49
+ self.assertTrue((out / "structure_runs.csv").exists())
50
+ self.assertTrue((out / "chunk_quality.csv").exists())
51
+ self.assertTrue((out / "throughput_runs.csv").exists())
52
+
53
+
54
+ if __name__ == "__main__":
55
+ unittest.main()
tests/test_chunking.py ADDED
@@ -0,0 +1,286 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unittest
2
+
3
+ from zsgdp.chunking import build_agentic_chunks
4
+ from zsgdp.config import load_config
5
+ from zsgdp.schema import DocumentProfile, Element, FigureObject, PageProfile, ParsedDocument, QualityReport, TableObject
6
+ from zsgdp.verify import verify_chunks
7
+
8
+
9
+ class ChunkingTests(unittest.TestCase):
10
+ def test_agentic_chunking_builds_parent_child_chunks(self):
11
+ profile = DocumentProfile(
12
+ doc_id="d1",
13
+ source_path="sample.md",
14
+ file_type="markdown",
15
+ page_count=1,
16
+ extension=".md",
17
+ pages=[PageProfile(page_num=1, digital_text_chars=120, digital_text_quality=1.0)],
18
+ )
19
+ parsed = ParsedDocument(
20
+ doc_id="d1",
21
+ source_path="sample.md",
22
+ file_type="markdown",
23
+ quality_report=QualityReport(score=0.95),
24
+ )
25
+ parsed.elements.extend(
26
+ [
27
+ Element("e1", "d1", 1, "title", markdown="# Report", reading_order=1, source_parser="text"),
28
+ Element("e2", "d1", 1, "paragraph", text=" ".join(["alpha"] * 80), reading_order=2, source_parser="text"),
29
+ ]
30
+ )
31
+
32
+ chunks = build_agentic_chunks(parsed, profile, load_config())
33
+
34
+ self.assertTrue(any(chunk.content_type == "parent" for chunk in chunks))
35
+ self.assertTrue(any(chunk.parent_chunk_id for chunk in chunks))
36
+ self.assertEqual(parsed.provenance["chunking"]["plan"]["target_tokens"], 512)
37
+
38
+ def test_chunk_readiness_adds_metrics(self):
39
+ profile = DocumentProfile(
40
+ doc_id="d1",
41
+ source_path="sample.md",
42
+ file_type="markdown",
43
+ page_count=1,
44
+ extension=".md",
45
+ pages=[PageProfile(page_num=1, digital_text_chars=120, digital_text_quality=1.0)],
46
+ )
47
+ parsed = ParsedDocument(
48
+ doc_id="d1",
49
+ source_path="sample.md",
50
+ file_type="markdown",
51
+ quality_report=QualityReport(score=0.95),
52
+ )
53
+ parsed.elements.append(
54
+ Element("e1", "d1", 1, "paragraph", text=" ".join(["alpha"] * 80), reading_order=1, source_parser="text")
55
+ )
56
+ parsed.chunks = build_agentic_chunks(parsed, profile, load_config())
57
+
58
+ report = verify_chunks(parsed, load_config())
59
+
60
+ self.assertEqual(report.metrics["chunk_count"], len(parsed.chunks))
61
+ self.assertIn("fixed_token_baseline", report.metrics["chunk_strategy_counts"])
62
+ self.assertIn("recursive_structure", report.metrics["chunk_strategy_counts"])
63
+
64
+ def test_fixed_token_baseline_chunks_are_emitted_with_provenance(self):
65
+ profile = DocumentProfile(
66
+ doc_id="d1",
67
+ source_path="sample.md",
68
+ file_type="markdown",
69
+ page_count=2,
70
+ extension=".md",
71
+ pages=[
72
+ PageProfile(page_num=1, digital_text_chars=120, digital_text_quality=1.0),
73
+ PageProfile(page_num=2, digital_text_chars=120, digital_text_quality=1.0),
74
+ ],
75
+ )
76
+ parsed = ParsedDocument(
77
+ doc_id="d1",
78
+ source_path="sample.md",
79
+ file_type="markdown",
80
+ quality_report=QualityReport(score=0.95),
81
+ )
82
+ parsed.elements.extend(
83
+ [
84
+ Element("e1", "d1", 1, "paragraph", text=" ".join(["alpha"] * 18), reading_order=1, source_parser="text"),
85
+ Element("e2", "d1", 2, "paragraph", text=" ".join(["beta"] * 18), reading_order=1, source_parser="text"),
86
+ ]
87
+ )
88
+ config = load_config(overrides={"chunking": {"target_tokens": 10, "overlap_ratio": 0.2}})
89
+
90
+ chunks = build_agentic_chunks(parsed, profile, config)
91
+ baseline_chunks = [chunk for chunk in chunks if chunk.strategy == "fixed_token_baseline"]
92
+
93
+ self.assertGreaterEqual(len(baseline_chunks), 4)
94
+ self.assertEqual(baseline_chunks[0].element_ids, ["e1"])
95
+ self.assertEqual(baseline_chunks[-1].page_end, 2)
96
+ self.assertEqual(parsed.provenance["chunking"]["fixed_token_baseline_count"], len(baseline_chunks))
97
+
98
+ def test_figure_without_caption_still_gets_visual_chunk(self):
99
+ profile = DocumentProfile(
100
+ doc_id="d1",
101
+ source_path="sample.pdf",
102
+ file_type="pdf",
103
+ page_count=1,
104
+ extension=".pdf",
105
+ pages=[PageProfile(page_num=1, digital_text_chars=20, digital_text_quality=1.0)],
106
+ )
107
+ parsed = ParsedDocument(
108
+ doc_id="d1",
109
+ source_path="sample.pdf",
110
+ file_type="pdf",
111
+ quality_report=QualityReport(score=0.90),
112
+ )
113
+ parsed.elements.append(Element("e1", "d1", 1, "paragraph", text="hello world", reading_order=1, source_parser="pymupdf"))
114
+ parsed.figures.append(
115
+ FigureObject(
116
+ figure_id="f1",
117
+ page_num=1,
118
+ image_path="/tmp/figure.png",
119
+ confidence=0.5,
120
+ source_parser="pymupdf",
121
+ )
122
+ )
123
+
124
+ parsed.chunks = build_agentic_chunks(parsed, profile, load_config())
125
+ report = verify_chunks(parsed, load_config())
126
+
127
+ self.assertTrue(any(chunk.figure_ids == ["f1"] for chunk in parsed.chunks))
128
+ self.assertEqual(report.metrics["figure_chunk_coverage"], 1.0)
129
+
130
+ def test_table_chunk_keeps_multimodal_metadata(self):
131
+ profile = DocumentProfile(
132
+ doc_id="d1",
133
+ source_path="sample.pdf",
134
+ file_type="pdf",
135
+ page_count=1,
136
+ extension=".pdf",
137
+ pages=[PageProfile(page_num=1, digital_text_chars=20, digital_text_quality=1.0)],
138
+ )
139
+ parsed = ParsedDocument(
140
+ doc_id="d1",
141
+ source_path="sample.pdf",
142
+ file_type="pdf",
143
+ quality_report=QualityReport(score=0.90),
144
+ )
145
+ parsed.elements.append(Element("e1", "d1", 1, "paragraph", text="hello world", reading_order=1, source_parser="pymupdf"))
146
+ parsed.tables.append(
147
+ TableObject(
148
+ table_id="t1",
149
+ page_nums=[1],
150
+ bbox=[(1.0, 2.0, 3.0, 4.0)],
151
+ markdown="| A | B |\n| --- | --- |\n| 1 | 2 |",
152
+ natural_language_rendering="Table with columns A, B. Rows: 1: B=2.",
153
+ confidence=0.82,
154
+ source_parser="pymupdf",
155
+ provenance={"crop_path": "/tmp/table.png", "source_parsers": ["pymupdf", "docling"]},
156
+ )
157
+ )
158
+
159
+ parsed.chunks = build_agentic_chunks(parsed, profile, load_config())
160
+ table_chunk = next(chunk for chunk in parsed.chunks if chunk.strategy == "table_object")
161
+
162
+ self.assertEqual(table_chunk.text, "Table with columns A, B. Rows: 1: B=2.")
163
+ self.assertEqual(table_chunk.metadata["markdown"], "| A | B |\n| --- | --- |\n| 1 | 2 |")
164
+ self.assertEqual(table_chunk.metadata["bbox"], [(1.0, 2.0, 3.0, 4.0)])
165
+ self.assertEqual(table_chunk.metadata["crop_path"], "/tmp/table.png")
166
+ self.assertEqual(table_chunk.metadata["source_parsers"], ["pymupdf", "docling"])
167
+
168
+ def test_vision_guided_chunking_exports_visual_regions(self):
169
+ profile = DocumentProfile(
170
+ doc_id="d1",
171
+ source_path="sample.pdf",
172
+ file_type="pdf",
173
+ page_count=1,
174
+ extension=".pdf",
175
+ pages=[PageProfile(page_num=1, digital_text_chars=20, digital_text_quality=1.0)],
176
+ )
177
+ parsed = ParsedDocument(
178
+ doc_id="d1",
179
+ source_path="sample.pdf",
180
+ file_type="pdf",
181
+ quality_report=QualityReport(score=0.90),
182
+ )
183
+ parsed.elements.append(Element("e1", "d1", 1, "paragraph", text="hello world", reading_order=1, source_parser="pymupdf"))
184
+ parsed.tables.append(TableObject(table_id="t1", page_nums=[1], bbox=[(1.0, 2.0, 3.0, 4.0)], markdown="| A | B |\n| --- | --- |\n| 1 | 2 |"))
185
+ parsed.figures.append(FigureObject(figure_id="f1", page_num=1, bbox=(5.0, 6.0, 7.0, 8.0), source_parser="pymupdf"))
186
+ config = load_config(overrides={"chunking": {"vision_guided": True}})
187
+
188
+ parsed.chunks = build_agentic_chunks(parsed, profile, config)
189
+
190
+ visual_chunks = [chunk for chunk in parsed.chunks if chunk.content_type in {"table", "figure"}]
191
+ self.assertTrue(all(chunk.requires_visual_context for chunk in visual_chunks))
192
+ self.assertEqual(len(parsed.provenance["chunking"]["vision_regions"]), 2)
193
+ self.assertEqual(parsed.provenance["chunking"]["vision_regions"][0]["region_id"], "t1")
194
+
195
+ def test_advanced_chunking_flags_emit_strategy_chunks(self):
196
+ profile = DocumentProfile(
197
+ doc_id="d1",
198
+ source_path="sample.pdf",
199
+ file_type="pdf",
200
+ page_count=2,
201
+ extension=".pdf",
202
+ pages=[
203
+ PageProfile(page_num=1, digital_text_chars=200, digital_text_quality=1.0),
204
+ PageProfile(page_num=2, digital_text_chars=200, digital_text_quality=1.0),
205
+ ],
206
+ )
207
+ parsed = ParsedDocument(
208
+ doc_id="d1",
209
+ source_path="sample.pdf",
210
+ file_type="pdf",
211
+ quality_report=QualityReport(score=0.92),
212
+ )
213
+ parsed.elements.extend(
214
+ [
215
+ Element("e1", "d1", 1, "heading", markdown="## Revenue", reading_order=1, source_parser="pymupdf"),
216
+ Element(
217
+ "e2",
218
+ "d1",
219
+ 1,
220
+ "paragraph",
221
+ text="Revenue increased by 12 percent in Q1. Gross margin improved due to pricing.",
222
+ reading_order=2,
223
+ source_parser="pymupdf",
224
+ ),
225
+ Element("e3", "d1", 2, "heading", markdown="## Safety", reading_order=1, source_parser="pymupdf"),
226
+ Element(
227
+ "e4",
228
+ "d1",
229
+ 2,
230
+ "paragraph",
231
+ text="Safety inspections found three unresolved risks. Corrective actions are due in June.",
232
+ reading_order=2,
233
+ source_parser="pymupdf",
234
+ ),
235
+ ]
236
+ )
237
+ parsed.tables.append(
238
+ TableObject(
239
+ table_id="t1",
240
+ page_nums=[1],
241
+ markdown="| Metric | Value |\n| --- | --- |\n| Revenue | 12% |",
242
+ natural_language_rendering="Table t1 reports revenue growth of 12 percent.",
243
+ source_parser="pymupdf",
244
+ )
245
+ )
246
+ parsed.figures.append(
247
+ FigureObject(
248
+ figure_id="f1",
249
+ page_num=2,
250
+ caption="Risk trend chart shows open safety findings.",
251
+ source_parser="pymupdf",
252
+ )
253
+ )
254
+ config = load_config(
255
+ overrides={
256
+ "chunking": {
257
+ "contextual_retrieval": True,
258
+ "semantic_chunking": True,
259
+ "late_chunking": True,
260
+ "vision_guided": True,
261
+ "agentic_proposition_chunking": True,
262
+ }
263
+ }
264
+ )
265
+
266
+ parsed.chunks = build_agentic_chunks(parsed, profile, config)
267
+ strategies = {chunk.strategy for chunk in parsed.chunks}
268
+
269
+ self.assertIn("semantic", strategies)
270
+ self.assertIn("late", strategies)
271
+ self.assertIn("contextual_retrieval", strategies)
272
+ self.assertIn("vision_guided", strategies)
273
+ self.assertIn("agentic_proposition", strategies)
274
+ self.assertGreater(parsed.provenance["chunking"]["semantic_chunk_count"], 0)
275
+ self.assertGreater(parsed.provenance["chunking"]["late_chunk_count"], 0)
276
+ self.assertGreater(parsed.provenance["chunking"]["contextual_retrieval_chunk_count"], 0)
277
+ semantic_chunk = next(chunk for chunk in parsed.chunks if chunk.strategy == "semantic")
278
+ self.assertEqual(semantic_chunk.metadata["execution_mode"], "lexical_similarity_proxy")
279
+ contextual_chunk = next(chunk for chunk in parsed.chunks if chunk.strategy == "contextual_retrieval")
280
+ self.assertIn("source_chunk_id", contextual_chunk.metadata)
281
+ late_chunk = next(chunk for chunk in parsed.chunks if chunk.strategy == "late")
282
+ self.assertTrue(late_chunk.metadata["requires_token_level_embeddings"])
283
+
284
+
285
+ if __name__ == "__main__":
286
+ unittest.main()
tests/test_cli_help.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests guarding CLI help text — examples must render and stay clean."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import io
6
+ import unittest
7
+ from contextlib import redirect_stdout
8
+
9
+ from zsgdp.cli import _epilog, main
10
+
11
+
12
+ def _capture_help(argv: list[str]) -> str:
13
+ """Run `zsgdp <argv> --help` and return captured stdout. SystemExit is normal."""
14
+
15
+ buffer = io.StringIO()
16
+ with redirect_stdout(buffer):
17
+ try:
18
+ main(argv + ["--help"])
19
+ except SystemExit:
20
+ pass
21
+ return buffer.getvalue()
22
+
23
+
24
+ class EpilogFormatterTests(unittest.TestCase):
25
+ def test_epilog_dedents_indented_source_string(self):
26
+ rendered = _epilog(
27
+ """
28
+ zsgdp parse --input ./a --output ./b
29
+ zsgdp parse --input ./c --output ./d
30
+ """
31
+ )
32
+ # No double-indentation; first non-blank line begins with two spaces only.
33
+ lines = rendered.splitlines()
34
+ self.assertEqual(lines[0], "Examples:")
35
+ self.assertTrue(lines[1].startswith(" zsgdp parse"))
36
+ # No source-indent leak.
37
+ self.assertNotIn(" zsgdp", rendered)
38
+
39
+ def test_epilog_preserves_blank_lines_as_separators(self):
40
+ rendered = _epilog(
41
+ """
42
+ line one
43
+
44
+ line two
45
+ """
46
+ )
47
+ self.assertIn("\n\n", rendered)
48
+
49
+
50
+ class SubcommandHelpTests(unittest.TestCase):
51
+ def test_top_level_help_lists_examples_section(self):
52
+ text = _capture_help([])
53
+ self.assertIn("Examples:", text)
54
+ self.assertIn("zsgdp parse", text)
55
+ self.assertIn("docs/space_smoke.md", text)
56
+
57
+ def test_parse_help_has_examples(self):
58
+ text = _capture_help(["parse"])
59
+ self.assertIn("Examples:", text)
60
+ self.assertIn("zsgdp parse --input", text)
61
+ self.assertIn("--config configs/docling.yaml", text)
62
+
63
+ def test_benchmark_help_covers_three_dataset_modes(self):
64
+ text = _capture_help(["benchmark"])
65
+ self.assertIn("Examples:", text)
66
+ self.assertIn("--dataset omnidocbench", text)
67
+ self.assertIn("--dataset doclaynet", text)
68
+
69
+ def test_benchmark_ablate_shows_merged_arm_pattern(self):
70
+ text = _capture_help(["benchmark-ablate"])
71
+ self.assertIn("--parser docling --parser pymupdf", text)
72
+ self.assertIn("--no-merged", text)
73
+
74
+ def test_run_gpu_tasks_documents_dry_run_vs_execute(self):
75
+ text = _capture_help(["run-gpu-tasks"])
76
+ self.assertIn("Dry-run", text)
77
+ self.assertIn("--execute", text)
78
+
79
+ def test_combine_benchmarks_shows_label_pairing(self):
80
+ text = _capture_help(["combine-benchmarks"])
81
+ self.assertIn("--label omnidocbench", text)
82
+ self.assertIn("--label doclaynet", text)
83
+
84
+ def test_preflight_help_documents_skip_flags(self):
85
+ text = _capture_help(["preflight"])
86
+ self.assertIn("--benchmark", text)
87
+ self.assertIn("--skip-unit", text)
88
+
89
+
90
+ if __name__ == "__main__":
91
+ unittest.main()
tests/test_conflict_detection.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tempfile
2
+ import unittest
3
+ from pathlib import Path
4
+
5
+ from zsgdp.export import export_parsed_document
6
+ from zsgdp.merge.conflict_detection import build_candidate_conflict_report, detect_candidate_conflicts
7
+ from zsgdp.merge.merge_candidates import merge_candidates
8
+ from zsgdp.schema import DocumentProfile, Element, PageProfile, ParseCandidate, TableObject
9
+
10
+
11
+ class ConflictDetectionTests(unittest.TestCase):
12
+ def test_conflict_report_flags_reading_order_and_table_structure(self):
13
+ candidates = [_candidate("docling", ["Alpha", "Beta", "Gamma"], 3), _candidate("pymupdf", ["Gamma", "Beta", "Alpha"], 2)]
14
+
15
+ report = build_candidate_conflict_report(candidates)
16
+ issues = detect_candidate_conflicts(candidates)
17
+
18
+ conflict_types = {conflict["type"] for conflict in report["conflicts"]}
19
+ self.assertIn("reading_order_disagreement", conflict_types)
20
+ self.assertIn("table_structure_disagreement", conflict_types)
21
+ self.assertTrue(issues)
22
+ self.assertTrue(all(issue.issue_type == "parser_disagreement" for issue in issues))
23
+
24
+ def test_merge_stores_and_exports_conflict_report(self):
25
+ profile = DocumentProfile(
26
+ doc_id="d1",
27
+ source_path="sample.pdf",
28
+ file_type="pdf",
29
+ page_count=1,
30
+ extension=".pdf",
31
+ pages=[PageProfile(page_num=1, digital_text_chars=30)],
32
+ )
33
+ parsed = merge_candidates(
34
+ [_candidate("docling", ["Alpha", "Beta", "Gamma"], 3), _candidate("pymupdf", ["Gamma", "Beta", "Alpha"], 2)],
35
+ profile,
36
+ )
37
+
38
+ with tempfile.TemporaryDirectory() as tmp:
39
+ output_dir = Path(tmp) / "out"
40
+ export_parsed_document(parsed, output_dir)
41
+
42
+ self.assertTrue((output_dir / "conflict_report.json").exists())
43
+
44
+ self.assertIn("conflict_report", parsed.provenance)
45
+ self.assertGreater(parsed.provenance["conflict_report"]["conflict_count"], 0)
46
+
47
+
48
+ def _candidate(parser_name: str, ordered_text: list[str], table_columns: int) -> ParseCandidate:
49
+ elements = [
50
+ Element(
51
+ element_id=f"{parser_name}_e{index}",
52
+ doc_id="d1",
53
+ page_num=1,
54
+ type="paragraph",
55
+ text=text,
56
+ reading_order=index,
57
+ confidence=0.8,
58
+ source_parser=parser_name,
59
+ )
60
+ for index, text in enumerate(ordered_text, start=1)
61
+ ]
62
+ return ParseCandidate(
63
+ parser_name=parser_name,
64
+ doc_id="d1",
65
+ source_path="sample.pdf",
66
+ file_type="pdf",
67
+ pages=[{"page_num": 1, "source_parser": parser_name}],
68
+ elements=elements,
69
+ tables=[
70
+ TableObject(
71
+ table_id=f"{parser_name}_t1",
72
+ page_nums=[1],
73
+ markdown=_table_markdown(table_columns),
74
+ confidence=0.8,
75
+ source_parser=parser_name,
76
+ )
77
+ ],
78
+ confidence=0.8,
79
+ )
80
+
81
+
82
+ def _table_markdown(columns: int) -> str:
83
+ if columns == 3:
84
+ return "| Region | Q1 | Q2 |\n| --- | --- | --- |\n| NA | 10 | 12 |"
85
+ return "| Region | Q1 |\n| --- | --- |\n| NA | 10 |"
86
+
87
+
88
+ if __name__ == "__main__":
89
+ unittest.main()
tests/test_cross_dataset.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for cross-dataset benchmark comparison."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import tempfile
7
+ import unittest
8
+ from pathlib import Path
9
+
10
+ from zsgdp.benchmarks.cross_dataset import (
11
+ combine_benchmark_summaries,
12
+ write_cross_dataset_outputs,
13
+ )
14
+
15
+
16
+ def _summary(dataset_name: str, *, layout_f1: float, leaderboard: list[dict] | None = None) -> dict:
17
+ return {
18
+ "dataset_name": dataset_name,
19
+ "dataset_root": f"/tmp/{dataset_name}",
20
+ "document_count": 5,
21
+ "mean_quality_score": 0.9,
22
+ "mean_layout_f1": layout_f1,
23
+ "mean_retrieval_recall_at_5": 0.7,
24
+ "mean_table_structure_score": 0.6,
25
+ "mean_formula_cer": 0.2,
26
+ "per_parser_gt_leaderboard": leaderboard or [],
27
+ }
28
+
29
+
30
+ class TestCombineBenchmarkSummaries(unittest.TestCase):
31
+ def test_two_runs_produce_two_rows(self):
32
+ runs = [
33
+ ("docs_a", _summary("docs_a", layout_f1=0.5)),
34
+ ("docs_b", _summary("docs_b", layout_f1=0.8)),
35
+ ]
36
+ comparison = combine_benchmark_summaries(runs)
37
+ self.assertEqual(comparison["run_count"], 2)
38
+ self.assertEqual(comparison["labels"], ["docs_a", "docs_b"])
39
+ self.assertEqual([row["label"] for row in comparison["dataset_summary"]], ["docs_a", "docs_b"])
40
+ layouts = {row["label"]: row["mean_layout_f1"] for row in comparison["dataset_summary"]}
41
+ self.assertEqual(layouts, {"docs_a": 0.5, "docs_b": 0.8})
42
+
43
+ def test_parser_matrix_aligns_parsers_across_runs(self):
44
+ leaderboard_a = [
45
+ {"parser": "docling", "mean_layout_class_aware_f1": 0.9, "document_count": 3},
46
+ {"parser": "pymupdf", "mean_layout_class_aware_f1": 0.4, "document_count": 3},
47
+ ]
48
+ leaderboard_b = [
49
+ {"parser": "docling", "mean_layout_class_aware_f1": 0.7, "document_count": 5},
50
+ # marker only appears in run B.
51
+ {"parser": "marker", "mean_layout_class_aware_f1": 0.6, "document_count": 5},
52
+ ]
53
+ runs = [
54
+ ("a", _summary("a", layout_f1=0.5, leaderboard=leaderboard_a)),
55
+ ("b", _summary("b", layout_f1=0.7, leaderboard=leaderboard_b)),
56
+ ]
57
+ comparison = combine_benchmark_summaries(runs)
58
+
59
+ matrix = comparison["parser_matrix"]
60
+ parsers = sorted(row["parser"] for row in matrix)
61
+ self.assertEqual(parsers, ["docling", "marker", "pymupdf"])
62
+
63
+ by_parser = {row["parser"]: row for row in matrix}
64
+ # Docling appears in both runs.
65
+ self.assertEqual(by_parser["docling"]["a__mean_layout_class_aware_f1"], 0.9)
66
+ self.assertEqual(by_parser["docling"]["b__mean_layout_class_aware_f1"], 0.7)
67
+ # Marker missing in run A -> None, present in B.
68
+ self.assertIsNone(by_parser["marker"]["a__mean_layout_class_aware_f1"])
69
+ self.assertEqual(by_parser["marker"]["b__mean_layout_class_aware_f1"], 0.6)
70
+ # PyMuPDF missing in run B -> None.
71
+ self.assertIsNone(by_parser["pymupdf"]["b__mean_layout_class_aware_f1"])
72
+
73
+ def test_duplicate_labels_raise(self):
74
+ with self.assertRaises(ValueError):
75
+ combine_benchmark_summaries(
76
+ [
77
+ ("same", _summary("a", layout_f1=0.5)),
78
+ ("same", _summary("b", layout_f1=0.7)),
79
+ ]
80
+ )
81
+
82
+ def test_summary_loaded_from_path(self):
83
+ with tempfile.TemporaryDirectory() as tmp:
84
+ tmp = Path(tmp)
85
+ (tmp / "results.json").write_text(json.dumps(_summary("from_path", layout_f1=0.42)))
86
+
87
+ comparison = combine_benchmark_summaries([("a", tmp)])
88
+ self.assertEqual(comparison["dataset_summary"][0]["mean_layout_f1"], 0.42)
89
+
90
+ def test_missing_metric_yields_none_not_zero(self):
91
+ # A summary missing mean_formula_cer (older code, e.g.) preserves None.
92
+ sparse_summary = {"dataset_name": "old_run", "document_count": 1}
93
+ comparison = combine_benchmark_summaries([("old", sparse_summary)])
94
+ row = comparison["dataset_summary"][0]
95
+ self.assertEqual(row["document_count"], 1)
96
+ self.assertIsNone(row["mean_layout_f1"])
97
+ self.assertIsNone(row["mean_formula_cer"])
98
+
99
+
100
+ class TestWriteCrossDatasetOutputs(unittest.TestCase):
101
+ def test_writes_json_and_csvs(self):
102
+ leaderboard = [{"parser": "docling", "mean_layout_class_aware_f1": 0.9, "document_count": 3}]
103
+ comparison = combine_benchmark_summaries(
104
+ [("a", _summary("a", layout_f1=0.5, leaderboard=leaderboard))]
105
+ )
106
+ with tempfile.TemporaryDirectory() as tmp:
107
+ tmp = Path(tmp)
108
+ write_cross_dataset_outputs(comparison, tmp)
109
+
110
+ self.assertTrue((tmp / "cross_dataset_comparison.json").exists())
111
+ self.assertTrue((tmp / "dataset_summary.csv").exists())
112
+ self.assertTrue((tmp / "parser_matrix.csv").exists())
113
+
114
+ ds_csv = (tmp / "dataset_summary.csv").read_text()
115
+ self.assertIn("mean_layout_f1", ds_csv.splitlines()[0])
116
+ self.assertIn("a", ds_csv.splitlines()[1])
117
+
118
+ matrix_csv = (tmp / "parser_matrix.csv").read_text()
119
+ self.assertIn("a__mean_layout_class_aware_f1", matrix_csv.splitlines()[0])
120
+
121
+
122
+ if __name__ == "__main__":
123
+ unittest.main()
tests/test_datasets.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Dataset loader tests."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import tempfile
7
+ import unittest
8
+ from pathlib import Path
9
+
10
+ from zsgdp.benchmarks.datasets import (
11
+ DatasetDocument,
12
+ get_dataset_loader,
13
+ iter_dataset,
14
+ list_dataset_loaders,
15
+ register_dataset_loader,
16
+ )
17
+
18
+
19
+ class TestDatasetRegistry(unittest.TestCase):
20
+ def test_built_in_loaders_registered(self):
21
+ loaders = list_dataset_loaders()
22
+ self.assertIn("custom_folder", loaders)
23
+ self.assertIn("omnidocbench", loaders)
24
+ self.assertIn("doclaynet", loaders)
25
+
26
+ def test_custom_alias_resolves_to_custom_folder(self):
27
+ loader_default = get_dataset_loader("default")
28
+ loader_alias = get_dataset_loader("custom")
29
+ loader_canonical = get_dataset_loader("custom_folder")
30
+ self.assertIs(loader_default, loader_canonical)
31
+ self.assertIs(loader_alias, loader_canonical)
32
+
33
+ def test_unknown_loader_raises(self):
34
+ with self.assertRaises(KeyError):
35
+ get_dataset_loader("not_a_real_dataset")
36
+
37
+
38
+ class TestCustomFolderLoader(unittest.TestCase):
39
+ def test_yields_files_with_no_ground_truth(self):
40
+ with tempfile.TemporaryDirectory() as tmp:
41
+ root = Path(tmp)
42
+ (root / "a.md").write_text("# A\n")
43
+ (root / "b.md").write_text("# B\n")
44
+ (root / "subdir").mkdir()
45
+ (root / "subdir" / "ignored.md").write_text("# nope\n")
46
+
47
+ documents = list(iter_dataset("custom_folder", root))
48
+
49
+ ids = sorted(document.doc_id for document in documents)
50
+ self.assertEqual(ids, ["a", "b"])
51
+ for document in documents:
52
+ self.assertIsNone(document.ground_truth)
53
+ self.assertEqual(document.dataset_id, "custom_folder")
54
+ self.assertTrue(document.path.exists())
55
+
56
+ def test_missing_root_raises(self):
57
+ with self.assertRaises(FileNotFoundError):
58
+ list(iter_dataset("custom_folder", "/tmp/this-path-should-not-exist-zsgdp"))
59
+
60
+
61
+ class TestOmniDocBenchLoader(unittest.TestCase):
62
+ def test_pairs_pdf_with_sibling_json(self):
63
+ with tempfile.TemporaryDirectory() as tmp:
64
+ root = Path(tmp)
65
+ (root / "doc1.pdf").write_bytes(b"%PDF-1.4\n%%EOF\n")
66
+ (root / "doc1.json").write_text(json.dumps({"reading_order": ["e1", "e2"]}))
67
+ (root / "doc2.pdf").write_bytes(b"%PDF-1.4\n%%EOF\n") # no GT
68
+
69
+ documents = list(iter_dataset("omnidocbench", root))
70
+
71
+ by_id = {document.doc_id: document for document in documents}
72
+ self.assertEqual(set(by_id), {"doc1", "doc2"})
73
+
74
+ self.assertIsNotNone(by_id["doc1"].ground_truth)
75
+ self.assertEqual(by_id["doc1"].ground_truth["reading_order"], ["e1", "e2"])
76
+ self.assertTrue(by_id["doc1"].metadata["has_ground_truth"])
77
+
78
+ self.assertIsNone(by_id["doc2"].ground_truth)
79
+ self.assertFalse(by_id["doc2"].metadata["has_ground_truth"])
80
+
81
+ def test_no_pdfs_raises(self):
82
+ with tempfile.TemporaryDirectory() as tmp:
83
+ with self.assertRaises(FileNotFoundError):
84
+ list(iter_dataset("omnidocbench", tmp))
85
+
86
+
87
+ class TestDocLayNetLoader(unittest.TestCase):
88
+ def test_yields_one_document_per_image_with_filtered_annotations(self):
89
+ with tempfile.TemporaryDirectory() as tmp:
90
+ root = Path(tmp)
91
+ (root / "page1.png").write_bytes(b"\x89PNG\r\n\x1a\n")
92
+ (root / "page2.png").write_bytes(b"\x89PNG\r\n\x1a\n")
93
+ (root / "annotations.json").write_text(
94
+ json.dumps(
95
+ {
96
+ "images": [
97
+ {"id": 1, "file_name": "page1.png", "width": 800, "height": 1100},
98
+ {"id": 2, "file_name": "page2.png", "width": 800, "height": 1100},
99
+ ],
100
+ "annotations": [
101
+ {"id": 10, "image_id": 1, "category_id": 1, "bbox": [0, 0, 100, 50]},
102
+ {"id": 11, "image_id": 1, "category_id": 2, "bbox": [0, 60, 100, 50]},
103
+ {"id": 12, "image_id": 2, "category_id": 1, "bbox": [0, 0, 100, 50]},
104
+ ],
105
+ "categories": [
106
+ {"id": 1, "name": "Title"},
107
+ {"id": 2, "name": "Text"},
108
+ ],
109
+ }
110
+ )
111
+ )
112
+
113
+ documents = list(iter_dataset("doclaynet", root))
114
+
115
+ by_id = {document.doc_id: document for document in documents}
116
+ self.assertEqual(set(by_id), {"page1.png", "page2.png"})
117
+
118
+ self.assertEqual(len(by_id["page1.png"].ground_truth["annotations"]), 2)
119
+ self.assertEqual(len(by_id["page2.png"].ground_truth["annotations"]), 1)
120
+ self.assertEqual(by_id["page1.png"].ground_truth["categories"][1]["name"], "Title")
121
+
122
+ def test_missing_annotations_raises(self):
123
+ with tempfile.TemporaryDirectory() as tmp:
124
+ root = Path(tmp)
125
+ (root / "page1.png").write_bytes(b"\x89PNG\r\n\x1a\n")
126
+ with self.assertRaises(FileNotFoundError):
127
+ list(iter_dataset("doclaynet", root))
128
+
129
+
130
+ class TestRegisterDatasetLoader(unittest.TestCase):
131
+ def test_register_and_use_custom_loader(self):
132
+ marker = []
133
+
134
+ def fake_loader(root: Path):
135
+ marker.append(root)
136
+ yield DatasetDocument(dataset_id="fake", doc_id="x", path=root)
137
+
138
+ register_dataset_loader("zsgdp_test_fake", fake_loader)
139
+ try:
140
+ documents = list(iter_dataset("zsgdp_test_fake", Path("/tmp/whatever")))
141
+ finally:
142
+ from zsgdp.benchmarks.datasets import _LOADERS
143
+
144
+ _LOADERS.pop("zsgdp_test_fake", None)
145
+
146
+ self.assertEqual(len(documents), 1)
147
+ self.assertEqual(documents[0].dataset_id, "fake")
148
+ self.assertEqual(marker, [Path("/tmp/whatever")])
149
+
150
+
151
+ if __name__ == "__main__":
152
+ unittest.main()
tests/test_deployment.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import tempfile
3
+ import unittest
4
+ from pathlib import Path
5
+
6
+ from zsgdp.cli import main
7
+ from zsgdp.deployment import check_huggingface_space
8
+
9
+
10
+ class DeploymentReadinessTests(unittest.TestCase):
11
+ def test_space_check_accepts_current_project(self):
12
+ report = check_huggingface_space(Path.cwd())
13
+
14
+ self.assertTrue(report["valid"])
15
+ self.assertEqual(report["target"], "huggingface_spaces")
16
+ self.assertEqual(report["space_name"], "zeroshotGPU")
17
+ self.assertEqual(report["gpu_models_target"], "zeroshotGPU")
18
+ self.assertEqual(report["failure_count"], 0)
19
+ self.assertTrue(any(check["status"] == "warn" for check in report["checks"]))
20
+
21
+ def test_space_check_cli_writes_report(self):
22
+ with tempfile.TemporaryDirectory() as tmp:
23
+ output_path = Path(tmp) / "space_report.json"
24
+
25
+ code = main(["space-check", "--root", str(Path.cwd()), "--output", str(output_path)])
26
+
27
+ self.assertEqual(code, 0)
28
+ self.assertTrue(output_path.exists())
29
+ self.assertTrue(json.loads(output_path.read_text(encoding="utf-8"))["valid"])
30
+
31
+ def test_space_check_reports_missing_files(self):
32
+ with tempfile.TemporaryDirectory() as tmp:
33
+ root = Path(tmp)
34
+
35
+ report = check_huggingface_space(root)
36
+
37
+ self.assertFalse(report["valid"])
38
+ self.assertGreater(report["failure_count"], 0)
39
+ self.assertTrue(any(check["id"] == "required_file" and check["status"] == "fail" for check in report["checks"]))
40
+
41
+
42
+ if __name__ == "__main__":
43
+ unittest.main()
tests/test_docling_parser.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unittest
2
+
3
+ from zsgdp.parsers.docling_parser import _export_markdown, normalize_docling_markdown
4
+ from zsgdp.schema import DocumentProfile, PageProfile
5
+
6
+
7
+ class FakeDoclingDocument:
8
+ def export_to_markdown(self):
9
+ return "# Report\n\n| A | B |\n| --- | --- |\n| 1 | 2 |"
10
+
11
+
12
+ class DoclingParserTests(unittest.TestCase):
13
+ def test_export_markdown_uses_docling_method(self):
14
+ self.assertEqual(_export_markdown(FakeDoclingDocument()), "# Report\n\n| A | B |\n| --- | --- |\n| 1 | 2 |")
15
+
16
+ def test_normalize_docling_markdown_emits_schema(self):
17
+ profile = DocumentProfile(
18
+ doc_id="d1",
19
+ source_path="sample.pdf",
20
+ file_type="pdf",
21
+ page_count=1,
22
+ extension=".pdf",
23
+ pages=[PageProfile(page_num=1, digital_text_chars=20)],
24
+ )
25
+
26
+ candidate = normalize_docling_markdown(
27
+ markdown="# Report\n\n| A | B |\n| --- | --- |\n| 1 | 2 |",
28
+ profile=profile,
29
+ source_path="sample.pdf",
30
+ )
31
+
32
+ self.assertEqual(candidate.parser_name, "docling")
33
+ self.assertEqual(len(candidate.elements), 2)
34
+ self.assertEqual(len(candidate.tables), 1)
35
+ self.assertEqual(candidate.pages[0]["source_parser"], "docling")
36
+
37
+
38
+ if __name__ == "__main__":
39
+ unittest.main()
tests/test_embedding_retriever.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for the embedding-based retriever and the build_retriever factory."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import math
6
+ import tempfile
7
+ import unittest
8
+ from pathlib import Path
9
+ from unittest.mock import patch
10
+
11
+ from zsgdp.benchmarks.embedding_retriever import (
12
+ EmbeddingRetriever,
13
+ build_retriever,
14
+ )
15
+ from zsgdp.benchmarks.parser_quality import run_parser_benchmark
16
+ from zsgdp.benchmarks.retrieval import LexicalRetriever, run_retrieval_for_document
17
+ from zsgdp.schema import Chunk, ParsedDocument, QualityReport
18
+
19
+
20
+ def _chunk(chunk_id: str, text: str) -> Chunk:
21
+ return Chunk(
22
+ chunk_id=chunk_id,
23
+ doc_id="d1",
24
+ page_start=1,
25
+ page_end=1,
26
+ section_path=[],
27
+ content_type="prose",
28
+ text=text,
29
+ token_count=len(text.split()),
30
+ )
31
+
32
+
33
+ def _hashing_embedder(dim: int = 32):
34
+ """Deterministic toy embedder: tokens hashed into a fixed-dim vector.
35
+
36
+ Uses a process-stable hash (hashlib.md5) instead of builtins.hash(), which
37
+ is randomized per Python process and would make ranking non-deterministic
38
+ across test runs.
39
+ """
40
+
41
+ import hashlib
42
+
43
+ def stable_hash(token: str) -> int:
44
+ return int.from_bytes(hashlib.md5(token.encode("utf-8")).digest()[:8], "big")
45
+
46
+ def encode(texts):
47
+ out = []
48
+ for text in texts:
49
+ vector = [0.0] * dim
50
+ for token in text.lower().split():
51
+ vector[stable_hash(token) % dim] += 1.0
52
+ out.append(vector)
53
+ return out
54
+
55
+ return encode
56
+
57
+
58
+ class TestEmbeddingRetriever(unittest.TestCase):
59
+ def test_finds_distinctive_chunk_with_injected_embedder(self):
60
+ chunks = [
61
+ _chunk("c1", "Apples grow on trees in the orchard."),
62
+ _chunk("c2", "Cars drive on highways across the country."),
63
+ _chunk("c3", "Boats sail on rivers and oceans."),
64
+ ]
65
+ retriever = EmbeddingRetriever(embedder=_hashing_embedder())
66
+ retriever.index(chunks)
67
+
68
+ ranking = retriever.query("apples orchard", top_k=3)
69
+ self.assertEqual(ranking[0], "c1")
70
+
71
+ def test_empty_index_returns_empty(self):
72
+ retriever = EmbeddingRetriever(embedder=_hashing_embedder())
73
+ self.assertEqual(retriever.query("anything", top_k=3), [])
74
+
75
+ def test_zero_norm_vector_skipped(self):
76
+ retriever = EmbeddingRetriever(embedder=lambda texts: [[0.0, 0.0, 0.0]] * len(texts))
77
+ retriever.index([_chunk("c1", "anything")])
78
+ # Query embedder also returns zero vector, normalization fails -> empty.
79
+ self.assertEqual(retriever.query("anything", top_k=3), [])
80
+
81
+ def test_embedder_returning_wrong_count_raises(self):
82
+ bad = lambda texts: [[1.0]] # always returns one vector
83
+ retriever = EmbeddingRetriever(embedder=bad)
84
+ with self.assertRaises(RuntimeError):
85
+ retriever.index([_chunk("c1", "a"), _chunk("c2", "b")])
86
+
87
+ def test_lazy_load_path_raises_if_sentence_transformers_missing(self):
88
+ retriever = EmbeddingRetriever(model_id="fake/model")
89
+ # Force the import to fail by patching builtins.__import__.
90
+ import builtins
91
+
92
+ real_import = builtins.__import__
93
+
94
+ def fake_import(name, *args, **kwargs):
95
+ if name == "sentence_transformers":
96
+ raise ImportError("not installed")
97
+ return real_import(name, *args, **kwargs)
98
+
99
+ with patch("builtins.__import__", side_effect=fake_import):
100
+ with self.assertRaises(RuntimeError) as ctx:
101
+ retriever.index([_chunk("c1", "anything")])
102
+ self.assertIn("sentence-transformers", str(ctx.exception))
103
+
104
+
105
+ class TestBuildRetriever(unittest.TestCase):
106
+ def test_default_returns_lexical(self):
107
+ retriever = build_retriever({})
108
+ self.assertIsInstance(retriever, LexicalRetriever)
109
+
110
+ def test_explicit_lexical_backend(self):
111
+ retriever = build_retriever({"benchmarks": {"retriever": {"backend": "lexical"}}})
112
+ self.assertIsInstance(retriever, LexicalRetriever)
113
+
114
+ def test_embedding_backend_uses_gpu_models_embedding_default(self):
115
+ config = {
116
+ "benchmarks": {"retriever": {"backend": "embedding"}},
117
+ "gpu": {"models": {"embedding": {"model_id": "custom/model", "task": "retrieval.query"}}},
118
+ }
119
+ retriever = build_retriever(config)
120
+ self.assertIsInstance(retriever, EmbeddingRetriever)
121
+ self.assertEqual(retriever._model_id, "custom/model")
122
+ self.assertEqual(retriever._task, "retrieval.query")
123
+
124
+ def test_explicit_model_id_overrides_gpu_default(self):
125
+ config = {
126
+ "benchmarks": {"retriever": {"backend": "embedding", "model_id": "explicit/model"}},
127
+ "gpu": {"models": {"embedding": {"model_id": "ignored/model"}}},
128
+ }
129
+ retriever = build_retriever(config)
130
+ self.assertEqual(retriever._model_id, "explicit/model")
131
+
132
+ def test_unknown_backend_raises(self):
133
+ with self.assertRaises(ValueError):
134
+ build_retriever({"benchmarks": {"retriever": {"backend": "magic"}}})
135
+
136
+
137
+ class TestRunRetrievalWithEmbedding(unittest.TestCase):
138
+ def test_run_retrieval_for_document_accepts_embedding_retriever(self):
139
+ parsed = ParsedDocument(
140
+ doc_id="d1",
141
+ source_path="/tmp/d1.md",
142
+ file_type="markdown",
143
+ chunks=[
144
+ _chunk("c1", "Apples grow on trees in the orchard during autumn."),
145
+ _chunk("c2", "Submarines navigate beneath the ocean using sonar."),
146
+ ],
147
+ quality_report=QualityReport(),
148
+ )
149
+ retriever = EmbeddingRetriever(embedder=_hashing_embedder())
150
+ run = run_retrieval_for_document(parsed, retriever=retriever)
151
+ self.assertTrue(run["evaluated"])
152
+ self.assertGreater(run["query_count"], 0)
153
+ for result in run["results"]:
154
+ truth = result["truths"][0]
155
+ self.assertEqual(result["retrieved"][0], truth)
156
+
157
+
158
+ class TestBenchmarkOptInToEmbeddingBackend(unittest.TestCase):
159
+ def test_benchmark_uses_embedding_when_config_says_so(self):
160
+ # Patch build_retriever to return an EmbeddingRetriever with our toy embedder
161
+ # so the benchmark exercises the opt-in code path without loading a real model.
162
+ toy = EmbeddingRetriever(embedder=_hashing_embedder())
163
+
164
+ with tempfile.TemporaryDirectory() as tmp:
165
+ tmp = Path(tmp)
166
+ src = tmp / "in"
167
+ src.mkdir()
168
+ (src / "doc.md").write_text(
169
+ "# Doc\n\n"
170
+ "Apples grow on trees in the orchard during autumn season.\n\n"
171
+ "Submarines navigate beneath the ocean using sonar pulses across waters.\n",
172
+ encoding="utf-8",
173
+ )
174
+
175
+ with patch("zsgdp.benchmarks.parser_quality.load_config") as load_config:
176
+ load_config.return_value = {
177
+ "benchmarks": {"retriever": {"backend": "embedding"}},
178
+ }
179
+ with patch(
180
+ "zsgdp.benchmarks.embedding_retriever.build_retriever",
181
+ return_value=toy,
182
+ ) as build_call:
183
+ summary = run_parser_benchmark(src, tmp / "out", dataset_name="custom_folder")
184
+
185
+ self.assertGreaterEqual(build_call.call_count, 1)
186
+ self.assertTrue(summary["documents"][0]["retrieval_evaluated"])
187
+
188
+
189
+ if __name__ == "__main__":
190
+ unittest.main()
tests/test_env_loading.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for .env loading and HF_TOKEN resolution."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ import tempfile
7
+ import unittest
8
+ from pathlib import Path
9
+ from unittest.mock import patch
10
+
11
+ from zsgdp.config import hf_token, load_env_file
12
+
13
+
14
+ class LoadEnvFileTests(unittest.TestCase):
15
+ def test_loads_simple_key_value(self):
16
+ with tempfile.TemporaryDirectory() as tmp:
17
+ env = Path(tmp) / ".env"
18
+ env.write_text("HF_TOKEN=hf_test_value_123\nOTHER=foo\n", encoding="utf-8")
19
+
20
+ with patch.dict("os.environ", {}, clear=False):
21
+ os.environ.pop("HF_TOKEN", None)
22
+ os.environ.pop("OTHER", None)
23
+ applied = load_env_file(env)
24
+
25
+ self.assertEqual(applied["HF_TOKEN"], "hf_test_value_123")
26
+ self.assertEqual(applied["OTHER"], "foo")
27
+
28
+ def test_skips_comments_and_blank_lines(self):
29
+ with tempfile.TemporaryDirectory() as tmp:
30
+ env = Path(tmp) / ".env"
31
+ env.write_text(
32
+ "# top comment\n\nFOO=bar\n # indented\n\nBAZ=qux\n",
33
+ encoding="utf-8",
34
+ )
35
+ with patch.dict("os.environ", {}, clear=False):
36
+ os.environ.pop("FOO", None)
37
+ os.environ.pop("BAZ", None)
38
+ applied = load_env_file(env)
39
+
40
+ self.assertEqual(set(applied), {"FOO", "BAZ"})
41
+
42
+ def test_quoted_values_unquoted(self):
43
+ with tempfile.TemporaryDirectory() as tmp:
44
+ env = Path(tmp) / ".env"
45
+ env.write_text('A="quoted value"\nB=\'single\'\nC=plain\n', encoding="utf-8")
46
+ with patch.dict("os.environ", {}, clear=False):
47
+ for key in ("A", "B", "C"):
48
+ os.environ.pop(key, None)
49
+ applied = load_env_file(env)
50
+
51
+ self.assertEqual(applied["A"], "quoted value")
52
+ self.assertEqual(applied["B"], "single")
53
+ self.assertEqual(applied["C"], "plain")
54
+
55
+ def test_export_prefix_stripped(self):
56
+ with tempfile.TemporaryDirectory() as tmp:
57
+ env = Path(tmp) / ".env"
58
+ env.write_text("export FOO=bar\n", encoding="utf-8")
59
+ with patch.dict("os.environ", {}, clear=False):
60
+ os.environ.pop("FOO", None)
61
+ applied = load_env_file(env)
62
+
63
+ self.assertEqual(applied["FOO"], "bar")
64
+
65
+ def test_existing_env_wins_unless_override(self):
66
+ with tempfile.TemporaryDirectory() as tmp:
67
+ env = Path(tmp) / ".env"
68
+ env.write_text("FOO=from_file\n", encoding="utf-8")
69
+
70
+ with patch.dict("os.environ", {"FOO": "from_env"}, clear=False):
71
+ applied = load_env_file(env)
72
+ # Default behaviour: don't override.
73
+ self.assertNotIn("FOO", applied)
74
+ self.assertEqual(os.environ["FOO"], "from_env")
75
+
76
+ # With override=True, file wins.
77
+ applied = load_env_file(env, override=True)
78
+ self.assertEqual(applied["FOO"], "from_file")
79
+ self.assertEqual(os.environ["FOO"], "from_file")
80
+
81
+ def test_missing_file_returns_empty_no_error(self):
82
+ self.assertEqual(load_env_file(Path("/tmp/zsgdp_does_not_exist.env")), {})
83
+
84
+
85
+ class HFTokenResolverTests(unittest.TestCase):
86
+ def test_picks_up_hf_token(self):
87
+ with patch.dict(
88
+ "os.environ",
89
+ {"HF_TOKEN": "primary", "HUGGING_FACE_HUB_TOKEN": "secondary"},
90
+ clear=False,
91
+ ):
92
+ self.assertEqual(hf_token(), "primary")
93
+
94
+ def test_falls_through_alternative_names(self):
95
+ with patch.dict("os.environ", {}, clear=True):
96
+ os.environ["HUGGINGFACE_TOKEN"] = "fallback"
97
+ self.assertEqual(hf_token(), "fallback")
98
+
99
+ def test_recognises_hf_access_token_alias(self):
100
+ with patch.dict("os.environ", {}, clear=True):
101
+ os.environ["HF_ACCESS_TOKEN"] = "from_alias"
102
+ self.assertEqual(hf_token(), "from_alias")
103
+
104
+ def test_returns_none_when_unset(self):
105
+ with patch.dict("os.environ", {}, clear=True):
106
+ self.assertIsNone(hf_token())
107
+
108
+
109
+ if __name__ == "__main__":
110
+ unittest.main()
tests/test_external_parser_adapters.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unittest
2
+ from unittest.mock import patch
3
+
4
+ from zsgdp.config import load_config
5
+ from zsgdp.normalize.normalize_unstructured import normalize_unstructured_parts
6
+ from zsgdp.parsers.external import MinerUParser, OlmOCRParser, PaddleOCRParser
7
+ from zsgdp.schema import DocumentProfile, PageProfile
8
+
9
+
10
+ class ExternalParserAdapterTests(unittest.TestCase):
11
+ def test_command_backed_parsers_normalize_markdown(self):
12
+ cases = [
13
+ (MinerUParser, "mineru"),
14
+ (OlmOCRParser, "olmocr"),
15
+ (PaddleOCRParser, "paddleocr"),
16
+ ]
17
+ profile = _profile()
18
+
19
+ for parser_class, parser_name in cases:
20
+ with self.subTest(parser=parser_name), patch.object(parser_class, "available", return_value=True), patch(
21
+ "zsgdp.parsers.external.run_external_parser_to_markdown",
22
+ return_value="# Report\n\n| A | B |\n| --- | --- |\n| 1 | 2 |",
23
+ ):
24
+ candidate = parser_class().parse("sample.pdf", profile, load_config())
25
+
26
+ self.assertEqual(candidate.parser_name, parser_name)
27
+ self.assertEqual(candidate.elements[0].source_parser, parser_name)
28
+ self.assertEqual(len(candidate.tables), 1)
29
+ self.assertEqual(candidate.provenance["requested_pages"], [1])
30
+
31
+ def test_unstructured_normalizer_preserves_page_and_title_metadata(self):
32
+ class Metadata:
33
+ page_number = 2
34
+
35
+ class Title:
36
+ category = "Title"
37
+ metadata = Metadata()
38
+
39
+ def __str__(self):
40
+ return "Executive Summary"
41
+
42
+ class Narrative:
43
+ category = "NarrativeText"
44
+ metadata = Metadata()
45
+
46
+ def __str__(self):
47
+ return "The document parser keeps provenance."
48
+
49
+ candidate = normalize_unstructured_parts(parts=[Title(), Narrative()], profile=_profile(), source_path="sample.pdf")
50
+
51
+ self.assertEqual(candidate.parser_name, "unstructured")
52
+ self.assertEqual(candidate.elements[0].page_num, 2)
53
+ self.assertEqual(candidate.elements[0].type, "title")
54
+ self.assertEqual(candidate.elements[0].markdown, "# Executive Summary")
55
+
56
+
57
+ def _profile():
58
+ return DocumentProfile(
59
+ doc_id="d1",
60
+ source_path="sample.pdf",
61
+ file_type="pdf",
62
+ page_count=1,
63
+ extension=".pdf",
64
+ pages=[PageProfile(page_num=1, digital_text_chars=20)],
65
+ )
66
+
67
+
68
+ if __name__ == "__main__":
69
+ unittest.main()
tests/test_gpu_runner.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import tempfile
3
+ import unittest
4
+ from pathlib import Path
5
+ from unittest.mock import patch
6
+
7
+ from zsgdp.cli import main
8
+ from zsgdp.config import load_config
9
+ from zsgdp.gpu.batching import batch_gpu_tasks
10
+ from zsgdp.gpu.runner import dry_run_gpu_tasks, load_gpu_tasks, run_gpu_task_manifest
11
+ from zsgdp.gpu.worker import GPUWorker
12
+ from zsgdp.utils import write_jsonl
13
+
14
+
15
+ class GPURunnerTests(unittest.TestCase):
16
+ def test_batch_gpu_tasks_groups_by_task_type_and_batch_size(self):
17
+ tasks = [
18
+ {"task_id": "a", "task_type": "figure_description", "priority": 1},
19
+ {"task_id": "b", "task_type": "figure_description", "priority": 2},
20
+ {"task_id": "c", "task_type": "table_vlm_repair", "priority": 3},
21
+ ]
22
+
23
+ batches = batch_gpu_tasks(tasks, max_batch_size=1)
24
+
25
+ self.assertEqual(len(batches), 3)
26
+ self.assertEqual(batches[0]["task_count"], 1)
27
+ self.assertEqual({batch["task_type"] for batch in batches}, {"figure_description", "table_vlm_repair"})
28
+
29
+ def test_worker_reports_missing_image_path(self):
30
+ worker = GPUWorker(load_config())
31
+
32
+ result = worker.run(
33
+ {
34
+ "task_id": "gt1",
35
+ "task_type": "figure_description",
36
+ "doc_id": "d1",
37
+ "page_nums": [1],
38
+ "image_path": "/tmp/does-not-exist.png",
39
+ }
40
+ )
41
+
42
+ self.assertEqual(result["status"], "blocked_missing_inputs")
43
+ self.assertIn("image_path", result["readiness"]["missing_inputs"])
44
+
45
+ def test_run_gpu_task_manifest_writes_report(self):
46
+ with tempfile.TemporaryDirectory() as tmp:
47
+ tmp_path = Path(tmp)
48
+ image_path = tmp_path / "figure.png"
49
+ image_path.write_bytes(b"fake")
50
+ tasks_path = tmp_path / "gpu_tasks.jsonl"
51
+ report_path = tmp_path / "report.json"
52
+ write_jsonl(
53
+ tasks_path,
54
+ [
55
+ {
56
+ "task_id": "gt1",
57
+ "task_type": "figure_description",
58
+ "doc_id": "d1",
59
+ "page_nums": [1],
60
+ "image_path": str(image_path),
61
+ "priority": 60,
62
+ }
63
+ ],
64
+ )
65
+
66
+ report = run_gpu_task_manifest(tmp_path, config=load_config(), output_path=report_path)
67
+
68
+ self.assertEqual(report["task_count"], 1)
69
+ self.assertEqual(report["ready_count"], 1)
70
+ self.assertTrue(report_path.exists())
71
+ self.assertEqual(json.loads(report_path.read_text(encoding="utf-8"))["batch_count"], 1)
72
+
73
+ def test_dry_run_gpu_tasks_accepts_in_memory_tasks(self):
74
+ with tempfile.TemporaryDirectory() as tmp:
75
+ image_path = Path(tmp) / "figure.png"
76
+ image_path.write_bytes(b"fake")
77
+
78
+ report = dry_run_gpu_tasks(
79
+ [
80
+ {
81
+ "task_id": "gt1",
82
+ "task_type": "figure_description",
83
+ "doc_id": "d1",
84
+ "page_nums": [1],
85
+ "image_path": str(image_path),
86
+ "priority": 60,
87
+ }
88
+ ],
89
+ config=load_config(),
90
+ )
91
+
92
+ self.assertEqual(report["ready_count"], 1)
93
+ self.assertEqual(report["blocked_count"], 0)
94
+
95
+ def test_execute_gpu_tasks_dispatches_transformers_client(self):
96
+ with tempfile.TemporaryDirectory() as tmp:
97
+ image_path = Path(tmp) / "figure.png"
98
+ image_path.write_bytes(b"fake")
99
+ task = {
100
+ "task_id": "gt1",
101
+ "task_type": "figure_description",
102
+ "doc_id": "d1",
103
+ "page_nums": [1],
104
+ "image_path": str(image_path),
105
+ "priority": 60,
106
+ "backend": "transformers",
107
+ "model_role": "vlm",
108
+ "model_id": "local-test-model",
109
+ }
110
+
111
+ with patch("zsgdp.gpu.worker.TransformersClient") as client_class:
112
+ client_class.return_value.execute_task.return_value = {"status": "executed", "text": "Figure description."}
113
+ report = dry_run_gpu_tasks([task], config=load_config(), dry_run=False)
114
+
115
+ self.assertFalse(report["dry_run"])
116
+ self.assertEqual(report["executed_count"], 1)
117
+ self.assertEqual(report["failed_count"], 0)
118
+ self.assertEqual(report["batches"][0]["status"], "execute_complete")
119
+ client_class.return_value.execute_task.assert_called_once()
120
+
121
+ def test_load_gpu_tasks_accepts_file_path(self):
122
+ with tempfile.TemporaryDirectory() as tmp:
123
+ tasks_path = Path(tmp) / "tasks.jsonl"
124
+ write_jsonl(tasks_path, [{"task_id": "gt1"}])
125
+
126
+ self.assertEqual(load_gpu_tasks(tasks_path)[0]["task_id"], "gt1")
127
+
128
+ def test_run_gpu_tasks_cli(self):
129
+ with tempfile.TemporaryDirectory() as tmp:
130
+ tmp_path = Path(tmp)
131
+ tasks_path = tmp_path / "gpu_tasks.jsonl"
132
+ report_path = tmp_path / "report.json"
133
+ write_jsonl(
134
+ tasks_path,
135
+ [
136
+ {
137
+ "task_id": "gt1",
138
+ "task_type": "figure_description",
139
+ "doc_id": "d1",
140
+ "page_nums": [1],
141
+ "image_path": str(tmp_path / "missing.png"),
142
+ "priority": 60,
143
+ }
144
+ ],
145
+ )
146
+
147
+ code = main(["run-gpu-tasks", "--input", str(tasks_path), "--output", str(report_path)])
148
+
149
+ self.assertEqual(code, 0)
150
+ self.assertTrue(report_path.exists())
151
+
152
+ def test_run_gpu_tasks_cli_execute(self):
153
+ with tempfile.TemporaryDirectory() as tmp:
154
+ tmp_path = Path(tmp)
155
+ image_path = tmp_path / "figure.png"
156
+ image_path.write_bytes(b"fake")
157
+ tasks_path = tmp_path / "gpu_tasks.jsonl"
158
+ report_path = tmp_path / "report.json"
159
+ write_jsonl(
160
+ tasks_path,
161
+ [
162
+ {
163
+ "task_id": "gt1",
164
+ "task_type": "figure_description",
165
+ "doc_id": "d1",
166
+ "page_nums": [1],
167
+ "image_path": str(image_path),
168
+ "priority": 60,
169
+ "backend": "transformers",
170
+ "model_role": "vlm",
171
+ "model_id": "local-test-model",
172
+ }
173
+ ],
174
+ )
175
+
176
+ with patch("zsgdp.gpu.worker.TransformersClient") as client_class:
177
+ client_class.return_value.execute_task.return_value = {"status": "executed", "text": "done"}
178
+ code = main(["run-gpu-tasks", "--input", str(tasks_path), "--output", str(report_path), "--execute"])
179
+
180
+ self.assertEqual(code, 0)
181
+ self.assertEqual(json.loads(report_path.read_text(encoding="utf-8"))["executed_count"], 1)
182
+
183
+
184
+ if __name__ == "__main__":
185
+ unittest.main()
tests/test_gpu_runtime.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unittest
2
+ from unittest.mock import patch
3
+
4
+ from zsgdp.config import load_config
5
+ from zsgdp.gpu import GPUModelConfig, collect_gpu_runtime_status
6
+
7
+
8
+ class GPURuntimeTests(unittest.TestCase):
9
+ def test_model_config_reads_gpu_section(self):
10
+ config = load_config(overrides={"gpu": {"backend": "vllm", "provider": "huggingface_spaces", "space_name": "zeroshotGPU", "max_batch_size": 8}})
11
+
12
+ model_config = GPUModelConfig.from_config(config)
13
+
14
+ self.assertEqual(model_config.backend, "vllm")
15
+ self.assertEqual(model_config.provider, "huggingface_spaces")
16
+ self.assertEqual(model_config.space_name, "zeroshotGPU")
17
+ self.assertEqual(model_config.max_batch_size, 8)
18
+
19
+ def test_collect_runtime_detects_space_environment(self):
20
+ config = load_config()
21
+
22
+ with patch.dict("os.environ", {"SPACE_ID": "user/zeroshotGPU", "SPACE_HARDWARE": "l4x1"}, clear=False):
23
+ status = collect_gpu_runtime_status(config).to_dict()
24
+
25
+ self.assertEqual(status["provider"], "huggingface_spaces")
26
+ self.assertEqual(status["space_name"], "zeroshotGPU")
27
+ self.assertEqual(status["gpu_models_target"], "zeroshotGPU")
28
+ self.assertTrue(status["running_on_huggingface_space"])
29
+ self.assertEqual(status["space_id"], "user/zeroshotGPU")
30
+ self.assertEqual(status["hardware"], "l4x1")
31
+ self.assertIn(status["device"], {"cpu", "cuda", "mps"})
32
+ self.assertIn("torch_available", status)
33
+ self.assertEqual(status["configured_models"]["vlm"]["model_id"], "Qwen/Qwen2.5-VL-3B-Instruct")
34
+ self.assertEqual(status["configured_models"]["embedding"]["model_id"], "jinaai/jina-embeddings-v3")
35
+
36
+ def test_collect_runtime_reports_local_note(self):
37
+ config = load_config()
38
+
39
+ with patch.dict("os.environ", {"SPACE_ID": "", "SPACE_HOST": "", "SPACE_HARDWARE": ""}, clear=False):
40
+ status = collect_gpu_runtime_status(config)
41
+
42
+ self.assertFalse(status.running_on_huggingface_space)
43
+ self.assertTrue(any("local run" in note for note in status.notes))
44
+
45
+
46
+ if __name__ == "__main__":
47
+ unittest.main()
tests/test_gpu_tasks.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unittest
2
+
3
+ from zsgdp.config import load_config
4
+ from zsgdp.gpu import plan_gpu_tasks
5
+ from zsgdp.routing import RouteDecision
6
+ from zsgdp.routing.budgets import Budget
7
+ from zsgdp.schema import DocumentProfile, FigureObject, PageProfile, ParsedDocument, TableObject
8
+
9
+
10
+ class GPUTaskTests(unittest.TestCase):
11
+ def test_plan_gpu_tasks_includes_route_ocr_table_and_figure(self):
12
+ config = load_config(overrides={"chunking": {"vision_guided": True}})
13
+ profile = DocumentProfile(
14
+ doc_id="d1",
15
+ source_path="sample.pdf",
16
+ file_type="pdf",
17
+ page_count=1,
18
+ extension=".pdf",
19
+ pages=[
20
+ PageProfile(page_num=1, scanned_score=0.8, digital_text_chars=0, digital_text_quality=0.0),
21
+ ],
22
+ )
23
+ parsed = ParsedDocument(
24
+ doc_id="d1",
25
+ source_path="sample.pdf",
26
+ file_type="pdf",
27
+ pages=[
28
+ {
29
+ "page_num": 1,
30
+ "parser_pages": [
31
+ {"rendered_page": {"image_path": "/tmp/page.png"}},
32
+ ],
33
+ }
34
+ ],
35
+ )
36
+ parsed.tables.append(
37
+ TableObject(
38
+ table_id="t1",
39
+ page_nums=[1],
40
+ bbox=[(1.0, 2.0, 3.0, 4.0)],
41
+ markdown="| A | B |\n| --- | --- |\n| 1 | 2 |",
42
+ provenance={"crop_path": "/tmp/table.png"},
43
+ )
44
+ )
45
+ parsed.figures.append(FigureObject(figure_id="f1", page_num=1, image_path="/tmp/figure.png"))
46
+ routes = [
47
+ RouteDecision(
48
+ page_id=1,
49
+ experts=["pymupdf", "vlm_figure_repair"],
50
+ reason="figure-heavy page",
51
+ budget=Budget(),
52
+ labels=["figure_heavy"],
53
+ )
54
+ ]
55
+
56
+ tasks = plan_gpu_tasks(profile, parsed, config, routes)
57
+
58
+ task_types = [task["task_type"] for task in tasks]
59
+ self.assertIn("vlm_route_repair", task_types)
60
+ self.assertIn("ocr_page", task_types)
61
+ self.assertIn("table_vlm_repair", task_types)
62
+ self.assertIn("figure_description", task_types)
63
+ self.assertEqual(tasks[0]["task_type"], "vlm_route_repair")
64
+ self.assertTrue(all(task["provider"] == "huggingface_spaces" for task in tasks))
65
+ self.assertTrue(all(task["space_name"] == "zeroshotGPU" for task in tasks))
66
+ self.assertTrue(all(task["model_id"] for task in tasks))
67
+ self.assertEqual(_task_by_type(tasks, "ocr_page")["model_role"], "ocr")
68
+ self.assertEqual(_task_by_type(tasks, "table_vlm_repair")["model_role"], "table")
69
+ self.assertEqual(_task_by_type(tasks, "figure_description")["model_role"], "vlm")
70
+ self.assertEqual(_task_by_type(tasks, "figure_description")["model_id"], "Qwen/Qwen2.5-VL-3B-Instruct")
71
+
72
+ def test_plan_gpu_tasks_respects_max_vlm_calls(self):
73
+ config = load_config(overrides={"gpu": {"max_vlm_calls_per_doc": 1}, "chunking": {"vision_guided": True}})
74
+ profile = DocumentProfile(
75
+ doc_id="d1",
76
+ source_path="sample.pdf",
77
+ file_type="pdf",
78
+ page_count=1,
79
+ extension=".pdf",
80
+ pages=[PageProfile(page_num=1, scanned_score=0.8)],
81
+ )
82
+ parsed = ParsedDocument(doc_id="d1", source_path="sample.pdf", file_type="pdf")
83
+ parsed.figures.append(FigureObject(figure_id="f1", page_num=1, image_path="/tmp/figure.png"))
84
+
85
+ tasks = plan_gpu_tasks(profile, parsed, config)
86
+
87
+ self.assertEqual(len(tasks), 1)
88
+ self.assertEqual(tasks[0]["task_type"], "ocr_page")
89
+
90
+
91
+ def _task_by_type(tasks, task_type):
92
+ for task in tasks:
93
+ if task["task_type"] == task_type:
94
+ return task
95
+ raise AssertionError(f"Missing task type: {task_type}")
96
+
97
+
98
+ if __name__ == "__main__":
99
+ unittest.main()
tests/test_layout_f1.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for layout F1 metric and ground-truth adapters."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import unittest
6
+
7
+ from zsgdp.benchmarks.ground_truth import (
8
+ canonical_category,
9
+ doclaynet_layout_truths,
10
+ omnidocbench_layout_truths,
11
+ parsed_layout_predictions,
12
+ )
13
+ from zsgdp.schema import Element, FigureObject, ParsedDocument, QualityReport, TableObject
14
+ from zsgdp.verify.layout_f1 import compute_layout_f1
15
+
16
+
17
+ def _item(bbox, category="paragraph", page_num=1):
18
+ return {"bbox": bbox, "category": category, "page_num": page_num}
19
+
20
+
21
+ class TestComputeLayoutF1(unittest.TestCase):
22
+ def test_perfect_match_yields_f1_1(self):
23
+ predictions = [_item((0, 0, 100, 50)), _item((0, 60, 100, 110), "table")]
24
+ truths = [_item((0, 0, 100, 50)), _item((0, 60, 100, 110), "table")]
25
+ result = compute_layout_f1(predictions, truths)
26
+ self.assertEqual(result["class_aware"]["f1"], 1.0)
27
+ self.assertEqual(result["class_agnostic"]["f1"], 1.0)
28
+ self.assertEqual(result["class_aware"]["tp"], 2)
29
+
30
+ def test_zero_match_yields_f1_0(self):
31
+ predictions = [_item((0, 0, 50, 50))]
32
+ truths = [_item((1000, 1000, 1100, 1100))]
33
+ result = compute_layout_f1(predictions, truths)
34
+ self.assertEqual(result["class_aware"]["f1"], 0.0)
35
+ self.assertEqual(result["class_aware"]["fp"], 1)
36
+ self.assertEqual(result["class_aware"]["fn"], 1)
37
+
38
+ def test_iou_below_threshold_misses(self):
39
+ # 50% overlap by area in one axis only -> IoU < 0.5
40
+ predictions = [_item((0, 0, 100, 100))]
41
+ truths = [_item((60, 0, 160, 100))]
42
+ result = compute_layout_f1(predictions, truths, iou_threshold=0.5)
43
+ self.assertEqual(result["class_aware"]["tp"], 0)
44
+
45
+ def test_class_aware_vs_agnostic(self):
46
+ # Same bbox, different category -> agnostic matches, aware doesn't.
47
+ predictions = [_item((0, 0, 100, 100), "paragraph")]
48
+ truths = [_item((0, 0, 100, 100), "title")]
49
+ result = compute_layout_f1(predictions, truths)
50
+ self.assertEqual(result["class_aware"]["tp"], 0)
51
+ self.assertEqual(result["class_agnostic"]["tp"], 1)
52
+
53
+ def test_per_category_breakdown(self):
54
+ predictions = [_item((0, 0, 100, 100), "title"), _item((0, 200, 100, 300), "table")]
55
+ truths = [_item((0, 0, 100, 100), "title")]
56
+ result = compute_layout_f1(predictions, truths)
57
+ per_category = result["per_category"]
58
+ self.assertEqual(per_category["title"]["tp"], 1)
59
+ self.assertEqual(per_category["table"]["fp"], 1)
60
+
61
+ def test_empty_inputs_are_vacuously_correct(self):
62
+ self.assertEqual(compute_layout_f1([], [])["class_aware"]["f1"], 1.0)
63
+
64
+ def test_predictions_only_yields_zero(self):
65
+ result = compute_layout_f1([_item((0, 0, 10, 10))], [])
66
+ self.assertEqual(result["class_aware"]["fp"], 1)
67
+ self.assertEqual(result["class_aware"]["f1"], 0.0)
68
+
69
+ def test_page_num_must_match(self):
70
+ predictions = [_item((0, 0, 100, 100), "table", page_num=1)]
71
+ truths = [_item((0, 0, 100, 100), "table", page_num=2)]
72
+ result = compute_layout_f1(predictions, truths)
73
+ self.assertEqual(result["class_aware"]["tp"], 0)
74
+
75
+
76
+ class TestDocLayNetAdapter(unittest.TestCase):
77
+ def test_xywh_converted_and_categories_normalized(self):
78
+ ground_truth = {
79
+ "image": {"id": 5, "file_name": "p.png", "page_no": 5},
80
+ "annotations": [
81
+ {"image_id": 5, "category_id": 1, "bbox": [10, 20, 50, 60]},
82
+ {"image_id": 5, "category_id": 2, "bbox": [100, 0, 40, 30]},
83
+ ],
84
+ "categories": {1: {"name": "Title"}, 2: {"name": "Section-header"}},
85
+ }
86
+ truths = doclaynet_layout_truths(ground_truth)
87
+ self.assertEqual(len(truths), 2)
88
+ self.assertEqual(truths[0]["bbox"], (10.0, 20.0, 60.0, 80.0))
89
+ self.assertEqual(truths[0]["category"], "title")
90
+ self.assertEqual(truths[0]["page_num"], 5)
91
+ self.assertEqual(truths[1]["category"], "heading")
92
+
93
+ def test_invalid_annotations_dropped(self):
94
+ ground_truth = {
95
+ "image": {"id": 1, "file_name": "p.png"},
96
+ "annotations": [
97
+ {"image_id": 1, "category_id": 1, "bbox": [0, 0, 0, 0]},
98
+ {"image_id": 1, "category_id": 1},
99
+ ],
100
+ "categories": {1: {"name": "Text"}},
101
+ }
102
+ self.assertEqual(doclaynet_layout_truths(ground_truth), [])
103
+
104
+
105
+ class TestOmniDocBenchAdapter(unittest.TestCase):
106
+ def test_picks_layout_dets_first(self):
107
+ ground_truth = {
108
+ "layout_dets": [
109
+ {"bbox": [0, 0, 100, 50], "category": "title", "page_num": 1},
110
+ {"bbox": [0, 100, 100, 150], "category": "Table", "page": 1},
111
+ ]
112
+ }
113
+ truths = omnidocbench_layout_truths(ground_truth)
114
+ self.assertEqual(len(truths), 2)
115
+ self.assertEqual(truths[0]["category"], "title")
116
+ self.assertEqual(truths[1]["category"], "table")
117
+
118
+ def test_pages_nested_records(self):
119
+ ground_truth = {
120
+ "pages": [
121
+ {"page_num": 1, "elements": [{"bbox": [0, 0, 10, 10], "category": "paragraph"}]},
122
+ {"page_num": 2, "elements": [{"bbox": [0, 0, 10, 10], "category": "table"}]},
123
+ ]
124
+ }
125
+ truths = omnidocbench_layout_truths(ground_truth)
126
+ self.assertEqual(len(truths), 2)
127
+ self.assertEqual(truths[0]["page_num"], 1)
128
+ self.assertEqual(truths[1]["page_num"], 2)
129
+
130
+ def test_unknown_shape_returns_empty(self):
131
+ self.assertEqual(omnidocbench_layout_truths({"weird": "shape"}), [])
132
+ self.assertEqual(omnidocbench_layout_truths(None), [])
133
+
134
+
135
+ class TestParsedPredictions(unittest.TestCase):
136
+ def test_extracts_bboxes_from_elements_tables_figures(self):
137
+ parsed = ParsedDocument(
138
+ doc_id="d1",
139
+ source_path="/tmp/d1.pdf",
140
+ file_type="pdf",
141
+ elements=[
142
+ Element(
143
+ element_id="e1",
144
+ doc_id="d1",
145
+ page_num=1,
146
+ type="title",
147
+ text="Title",
148
+ bbox=(0.0, 0.0, 100.0, 30.0),
149
+ ),
150
+ Element(
151
+ element_id="e2",
152
+ doc_id="d1",
153
+ page_num=1,
154
+ type="paragraph",
155
+ text="No bbox",
156
+ ),
157
+ ],
158
+ tables=[
159
+ TableObject(
160
+ table_id="t1",
161
+ page_nums=[1],
162
+ bbox=[(0.0, 100.0, 200.0, 200.0)],
163
+ )
164
+ ],
165
+ figures=[
166
+ FigureObject(
167
+ figure_id="f1",
168
+ page_num=2,
169
+ bbox=(50.0, 50.0, 150.0, 250.0),
170
+ )
171
+ ],
172
+ quality_report=QualityReport(),
173
+ )
174
+ predictions = parsed_layout_predictions(parsed)
175
+ categories = sorted(prediction["category"] for prediction in predictions)
176
+ self.assertEqual(categories, ["figure", "table", "title"])
177
+ self.assertEqual(len(predictions), 3)
178
+
179
+
180
+ class TestCanonicalCategory(unittest.TestCase):
181
+ def test_canonical_mapping(self):
182
+ self.assertEqual(canonical_category("Picture"), "figure")
183
+ self.assertEqual(canonical_category("Section-header"), "heading")
184
+ self.assertEqual(canonical_category("Page-footer"), "footer")
185
+ self.assertEqual(canonical_category("formula"), "formula")
186
+ self.assertEqual(canonical_category("Mystery"), "mystery")
187
+
188
+
189
+ if __name__ == "__main__":
190
+ unittest.main()
tests/test_logging.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for the logging configuration and structured log emission."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import io
6
+ import json
7
+ import logging
8
+ import tempfile
9
+ import unittest
10
+ from pathlib import Path
11
+ from unittest.mock import patch
12
+
13
+ from zsgdp.logging_config import configure_logging, get_logger
14
+ from zsgdp.pipeline import parse_document
15
+
16
+
17
+ class ConfigureLoggingTests(unittest.TestCase):
18
+ def setUp(self) -> None:
19
+ # Reset between tests so each one configures cleanly.
20
+ root = logging.getLogger("zsgdp")
21
+ for handler in list(root.handlers):
22
+ root.removeHandler(handler)
23
+
24
+ def test_idempotent_configuration(self):
25
+ stream = io.StringIO()
26
+ configure_logging(level="INFO", json_format=False, stream=stream)
27
+ configure_logging(level="INFO", json_format=False, stream=stream)
28
+ root = logging.getLogger("zsgdp")
29
+ # Idempotent: still exactly one handler attached.
30
+ self.assertEqual(len(root.handlers), 1)
31
+
32
+ def test_text_format_emits_human_readable(self):
33
+ stream = io.StringIO()
34
+ configure_logging(level="INFO", json_format=False, stream=stream)
35
+ get_logger("zsgdp.test").info("hello", extra={"doc_id": "d1"})
36
+ output = stream.getvalue()
37
+ self.assertIn("INFO", output)
38
+ self.assertIn("zsgdp.test", output)
39
+ self.assertIn("hello", output)
40
+
41
+ def test_json_format_emits_one_line_records(self):
42
+ stream = io.StringIO()
43
+ configure_logging(level="INFO", json_format=True, stream=stream)
44
+ get_logger("zsgdp.test").info("event", extra={"doc_id": "abc", "count": 3})
45
+ output = stream.getvalue().strip()
46
+ record = json.loads(output)
47
+ self.assertEqual(record["level"], "INFO")
48
+ self.assertEqual(record["logger"], "zsgdp.test")
49
+ self.assertEqual(record["message"], "event")
50
+ self.assertEqual(record["doc_id"], "abc")
51
+ self.assertEqual(record["count"], 3)
52
+
53
+ def test_default_level_is_warning(self):
54
+ stream = io.StringIO()
55
+ with patch.dict("os.environ", {"ZSGDP_LOG_LEVEL": "", "ZSGDP_LOG_JSON": ""}, clear=False):
56
+ configure_logging(stream=stream)
57
+ get_logger("zsgdp.test").info("hidden_at_default_level")
58
+ self.assertNotIn("hidden_at_default_level", stream.getvalue())
59
+ get_logger("zsgdp.test").warning("visible_at_default_level")
60
+ self.assertIn("visible_at_default_level", stream.getvalue())
61
+
62
+ def test_get_logger_namespacing(self):
63
+ self.assertEqual(get_logger().name, "zsgdp")
64
+ self.assertEqual(get_logger("zsgdp.foo").name, "zsgdp.foo")
65
+ # Bare names get prefixed.
66
+ self.assertEqual(get_logger("foo").name, "zsgdp.foo")
67
+
68
+
69
+ class PipelineLogEmissionTests(unittest.TestCase):
70
+ def test_parse_emits_start_and_end_records(self):
71
+ # Reset handlers so assertLogs works against the named logger.
72
+ root = logging.getLogger("zsgdp")
73
+ for handler in list(root.handlers):
74
+ root.removeHandler(handler)
75
+ root.setLevel(logging.DEBUG)
76
+ root.propagate = True
77
+
78
+ with tempfile.TemporaryDirectory() as tmp:
79
+ input_path = Path(tmp) / "doc.md"
80
+ input_path.write_text("# Doc\n\nHello.\n", encoding="utf-8")
81
+
82
+ with self.assertLogs("zsgdp.pipeline", level="INFO") as captured:
83
+ parse_document(input_path, Path(tmp) / "out")
84
+
85
+ messages = [record.message for record in captured.records]
86
+ self.assertIn("parse_start", messages)
87
+ self.assertIn("parse_end", messages)
88
+
89
+ # Find a parse_end record and assert structured fields are present.
90
+ parse_end = next(record for record in captured.records if record.message == "parse_end")
91
+ self.assertTrue(hasattr(parse_end, "doc_id"))
92
+ self.assertTrue(hasattr(parse_end, "elapsed_seconds"))
93
+ self.assertTrue(hasattr(parse_end, "quality_score"))
94
+ self.assertTrue(hasattr(parse_end, "element_count"))
95
+
96
+
97
+ class RepairLogEmissionTests(unittest.TestCase):
98
+ def test_repair_emits_iteration_record(self):
99
+ root = logging.getLogger("zsgdp")
100
+ for handler in list(root.handlers):
101
+ root.removeHandler(handler)
102
+ root.setLevel(logging.DEBUG)
103
+ root.propagate = True
104
+
105
+ with tempfile.TemporaryDirectory() as tmp:
106
+ input_path = Path(tmp) / "report.md"
107
+ # Malformed table forces a repair iteration.
108
+ input_path.write_text(
109
+ "# Report\n\n| A | B |\n| --- | --- |\n| 1 | 2 | 3 |\n",
110
+ encoding="utf-8",
111
+ )
112
+
113
+ with self.assertLogs("zsgdp.repair.controller", level="INFO") as captured:
114
+ parse_document(input_path, Path(tmp) / "out")
115
+
116
+ repair_records = [r for r in captured.records if r.message == "repair_iteration"]
117
+ self.assertGreaterEqual(len(repair_records), 1)
118
+ # Each record carries the iteration index.
119
+ for record in repair_records:
120
+ self.assertTrue(hasattr(record, "iteration"))
121
+ self.assertTrue(hasattr(record, "status"))
122
+
123
+
124
+ if __name__ == "__main__":
125
+ unittest.main()
tests/test_markdown_normalizer.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unittest
2
+
3
+ from zsgdp.normalize.markdown import markdown_to_blocks, normalize_markdown_candidate, normalize_markdown_table
4
+
5
+
6
+ class MarkdownNormalizerTests(unittest.TestCase):
7
+ def test_markdown_to_blocks_preserves_pages_tables_and_images(self):
8
+ markdown = """# Report
9
+
10
+ Intro paragraph.
11
+
12
+ | Region | Q1 |
13
+ | --- | --- |
14
+ | NA | 10 |
15
+
16
+ <!-- page:2 -->
17
+
18
+ ## Figure Section
19
+
20
+ ![Chart caption](chart.png)
21
+ """
22
+
23
+ candidate = normalize_markdown_candidate(
24
+ markdown=markdown,
25
+ doc_id="d1",
26
+ source_path="sample.md",
27
+ file_type="markdown",
28
+ parser_name="test",
29
+ )
30
+
31
+ self.assertEqual([page["page_num"] for page in candidate.pages], [1, 2])
32
+ self.assertEqual(len(candidate.tables), 1)
33
+ self.assertEqual(candidate.tables[0].page_nums, [1])
34
+ self.assertEqual(len(candidate.figures), 1)
35
+ self.assertEqual(candidate.figures[0].page_num, 2)
36
+ self.assertEqual(candidate.figures[0].image_path, "chart.png")
37
+
38
+ def test_normalize_markdown_table_repairs_separator(self):
39
+ table = "| A | B |\n| --- | --- |\n| 1 | 2 |"
40
+
41
+ self.assertEqual(normalize_markdown_table(table), "| A | B |\n| --- | --- |\n| 1 | 2 |")
42
+
43
+ def test_normalize_plain_aligned_table(self):
44
+ table = "Region Q1 Q2\nNorth America 10 12\nEurope 8 7"
45
+
46
+ self.assertEqual(
47
+ normalize_markdown_table(table),
48
+ "| Region | Q1 | Q2 |\n| --- | --- | --- |\n| North America | 10 | 12 |\n| Europe | 8 | 7 |",
49
+ )
50
+
51
+ def test_markdown_to_blocks_detects_plain_aligned_table(self):
52
+ blocks = markdown_to_blocks("# Report\n\nRegion Q1 Q2\nNorth America 10 12\nEurope 8 7")
53
+
54
+ self.assertEqual(blocks[1].block_type, "table")
55
+
56
+ def test_markdown_to_blocks_classifies_caption(self):
57
+ blocks = markdown_to_blocks("Figure 1 Revenue trend")
58
+
59
+ self.assertEqual(blocks[0].block_type, "caption")
60
+
61
+
62
+ if __name__ == "__main__":
63
+ unittest.main()
tests/test_marker_parser.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tempfile
2
+ import unittest
3
+ from pathlib import Path
4
+ from unittest.mock import patch
5
+
6
+ from zsgdp.config import load_config
7
+ from zsgdp.parsers.external import MarkerParser, _read_external_markdown, _read_marker_markdown, normalize_marker_markdown
8
+ from zsgdp.schema import DocumentProfile, PageProfile
9
+
10
+
11
+ class MarkerParserTests(unittest.TestCase):
12
+ def test_normalize_marker_markdown_emits_common_schema(self):
13
+ profile = DocumentProfile(
14
+ doc_id="d1",
15
+ source_path="sample.pdf",
16
+ file_type="pdf",
17
+ page_count=1,
18
+ extension=".pdf",
19
+ pages=[PageProfile(page_num=1, digital_text_chars=20)],
20
+ )
21
+
22
+ candidate = normalize_marker_markdown(
23
+ markdown="# Report\n\n| A | B |\n| --- | --- |\n| 1 | 2 |\n\n![Chart](chart.png)",
24
+ profile=profile,
25
+ source_path="sample.pdf",
26
+ )
27
+
28
+ self.assertEqual(candidate.parser_name, "marker")
29
+ self.assertEqual(len(candidate.tables), 1)
30
+ self.assertEqual(len(candidate.figures), 1)
31
+ self.assertEqual(candidate.pages[0]["source_parser"], "marker")
32
+
33
+ def test_marker_parser_runs_markdown_through_normalizer(self):
34
+ profile = DocumentProfile(
35
+ doc_id="d1",
36
+ source_path="sample.pdf",
37
+ file_type="pdf",
38
+ page_count=1,
39
+ extension=".pdf",
40
+ pages=[PageProfile(page_num=1, digital_text_chars=20)],
41
+ )
42
+
43
+ with patch.object(MarkerParser, "available", return_value=True), patch(
44
+ "zsgdp.parsers.external.run_marker_to_markdown",
45
+ return_value="# Report\n\nBody.",
46
+ ):
47
+ candidate = MarkerParser().parse("sample.pdf", profile, load_config())
48
+
49
+ self.assertEqual(candidate.parser_name, "marker")
50
+ self.assertEqual(candidate.elements[0].source_parser, "marker")
51
+ self.assertEqual(candidate.provenance["requested_pages"], [1])
52
+
53
+ def test_read_marker_markdown_prefers_markdown_file(self):
54
+ with tempfile.TemporaryDirectory() as tmp:
55
+ root = Path(tmp)
56
+ nested = root / "sample"
57
+ nested.mkdir()
58
+ (nested / "other.md").write_text("# Other", encoding="utf-8")
59
+ (nested / "markdown.md").write_text("# Preferred", encoding="utf-8")
60
+
61
+ markdown = _read_marker_markdown(root)
62
+
63
+ self.assertEqual(markdown, "# Preferred")
64
+
65
+ def test_read_external_markdown_falls_back_to_stdout(self):
66
+ with tempfile.TemporaryDirectory() as tmp:
67
+ markdown = _read_external_markdown(Path(tmp), parser_name="mineru", stdout="# From stdout")
68
+
69
+ self.assertEqual(markdown, "# From stdout")
70
+
71
+
72
+ if __name__ == "__main__":
73
+ unittest.main()
tests/test_merge.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unittest
2
+
3
+ from zsgdp.merge.dedupe import dedupe_elements, dedupe_tables
4
+ from zsgdp.schema import Element, TableObject
5
+
6
+
7
+ class MergeDedupeTests(unittest.TestCase):
8
+ def test_merges_docling_heading_with_pymupdf_bbox(self):
9
+ elements = [
10
+ Element(
11
+ element_id="docling_p1_e1",
12
+ doc_id="d1",
13
+ page_num=1,
14
+ type="heading",
15
+ text="## Revenue Summary",
16
+ markdown="## Revenue Summary",
17
+ reading_order=1,
18
+ confidence=0.88,
19
+ source_parser="docling",
20
+ ),
21
+ Element(
22
+ element_id="pymupdf_p1_e1",
23
+ doc_id="d1",
24
+ page_num=1,
25
+ type="paragraph",
26
+ text="Revenue Summary",
27
+ bbox=(72.0, 100.0, 200.0, 124.0),
28
+ reading_order=1,
29
+ confidence=0.86,
30
+ source_parser="pymupdf",
31
+ ),
32
+ ]
33
+
34
+ deduped = dedupe_elements(elements)
35
+
36
+ self.assertEqual(len(deduped), 1)
37
+ self.assertEqual(deduped[0].source_parser, "docling")
38
+ self.assertEqual(deduped[0].bbox, (72.0, 100.0, 200.0, 124.0))
39
+ self.assertEqual(deduped[0].provenance["bbox_source_parser"], "pymupdf")
40
+
41
+ def test_drops_paragraph_duplicate_of_structured_table(self):
42
+ elements = [
43
+ Element(
44
+ element_id="docling_p1_e1",
45
+ doc_id="d1",
46
+ page_num=1,
47
+ type="paragraph",
48
+ text="Region Q1 Q2 North America 10 12 Europe 8 7",
49
+ reading_order=1,
50
+ confidence=0.88,
51
+ source_parser="docling",
52
+ ),
53
+ Element(
54
+ element_id="pymupdf_p1_e1",
55
+ doc_id="d1",
56
+ page_num=1,
57
+ type="table",
58
+ markdown="| Region | Q1 | Q2 |\n| --- | --- | --- |\n| North America | 10 | 12 |\n| Europe | 8 | 7 |",
59
+ reading_order=1,
60
+ confidence=0.72,
61
+ source_parser="pymupdf",
62
+ ),
63
+ ]
64
+
65
+ deduped = dedupe_elements(elements)
66
+
67
+ self.assertEqual(len(deduped), 1)
68
+ self.assertEqual(deduped[0].type, "table")
69
+
70
+ def test_merges_duplicate_table_elements_and_keeps_better_grid(self):
71
+ elements = [
72
+ Element(
73
+ element_id="docling_p1_e3",
74
+ doc_id="d1",
75
+ page_num=1,
76
+ type="table",
77
+ markdown="| Region | Q1 | Q2 North America | 10 | 12 Europe | 8 | 7 |\n| --- | --- | --- | --- | --- | --- | --- |",
78
+ reading_order=3,
79
+ confidence=0.88,
80
+ source_parser="docling",
81
+ ),
82
+ Element(
83
+ element_id="pymupdf_p1_e3",
84
+ doc_id="d1",
85
+ page_num=1,
86
+ type="table",
87
+ bbox=(72.0, 144.0, 237.0, 186.0),
88
+ markdown="| Region | Q1 | Q2 |\n| --- | --- | --- |\n| North America | 10 | 12 |\n| Europe | 8 | 7 |",
89
+ reading_order=3,
90
+ confidence=0.72,
91
+ source_parser="pymupdf",
92
+ ),
93
+ ]
94
+
95
+ deduped = dedupe_elements(elements)
96
+
97
+ self.assertEqual(len(deduped), 1)
98
+ self.assertEqual(deduped[0].source_parser, "pymupdf")
99
+ self.assertEqual(deduped[0].confidence, 0.88)
100
+ self.assertIn("| North America | 10 | 12 |", deduped[0].markdown or "")
101
+ self.assertEqual(deduped[0].bbox, (72.0, 144.0, 237.0, 186.0))
102
+
103
+ def test_merges_duplicate_tables_and_keeps_better_grid_assets(self):
104
+ tables = [
105
+ TableObject(
106
+ table_id="docling_t1",
107
+ page_nums=[1],
108
+ markdown="| Region | Q1 | Q2 North America | 10 | 12 Europe | 8 | 7 |\n| --- | --- | --- | --- | --- | --- | --- |",
109
+ confidence=0.84,
110
+ source_parser="docling",
111
+ ),
112
+ TableObject(
113
+ table_id="pymupdf_t1",
114
+ page_nums=[1],
115
+ bbox=[(72.0, 144.0, 237.0, 186.0)],
116
+ markdown="| Region | Q1 | Q2 |\n| --- | --- | --- |\n| North America | 10 | 12 |\n| Europe | 8 | 7 |",
117
+ confidence=0.72,
118
+ source_parser="pymupdf",
119
+ provenance={"crop_path": "/tmp/table.png"},
120
+ ),
121
+ ]
122
+
123
+ deduped = dedupe_tables(tables)
124
+
125
+ self.assertEqual(len(deduped), 1)
126
+ self.assertEqual(deduped[0].source_parser, "pymupdf")
127
+ self.assertEqual(deduped[0].confidence, 0.84)
128
+ self.assertEqual(deduped[0].bbox, [(72.0, 144.0, 237.0, 186.0)])
129
+ self.assertEqual(deduped[0].provenance["crop_path"], "/tmp/table.png")
130
+ self.assertEqual(deduped[0].provenance["source_parsers"], ["pymupdf", "docling"])
131
+
132
+
133
+ if __name__ == "__main__":
134
+ unittest.main()
tests/test_parser_disagreement.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for parser-disagreement and repair-success metrics."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import tempfile
6
+ import unittest
7
+ from pathlib import Path
8
+
9
+ from zsgdp.merge.conflict_detection import build_candidate_conflict_report
10
+ from zsgdp.pipeline import parse_document
11
+ from zsgdp.schema import DocumentProfile, Element, ParseCandidate, PageProfile, TableObject
12
+ from zsgdp.verify.parser_disagreement import compute_parser_disagreement
13
+ from zsgdp.verify.repair_success import compute_repair_success
14
+
15
+
16
+ def _profile() -> DocumentProfile:
17
+ return DocumentProfile(
18
+ doc_id="d1",
19
+ source_path="/tmp/d1.md",
20
+ file_type="markdown",
21
+ page_count=1,
22
+ extension=".md",
23
+ pages=[PageProfile(page_num=1, digital_text_chars=400, digital_text_quality=0.9)],
24
+ )
25
+
26
+
27
+ def _candidate(name: str, *, text: str, table_count: int = 0) -> ParseCandidate:
28
+ elements = [
29
+ Element(
30
+ element_id=f"{name}_e1",
31
+ doc_id="d1",
32
+ page_num=1,
33
+ type="paragraph",
34
+ text=text,
35
+ reading_order=1,
36
+ source_parser=name,
37
+ )
38
+ ]
39
+ tables: list[TableObject] = []
40
+ for index in range(table_count):
41
+ tables.append(
42
+ TableObject(
43
+ table_id=f"{name}_t{index + 1}",
44
+ page_nums=[1],
45
+ markdown="| A | B |\n| --- | --- |\n| 1 | 2 |",
46
+ source_parser=name,
47
+ )
48
+ )
49
+ return ParseCandidate(
50
+ parser_name=name,
51
+ doc_id="d1",
52
+ source_path="/tmp/d1.md",
53
+ file_type="markdown",
54
+ elements=elements,
55
+ tables=tables,
56
+ figures=[],
57
+ pages=[{"page_num": 1, "source_parser": name}],
58
+ confidence=0.8,
59
+ )
60
+
61
+
62
+ class TestParserDisagreement(unittest.TestCase):
63
+ def test_disagreement_rate_uses_pair_count_denominator(self):
64
+ candidates = [
65
+ _candidate("docling", text="A" * 800, table_count=4),
66
+ _candidate("pymupdf", text="A" * 100, table_count=0),
67
+ ]
68
+ report = build_candidate_conflict_report(candidates)
69
+ parser_metrics = {
70
+ "docling": {"parser": "docling", "failed": False},
71
+ "pymupdf": {"parser": "pymupdf", "failed": False},
72
+ }
73
+
74
+ result = compute_parser_disagreement(report, parser_metrics)
75
+
76
+ self.assertEqual(result["candidate_count"], 2)
77
+ self.assertEqual(result["parser_pair_count"], 1)
78
+ self.assertGreater(result["conflict_count"], 0)
79
+ self.assertGreater(result["disagreement_rate"], 0.0)
80
+ self.assertIn("text_coverage_gap", result["disagreement_by_type"])
81
+ self.assertIn("docling|pymupdf", result["disagreement_by_parser_pair"])
82
+
83
+ def test_disagreement_rate_zero_when_single_parser(self):
84
+ result = compute_parser_disagreement(
85
+ {"conflicts": []},
86
+ {"docling": {"parser": "docling", "failed": False}},
87
+ )
88
+
89
+ self.assertEqual(result["candidate_count"], 1)
90
+ self.assertEqual(result["parser_pair_count"], 0)
91
+ self.assertEqual(result["disagreement_rate"], 0.0)
92
+
93
+ def test_failed_parsers_excluded_from_pair_count(self):
94
+ result = compute_parser_disagreement(
95
+ {"conflicts": []},
96
+ {
97
+ "docling": {"parser": "docling", "failed": False},
98
+ "marker": {"parser": "marker", "failed": True, "error": "boom"},
99
+ "pymupdf": {"parser": "pymupdf", "failed": False},
100
+ },
101
+ )
102
+
103
+ self.assertEqual(result["candidate_count"], 2)
104
+ self.assertEqual(result["parser_pair_count"], 1)
105
+
106
+
107
+ class TestRepairSuccess(unittest.TestCase):
108
+ def test_resolution_rate_when_blocking_issue_resolved(self):
109
+ pre = {"score": 0.5, "issues": [{"issue_type": "invalid_table", "blocking": True, "page_num": 1, "region_id": "t1"}]}
110
+ post = {"score": 0.9, "issues": []}
111
+ history = [{"iteration": 1, "before_score": 0.5, "after_score": 0.9, "actions": [{"action": "repair_table"}]}]
112
+
113
+ result = compute_repair_success(pre, post, history)
114
+
115
+ self.assertEqual(result["pre_repair_blocking_count"], 1)
116
+ self.assertEqual(result["post_repair_blocking_count"], 0)
117
+ self.assertEqual(result["resolved_blocking_count"], 1)
118
+ self.assertEqual(result["repair_resolution_rate"], 1.0)
119
+ self.assertEqual(result["repair_regression_rate"], 0.0)
120
+ self.assertEqual(result["iteration_count"], 1)
121
+ self.assertAlmostEqual(result["score_delta"], 0.4, places=6)
122
+
123
+ def test_regression_rate_counts_new_blocking_issues(self):
124
+ pre = {"score": 0.7, "issues": [{"issue_type": "invalid_table", "blocking": True, "region_id": "t1"}]}
125
+ post = {
126
+ "score": 0.6,
127
+ "issues": [
128
+ {"issue_type": "invalid_table", "blocking": True, "region_id": "t1"},
129
+ {"issue_type": "missing_text_coverage", "blocking": True, "page_num": 2},
130
+ ],
131
+ }
132
+ history = [{"iteration": 1, "before_score": 0.7, "after_score": 0.6, "actions": []}]
133
+
134
+ result = compute_repair_success(pre, post, history)
135
+
136
+ self.assertEqual(result["resolved_blocking_count"], 0)
137
+ self.assertEqual(result["regressed_blocking_count"], 1)
138
+ self.assertEqual(result["repair_regression_rate"], 1.0)
139
+ self.assertEqual(result["repair_resolution_rate"], 0.0)
140
+
141
+ def test_vacuous_success_when_no_pre_repair_blocking_issues(self):
142
+ result = compute_repair_success(
143
+ {"score": 1.0, "issues": []},
144
+ {"score": 1.0, "issues": []},
145
+ [],
146
+ )
147
+
148
+ self.assertEqual(result["repair_resolution_rate"], 1.0)
149
+ self.assertEqual(result["repair_regression_rate"], 0.0)
150
+ self.assertEqual(result["iteration_count"], 0)
151
+
152
+
153
+ class TestRepairSuccessIntegration(unittest.TestCase):
154
+ def test_pipeline_records_resolution_for_iterative_table_repair(self):
155
+ with tempfile.TemporaryDirectory() as tmp:
156
+ input_path = Path(tmp) / "report.md"
157
+ input_path.write_text(
158
+ "# Report\n\n| A | B |\n| --- | --- |\n| 1 | 2 | 3 |\n",
159
+ encoding="utf-8",
160
+ )
161
+
162
+ parsed = parse_document(input_path, Path(tmp) / "out")
163
+
164
+ metrics = parsed.quality_report.metrics
165
+ self.assertIn("repair_resolution_rate", metrics)
166
+ self.assertIn("repair_regression_rate", metrics)
167
+ self.assertIn("parser_disagreement_rate", metrics)
168
+
169
+ success = parsed.provenance["repair_success"]
170
+ self.assertGreaterEqual(success["pre_repair_issue_count"], 1)
171
+ self.assertGreaterEqual(success["resolved_any_count"], 1)
172
+ self.assertGreaterEqual(success["repair_resolution_rate_any"], 0.0)
173
+ self.assertGreater(success["iteration_count"], 0)
174
+
175
+
176
+ if __name__ == "__main__":
177
+ unittest.main()