Spaces:
Running on Zero
Running on Zero
Arjunvir Singh commited on
Commit ·
db06ffa
0
Parent(s):
Initial commit: zeroshotGPU MVP with full eval surface
Browse filesProfiler, router, parser registry, schema, merger with conflict detection,
verifier (coverage/reading-order/table/figure/formula/chunk-readiness plus
GT-comparison: layout F1, table structure, formula CER, retrieval recall),
iterative repair loop with optional GPU escalation, agentic chunker,
benchmark suite (per-doc + per-parser + ablation + cross-dataset),
Gradio Spaces UI with abuse guards + per-artifact downloads, structured
JSON logging, preflight runner, regression-fixture format with perf
floors, .env loading, pre-commit/pre-push hooks, CONTRIBUTING.md +
docs/space_smoke.md, scripts/run_space_smoke.py runner.
Test count: 240/240 passing.
This view is limited to 50 files because it contains too many changes. See raw diff
- .env.example +16 -0
- .gitignore +15 -0
- .pre-commit-config.yaml +44 -0
- CHANGELOG.md +274 -0
- CONTRIBUTING.md +235 -0
- Makefile +49 -0
- README.md +287 -0
- app.py +251 -0
- configs/default.yaml +159 -0
- configs/docling.yaml +29 -0
- configs/gpu.yaml +43 -0
- configs/parsers.yaml +33 -0
- configs/routing.yaml +8 -0
- docs/space_smoke.md +269 -0
- examples/parse_folder.py +27 -0
- examples/parse_pdf.py +25 -0
- examples/run_benchmark.py +33 -0
- pyproject.toml +41 -0
- requirements.txt +33 -0
- scripts/__init__.py +0 -0
- scripts/run_space_smoke.py +455 -0
- tests/__init__.py +1 -0
- tests/regression/README.md +97 -0
- tests/regression/__init__.py +0 -0
- tests/regression/fixtures/markdown_basic.expected.json +31 -0
- tests/regression/fixtures/markdown_basic.input.md +14 -0
- tests/regression/test_regression.py +255 -0
- tests/test_ablation_runner.py +133 -0
- tests/test_app.py +141 -0
- tests/test_artifacts.py +82 -0
- tests/test_benchmark.py +55 -0
- tests/test_chunking.py +286 -0
- tests/test_cli_help.py +91 -0
- tests/test_conflict_detection.py +89 -0
- tests/test_cross_dataset.py +123 -0
- tests/test_datasets.py +152 -0
- tests/test_deployment.py +43 -0
- tests/test_docling_parser.py +39 -0
- tests/test_embedding_retriever.py +190 -0
- tests/test_env_loading.py +110 -0
- tests/test_external_parser_adapters.py +69 -0
- tests/test_gpu_runner.py +185 -0
- tests/test_gpu_runtime.py +47 -0
- tests/test_gpu_tasks.py +99 -0
- tests/test_layout_f1.py +190 -0
- tests/test_logging.py +125 -0
- tests/test_markdown_normalizer.py +63 -0
- tests/test_marker_parser.py +73 -0
- tests/test_merge.py +134 -0
- tests/test_parser_disagreement.py +177 -0
.env.example
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copy to .env and fill in. .env is gitignored; .env.example is committed.
|
| 2 |
+
# Loaded automatically by zsgdp.config.load_env_file() when CLI / app starts.
|
| 3 |
+
|
| 4 |
+
# Hugging Face Hub access token. Required for gated models like jina-v3
|
| 5 |
+
# (the embedding retriever) and any private model id used in gpu.models.
|
| 6 |
+
# Read transparently by transformers / sentence-transformers when set.
|
| 7 |
+
HF_TOKEN=
|
| 8 |
+
|
| 9 |
+
# Logging — see zsgdp/logging_config.py.
|
| 10 |
+
# ZSGDP_LOG_LEVEL=INFO
|
| 11 |
+
# ZSGDP_LOG_JSON=1
|
| 12 |
+
|
| 13 |
+
# Pipeline overrides.
|
| 14 |
+
# ZSGDP_CONFIG_PATH=configs/docling.yaml
|
| 15 |
+
# ZSGDP_MAX_UPLOAD_BYTES=52428800
|
| 16 |
+
# ZSGDP_MAX_PAGE_COUNT=200
|
.gitignore
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__/
|
| 2 |
+
*.py[cod]
|
| 3 |
+
.pytest_cache/
|
| 4 |
+
.mypy_cache/
|
| 5 |
+
.ruff_cache/
|
| 6 |
+
.venv/
|
| 7 |
+
venv/
|
| 8 |
+
out/
|
| 9 |
+
parsed/
|
| 10 |
+
benchmarks/results/
|
| 11 |
+
|
| 12 |
+
# Secrets — never commit. Loaded by zsgdp.config.load_env_file() at runtime.
|
| 13 |
+
.env
|
| 14 |
+
.env.*
|
| 15 |
+
!.env.example
|
.pre-commit-config.yaml
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Pre-commit and pre-push hooks for zeroshotGPU.
|
| 2 |
+
#
|
| 3 |
+
# Install once with:
|
| 4 |
+
# python -m pip install pre-commit
|
| 5 |
+
# pre-commit install --install-hooks --hook-type pre-commit --hook-type pre-push
|
| 6 |
+
#
|
| 7 |
+
# pre-commit runs only fast static checks on every commit so the developer
|
| 8 |
+
# loop stays tight. The slow `preflight` runs at pre-push time so it gates
|
| 9 |
+
# what reaches the remote without slowing down individual commits.
|
| 10 |
+
|
| 11 |
+
default_language_version:
|
| 12 |
+
python: python3.11
|
| 13 |
+
|
| 14 |
+
repos:
|
| 15 |
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
| 16 |
+
rev: v5.0.0
|
| 17 |
+
hooks:
|
| 18 |
+
- id: trailing-whitespace
|
| 19 |
+
stages: [pre-commit]
|
| 20 |
+
- id: end-of-file-fixer
|
| 21 |
+
stages: [pre-commit]
|
| 22 |
+
- id: check-yaml
|
| 23 |
+
stages: [pre-commit]
|
| 24 |
+
# The simple YAML in configs/*.yaml uses a tiny subset; check-yaml
|
| 25 |
+
# is fine. `app_file` etc. in README.md aren't real YAML headers
|
| 26 |
+
# — they're HF Spaces front-matter and excluded from this hook.
|
| 27 |
+
exclude: ^README\.md$
|
| 28 |
+
- id: check-json
|
| 29 |
+
stages: [pre-commit]
|
| 30 |
+
- id: check-added-large-files
|
| 31 |
+
stages: [pre-commit]
|
| 32 |
+
args: ["--maxkb=2048"]
|
| 33 |
+
- id: check-merge-conflict
|
| 34 |
+
stages: [pre-commit]
|
| 35 |
+
|
| 36 |
+
- repo: local
|
| 37 |
+
hooks:
|
| 38 |
+
- id: zsgdp-preflight
|
| 39 |
+
name: zsgdp preflight (unit + regression + space-check + parsers)
|
| 40 |
+
entry: python -m zsgdp.cli preflight --root .
|
| 41 |
+
language: system
|
| 42 |
+
pass_filenames: false
|
| 43 |
+
stages: [pre-push]
|
| 44 |
+
always_run: true
|
CHANGELOG.md
ADDED
|
@@ -0,0 +1,274 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Changelog
|
| 2 |
+
|
| 3 |
+
All notable changes to zeroshotGPU. Format follows
|
| 4 |
+
[Keep a Changelog](https://keepachangelog.com/en/1.1.0/); versions follow
|
| 5 |
+
[Semantic Versioning](https://semver.org/spec/v2.0.0.html) but the project is
|
| 6 |
+
pre-1.0 so minor bumps may include breaking changes.
|
| 7 |
+
|
| 8 |
+
## [Unreleased]
|
| 9 |
+
|
| 10 |
+
### Documentation — README restructured
|
| 11 |
+
|
| 12 |
+
- Reorganised into Install → Quick start → Opt-ins → Outputs →
|
| 13 |
+
Architecture map → Production benchmark numbers → Deployment →
|
| 14 |
+
Contributing.
|
| 15 |
+
- New "Production benchmark numbers" placeholder table with §29
|
| 16 |
+
success criteria recalled inline; columns are
|
| 17 |
+
`Metric / Dataset / Value / Date / Run` so the operator pastes real
|
| 18 |
+
numbers in after running `make space-smoke` and `make benchmark`
|
| 19 |
+
on the Space.
|
| 20 |
+
- Optional-extras table (`embedding`, `gpu_repair`, `spaces`)
|
| 21 |
+
documents what each extra adds and the config flag that requires it.
|
| 22 |
+
- Architecture quick-map turned into a table; one row per top-level
|
| 23 |
+
module with its responsibility.
|
| 24 |
+
- Deployment section is now a numbered checklist that ends with
|
| 25 |
+
"update the production-benchmark table."
|
| 26 |
+
|
| 27 |
+
### Added — Space smoke validation runner
|
| 28 |
+
|
| 29 |
+
- `scripts/run_space_smoke.py` automates the five smokes documented in
|
| 30 |
+
`docs/space_smoke.md`. One command runs whichever smokes have their
|
| 31 |
+
deps installed; missing deps surface as `skip` results with explicit
|
| 32 |
+
`pip install` hints, not crashes.
|
| 33 |
+
- Five smokes: `lexical` (model-free benchmark), `ablation` (per-parser
|
| 34 |
+
runner), `embedding` (sentence-transformers + jina-v3 lazy-load
|
| 35 |
+
probe), `gpu_repair` (dry-run plan + repair-loop iteration check —
|
| 36 |
+
*does not* download multi-GB Qwen weights, defers live invocation
|
| 37 |
+
to `run-gpu-tasks --execute`), `marker` (binary detection +
|
| 38 |
+
registry availability).
|
| 39 |
+
- `--strict` mode treats skipped smokes as failures; `--output PATH`
|
| 40 |
+
emits a structured JSON report with per-smoke `detail`, elapsed
|
| 41 |
+
seconds, status (`pass`/`fail`/`skip`/`error`), and aggregate
|
| 42 |
+
summary counts.
|
| 43 |
+
- 14 new tests covering registry membership, report aggregation,
|
| 44 |
+
text formatting per status, strict-mode skip-as-failure, end-to-end
|
| 45 |
+
smoke execution for the three model-free smokes, and skip-path
|
| 46 |
+
structure for the model-dependent ones.
|
| 47 |
+
|
| 48 |
+
### Added — per-artifact downloads in the Space UI
|
| 49 |
+
|
| 50 |
+
- New "Artifacts" tab in `app.py` exposes each top-level artifact
|
| 51 |
+
(`parsed_document.json`, `document.md`, `chunks.jsonl`,
|
| 52 |
+
`quality_report.json`, etc. — 16 candidate files) as an individual
|
| 53 |
+
download via `gr.Files`. The bundled zip stays as it was for
|
| 54 |
+
archival, and nested asset dirs (`assets/pages/*.png`,
|
| 55 |
+
`assets/tables/*.png`) are intentionally excluded from the
|
| 56 |
+
per-artifact list — they can be large and the zip already covers
|
| 57 |
+
them.
|
| 58 |
+
- The artifact list is built from `_INDIVIDUAL_ARTIFACT_NAMES` in
|
| 59 |
+
declaration order so the UI listing is stable across runs. Missing
|
| 60 |
+
files are silently skipped (different parses emit different subsets;
|
| 61 |
+
e.g. `conflict_report.json` only when multiple parsers ran).
|
| 62 |
+
- All return paths in `parse_uploaded_document` now go through a
|
| 63 |
+
single `_empty_outputs(...)` helper so the tuple width can't drift
|
| 64 |
+
between success and the four error paths. New drift-guard test
|
| 65 |
+
asserts `len(outputs) == 11` for every error path.
|
| 66 |
+
- Summary JSON now includes `individual_artifact_count`.
|
| 67 |
+
|
| 68 |
+
### Added — CLI help with examples
|
| 69 |
+
|
| 70 |
+
- Each non-trivial CLI subcommand (`parse`, `parse-folder`, `space-check`,
|
| 71 |
+
`run-gpu-tasks`, `benchmark`, `benchmark-ablate`, `preflight`,
|
| 72 |
+
`combine-benchmarks`, `export-chunks`, `validate-artifacts`, plus the
|
| 73 |
+
top-level help) now ships with an `Examples:` block in its `--help`
|
| 74 |
+
output. Multi-line shell snippets render via
|
| 75 |
+
`argparse.RawDescriptionHelpFormatter` + a textwrap-dedent helper so
|
| 76 |
+
the source-side indentation doesn't leak into the rendered output.
|
| 77 |
+
- `zsgdp run-gpu-tasks --help` now explicitly contrasts the dry-run
|
| 78 |
+
default against `--execute`, matching the safety contract of
|
| 79 |
+
`repair.execute_gpu_escalations` in config.
|
| 80 |
+
- 9 new tests guarding: epilog dedent helper, blank-line preservation
|
| 81 |
+
in epilogs, top-level help lists examples, and per-subcommand
|
| 82 |
+
examples cover their distinguishing flags (e.g. `benchmark` shows
|
| 83 |
+
all three dataset modes; `combine-benchmarks` shows label pairing).
|
| 84 |
+
|
| 85 |
+
### Added — contributor onboarding
|
| 86 |
+
|
| 87 |
+
- `CONTRIBUTING.md` documenting setup, hooks, test layout, fixture
|
| 88 |
+
format, parser/metric/schema-bump procedure, logging conventions,
|
| 89 |
+
PR checklist, and an architecture quick-map.
|
| 90 |
+
- `.pre-commit-config.yaml` with two stages:
|
| 91 |
+
- **pre-commit**: trailing whitespace, end-of-file fixer, JSON/YAML
|
| 92 |
+
syntax, large-file guard (2 MB cap), merge-conflict markers.
|
| 93 |
+
- **pre-push**: runs `python -m zsgdp.cli preflight` so failing
|
| 94 |
+
preflight blocks the push. External hook repo is pinned to a
|
| 95 |
+
specific tag (no `master`/`HEAD` references).
|
| 96 |
+
- `tests/test_repo_hygiene.py` (6 tests) — guards `.env` is in
|
| 97 |
+
`.gitignore`, `.env.example` is committed and contains no
|
| 98 |
+
real-shape secrets, pre-commit config has the preflight hook on
|
| 99 |
+
the pre-push stage with a pinned external repo, `CONTRIBUTING.md`
|
| 100 |
+
references the preflight workflow and Space smoke checklist,
|
| 101 |
+
`CHANGELOG.md` has an `[Unreleased]` section.
|
| 102 |
+
|
| 103 |
+
### Added — performance baselines
|
| 104 |
+
|
| 105 |
+
- Regression fixture format gains an optional `performance` block:
|
| 106 |
+
`repeats`, `max_elapsed_seconds`, `min_pages_per_second`,
|
| 107 |
+
`always_enforce`. The runner parses each fixture N times and compares
|
| 108 |
+
the median against the floor — the cold-import outlier on the first
|
| 109 |
+
run is stripped automatically.
|
| 110 |
+
- Default opt-in via `ZSGDP_REGRESSION_PERF=1`; per-fixture override
|
| 111 |
+
via `always_enforce: true`. Floors are intended as
|
| 112 |
+
catastrophic-regression guards, not tight perf bars.
|
| 113 |
+
- Seed fixture `markdown_basic` ships with a 2.0s / 0.5pps floor
|
| 114 |
+
(~80x slack against measured ~6ms median) so it exercises the path
|
| 115 |
+
without flaking on slow CI.
|
| 116 |
+
- 5 new unit tests for the perf evaluator: max-elapsed and
|
| 117 |
+
min-pps trip correctly, median strips cold outliers, env-var gating
|
| 118 |
+
honours `always_enforce`.
|
| 119 |
+
|
| 120 |
+
### Added — preflight + secrets
|
| 121 |
+
|
| 122 |
+
- Preflight runner: `zsgdp preflight` CLI subcommand and `make preflight`
|
| 123 |
+
target. Chains `unittest discover`, regression fixtures, `space-check`,
|
| 124 |
+
and `parsers` registry sanity. `--benchmark` adds an end-to-end smoke
|
| 125 |
+
against the regression fixtures. Each step's output is suppressed on
|
| 126 |
+
success and surfaced on failure; one-line summary always printed.
|
| 127 |
+
- `Makefile` with targets `test`, `regression`, `space-check`, `parsers`,
|
| 128 |
+
`preflight`, `preflight-full`, `benchmark`, `clean`.
|
| 129 |
+
- `.env` loading via `zsgdp.config.load_env_file()`. Read at CLI start
|
| 130 |
+
and `app.py` import; pre-set environment variables always win. Never
|
| 131 |
+
overrides Space-side secrets. `.env.example` shipped as the template.
|
| 132 |
+
- `.env`/`.env.*` added to `.gitignore` (`.env.example` whitelisted).
|
| 133 |
+
- `zsgdp.config.hf_token()` resolves `HF_TOKEN`,
|
| 134 |
+
`HUGGING_FACE_HUB_TOKEN`, `HUGGINGFACE_TOKEN` in priority order.
|
| 135 |
+
|
| 136 |
+
### Added — structured logging
|
| 137 |
+
|
| 138 |
+
- `zsgdp.logging_config` with idempotent `configure_logging()`. Default
|
| 139 |
+
level WARNING; opt in via `ZSGDP_LOG_LEVEL`. Optional one-line JSON
|
| 140 |
+
records via `ZSGDP_LOG_JSON=1`; structured `extra={...}` fields
|
| 141 |
+
promoted to top-level keys for HF Spaces logs / Loki / Datadog.
|
| 142 |
+
- Wired into pipeline (`parse_start`, `parser_candidate`,
|
| 143 |
+
`parser_failed`, `parse_end`), repair controller (`repair_iteration`),
|
| 144 |
+
GPU worker (`gpu_task_executed`, `gpu_task_blocked`), CLI, and
|
| 145 |
+
`app.py`. App auto-enables JSON mode when `SPACE_ID` is set.
|
| 146 |
+
|
| 147 |
+
### Added — deployment-readiness pass
|
| 148 |
+
|
| 149 |
+
- Pinned upper bounds on all `requirements.txt` and `pyproject.toml`
|
| 150 |
+
dependencies. Added explicit `embedding` and `gpu_repair` extras so the
|
| 151 |
+
optional sentence-transformers / transformers stacks can be installed
|
| 152 |
+
without dragging the whole spaces extra in.
|
| 153 |
+
- Abuse / cost guards in the Gradio Space entrypoint (`app.py`):
|
| 154 |
+
`MAX_UPLOAD_BYTES` (50 MB default) and `MAX_PAGE_COUNT` (200 default),
|
| 155 |
+
both overridable via `ZSGDP_MAX_*` env vars. Oversized uploads are
|
| 156 |
+
rejected with a clear UI error before parsing starts.
|
| 157 |
+
- `SCHEMA_VERSION` constant and `ParsedDocument.schema_version` field.
|
| 158 |
+
Surfaced into the artifact manifest as
|
| 159 |
+
`parsed_document_schema_version` alongside the existing manifest
|
| 160 |
+
`schema_version`. Validation report echoes both so consumers can gate.
|
| 161 |
+
- Regression fixture format under `tests/regression/`: a YAML-style
|
| 162 |
+
`*.expected.json` tolerance spec paired with an input document. Runner
|
| 163 |
+
auto-discovers, asserts on tolerances (counts, score, markdown
|
| 164 |
+
contains/excludes, repair/disagreement rate ranges). One seed fixture
|
| 165 |
+
shipped (`markdown_basic`).
|
| 166 |
+
|
| 167 |
+
### Added — eval surface
|
| 168 |
+
|
| 169 |
+
- Per-parser GT-comparison metrics within a single merged run
|
| 170 |
+
(`zsgdp/benchmarks/per_parser_metrics.py`). Reads pre-merge candidate
|
| 171 |
+
snapshots from `parsed.provenance.candidates` and computes layout F1 /
|
| 172 |
+
table structure / formula CER per parser against the same GT.
|
| 173 |
+
Surfaced as `per_parser_metrics.csv` and per-doc field
|
| 174 |
+
`per_parser_metrics`.
|
| 175 |
+
- Per-parser cross-doc leaderboard rollup
|
| 176 |
+
(`per_parser_gt_leaderboard.csv`) with truth-aware filtering: a metric
|
| 177 |
+
contributes to a parser's mean only when that parser was actually
|
| 178 |
+
evaluated against truths for that metric on that document.
|
| 179 |
+
- Cross-dataset comparison (`zsgdp/benchmarks/cross_dataset.py`) with
|
| 180 |
+
`combine-benchmarks` CLI subcommand. Combines multiple
|
| 181 |
+
`results.json` summaries into `dataset_summary.csv` and a
|
| 182 |
+
parser-vs-dataset matrix. Missing metrics surface as `None` rather
|
| 183 |
+
than 0.0 so callers can distinguish absent from true-zero.
|
| 184 |
+
- Embedding-based retriever (`zsgdp/benchmarks/embedding_retriever.py`)
|
| 185 |
+
satisfying the `Retriever` protocol. Defaults to lexical (model-free,
|
| 186 |
+
CI-safe); opt in via `benchmarks.retriever.backend=embedding` in
|
| 187 |
+
config. Lazy-loads `sentence-transformers` on first use; falls back
|
| 188 |
+
cleanly when unavailable.
|
| 189 |
+
- Layout F1 against ground-truth bbox annotations
|
| 190 |
+
(`zsgdp/verify/layout_f1.py`). Class-aware and class-agnostic scores
|
| 191 |
+
side-by-side, per-category breakdown. DocLayNet COCO and OmniDocBench
|
| 192 |
+
JSON adapters in `zsgdp/benchmarks/ground_truth.py`.
|
| 193 |
+
- Table structure similarity (`zsgdp/verify/table_structure.py`):
|
| 194 |
+
shape similarity × multiset cell-content F1, greedy bipartite
|
| 195 |
+
matching.
|
| 196 |
+
- Formula extraction CER (`zsgdp/verify/formula_extraction.py`):
|
| 197 |
+
Levenshtein-based, normalized for whitespace and `$`/`$$` delimiters.
|
| 198 |
+
- Retrieval-readiness metrics (`zsgdp/verify/retrieval.py`): recall@k,
|
| 199 |
+
citation accuracy@k, mean reciprocal rank. Synthetic QA generator
|
| 200 |
+
(`zsgdp/benchmarks/retrieval.py`) using distinctive sentences.
|
| 201 |
+
- Parser-disagreement rate
|
| 202 |
+
(`zsgdp/verify/parser_disagreement.py`): conflict count over parser
|
| 203 |
+
pair count from the merger's existing conflict report.
|
| 204 |
+
- Repair success / regression rates
|
| 205 |
+
(`zsgdp/verify/repair_success.py`): pre/post issue identity diff;
|
| 206 |
+
iteration history, score delta, action counts.
|
| 207 |
+
- Parser contribution counts: which parser's elements survived the
|
| 208 |
+
merge, surfaced as per-doc and aggregate fractions.
|
| 209 |
+
- Parser ablation runner (`zsgdp/benchmarks/ablation_runner.py`) with
|
| 210 |
+
`benchmark-ablate` CLI subcommand. Runs the benchmark once per parser
|
| 211 |
+
in isolation plus a merged arm, emits a comparison CSV.
|
| 212 |
+
- Three dataset loaders (`zsgdp/benchmarks/datasets.py`):
|
| 213 |
+
`custom_folder`, `omnidocbench`, `doclaynet`. `DatasetDocument`
|
| 214 |
+
dataclass; registry pattern for downstream extension.
|
| 215 |
+
|
| 216 |
+
### Added — pipeline
|
| 217 |
+
|
| 218 |
+
- Iterative repair loop in `pipeline.py`: bounded by
|
| 219 |
+
`repair.max_iterations`, terminates on quality-accepted OR
|
| 220 |
+
no-changes-this-pass. Per-iteration history under
|
| 221 |
+
`provenance.repair_iterations`.
|
| 222 |
+
- GPU repair escalation wired into `repair/controller.py`. Plans
|
| 223 |
+
same-schema GPU tasks for invalid tables, OCR/text coverage issues,
|
| 224 |
+
reading-order failures, and figure issues, then dispatches via
|
| 225 |
+
`GPUWorker`. Default safe (`repair.gpu_escalation=true,
|
| 226 |
+
repair.execute_gpu_escalations=false`); flip the second to invoke
|
| 227 |
+
the configured backend.
|
| 228 |
+
- Per-parser candidate snapshots persisted in
|
| 229 |
+
`parsed.provenance.candidates` so per-parser GT metrics can be
|
| 230 |
+
computed without re-parsing.
|
| 231 |
+
- Real Marker and Unstructured normalizers
|
| 232 |
+
(`zsgdp/normalize/normalize_marker.py` and
|
| 233 |
+
`normalize_unstructured.py`) wired through `parsers/external.py`.
|
| 234 |
+
|
| 235 |
+
### Changed
|
| 236 |
+
|
| 237 |
+
- `requirements.txt` no longer pins `torch`; the HF Spaces image
|
| 238 |
+
preinstalls a CUDA-matched build and pinning here would override it.
|
| 239 |
+
- `--gpu-workers` flag help text clarified — the value is recorded for
|
| 240 |
+
downstream task-execution accounting but document parsing uses
|
| 241 |
+
`--workers`.
|
| 242 |
+
- `--dataset` benchmark flag now selects the loader name
|
| 243 |
+
(default `custom_folder`); `custom`/`folder`/`default` accepted as
|
| 244 |
+
aliases. Previous behaviour was a freeform reporting label only.
|
| 245 |
+
- Embedding-retriever toy hashing test now uses
|
| 246 |
+
`hashlib.md5`-based stable hashing instead of `builtins.hash()`,
|
| 247 |
+
fixing per-process flakiness.
|
| 248 |
+
|
| 249 |
+
### Documentation
|
| 250 |
+
|
| 251 |
+
- `tests/regression/README.md` documents the fixture format.
|
| 252 |
+
- `configs/default.yaml` and `configs/docling.yaml` annotated to
|
| 253 |
+
explain the new `repair.execute_gpu_escalations` and the deliberate
|
| 254 |
+
Docling+PyMuPDF dual-enable for the disagreement metric.
|
| 255 |
+
|
| 256 |
+
### Test count
|
| 257 |
+
|
| 258 |
+
- 181 tests pass (was 4 at the start of the eval surface work).
|
| 259 |
+
|
| 260 |
+
## [0.1.0] — initial MVP
|
| 261 |
+
|
| 262 |
+
- Profiler, page router, parser registry (text, pymupdf, docling, plus
|
| 263 |
+
shell-out adapters for marker / mineru / olmocr / paddleocr /
|
| 264 |
+
unstructured).
|
| 265 |
+
- Canonical schema (`Element`, `TableObject`, `FigureObject`, `Chunk`,
|
| 266 |
+
`ParsedDocument`, `QualityReport`).
|
| 267 |
+
- Merger with conflict detection, quality verifier (coverage, reading
|
| 268 |
+
order, table validity, chunk readiness), deterministic repair
|
| 269 |
+
controller.
|
| 270 |
+
- Agentic chunker with fixed-token / recursive-structure / parent-child
|
| 271 |
+
/ page-level / table / figure strategies; semantic / late /
|
| 272 |
+
vision-guided / proposition stubs.
|
| 273 |
+
- Artifact manifest with SHA-256 checksums, `validate-artifacts` CLI.
|
| 274 |
+
- Gradio Spaces entrypoint, `space-check` deployment readiness CLI.
|
CONTRIBUTING.md
ADDED
|
@@ -0,0 +1,235 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Contributing to zeroshotGPU
|
| 2 |
+
|
| 3 |
+
Thanks for working on this. Three things to know up front:
|
| 4 |
+
|
| 5 |
+
1. **Run `make preflight` before pushing.** It's the same suite that runs
|
| 6 |
+
in pre-push if you have the hooks installed (see below). A green
|
| 7 |
+
preflight is the local signal that the branch is ready for the
|
| 8 |
+
[Space smoke checklist](docs/space_smoke.md).
|
| 9 |
+
2. **Keep it dependency-light by default.** New runtime dependencies need
|
| 10 |
+
a corresponding entry in `pyproject.toml` extras and an explicit
|
| 11 |
+
gate (config flag, lazy import, or feature-detection fallback). The
|
| 12 |
+
`embedding` extra is the model: opt-in, lazy-imported on first use,
|
| 13 |
+
raises a clean `RuntimeError` when missing.
|
| 14 |
+
3. **Don't change schema shapes silently.** Bump
|
| 15 |
+
`zsgdp.schema.SCHEMA_VERSION` whenever the on-disk shape of
|
| 16 |
+
`parsed_document.json`, `chunks.jsonl`, etc. changes. See
|
| 17 |
+
[Schema versioning](#schema-versioning) below.
|
| 18 |
+
|
| 19 |
+
---
|
| 20 |
+
|
| 21 |
+
## Setup
|
| 22 |
+
|
| 23 |
+
```bash
|
| 24 |
+
git clone <repo>
|
| 25 |
+
cd "Document Parser"
|
| 26 |
+
python3.11 -m venv .venv && source .venv/bin/activate
|
| 27 |
+
python -m pip install -e ".[pdf,yaml,docling,dev]"
|
| 28 |
+
```
|
| 29 |
+
|
| 30 |
+
Optional extras:
|
| 31 |
+
|
| 32 |
+
- `.[embedding]` — sentence-transformers + transformers for the embedding
|
| 33 |
+
retriever. Only needed when you set `benchmarks.retriever.backend=embedding`.
|
| 34 |
+
- `.[gpu_repair]` — transformers for live GPU repair. Only needed when you
|
| 35 |
+
set `repair.execute_gpu_escalations=true`.
|
| 36 |
+
- `.[spaces]` — mirrors the root `requirements.txt` so an editable install
|
| 37 |
+
matches a Space deploy.
|
| 38 |
+
|
| 39 |
+
Set up `.env` for local secrets:
|
| 40 |
+
|
| 41 |
+
```bash
|
| 42 |
+
cp .env.example .env
|
| 43 |
+
# Fill in HF_TOKEN if you need gated models.
|
| 44 |
+
```
|
| 45 |
+
|
| 46 |
+
`.env` is gitignored. CLI and `app.py` load it automatically; pre-set
|
| 47 |
+
environment variables always win, so a Space's secrets never get
|
| 48 |
+
overridden by a stray local file.
|
| 49 |
+
|
| 50 |
+
---
|
| 51 |
+
|
| 52 |
+
## Pre-commit / pre-push hooks
|
| 53 |
+
|
| 54 |
+
```bash
|
| 55 |
+
python -m pip install pre-commit
|
| 56 |
+
pre-commit install --install-hooks --hook-type pre-commit --hook-type pre-push
|
| 57 |
+
```
|
| 58 |
+
|
| 59 |
+
Two stages:
|
| 60 |
+
|
| 61 |
+
- **pre-commit** — fast static checks: trailing whitespace, end-of-file
|
| 62 |
+
newline, JSON/YAML syntax, large-file guard, merge-conflict markers.
|
| 63 |
+
Runs on every `git commit`.
|
| 64 |
+
- **pre-push** — runs `python -m zsgdp.cli preflight`. Same as
|
| 65 |
+
`make preflight`. Failing this blocks the push.
|
| 66 |
+
|
| 67 |
+
Skip on a specific commit with `git commit --no-verify` if you genuinely
|
| 68 |
+
need to (e.g. WIP). Skip the pre-push gate with `git push --no-verify`,
|
| 69 |
+
but only if you have a separately verified preflight run.
|
| 70 |
+
|
| 71 |
+
---
|
| 72 |
+
|
| 73 |
+
## Running tests
|
| 74 |
+
|
| 75 |
+
```bash
|
| 76 |
+
make test # full unittest discover
|
| 77 |
+
make regression # snapshot fixture suite
|
| 78 |
+
make preflight # everything except the benchmark smoke
|
| 79 |
+
make preflight-full # adds an end-to-end benchmark smoke
|
| 80 |
+
make benchmark # parses tests/regression/fixtures/ via the CLI
|
| 81 |
+
```
|
| 82 |
+
|
| 83 |
+
Or directly:
|
| 84 |
+
|
| 85 |
+
```bash
|
| 86 |
+
python -m unittest discover
|
| 87 |
+
python -m unittest tests.regression.test_regression
|
| 88 |
+
python -m zsgdp.cli preflight --root . --benchmark
|
| 89 |
+
```
|
| 90 |
+
|
| 91 |
+
Performance regressions are gated behind `ZSGDP_REGRESSION_PERF=1`:
|
| 92 |
+
|
| 93 |
+
```bash
|
| 94 |
+
ZSGDP_REGRESSION_PERF=1 python -m unittest tests.regression.test_regression
|
| 95 |
+
```
|
| 96 |
+
|
| 97 |
+
See [tests/regression/README.md](tests/regression/README.md) for the
|
| 98 |
+
fixture format including the `performance` block.
|
| 99 |
+
|
| 100 |
+
---
|
| 101 |
+
|
| 102 |
+
## Adding a regression fixture
|
| 103 |
+
|
| 104 |
+
1. Drop the input under `tests/regression/fixtures/<name>.input.<ext>`.
|
| 105 |
+
2. Parse it once locally and inspect the output:
|
| 106 |
+
```bash
|
| 107 |
+
python -m zsgdp.cli parse --input tests/regression/fixtures/<name>.input.<ext> --output /tmp/sanity
|
| 108 |
+
```
|
| 109 |
+
3. Hand-write `tests/regression/fixtures/<name>.expected.json` with the
|
| 110 |
+
tolerances you want to lock down. Prefer ranges over exact counts
|
| 111 |
+
where reasonable variance exists.
|
| 112 |
+
4. Optional: add a `performance` block with `max_elapsed_seconds` set to
|
| 113 |
+
~50–100x your local median (catastrophic-regression guard, not a
|
| 114 |
+
tight bar).
|
| 115 |
+
5. Run `make regression` to confirm the fixture is picked up.
|
| 116 |
+
|
| 117 |
+
---
|
| 118 |
+
|
| 119 |
+
## Adding a parser adapter
|
| 120 |
+
|
| 121 |
+
1. Subclass `BaseParser` in `zsgdp/parsers/<name>_parser.py` (or extend
|
| 122 |
+
`external.py` for shell-out adapters).
|
| 123 |
+
2. Set `name`, `supported_file_types`, implement `available()` and
|
| 124 |
+
`parse(path, profile, config, *, pages=None)`.
|
| 125 |
+
3. Register in `zsgdp/parsers/registry.py`.
|
| 126 |
+
4. If the parser produces Markdown, write a normalizer under
|
| 127 |
+
`zsgdp/normalize/normalize_<name>.py` that returns a `ParseCandidate`
|
| 128 |
+
via `normalize_markdown_candidate(...)`.
|
| 129 |
+
5. Add a config block to `configs/default.yaml` with `enabled: false`
|
| 130 |
+
plus any CLI flags the adapter needs.
|
| 131 |
+
6. Add the dependency to `pyproject.toml` as an optional extra. Don't
|
| 132 |
+
pin it in the top-level `requirements.txt` unless it's free to
|
| 133 |
+
install on every Space build.
|
| 134 |
+
|
| 135 |
+
---
|
| 136 |
+
|
| 137 |
+
## Adding a metric
|
| 138 |
+
|
| 139 |
+
Pure metrics live under `zsgdp/verify/`:
|
| 140 |
+
|
| 141 |
+
1. Define inputs as plain dicts/lists (not `ParsedDocument`-keyed) so
|
| 142 |
+
the same metric works on per-parser candidate snapshots, not just
|
| 143 |
+
the merged document.
|
| 144 |
+
2. Pin definitions in the module docstring — exact denominator,
|
| 145 |
+
handling of empty inputs, what each return key means.
|
| 146 |
+
3. Surface in `zsgdp/benchmarks/parser_quality.py`:
|
| 147 |
+
- Add per-document fields to the `doc_record`.
|
| 148 |
+
- Add aggregated means to the top-level `summary` dict.
|
| 149 |
+
- Add a per-document CSV writer if it has detail worth its own file.
|
| 150 |
+
4. Add tests for: perfect input, no-match input, partial overlap,
|
| 151 |
+
vacuous empty/empty case, and a benchmark-integration test that
|
| 152 |
+
asserts the metric appears in `summary["documents"][0]`.
|
| 153 |
+
|
| 154 |
+
---
|
| 155 |
+
|
| 156 |
+
## Schema versioning
|
| 157 |
+
|
| 158 |
+
`zsgdp.schema.SCHEMA_VERSION` lives in
|
| 159 |
+
[zsgdp/schema/document.py](zsgdp/schema/document.py). It's surfaced into
|
| 160 |
+
`artifact_manifest.json` as `parsed_document_schema_version` so a
|
| 161 |
+
consumer reading old output can gate.
|
| 162 |
+
|
| 163 |
+
Bump rules:
|
| 164 |
+
|
| 165 |
+
- **Additive change** (new optional field with a default) — bump the
|
| 166 |
+
patch (1.0 → 1.1).
|
| 167 |
+
- **Breaking change** (renamed/removed field, semantics changed) — bump
|
| 168 |
+
the major (1.0 → 2.0). Update the regression fixtures in the same
|
| 169 |
+
PR; downstream consumers will need a migration.
|
| 170 |
+
- **No change** — leave it alone.
|
| 171 |
+
|
| 172 |
+
When you bump, add an entry to `CHANGELOG.md` under
|
| 173 |
+
"### Schema" with the version and what changed.
|
| 174 |
+
|
| 175 |
+
---
|
| 176 |
+
|
| 177 |
+
## Logging
|
| 178 |
+
|
| 179 |
+
Use `from zsgdp.logging_config import get_logger` then
|
| 180 |
+
`logger = get_logger(__name__)`. Call `.info`/`.warning`/`.error` with
|
| 181 |
+
structured `extra={...}` fields rather than f-string-formatted messages
|
| 182 |
+
where possible — the JSON formatter promotes `extra` keys to top-level
|
| 183 |
+
fields so the HF Spaces logs page is greppable.
|
| 184 |
+
|
| 185 |
+
Default log level is WARNING (CLI summaries unaffected). Opt in with
|
| 186 |
+
`ZSGDP_LOG_LEVEL=INFO` and `ZSGDP_LOG_JSON=1` for Space-style output.
|
| 187 |
+
|
| 188 |
+
---
|
| 189 |
+
|
| 190 |
+
## Pull request checklist
|
| 191 |
+
|
| 192 |
+
Before opening a PR:
|
| 193 |
+
|
| 194 |
+
- [ ] `make preflight` passes locally.
|
| 195 |
+
- [ ] If you added a metric, an adapter, or changed the schema, you
|
| 196 |
+
updated `CHANGELOG.md`.
|
| 197 |
+
- [ ] If you changed parser behavior, you ran `make regression` and any
|
| 198 |
+
fixture drift is intentional (and the snapshot was regenerated
|
| 199 |
+
explicitly).
|
| 200 |
+
- [ ] If your change touches GPU/model code paths, you flagged it for
|
| 201 |
+
Space-side smoke testing in the PR description (the
|
| 202 |
+
[smoke checklist](docs/space_smoke.md) covers what to run).
|
| 203 |
+
- [ ] You did **not** commit `.env` or any secret. The `.gitignore`
|
| 204 |
+
should catch this; if you suspect a leak, treat the token as
|
| 205 |
+
compromised and rotate it.
|
| 206 |
+
|
| 207 |
+
---
|
| 208 |
+
|
| 209 |
+
## Architecture quick map
|
| 210 |
+
|
| 211 |
+
- `zsgdp/profiling/` — page-level features and labels.
|
| 212 |
+
- `zsgdp/routing/` — deterministic page → expert mapping.
|
| 213 |
+
- `zsgdp/parsers/` — adapters; one canonical schema regardless of source.
|
| 214 |
+
- `zsgdp/normalize/` — convert each parser's output into the schema.
|
| 215 |
+
- `zsgdp/merge/` — align candidates, dedupe, detect conflicts.
|
| 216 |
+
- `zsgdp/verify/` — coverage, reading order, table/figure/formula/chunk
|
| 217 |
+
quality, GT-comparison metrics (layout F1, table structure, formula
|
| 218 |
+
CER, retrieval recall), parser disagreement and repair success rates.
|
| 219 |
+
- `zsgdp/repair/` — deterministic header/table fixes plus GPU
|
| 220 |
+
escalation that dispatches to `gpu/worker.py`.
|
| 221 |
+
- `zsgdp/chunking/` — agentic planner + structure-aware / parent-child /
|
| 222 |
+
table / figure / page chunk builders, with semantic / late /
|
| 223 |
+
vision-guided / proposition deterministic stubs.
|
| 224 |
+
- `zsgdp/gpu/` — task planning, batching, dry-run worker, transformers
|
| 225 |
+
and vLLM clients.
|
| 226 |
+
- `zsgdp/benchmarks/` — dataset loaders, metric runners, ablation,
|
| 227 |
+
cross-dataset comparison, retrieval (lexical + embedding).
|
| 228 |
+
- `zsgdp/cli.py` — single entry point exposing all of the above.
|
| 229 |
+
- `app.py` — Gradio Space front-end.
|
| 230 |
+
|
| 231 |
+
The full spec lives in
|
| 232 |
+
[zero_shot_gpu_document_parser_project_spec.md](zero_shot_gpu_document_parser_project_spec.md).
|
| 233 |
+
The 2000-line read isn't required to contribute, but section §10 (schema)
|
| 234 |
+
and §17 (chunking ladder) are worth skimming if you're touching those
|
| 235 |
+
modules.
|
Makefile
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
PYTHON ?= python3.11
|
| 2 |
+
|
| 3 |
+
.PHONY: help test regression space-check parsers preflight preflight-full benchmark space-smoke space-smoke-strict clean
|
| 4 |
+
|
| 5 |
+
help:
|
| 6 |
+
@echo "Targets:"
|
| 7 |
+
@echo " test - run the full unittest discover suite"
|
| 8 |
+
@echo " regression - run the regression fixture snapshot suite"
|
| 9 |
+
@echo " space-check - run the HF Space readiness check"
|
| 10 |
+
@echo " parsers - print the parser registry"
|
| 11 |
+
@echo " preflight - run test + regression + space-check + parsers"
|
| 12 |
+
@echo " preflight-full - preflight + an end-to-end benchmark smoke"
|
| 13 |
+
@echo " benchmark - run zsgdp benchmark against tests/regression/fixtures"
|
| 14 |
+
@echo " space-smoke - run docs/space_smoke.md smokes (deps-permitting)"
|
| 15 |
+
@echo " space-smoke-strict - same, but treat skipped smokes as failures"
|
| 16 |
+
@echo " clean - remove __pycache__ and benchmark output"
|
| 17 |
+
|
| 18 |
+
test:
|
| 19 |
+
$(PYTHON) -m unittest discover
|
| 20 |
+
|
| 21 |
+
regression:
|
| 22 |
+
$(PYTHON) -m unittest tests.regression.test_regression -v
|
| 23 |
+
|
| 24 |
+
space-check:
|
| 25 |
+
$(PYTHON) -m zsgdp.cli space-check --root .
|
| 26 |
+
|
| 27 |
+
parsers:
|
| 28 |
+
$(PYTHON) -m zsgdp.cli parsers
|
| 29 |
+
|
| 30 |
+
preflight:
|
| 31 |
+
$(PYTHON) -m zsgdp.cli preflight --root .
|
| 32 |
+
|
| 33 |
+
preflight-full:
|
| 34 |
+
$(PYTHON) -m zsgdp.cli preflight --root . --benchmark
|
| 35 |
+
|
| 36 |
+
benchmark:
|
| 37 |
+
$(PYTHON) -m zsgdp.cli benchmark \
|
| 38 |
+
--input tests/regression/fixtures \
|
| 39 |
+
--output out/preflight_benchmark
|
| 40 |
+
|
| 41 |
+
space-smoke:
|
| 42 |
+
$(PYTHON) -m scripts.run_space_smoke --output out/space_smoke_report.json
|
| 43 |
+
|
| 44 |
+
space-smoke-strict:
|
| 45 |
+
$(PYTHON) -m scripts.run_space_smoke --strict --output out/space_smoke_report.json
|
| 46 |
+
|
| 47 |
+
clean:
|
| 48 |
+
find . -type d -name __pycache__ -exec rm -rf {} + 2>/dev/null || true
|
| 49 |
+
rm -rf out/preflight_benchmark
|
README.md
ADDED
|
@@ -0,0 +1,287 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: zeroshotGPU
|
| 3 |
+
sdk: gradio
|
| 4 |
+
app_file: app.py
|
| 5 |
+
python_version: 3.11
|
| 6 |
+
suggested_hardware: l4x1
|
| 7 |
+
short_description: Agentic zero-shot document parser with parser metrics and chunk artifacts.
|
| 8 |
+
---
|
| 9 |
+
|
| 10 |
+
# Zero-Shot GPU Document Parser
|
| 11 |
+
|
| 12 |
+
A self-hosted parsing control plane that profiles documents, routes pages to
|
| 13 |
+
parser experts, normalizes outputs, verifies quality with GT-comparison
|
| 14 |
+
metrics, repairs weak regions through a bounded verify/repair loop (with
|
| 15 |
+
optional GPU escalation), and emits auditable parsed-document artifacts plus
|
| 16 |
+
strategy-aware chunks. Implements the project described in
|
| 17 |
+
[`zero_shot_gpu_document_parser_project_spec.md`](zero_shot_gpu_document_parser_project_spec.md).
|
| 18 |
+
|
| 19 |
+
The codebase is intentionally dependency-light by default. Text and Markdown
|
| 20 |
+
work with the standard library; PyMuPDF, Docling, Marker, MinerU, olmOCR,
|
| 21 |
+
PaddleOCR, and Unstructured plug in via optional extras. Live GPU repair
|
| 22 |
+
(Qwen2.5-VL-3B) and the embedding retriever (jina-embeddings-v3) are gated
|
| 23 |
+
behind explicit config flags so a fresh clone never silently downloads
|
| 24 |
+
multi-gigabyte weights.
|
| 25 |
+
|
| 26 |
+
---
|
| 27 |
+
|
| 28 |
+
## Install
|
| 29 |
+
|
| 30 |
+
For the local MVP (text + PyMuPDF + Docling):
|
| 31 |
+
|
| 32 |
+
```bash
|
| 33 |
+
python -m pip install -e ".[pdf,yaml,docling,dev]"
|
| 34 |
+
```
|
| 35 |
+
|
| 36 |
+
Optional extras:
|
| 37 |
+
|
| 38 |
+
| Extra | Adds | Required for |
|
| 39 |
+
|---------------|--------------------------------------------------|-----------------------------------------------|
|
| 40 |
+
| `embedding` | `sentence-transformers`, `transformers` | `benchmarks.retriever.backend=embedding` |
|
| 41 |
+
| `gpu_repair` | `transformers` | `repair.execute_gpu_escalations=true` |
|
| 42 |
+
| `spaces` | mirrors `requirements.txt` for HF Spaces parity | running `app.py` locally as a Space simulant |
|
| 43 |
+
|
| 44 |
+
External parser CLIs (Marker, MinerU, olmOCR, PaddleOCR) install separately;
|
| 45 |
+
configure each via `parsers.<name>.command`, `output_args`, and `extra_args`
|
| 46 |
+
in your YAML config.
|
| 47 |
+
|
| 48 |
+
Secrets:
|
| 49 |
+
|
| 50 |
+
```bash
|
| 51 |
+
cp .env.example .env
|
| 52 |
+
# Set HF_TOKEN if you'll use gated models (jina-embeddings-v3, private repos).
|
| 53 |
+
```
|
| 54 |
+
|
| 55 |
+
`.env` is gitignored. The CLI and `app.py` load it on startup; pre-set
|
| 56 |
+
environment variables (e.g. Space-side secrets) always win.
|
| 57 |
+
|
| 58 |
+
---
|
| 59 |
+
|
| 60 |
+
## Quick start
|
| 61 |
+
|
| 62 |
+
### Parse one document or a folder
|
| 63 |
+
|
| 64 |
+
```bash
|
| 65 |
+
python -m zsgdp.cli parse --input ./docs/sample.md --output ./out/sample
|
| 66 |
+
python -m zsgdp.cli parse-folder --input ./docs --output ./parsed --workers 4
|
| 67 |
+
python -m zsgdp.cli parse --input ./docs/report.pdf --output ./out/report --config configs/docling.yaml
|
| 68 |
+
```
|
| 69 |
+
|
| 70 |
+
Each parse writes a full artifact bundle. `parsed_document.json` is the
|
| 71 |
+
canonical record; `chunks.jsonl` is the retrieval-ready output;
|
| 72 |
+
`quality_report.json` carries every metric the verifier computed.
|
| 73 |
+
|
| 74 |
+
### Run a benchmark
|
| 75 |
+
|
| 76 |
+
```bash
|
| 77 |
+
# Custom corpus, no GT — runs every metric that doesn't need labels:
|
| 78 |
+
python -m zsgdp.cli benchmark --input ./docs --output ./bench
|
| 79 |
+
|
| 80 |
+
# Labelled datasets — adds layout F1 / table structure / formula CER:
|
| 81 |
+
python -m zsgdp.cli benchmark --input ./omnidocbench --dataset omnidocbench --output ./bench/omni
|
| 82 |
+
python -m zsgdp.cli benchmark --input ./doclaynet --dataset doclaynet --output ./bench/doclay
|
| 83 |
+
```
|
| 84 |
+
|
| 85 |
+
### Compare parsers (ablation)
|
| 86 |
+
|
| 87 |
+
```bash
|
| 88 |
+
python -m zsgdp.cli benchmark-ablate \
|
| 89 |
+
--input ./docs --output ./bench/ablation \
|
| 90 |
+
--parser docling --parser pymupdf --parser text
|
| 91 |
+
```
|
| 92 |
+
|
| 93 |
+
Runs the benchmark once per parser plus a merged arm; emits
|
| 94 |
+
`ablation_comparison.csv`.
|
| 95 |
+
|
| 96 |
+
### Compare across datasets
|
| 97 |
+
|
| 98 |
+
```bash
|
| 99 |
+
python -m zsgdp.cli combine-benchmarks \
|
| 100 |
+
--input ./bench/omni --label omnidocbench \
|
| 101 |
+
--input ./bench/doclay --label doclaynet \
|
| 102 |
+
--output ./bench/cross
|
| 103 |
+
```
|
| 104 |
+
|
| 105 |
+
Emits `dataset_summary.csv` and `parser_matrix.csv` (parser × dataset).
|
| 106 |
+
|
| 107 |
+
### Before pushing to a Space — preflight
|
| 108 |
+
|
| 109 |
+
```bash
|
| 110 |
+
make preflight # unit + regression + space-check + parsers (~10s)
|
| 111 |
+
make preflight-full # ...plus an end-to-end benchmark smoke
|
| 112 |
+
```
|
| 113 |
+
|
| 114 |
+
A green preflight is the local signal that the branch is ready for the
|
| 115 |
+
Space. Pre-commit and pre-push hooks (see [CONTRIBUTING.md](CONTRIBUTING.md))
|
| 116 |
+
make this automatic on every `git push`.
|
| 117 |
+
|
| 118 |
+
### On the Space — smoke validation
|
| 119 |
+
|
| 120 |
+
Once deployed, exercise the deferred GPU/model paths:
|
| 121 |
+
|
| 122 |
+
```bash
|
| 123 |
+
make space-smoke # runs whichever of 5 smokes have their deps
|
| 124 |
+
python -m scripts.run_space_smoke --strict --output ./space_smoke.json
|
| 125 |
+
```
|
| 126 |
+
|
| 127 |
+
See [docs/space_smoke.md](docs/space_smoke.md) for the manual fallback
|
| 128 |
+
procedure (real PDF uploads, full Marker parses) and per-smoke
|
| 129 |
+
acceptance criteria.
|
| 130 |
+
|
| 131 |
+
---
|
| 132 |
+
|
| 133 |
+
## Opt-ins
|
| 134 |
+
|
| 135 |
+
### Embedding retriever
|
| 136 |
+
|
| 137 |
+
Default retriever is lexical TF-IDF (zero deps). To use a real embedder:
|
| 138 |
+
|
| 139 |
+
```yaml
|
| 140 |
+
# configs/myrun.yaml
|
| 141 |
+
benchmarks:
|
| 142 |
+
retriever:
|
| 143 |
+
backend: embedding
|
| 144 |
+
model_id: jinaai/jina-embeddings-v3 # or any sentence-transformers model
|
| 145 |
+
task: retrieval.passage
|
| 146 |
+
```
|
| 147 |
+
|
| 148 |
+
```bash
|
| 149 |
+
python -m pip install -e ".[embedding]"
|
| 150 |
+
python -m zsgdp.cli benchmark --input ./docs --output ./bench --config configs/myrun.yaml
|
| 151 |
+
```
|
| 152 |
+
|
| 153 |
+
The first call lazy-loads the model; subsequent calls reuse it in-process.
|
| 154 |
+
Set `HF_TOKEN` in `.env` for gated models.
|
| 155 |
+
|
| 156 |
+
### Live GPU repair
|
| 157 |
+
|
| 158 |
+
The repair controller plans GPU tasks for verification failures (invalid
|
| 159 |
+
tables, OCR coverage gaps, reading-order issues, missing figure captions).
|
| 160 |
+
By default these are dry-run only. To execute:
|
| 161 |
+
|
| 162 |
+
```yaml
|
| 163 |
+
# configs/myrun.yaml
|
| 164 |
+
repair:
|
| 165 |
+
gpu_escalation: true
|
| 166 |
+
execute_gpu_escalations: true # invokes the configured backend
|
| 167 |
+
gpu:
|
| 168 |
+
backend: transformers # or "vllm" for OpenAI-compat
|
| 169 |
+
models:
|
| 170 |
+
table:
|
| 171 |
+
model_id: Qwen/Qwen2.5-VL-3B-Instruct
|
| 172 |
+
```
|
| 173 |
+
|
| 174 |
+
Each executed task writes its output back into the merged document with a
|
| 175 |
+
`gpu_repair_task_id` provenance field.
|
| 176 |
+
|
| 177 |
+
---
|
| 178 |
+
|
| 179 |
+
## Outputs
|
| 180 |
+
|
| 181 |
+
Every parse writes:
|
| 182 |
+
|
| 183 |
+
- `parsed_document.json` — canonical record (carries `schema_version`).
|
| 184 |
+
- `document.md` — human-readable Markdown reconstruction.
|
| 185 |
+
- `elements.jsonl` / `tables.jsonl` / `figures.jsonl` / `chunks.jsonl` — JSONL streams.
|
| 186 |
+
- `chunking_plan.json` — strategy ladder + per-strategy metadata.
|
| 187 |
+
- `parser_metrics.json` — per-parser candidate-level stats.
|
| 188 |
+
- `quality_report.json` — every verifier metric (text coverage, reading order, table validity, parser disagreement, repair resolution/regression rates, GT-comparison metrics when applicable).
|
| 189 |
+
- `routing_report.json` — page → parser routing decisions.
|
| 190 |
+
- `profile.json` — document profiler output.
|
| 191 |
+
- `gpu_runtime.json` — detected GPU/device state at parse time.
|
| 192 |
+
- `gpu_tasks.jsonl` (when model-backed work is planned) and `gpu_task_report.json` (preflight validation).
|
| 193 |
+
- `conflict_report.json` (when multiple parsers ran).
|
| 194 |
+
- `artifact_manifest.json` with SHA-256 checksums and the parsed-document schema version.
|
| 195 |
+
- `assets/pages/*.png`, `assets/tables/*.png`, `assets/figures/*.png` — rendered PDF page and region crops.
|
| 196 |
+
|
| 197 |
+
Benchmark runs additionally write:
|
| 198 |
+
|
| 199 |
+
- `results.json` — full structured summary including aggregate means.
|
| 200 |
+
- `leaderboard.csv` and `per_parser_gt_leaderboard.csv` — parser leaderboards (without and with GT comparison).
|
| 201 |
+
- `per_parser_metrics.csv` — per-document, per-parser GT-comparison breakdown.
|
| 202 |
+
- `layout_runs.csv`, `table_structure_runs.csv`, `formula_runs.csv`, `retrieval_runs.csv`, `repair_runs.csv` — per-document detail per metric family.
|
| 203 |
+
- `parser_runs.csv`, `chunk_runs.csv`, `structure_runs.csv`, `chunk_quality.csv`, `throughput_runs.csv`, `ablations.json` — additional detail.
|
| 204 |
+
|
| 205 |
+
`benchmark-ablate` adds `ablation_comparison.csv`. `combine-benchmarks`
|
| 206 |
+
adds `dataset_summary.csv`, `parser_matrix.csv`, and
|
| 207 |
+
`cross_dataset_comparison.json`.
|
| 208 |
+
|
| 209 |
+
---
|
| 210 |
+
|
| 211 |
+
## Architecture map
|
| 212 |
+
|
| 213 |
+
| Module | Responsibility |
|
| 214 |
+
|-------------------------|-------------------------------------------------------------------------|
|
| 215 |
+
| `zsgdp/profiling/` | Cheap per-page features (scanned-score, table density, columns, etc.) |
|
| 216 |
+
| `zsgdp/routing/` | Deterministic page → parser-expert decisions with budget |
|
| 217 |
+
| `zsgdp/parsers/` | Adapters; one canonical schema regardless of source |
|
| 218 |
+
| `zsgdp/normalize/` | Convert each parser's output into the schema |
|
| 219 |
+
| `zsgdp/merge/` | Align candidates, dedupe, detect conflicts |
|
| 220 |
+
| `zsgdp/verify/` | Coverage / reading order / table / figure / formula / chunk readiness, plus GT-comparison: layout F1, table structure, formula CER, retrieval recall, parser disagreement, repair success |
|
| 221 |
+
| `zsgdp/repair/` | Deterministic header/table fixes plus GPU escalation through `gpu/worker.py` |
|
| 222 |
+
| `zsgdp/chunking/` | Agentic planner + structure / parent-child / table / figure / page chunkers, with semantic / late / vision / proposition deterministic stubs |
|
| 223 |
+
| `zsgdp/gpu/` | Task planning, batching, dry-run worker, transformers + vLLM clients |
|
| 224 |
+
| `zsgdp/benchmarks/` | Dataset loaders, metric runners, ablation, cross-dataset, retrieval |
|
| 225 |
+
| `zsgdp/cli.py` | All entry points |
|
| 226 |
+
| `app.py` | Gradio Space UI |
|
| 227 |
+
|
| 228 |
+
The full spec is in [`zero_shot_gpu_document_parser_project_spec.md`](zero_shot_gpu_document_parser_project_spec.md). §10 (schema) and §17 (chunking ladder) are the most useful sections to skim before touching those modules.
|
| 229 |
+
|
| 230 |
+
---
|
| 231 |
+
|
| 232 |
+
## Production benchmark numbers
|
| 233 |
+
|
| 234 |
+
Once the Space deploy is live and `make space-smoke` is green, run the
|
| 235 |
+
benchmark against your representative corpus and paste the headline
|
| 236 |
+
metrics here. Spec §29 success criteria for reference:
|
| 237 |
+
|
| 238 |
+
- **MVP:** full agentic loop improves table QA by ≥20% over best single parser; agentic chunking improves citation accuracy by ≥10% over recursive baseline.
|
| 239 |
+
- **Production-style (HR / financial reports / etc.):** retrieval recall@5 ≥ 90%, citation accuracy ≥ 90%, table QA exactness ≥ 85%, manual review rate ≤ 10%, parser blocking failure rate ≤ 5%.
|
| 240 |
+
|
| 241 |
+
| Metric | Dataset / Corpus | Value | Date | Run |
|
| 242 |
+
|---------------------------------|------------------|-------|------|-----|
|
| 243 |
+
| `mean_quality_score` | _todo_ | _todo_| _todo_ | _todo_ |
|
| 244 |
+
| `mean_layout_f1` | _todo_ | _todo_| _todo_ | _todo_ |
|
| 245 |
+
| `mean_table_structure_score` | _todo_ | _todo_| _todo_ | _todo_ |
|
| 246 |
+
| `mean_formula_cer` | _todo_ | _todo_| _todo_ | _todo_ |
|
| 247 |
+
| `mean_retrieval_recall_at_5` | _todo_ | _todo_| _todo_ | _todo_ |
|
| 248 |
+
| `mean_parser_disagreement_rate` | _todo_ | _todo_| _todo_ | _todo_ |
|
| 249 |
+
| `mean_repair_resolution_rate` | _todo_ | _todo_| _todo_ | _todo_ |
|
| 250 |
+
| `mean_pages_per_second` | _todo_ | _todo_| _todo_ | _todo_ |
|
| 251 |
+
|
| 252 |
+
Source rows are individual `results.json` files under each Space-side
|
| 253 |
+
benchmark output; commit the directory or a redacted summary so the
|
| 254 |
+
numbers above are reproducible.
|
| 255 |
+
|
| 256 |
+
---
|
| 257 |
+
|
| 258 |
+
## Deployment
|
| 259 |
+
|
| 260 |
+
Targeted: Hugging Face Spaces, hardware `l4x1`, GPU/model target
|
| 261 |
+
`zeroshotGPU`.
|
| 262 |
+
|
| 263 |
+
Pre-deploy gate:
|
| 264 |
+
|
| 265 |
+
1. `make preflight` (local).
|
| 266 |
+
2. `make preflight-full` (local with end-to-end benchmark smoke).
|
| 267 |
+
3. Duplicate the Space, set `HF_TOKEN` and any other secrets in **Variables and secrets**.
|
| 268 |
+
4. Push.
|
| 269 |
+
5. `make space-smoke` from the Space's JupyterLab terminal.
|
| 270 |
+
6. Inspect [docs/space_smoke.md](docs/space_smoke.md) Smoke 3 (live GPU repair) manually if the runner-level wiring smoke passed but you want full model-invocation validation.
|
| 271 |
+
7. Run `python -m zsgdp.cli benchmark` against your representative corpus and update the table above.
|
| 272 |
+
|
| 273 |
+
The Space defaults to `configs/docling.yaml` (Docling + PyMuPDF
|
| 274 |
+
co-enabled so the parser disagreement rate has signal). Override via
|
| 275 |
+
`ZSGDP_CONFIG_PATH` in Space variables for custom configs.
|
| 276 |
+
|
| 277 |
+
---
|
| 278 |
+
|
| 279 |
+
## Contributing
|
| 280 |
+
|
| 281 |
+
See [CONTRIBUTING.md](CONTRIBUTING.md) for setup, hooks, test layout,
|
| 282 |
+
fixture format, parser/metric/schema-bump procedures, and the PR checklist.
|
| 283 |
+
|
| 284 |
+
For changes touching the on-disk schema, bump `zsgdp.schema.SCHEMA_VERSION`
|
| 285 |
+
and add an entry under `### Schema` in [CHANGELOG.md](CHANGELOG.md). The
|
| 286 |
+
artifact manifest surfaces the version under
|
| 287 |
+
`parsed_document_schema_version` so downstream consumers can gate.
|
app.py
ADDED
|
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Hugging Face Spaces entrypoint for zeroshotGPU."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
import shutil
|
| 7 |
+
import tempfile
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from typing import Any
|
| 10 |
+
|
| 11 |
+
try:
|
| 12 |
+
import gradio as gr
|
| 13 |
+
except ImportError as exc: # pragma: no cover - only used when launching the Space UI.
|
| 14 |
+
raise RuntimeError("Gradio is required for the Spaces UI. Install with `python -m pip install -r requirements.txt`.") from exc
|
| 15 |
+
|
| 16 |
+
from zsgdp.artifacts import validate_artifact_manifest
|
| 17 |
+
from zsgdp.config import load_config, load_env_file
|
| 18 |
+
from zsgdp.gpu import collect_gpu_runtime_status
|
| 19 |
+
from zsgdp.logging_config import configure_logging, get_logger
|
| 20 |
+
from zsgdp.pipeline import parse_document
|
| 21 |
+
from zsgdp.profiling import profile_document
|
| 22 |
+
|
| 23 |
+
# Load .env first so any keys it sets (HF_TOKEN, ZSGDP_LOG_LEVEL, etc.) are
|
| 24 |
+
# visible before we read environment defaults below. Pre-set Space variables
|
| 25 |
+
# always win — load_env_file does not override existing env entries.
|
| 26 |
+
load_env_file()
|
| 27 |
+
|
| 28 |
+
# Default to JSON logs on the Space so the HF Spaces logs page is greppable.
|
| 29 |
+
# Override locally with `ZSGDP_LOG_JSON=0` for human-readable text output.
|
| 30 |
+
os.environ.setdefault("ZSGDP_LOG_LEVEL", "INFO")
|
| 31 |
+
os.environ.setdefault("ZSGDP_LOG_JSON", "1" if os.environ.get("SPACE_ID") else "0")
|
| 32 |
+
configure_logging()
|
| 33 |
+
_logger = get_logger(__name__)
|
| 34 |
+
|
| 35 |
+
ROOT = Path(__file__).resolve().parent
|
| 36 |
+
DOCLING_CONFIG = ROOT / "configs" / "docling.yaml"
|
| 37 |
+
|
| 38 |
+
# Abuse guards. Override at deployment time via env vars to relax for trusted
|
| 39 |
+
# Spaces or tighten further for public ones.
|
| 40 |
+
MAX_UPLOAD_BYTES = int(os.environ.get("ZSGDP_MAX_UPLOAD_BYTES", str(50 * 1024 * 1024))) # 50 MB
|
| 41 |
+
MAX_PAGE_COUNT = int(os.environ.get("ZSGDP_MAX_PAGE_COUNT", "200"))
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
class UploadRejected(Exception):
|
| 45 |
+
"""Raised when an upload exceeds an abuse-guard limit."""
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def _validate_upload(path: Path) -> None:
|
| 49 |
+
"""Reject oversized uploads or PDFs with too many pages before parsing.
|
| 50 |
+
|
| 51 |
+
Cheap to compute (file stat + profiler page count) and avoids spending
|
| 52 |
+
GPU/CPU minutes on inputs the Space wasn't sized for.
|
| 53 |
+
"""
|
| 54 |
+
|
| 55 |
+
if not path.exists():
|
| 56 |
+
raise UploadRejected("Uploaded file is missing on disk.")
|
| 57 |
+
size = path.stat().st_size
|
| 58 |
+
if size > MAX_UPLOAD_BYTES:
|
| 59 |
+
raise UploadRejected(
|
| 60 |
+
f"Upload is {size / 1024 / 1024:.1f} MB; the Space limit is "
|
| 61 |
+
f"{MAX_UPLOAD_BYTES / 1024 / 1024:.0f} MB. Set ZSGDP_MAX_UPLOAD_BYTES to override."
|
| 62 |
+
)
|
| 63 |
+
try:
|
| 64 |
+
profile = profile_document(path)
|
| 65 |
+
except Exception: # pragma: no cover - profiler is robust; this is belt-and-braces.
|
| 66 |
+
return
|
| 67 |
+
if profile.page_count > MAX_PAGE_COUNT:
|
| 68 |
+
raise UploadRejected(
|
| 69 |
+
f"Document has {profile.page_count} pages; the Space limit is "
|
| 70 |
+
f"{MAX_PAGE_COUNT}. Set ZSGDP_MAX_PAGE_COUNT to override."
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
# Top-level artifact files surfaced as individual downloads. Nested
|
| 75 |
+
# directories like assets/ stay bundled in the zip only — they can be
|
| 76 |
+
# large for multi-page PDFs and would clutter the per-artifact list.
|
| 77 |
+
_INDIVIDUAL_ARTIFACT_NAMES = (
|
| 78 |
+
"parsed_document.json",
|
| 79 |
+
"document.md",
|
| 80 |
+
"elements.jsonl",
|
| 81 |
+
"tables.jsonl",
|
| 82 |
+
"figures.jsonl",
|
| 83 |
+
"chunks.jsonl",
|
| 84 |
+
"chunking_plan.json",
|
| 85 |
+
"parser_metrics.json",
|
| 86 |
+
"quality_report.json",
|
| 87 |
+
"routing_report.json",
|
| 88 |
+
"profile.json",
|
| 89 |
+
"gpu_runtime.json",
|
| 90 |
+
"gpu_tasks.jsonl",
|
| 91 |
+
"gpu_task_report.json",
|
| 92 |
+
"artifact_manifest.json",
|
| 93 |
+
"conflict_report.json",
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
def _collect_artifact_files(output_dir: Path) -> list[str]:
|
| 98 |
+
"""Return absolute paths for the top-level artifacts the Space surfaces.
|
| 99 |
+
|
| 100 |
+
Order matches _INDIVIDUAL_ARTIFACT_NAMES so the UI listing is stable.
|
| 101 |
+
Missing files are silently skipped (different parse runs emit different
|
| 102 |
+
subsets — e.g. conflict_report.json only when multiple parsers ran).
|
| 103 |
+
"""
|
| 104 |
+
|
| 105 |
+
paths: list[str] = []
|
| 106 |
+
for name in _INDIVIDUAL_ARTIFACT_NAMES:
|
| 107 |
+
candidate = output_dir / name
|
| 108 |
+
if candidate.exists():
|
| 109 |
+
paths.append(str(candidate))
|
| 110 |
+
return paths
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
def _empty_outputs(reason: str, source: Path | None, *, rejected: bool, runtime: dict) -> tuple:
|
| 114 |
+
"""Return-shape used for every error path. Centralised so the tuple width
|
| 115 |
+
can't drift between the success path and the four error paths."""
|
| 116 |
+
|
| 117 |
+
summary: dict[str, Any] = {"error": reason}
|
| 118 |
+
if source is not None:
|
| 119 |
+
summary["source"] = str(source)
|
| 120 |
+
if rejected:
|
| 121 |
+
summary["rejected"] = True
|
| 122 |
+
return ("", summary, {}, {}, {}, runtime, [], {}, {}, None, [])
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
def parse_uploaded_document(file_obj: Any, pipeline_mode: str):
|
| 126 |
+
if file_obj is None:
|
| 127 |
+
return _empty_outputs("Upload a document first.", None, rejected=False, runtime={})
|
| 128 |
+
|
| 129 |
+
source = Path(file_obj.name)
|
| 130 |
+
work_dir = Path(tempfile.mkdtemp(prefix="zeroshotgpu_"))
|
| 131 |
+
output_dir = work_dir / "parsed"
|
| 132 |
+
config_path = _config_path_for_mode(pipeline_mode)
|
| 133 |
+
|
| 134 |
+
try:
|
| 135 |
+
_validate_upload(source)
|
| 136 |
+
except UploadRejected as exc:
|
| 137 |
+
_logger.warning(
|
| 138 |
+
"space_upload_rejected",
|
| 139 |
+
extra={"source_path": str(source), "reason": str(exc)},
|
| 140 |
+
)
|
| 141 |
+
runtime = runtime_status_for_mode(pipeline_mode)
|
| 142 |
+
return _empty_outputs(str(exc), source, rejected=True, runtime=runtime)
|
| 143 |
+
|
| 144 |
+
try:
|
| 145 |
+
parsed = parse_document(source, output_dir, config_path=config_path)
|
| 146 |
+
except Exception as exc: # pragma: no cover - surfaced in the Space UI.
|
| 147 |
+
runtime = runtime_status_for_mode(pipeline_mode)
|
| 148 |
+
return _empty_outputs(str(exc), source, rejected=False, runtime=runtime)
|
| 149 |
+
|
| 150 |
+
artifact_validation = validate_artifact_manifest(output_dir)
|
| 151 |
+
archive_path = shutil.make_archive(str(output_dir), "zip", output_dir)
|
| 152 |
+
individual_files = _collect_artifact_files(output_dir)
|
| 153 |
+
runtime = parsed.provenance.get("gpu_runtime", {})
|
| 154 |
+
summary = {
|
| 155 |
+
"doc_id": parsed.doc_id,
|
| 156 |
+
"file_type": parsed.file_type,
|
| 157 |
+
"elements": len(parsed.elements),
|
| 158 |
+
"tables": len(parsed.tables),
|
| 159 |
+
"figures": len(parsed.figures),
|
| 160 |
+
"chunks": len(parsed.chunks),
|
| 161 |
+
"quality_score": parsed.quality_report.score,
|
| 162 |
+
"blocking": parsed.quality_report.has_blocking_failures,
|
| 163 |
+
"deployment": parsed.provenance.get("config_deployment", {}),
|
| 164 |
+
"runtime_device": runtime.get("device"),
|
| 165 |
+
"running_on_huggingface_space": runtime.get("running_on_huggingface_space"),
|
| 166 |
+
"artifact_manifest_valid": artifact_validation.get("valid"),
|
| 167 |
+
"artifact_count": artifact_validation.get("artifact_count"),
|
| 168 |
+
"artifact_checked_count": artifact_validation.get("checked_count"),
|
| 169 |
+
"individual_artifact_count": len(individual_files),
|
| 170 |
+
}
|
| 171 |
+
return (
|
| 172 |
+
parsed.to_markdown(),
|
| 173 |
+
summary,
|
| 174 |
+
parsed.quality_report.to_dict(),
|
| 175 |
+
parsed.provenance.get("parser_metrics", {}),
|
| 176 |
+
parsed.provenance.get("chunking", {}),
|
| 177 |
+
runtime,
|
| 178 |
+
parsed.provenance.get("gpu_tasks", []),
|
| 179 |
+
parsed.provenance.get("gpu_task_report", {}),
|
| 180 |
+
artifact_validation,
|
| 181 |
+
archive_path,
|
| 182 |
+
individual_files,
|
| 183 |
+
)
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
def _config_path_for_mode(pipeline_mode: str) -> Path | None:
|
| 187 |
+
env_config = os.environ.get("ZSGDP_CONFIG_PATH")
|
| 188 |
+
if env_config:
|
| 189 |
+
return Path(env_config)
|
| 190 |
+
if pipeline_mode == "Docling + PyMuPDF" and DOCLING_CONFIG.exists():
|
| 191 |
+
return DOCLING_CONFIG
|
| 192 |
+
return None
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
def runtime_status_for_mode(pipeline_mode: str) -> dict:
|
| 196 |
+
return collect_gpu_runtime_status(load_config(_config_path_for_mode(pipeline_mode))).to_dict()
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
with gr.Blocks(title="zeroshotGPU") as demo:
|
| 200 |
+
gr.Markdown("# zeroshotGPU")
|
| 201 |
+
with gr.Row():
|
| 202 |
+
upload = gr.File(label="Document", file_types=[".pdf", ".md", ".txt", ".html"])
|
| 203 |
+
with gr.Column():
|
| 204 |
+
pipeline = gr.Dropdown(
|
| 205 |
+
choices=["Docling + PyMuPDF", "Default lightweight"],
|
| 206 |
+
value="Docling + PyMuPDF",
|
| 207 |
+
label="Pipeline",
|
| 208 |
+
)
|
| 209 |
+
parse_button = gr.Button("Parse", variant="primary")
|
| 210 |
+
archive = gr.File(label="Artifacts (zip)")
|
| 211 |
+
with gr.Tabs():
|
| 212 |
+
with gr.Tab("Markdown"):
|
| 213 |
+
markdown = gr.Markdown(label="Canonical Markdown")
|
| 214 |
+
with gr.Tab("Run"):
|
| 215 |
+
summary = gr.JSON(label="Summary")
|
| 216 |
+
quality = gr.JSON(label="Quality Report")
|
| 217 |
+
parser_metrics = gr.JSON(label="Parser Metrics")
|
| 218 |
+
chunking = gr.JSON(label="Chunking Plan")
|
| 219 |
+
artifact_validation = gr.JSON(label="Artifact Manifest Validation")
|
| 220 |
+
with gr.Tab("Artifacts"):
|
| 221 |
+
gr.Markdown(
|
| 222 |
+
"Each top-level artifact is downloadable individually. "
|
| 223 |
+
"Nested assets (page renders, table/figure crops) stay bundled "
|
| 224 |
+
"in the zip above."
|
| 225 |
+
)
|
| 226 |
+
individual_artifacts = gr.Files(label="Individual artifacts")
|
| 227 |
+
with gr.Tab("Runtime"):
|
| 228 |
+
runtime = gr.JSON(label="GPU Runtime", value=runtime_status_for_mode("Docling + PyMuPDF"))
|
| 229 |
+
gpu_tasks = gr.JSON(label="Planned GPU Tasks")
|
| 230 |
+
gpu_task_report = gr.JSON(label="GPU Task Preflight")
|
| 231 |
+
parse_button.click(
|
| 232 |
+
parse_uploaded_document,
|
| 233 |
+
inputs=[upload, pipeline],
|
| 234 |
+
outputs=[
|
| 235 |
+
markdown,
|
| 236 |
+
summary,
|
| 237 |
+
quality,
|
| 238 |
+
parser_metrics,
|
| 239 |
+
chunking,
|
| 240 |
+
runtime,
|
| 241 |
+
gpu_tasks,
|
| 242 |
+
gpu_task_report,
|
| 243 |
+
artifact_validation,
|
| 244 |
+
archive,
|
| 245 |
+
individual_artifacts,
|
| 246 |
+
],
|
| 247 |
+
)
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
if __name__ == "__main__":
|
| 251 |
+
demo.launch()
|
configs/default.yaml
ADDED
|
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
parsers:
|
| 2 |
+
text:
|
| 3 |
+
enabled: true
|
| 4 |
+
pymupdf:
|
| 5 |
+
enabled: true
|
| 6 |
+
docling:
|
| 7 |
+
enabled: false
|
| 8 |
+
do_ocr: false
|
| 9 |
+
do_table_structure: false
|
| 10 |
+
force_backend_text: true
|
| 11 |
+
marker:
|
| 12 |
+
enabled: false
|
| 13 |
+
command: null
|
| 14 |
+
timeout_seconds: 300
|
| 15 |
+
output_args: "--output_dir {output_dir} --output_format markdown"
|
| 16 |
+
extra_args: ""
|
| 17 |
+
mineru:
|
| 18 |
+
enabled: false
|
| 19 |
+
command: null
|
| 20 |
+
timeout_seconds: 600
|
| 21 |
+
output_args: "--output_dir {output_dir}"
|
| 22 |
+
extra_args: ""
|
| 23 |
+
olmocr:
|
| 24 |
+
enabled: false
|
| 25 |
+
command: null
|
| 26 |
+
timeout_seconds: 600
|
| 27 |
+
output_args: "--output_dir {output_dir}"
|
| 28 |
+
extra_args: ""
|
| 29 |
+
paddleocr:
|
| 30 |
+
enabled: false
|
| 31 |
+
command: null
|
| 32 |
+
timeout_seconds: 600
|
| 33 |
+
output_args: "--output_dir {output_dir}"
|
| 34 |
+
extra_args: ""
|
| 35 |
+
unstructured:
|
| 36 |
+
enabled: false
|
| 37 |
+
|
| 38 |
+
routing:
|
| 39 |
+
run_multiple_on_hard_pages: true
|
| 40 |
+
max_primary_parsers_per_page: 2
|
| 41 |
+
hard_page_threshold: 0.65
|
| 42 |
+
scanned_text_threshold: 0.40
|
| 43 |
+
table_density_threshold: 0.25
|
| 44 |
+
formula_density_threshold: 0.15
|
| 45 |
+
figure_density_threshold: 0.20
|
| 46 |
+
|
| 47 |
+
repair:
|
| 48 |
+
enabled: true
|
| 49 |
+
max_iterations: 3
|
| 50 |
+
# Plan and dry-run GPU escalations for verification failures.
|
| 51 |
+
gpu_escalation: true
|
| 52 |
+
# Actually invoke the configured GPU/VLM backend on flagged regions.
|
| 53 |
+
# Defaults to false to avoid surprise model downloads on local runs;
|
| 54 |
+
# set true on the Space once GPU models are warm.
|
| 55 |
+
execute_gpu_escalations: false
|
| 56 |
+
table_repair: true
|
| 57 |
+
reading_order_repair: true
|
| 58 |
+
figure_repair: true
|
| 59 |
+
ocr_repair: true
|
| 60 |
+
|
| 61 |
+
gpu:
|
| 62 |
+
backend: transformers
|
| 63 |
+
provider: huggingface_spaces
|
| 64 |
+
space_name: zeroshotGPU
|
| 65 |
+
batch_pages: true
|
| 66 |
+
validate_tasks: true
|
| 67 |
+
max_batch_size: 4
|
| 68 |
+
max_gpu_seconds_per_doc: 120
|
| 69 |
+
max_vlm_calls_per_doc: 30
|
| 70 |
+
models:
|
| 71 |
+
vlm:
|
| 72 |
+
model_id: Qwen/Qwen2.5-VL-3B-Instruct
|
| 73 |
+
task: image-text-to-text
|
| 74 |
+
device: auto
|
| 75 |
+
dtype: bfloat16
|
| 76 |
+
max_batch_size: 1
|
| 77 |
+
ocr:
|
| 78 |
+
model_id: Qwen/Qwen2.5-VL-3B-Instruct
|
| 79 |
+
task: document-ocr
|
| 80 |
+
device: auto
|
| 81 |
+
dtype: bfloat16
|
| 82 |
+
max_batch_size: 1
|
| 83 |
+
table:
|
| 84 |
+
model_id: Qwen/Qwen2.5-VL-3B-Instruct
|
| 85 |
+
task: table-repair
|
| 86 |
+
device: auto
|
| 87 |
+
dtype: bfloat16
|
| 88 |
+
max_batch_size: 1
|
| 89 |
+
embedding:
|
| 90 |
+
model_id: jinaai/jina-embeddings-v3
|
| 91 |
+
task: retrieval.passage
|
| 92 |
+
device: auto
|
| 93 |
+
dtype: bfloat16
|
| 94 |
+
max_batch_size: 16
|
| 95 |
+
task_model_roles:
|
| 96 |
+
vlm_route_repair: vlm
|
| 97 |
+
ocr_page: ocr
|
| 98 |
+
table_vlm_repair: table
|
| 99 |
+
figure_description: vlm
|
| 100 |
+
|
| 101 |
+
pdf:
|
| 102 |
+
render_pages: true
|
| 103 |
+
render_dpi: 150
|
| 104 |
+
crop_tables: true
|
| 105 |
+
crop_figures: true
|
| 106 |
+
asset_dir: assets
|
| 107 |
+
|
| 108 |
+
quality:
|
| 109 |
+
accept_threshold: 0.88
|
| 110 |
+
blocking_failures:
|
| 111 |
+
- empty_page
|
| 112 |
+
- invalid_table
|
| 113 |
+
- missing_text_coverage
|
| 114 |
+
- reading_order_failure
|
| 115 |
+
|
| 116 |
+
chunking:
|
| 117 |
+
enabled: true
|
| 118 |
+
planner: agentic
|
| 119 |
+
baseline_strategy: recursive_structure
|
| 120 |
+
target_tokens: 512
|
| 121 |
+
min_tokens: 120
|
| 122 |
+
overlap_ratio: 0.15
|
| 123 |
+
parent_child: true
|
| 124 |
+
parent_target_tokens: 1600
|
| 125 |
+
page_level_for_paginated_docs: true
|
| 126 |
+
table_chunks: true
|
| 127 |
+
figure_chunks: true
|
| 128 |
+
contextual_prefix: false
|
| 129 |
+
contextual_retrieval: false
|
| 130 |
+
semantic_similarity_threshold: 0.18
|
| 131 |
+
max_propositions_per_source: 8
|
| 132 |
+
max_proposition_chunks: 64
|
| 133 |
+
semantic_chunking: false
|
| 134 |
+
late_chunking: false
|
| 135 |
+
vision_guided: false
|
| 136 |
+
agentic_proposition_chunking: false
|
| 137 |
+
strategy_ladder:
|
| 138 |
+
- fixed_token_baseline
|
| 139 |
+
- recursive_structure
|
| 140 |
+
- metadata_enriched
|
| 141 |
+
- parent_child
|
| 142 |
+
- contextual_retrieval
|
| 143 |
+
- late_chunking
|
| 144 |
+
- semantic_chunking
|
| 145 |
+
- vision_guided
|
| 146 |
+
- agentic_proposition
|
| 147 |
+
|
| 148 |
+
benchmarks:
|
| 149 |
+
retriever:
|
| 150 |
+
# `lexical` (default, model-free TF-IDF) or `embedding` (sentence-transformers).
|
| 151 |
+
# The `embedding` backend pulls model_id and task from gpu.models.embedding
|
| 152 |
+
# unless overridden here. Requires `pip install sentence-transformers`.
|
| 153 |
+
backend: lexical
|
| 154 |
+
model_id: null
|
| 155 |
+
task: null
|
| 156 |
+
|
| 157 |
+
deployment:
|
| 158 |
+
target: huggingface_spaces
|
| 159 |
+
gpu_models_target: zeroshotGPU
|
configs/docling.yaml
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
parsers:
|
| 2 |
+
# Both docling and pymupdf are enabled deliberately so the parser
|
| 3 |
+
# disagreement-rate metric has a comparison surface on PDF inputs.
|
| 4 |
+
# Disable one if you only need a single-parser baseline.
|
| 5 |
+
docling:
|
| 6 |
+
enabled: true
|
| 7 |
+
do_ocr: false
|
| 8 |
+
do_table_structure: false
|
| 9 |
+
force_backend_text: true
|
| 10 |
+
generate_page_images: false
|
| 11 |
+
generate_picture_images: false
|
| 12 |
+
generate_table_images: false
|
| 13 |
+
do_picture_description: false
|
| 14 |
+
do_picture_classification: false
|
| 15 |
+
do_formula_enrichment: false
|
| 16 |
+
do_code_enrichment: false
|
| 17 |
+
marker:
|
| 18 |
+
enabled: false
|
| 19 |
+
pymupdf:
|
| 20 |
+
enabled: true
|
| 21 |
+
|
| 22 |
+
routing:
|
| 23 |
+
run_multiple_on_hard_pages: true
|
| 24 |
+
max_primary_parsers_per_page: 2
|
| 25 |
+
|
| 26 |
+
pdf:
|
| 27 |
+
render_pages: true
|
| 28 |
+
crop_tables: true
|
| 29 |
+
crop_figures: true
|
configs/gpu.yaml
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gpu:
|
| 2 |
+
backend: transformers
|
| 3 |
+
provider: huggingface_spaces
|
| 4 |
+
space_name: zeroshotGPU
|
| 5 |
+
batch_pages: true
|
| 6 |
+
validate_tasks: true
|
| 7 |
+
max_batch_size: 4
|
| 8 |
+
max_gpu_seconds_per_doc: 120
|
| 9 |
+
max_vlm_calls_per_doc: 30
|
| 10 |
+
models:
|
| 11 |
+
vlm:
|
| 12 |
+
model_id: Qwen/Qwen2.5-VL-3B-Instruct
|
| 13 |
+
task: image-text-to-text
|
| 14 |
+
device: auto
|
| 15 |
+
dtype: bfloat16
|
| 16 |
+
max_batch_size: 1
|
| 17 |
+
ocr:
|
| 18 |
+
model_id: Qwen/Qwen2.5-VL-3B-Instruct
|
| 19 |
+
task: document-ocr
|
| 20 |
+
device: auto
|
| 21 |
+
dtype: bfloat16
|
| 22 |
+
max_batch_size: 1
|
| 23 |
+
table:
|
| 24 |
+
model_id: Qwen/Qwen2.5-VL-3B-Instruct
|
| 25 |
+
task: table-repair
|
| 26 |
+
device: auto
|
| 27 |
+
dtype: bfloat16
|
| 28 |
+
max_batch_size: 1
|
| 29 |
+
embedding:
|
| 30 |
+
model_id: jinaai/jina-embeddings-v3
|
| 31 |
+
task: retrieval.passage
|
| 32 |
+
device: auto
|
| 33 |
+
dtype: bfloat16
|
| 34 |
+
max_batch_size: 16
|
| 35 |
+
task_model_roles:
|
| 36 |
+
vlm_route_repair: vlm
|
| 37 |
+
ocr_page: ocr
|
| 38 |
+
table_vlm_repair: table
|
| 39 |
+
figure_description: vlm
|
| 40 |
+
|
| 41 |
+
deployment:
|
| 42 |
+
target: huggingface_spaces
|
| 43 |
+
gpu_models_target: zeroshotGPU
|
configs/parsers.yaml
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
parsers:
|
| 2 |
+
text:
|
| 3 |
+
enabled: true
|
| 4 |
+
pymupdf:
|
| 5 |
+
enabled: true
|
| 6 |
+
docling:
|
| 7 |
+
enabled: false
|
| 8 |
+
marker:
|
| 9 |
+
enabled: false
|
| 10 |
+
command: null
|
| 11 |
+
timeout_seconds: 300
|
| 12 |
+
output_args: "--output_dir {output_dir} --output_format markdown"
|
| 13 |
+
extra_args: ""
|
| 14 |
+
mineru:
|
| 15 |
+
enabled: false
|
| 16 |
+
command: null
|
| 17 |
+
timeout_seconds: 600
|
| 18 |
+
output_args: "--output_dir {output_dir}"
|
| 19 |
+
extra_args: ""
|
| 20 |
+
olmocr:
|
| 21 |
+
enabled: false
|
| 22 |
+
command: null
|
| 23 |
+
timeout_seconds: 600
|
| 24 |
+
output_args: "--output_dir {output_dir}"
|
| 25 |
+
extra_args: ""
|
| 26 |
+
paddleocr:
|
| 27 |
+
enabled: false
|
| 28 |
+
command: null
|
| 29 |
+
timeout_seconds: 600
|
| 30 |
+
output_args: "--output_dir {output_dir}"
|
| 31 |
+
extra_args: ""
|
| 32 |
+
unstructured:
|
| 33 |
+
enabled: false
|
configs/routing.yaml
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
routing:
|
| 2 |
+
run_multiple_on_hard_pages: true
|
| 3 |
+
max_primary_parsers_per_page: 2
|
| 4 |
+
hard_page_threshold: 0.65
|
| 5 |
+
scanned_text_threshold: 0.40
|
| 6 |
+
table_density_threshold: 0.25
|
| 7 |
+
formula_density_threshold: 0.15
|
| 8 |
+
figure_density_threshold: 0.20
|
docs/space_smoke.md
ADDED
|
@@ -0,0 +1,269 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Hugging Face Space smoke-test checklist
|
| 2 |
+
|
| 3 |
+
This is the deferred deployment-readiness work that can only be exercised on
|
| 4 |
+
real GPU hardware against real models / external CLIs. Run each smoke once
|
| 5 |
+
against a duplicated `zeroshotGPU` Space (or your own dev Space). Each entry
|
| 6 |
+
gives the exact env vars / config flips, the command to trigger, and the
|
| 7 |
+
structured log lines you should expect.
|
| 8 |
+
|
| 9 |
+
All log lines below assume the Space is run with `ZSGDP_LOG_LEVEL=INFO` and
|
| 10 |
+
`ZSGDP_LOG_JSON=1`. `app.py` sets these automatically when `SPACE_ID` is in
|
| 11 |
+
the environment, so on a normal Space you do not need to set them yourself.
|
| 12 |
+
The HF Spaces logs page will surface the JSON records on stderr.
|
| 13 |
+
|
| 14 |
+
---
|
| 15 |
+
|
| 16 |
+
## Pre-flight
|
| 17 |
+
|
| 18 |
+
1. Duplicate the Space, give it `l4x1` hardware.
|
| 19 |
+
2. Make sure these are set in **Space settings → Variables and secrets**:
|
| 20 |
+
- `ZSGDP_LOG_LEVEL=INFO`
|
| 21 |
+
- `ZSGDP_LOG_JSON=1`
|
| 22 |
+
- (Optional, only for parser smokes that hit a private repo) `HF_TOKEN`.
|
| 23 |
+
3. In the Space's `requirements.txt`, uncomment the dependency block matching
|
| 24 |
+
the smoke you are running. Do **one smoke per Space deploy** — combining
|
| 25 |
+
them risks an OOM or slow cold-start on the L4.
|
| 26 |
+
4. Push and wait for the Space to build. First-build cold-start with a model
|
| 27 |
+
download is ~5-10 minutes; subsequent restarts are seconds.
|
| 28 |
+
|
| 29 |
+
After deploy, watch the **Logs** tab for the `parse_start` event. If you do
|
| 30 |
+
not see structured JSON lines there, the logging config is not active —
|
| 31 |
+
double-check `ZSGDP_LOG_JSON=1` in the Space variables.
|
| 32 |
+
|
| 33 |
+
## Automated runner
|
| 34 |
+
|
| 35 |
+
Each smoke below has an automated counterpart in
|
| 36 |
+
`scripts/run_space_smoke.py`. From a Space JupyterLab terminal (or any
|
| 37 |
+
shell with the project installed):
|
| 38 |
+
|
| 39 |
+
```bash
|
| 40 |
+
# Run all smokes whose deps are installed; skip the rest with hints:
|
| 41 |
+
python -m scripts.run_space_smoke --output ./space_smoke_report.json
|
| 42 |
+
|
| 43 |
+
# Run only specific smokes:
|
| 44 |
+
python -m scripts.run_space_smoke --smoke lexical --smoke ablation
|
| 45 |
+
|
| 46 |
+
# CI-strict mode: treat skipped smokes as failures (use after you've
|
| 47 |
+
# uncommented the deps for the smoke you intend to run):
|
| 48 |
+
python -m scripts.run_space_smoke --smoke embedding --strict
|
| 49 |
+
```
|
| 50 |
+
|
| 51 |
+
The runner reports `pass` / `fail` / `skip` / `error` per smoke, plus
|
| 52 |
+
elapsed seconds and a `detail` block with the metrics it gathered. The
|
| 53 |
+
manual procedure below is the fallback when you want to inspect the UI
|
| 54 |
+
directly or test something the runner doesn't cover (e.g. uploading a
|
| 55 |
+
specific real PDF rather than a synthetic fixture).
|
| 56 |
+
|
| 57 |
+
---
|
| 58 |
+
|
| 59 |
+
## Smoke 1 — Lexical retriever benchmark (model-free)
|
| 60 |
+
|
| 61 |
+
Confirms the Space's parsing + benchmark plumbing works end-to-end before
|
| 62 |
+
adding any model dependency.
|
| 63 |
+
|
| 64 |
+
**Setup:**
|
| 65 |
+
- Default `requirements.txt` (no uncommenting needed).
|
| 66 |
+
- Default config (no flips).
|
| 67 |
+
|
| 68 |
+
**Trigger:** upload a small markdown file via the Gradio UI.
|
| 69 |
+
|
| 70 |
+
**Expected log lines (in order):**
|
| 71 |
+
- `parse_start` with `doc_id`, `file_type`, `device` (likely `cuda`).
|
| 72 |
+
- One `parser_candidate` per parser that ran (typically `text`, possibly
|
| 73 |
+
`pymupdf` and `docling` if the file was a PDF).
|
| 74 |
+
- Possibly one or more `repair_iteration` records if quality < threshold.
|
| 75 |
+
- `parse_end` with `quality_score`, `repair_iterations`, `chunk_count`.
|
| 76 |
+
|
| 77 |
+
**Pass criteria:**
|
| 78 |
+
- All log lines appear with `doc_id` populated.
|
| 79 |
+
- `parse_end.quality_score >= 0.85` for a clean markdown doc.
|
| 80 |
+
- No `parser_failed` or `gpu_task_blocked` records.
|
| 81 |
+
|
| 82 |
+
---
|
| 83 |
+
|
| 84 |
+
## Smoke 2 — Embedding retriever (jina-embeddings-v3)
|
| 85 |
+
|
| 86 |
+
Confirms `sentence-transformers` lazy-load path and that jina-v3 specifically
|
| 87 |
+
runs on the L4 with `trust_remote_code=True`.
|
| 88 |
+
|
| 89 |
+
**Setup:**
|
| 90 |
+
- In `requirements.txt`, uncomment `transformers` and `sentence-transformers`
|
| 91 |
+
lines.
|
| 92 |
+
- Add `configs/space_embedding.yaml` to the repo with:
|
| 93 |
+
|
| 94 |
+
```yaml
|
| 95 |
+
benchmarks:
|
| 96 |
+
retriever:
|
| 97 |
+
backend: embedding
|
| 98 |
+
model_id: jinaai/jina-embeddings-v3
|
| 99 |
+
task: retrieval.passage
|
| 100 |
+
```
|
| 101 |
+
|
| 102 |
+
- In `app.py` set `os.environ["ZSGDP_CONFIG_PATH"] = "configs/space_embedding.yaml"`,
|
| 103 |
+
or pass via the env var configured in Space variables.
|
| 104 |
+
|
| 105 |
+
**Trigger:** upload any markdown / PDF; the benchmark CLI is not reachable
|
| 106 |
+
from the Gradio UI today, so for the embedding-retriever smoke you'd need
|
| 107 |
+
to run `zsgdp benchmark --input ./fixtures --output ./out` from a Space
|
| 108 |
+
**JupyterLab** session against a small input dir.
|
| 109 |
+
|
| 110 |
+
**Expected log lines:**
|
| 111 |
+
- First call: a 30–90s pause while jina-v3 weights download (no log lines
|
| 112 |
+
during this — torch logs go to its own logger). Then `parse_start`.
|
| 113 |
+
- After the first parse, subsequent calls are fast (model is in memory).
|
| 114 |
+
|
| 115 |
+
**Pass criteria:**
|
| 116 |
+
- Benchmark completes without an exception.
|
| 117 |
+
- `summary["mean_retrieval_recall_at_5"] >= 0.7` on a small distinct-text
|
| 118 |
+
corpus.
|
| 119 |
+
- No `gpu_task_blocked` records (those are repair-related, not retrieval).
|
| 120 |
+
- The parse_end record's `device` field reads `cuda`.
|
| 121 |
+
|
| 122 |
+
**Failure modes to watch:**
|
| 123 |
+
- `RuntimeError: EmbeddingRetriever requires sentence-transformers` →
|
| 124 |
+
package not in `requirements.txt`.
|
| 125 |
+
- CUDA OOM → switch to a smaller embedding model
|
| 126 |
+
(`sentence-transformers/all-MiniLM-L6-v2`) for the smoke and confirm the
|
| 127 |
+
wiring before retrying jina-v3.
|
| 128 |
+
|
| 129 |
+
---
|
| 130 |
+
|
| 131 |
+
## Smoke 3 — Live GPU repair on a malformed table
|
| 132 |
+
|
| 133 |
+
Confirms the repair loop's GPU escalation path actually invokes the
|
| 134 |
+
configured VLM and that the result is applied to the merged document.
|
| 135 |
+
|
| 136 |
+
**Setup:**
|
| 137 |
+
- In `requirements.txt`, uncomment `transformers` (sentence-transformers
|
| 138 |
+
not needed for this smoke).
|
| 139 |
+
- Add `configs/space_gpu_repair.yaml`:
|
| 140 |
+
|
| 141 |
+
```yaml
|
| 142 |
+
parsers:
|
| 143 |
+
docling:
|
| 144 |
+
enabled: true
|
| 145 |
+
pymupdf:
|
| 146 |
+
enabled: true
|
| 147 |
+
repair:
|
| 148 |
+
enabled: true
|
| 149 |
+
gpu_escalation: true
|
| 150 |
+
execute_gpu_escalations: true # the bit that flips the live path on
|
| 151 |
+
gpu:
|
| 152 |
+
backend: transformers
|
| 153 |
+
models:
|
| 154 |
+
table:
|
| 155 |
+
model_id: Qwen/Qwen2.5-VL-3B-Instruct
|
| 156 |
+
task: table-repair
|
| 157 |
+
device: auto
|
| 158 |
+
dtype: bfloat16
|
| 159 |
+
```
|
| 160 |
+
|
| 161 |
+
- Set `ZSGDP_CONFIG_PATH=configs/space_gpu_repair.yaml` on the Space.
|
| 162 |
+
|
| 163 |
+
**Trigger:** upload a PDF that contains a table the parsers will likely
|
| 164 |
+
mangle. A two-column financial statement page works well; if you don't
|
| 165 |
+
have one handy, take a Wikipedia article PDF that has a comparison table.
|
| 166 |
+
|
| 167 |
+
**Expected log lines (in order):**
|
| 168 |
+
- `parse_start`.
|
| 169 |
+
- `parser_candidate` for docling and pymupdf (both should fire on a PDF).
|
| 170 |
+
- `repair_iteration` with `iteration=1`, `gpu_task_count >= 1`,
|
| 171 |
+
`gpu_dry_run=false`.
|
| 172 |
+
- One `gpu_task_executed` record per GPU task. `status` should be
|
| 173 |
+
`executed` and `elapsed_seconds` 1-10s for a 3B-param VLM on L4.
|
| 174 |
+
- A second `repair_iteration` with `iteration=2` only if iteration 1
|
| 175 |
+
changed something and quality is still below threshold; otherwise the
|
| 176 |
+
loop terminates.
|
| 177 |
+
- `parse_end` with `repair_iterations >= 1`.
|
| 178 |
+
|
| 179 |
+
**Pass criteria:**
|
| 180 |
+
- At least one `gpu_task_executed` with `status=executed`.
|
| 181 |
+
- The output `parsed_document.json` shows `parsed.tables[i].provenance.gpu_repair_task_id` set.
|
| 182 |
+
- No `gpu_task_blocked` records (would mean missing image_path or doc_id).
|
| 183 |
+
|
| 184 |
+
**Failure modes to watch:**
|
| 185 |
+
- All `gpu_task_executed` records show `status=execution_failed` →
|
| 186 |
+
inspect `output.error` field; common causes are missing image_path
|
| 187 |
+
(the PDF doesn't render page crops because `pdf.crop_tables=true` isn't
|
| 188 |
+
set) or a CUDA OOM.
|
| 189 |
+
- No `repair_iteration` records → the verifier didn't flag any
|
| 190 |
+
blocking issues; pick a different input PDF.
|
| 191 |
+
|
| 192 |
+
---
|
| 193 |
+
|
| 194 |
+
## Smoke 4 — Per-parser ablation across docling + pymupdf
|
| 195 |
+
|
| 196 |
+
Confirms the ablation runner produces a comparison CSV and that each arm's
|
| 197 |
+
artifacts are isolated. No GPU dependency, runs on default Space hardware.
|
| 198 |
+
|
| 199 |
+
**Setup:** default config, no requirements.txt changes.
|
| 200 |
+
|
| 201 |
+
**Trigger:** Space JupyterLab terminal:
|
| 202 |
+
|
| 203 |
+
```bash
|
| 204 |
+
zsgdp benchmark-ablate \
|
| 205 |
+
--input ./fixtures/pdfs \
|
| 206 |
+
--output ./out/ablation \
|
| 207 |
+
--parser docling --parser pymupdf
|
| 208 |
+
```
|
| 209 |
+
|
| 210 |
+
**Expected log lines:** one parse cycle per arm (parse_start through
|
| 211 |
+
parse_end), three arms total (docling-only, pymupdf-only, merged).
|
| 212 |
+
|
| 213 |
+
**Pass criteria:**
|
| 214 |
+
- `out/ablation/ablation_comparison.csv` has 3 rows.
|
| 215 |
+
- Each arm's `mean_quality_score` is non-zero.
|
| 216 |
+
- The merged arm's `mean_quality_score` is `>= max(per-parser arms)`.
|
| 217 |
+
|
| 218 |
+
---
|
| 219 |
+
|
| 220 |
+
## Smoke 5 — External parser CLI (Marker)
|
| 221 |
+
|
| 222 |
+
The riskiest of the four external adapters because Marker's argv schema
|
| 223 |
+
has changed several times. Per-Space, do not bundle with other smokes.
|
| 224 |
+
|
| 225 |
+
**Setup:**
|
| 226 |
+
- Uncomment `marker-pdf` in `requirements.txt`.
|
| 227 |
+
- Add `configs/space_marker.yaml`:
|
| 228 |
+
|
| 229 |
+
```yaml
|
| 230 |
+
parsers:
|
| 231 |
+
text:
|
| 232 |
+
enabled: false
|
| 233 |
+
pymupdf:
|
| 234 |
+
enabled: false
|
| 235 |
+
marker:
|
| 236 |
+
enabled: true
|
| 237 |
+
timeout_seconds: 300
|
| 238 |
+
output_args: ["--output_dir", "{output_dir}", "--output_format", "markdown"]
|
| 239 |
+
extra_args: []
|
| 240 |
+
```
|
| 241 |
+
|
| 242 |
+
- Set `ZSGDP_CONFIG_PATH=configs/space_marker.yaml`.
|
| 243 |
+
|
| 244 |
+
**Trigger:** upload a small PDF (1–3 pages) via the Gradio UI.
|
| 245 |
+
|
| 246 |
+
**Expected log lines:**
|
| 247 |
+
- `parse_start`.
|
| 248 |
+
- `parser_candidate` for `marker` with non-zero `element_count`.
|
| 249 |
+
- `parse_end` with `candidate_parsers=["marker"]`.
|
| 250 |
+
|
| 251 |
+
**Pass criteria:**
|
| 252 |
+
- No `parser_failed` record for marker.
|
| 253 |
+
- Output Markdown has reasonable content (open the artifact zip and check).
|
| 254 |
+
- If `parser_failed` fires, look at `extra.error` — most common cause is
|
| 255 |
+
argv schema drift; tweak `output_args` in the config and retry.
|
| 256 |
+
|
| 257 |
+
---
|
| 258 |
+
|
| 259 |
+
## What "deployment ready" means after this checklist
|
| 260 |
+
|
| 261 |
+
If smokes 1–3 pass on a fresh duplicated Space, the project is genuinely
|
| 262 |
+
deployable for the Docling + PyMuPDF + Qwen2.5-VL-3B repair stack. Smokes 4
|
| 263 |
+
and 5 are nice-to-have — the per-parser ablation works locally too, and
|
| 264 |
+
external parsers stay flagged "experimental" until you actively need them.
|
| 265 |
+
|
| 266 |
+
Open the `parsed_document.json` from each smoke, copy the `quality_score`,
|
| 267 |
+
`mean_layout_f1` (where applicable), and any §29-relevant metric into
|
| 268 |
+
`README.md` under a new "Production benchmark numbers" section. That
|
| 269 |
+
publishes evidence that the success criteria are met against real data.
|
examples/parse_folder.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Parse a folder sequentially."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import argparse
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
from zsgdp import parse_document
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def main() -> int:
|
| 12 |
+
parser = argparse.ArgumentParser()
|
| 13 |
+
parser.add_argument("input")
|
| 14 |
+
parser.add_argument("output")
|
| 15 |
+
args = parser.parse_args()
|
| 16 |
+
|
| 17 |
+
input_dir = Path(args.input)
|
| 18 |
+
output_dir = Path(args.output)
|
| 19 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 20 |
+
for path in sorted(item for item in input_dir.iterdir() if item.is_file()):
|
| 21 |
+
parsed = parse_document(path, output_dir / path.stem)
|
| 22 |
+
print(f"{path.name}: score={parsed.quality_report.score:.2f}")
|
| 23 |
+
return 0
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
if __name__ == "__main__":
|
| 27 |
+
raise SystemExit(main())
|
examples/parse_pdf.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Parse one PDF with the MVP pipeline."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import argparse
|
| 6 |
+
|
| 7 |
+
from zsgdp import parse_document
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def main() -> int:
|
| 11 |
+
parser = argparse.ArgumentParser()
|
| 12 |
+
parser.add_argument("input")
|
| 13 |
+
parser.add_argument("output")
|
| 14 |
+
args = parser.parse_args()
|
| 15 |
+
parsed = parse_document(args.input, args.output)
|
| 16 |
+
print(
|
| 17 |
+
f"score={parsed.quality_report.score:.2f} "
|
| 18 |
+
f"elements={len(parsed.elements)} tables={len(parsed.tables)} "
|
| 19 |
+
f"figures={len(parsed.figures)} chunks={len(parsed.chunks)}"
|
| 20 |
+
)
|
| 21 |
+
return 0
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
if __name__ == "__main__":
|
| 25 |
+
raise SystemExit(main())
|
examples/run_benchmark.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Minimal benchmark runner placeholder."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import argparse
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from time import perf_counter
|
| 8 |
+
|
| 9 |
+
from zsgdp import parse_document
|
| 10 |
+
from zsgdp.benchmarks.throughput import pages_per_second
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def main() -> int:
|
| 14 |
+
parser = argparse.ArgumentParser()
|
| 15 |
+
parser.add_argument("input")
|
| 16 |
+
parser.add_argument("output")
|
| 17 |
+
args = parser.parse_args()
|
| 18 |
+
|
| 19 |
+
input_dir = Path(args.input)
|
| 20 |
+
output_dir = Path(args.output)
|
| 21 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 22 |
+
total_pages = 0
|
| 23 |
+
started = perf_counter()
|
| 24 |
+
for path in sorted(item for item in input_dir.iterdir() if item.is_file()):
|
| 25 |
+
parsed = parse_document(path, output_dir / path.stem)
|
| 26 |
+
total_pages += len(parsed.pages)
|
| 27 |
+
elapsed = perf_counter() - started
|
| 28 |
+
print(f"pages={total_pages} seconds={elapsed:.2f} pages_per_second={pages_per_second(total_pages, elapsed):.2f}")
|
| 29 |
+
return 0
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
if __name__ == "__main__":
|
| 33 |
+
raise SystemExit(main())
|
pyproject.toml
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = ["setuptools>=68", "wheel"]
|
| 3 |
+
build-backend = "setuptools.build_meta"
|
| 4 |
+
|
| 5 |
+
[project]
|
| 6 |
+
name = "zero-shot-gpu-doc-parser"
|
| 7 |
+
version = "0.1.0"
|
| 8 |
+
description = "Zero-shot GPU document parsing and agentic chunking control plane."
|
| 9 |
+
readme = "README.md"
|
| 10 |
+
requires-python = ">=3.11"
|
| 11 |
+
license = { text = "MIT" }
|
| 12 |
+
authors = [{ name = "Zero-Shot GPU Document Parser Contributors" }]
|
| 13 |
+
dependencies = []
|
| 14 |
+
|
| 15 |
+
[project.optional-dependencies]
|
| 16 |
+
pdf = ["pymupdf>=1.24.0,<1.28.0"]
|
| 17 |
+
yaml = ["pyyaml>=6.0.1,<7.0.0"]
|
| 18 |
+
docling = ["docling>=2.0.0,<3.0.0"]
|
| 19 |
+
# `spaces` mirrors requirements.txt at the root, which is what HF Spaces
|
| 20 |
+
# installs verbatim. Keep these two in sync; torch is intentionally absent
|
| 21 |
+
# because the l4x1 Space image preinstalls a CUDA-matched build.
|
| 22 |
+
spaces = [
|
| 23 |
+
"gradio>=4.44.0,<7.0.0",
|
| 24 |
+
"pymupdf>=1.24.0,<1.28.0",
|
| 25 |
+
"pyyaml>=6.0.1,<7.0.0",
|
| 26 |
+
"docling>=2.0.0,<3.0.0",
|
| 27 |
+
]
|
| 28 |
+
embedding = ["sentence-transformers>=3.0.0,<4.0.0", "transformers>=4.45.0,<6.0.0"]
|
| 29 |
+
gpu_repair = ["transformers>=4.45.0,<6.0.0"]
|
| 30 |
+
dev = ["pytest>=8.0.0"]
|
| 31 |
+
|
| 32 |
+
[project.scripts]
|
| 33 |
+
zsgdp = "zsgdp.cli:main"
|
| 34 |
+
|
| 35 |
+
[tool.setuptools.packages.find]
|
| 36 |
+
where = ["."]
|
| 37 |
+
include = ["zsgdp*"]
|
| 38 |
+
|
| 39 |
+
[tool.pytest.ini_options]
|
| 40 |
+
testpaths = ["tests"]
|
| 41 |
+
pythonpath = ["."]
|
requirements.txt
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Hugging Face Spaces dependencies for zeroshotGPU.
|
| 2 |
+
#
|
| 3 |
+
# Versions are pinned to tested upper bounds within each major. Bump these
|
| 4 |
+
# when you have run `python -m unittest discover` and the benchmark suite
|
| 5 |
+
# successfully against a new release.
|
| 6 |
+
#
|
| 7 |
+
# Torch is intentionally NOT pinned here. The l4x1 Space image preinstalls a
|
| 8 |
+
# CUDA-matched torch build; pinning torch in this file overrides it and risks
|
| 9 |
+
# a runtime/driver mismatch. If you're running locally without the Space
|
| 10 |
+
# preinstall, install torch separately via the recommended channel for your
|
| 11 |
+
# platform (e.g. `pip install torch --index-url https://download.pytorch.org/whl/cu121`).
|
| 12 |
+
|
| 13 |
+
gradio>=4.44.0,<7.0.0
|
| 14 |
+
pymupdf>=1.24.0,<1.28.0
|
| 15 |
+
pyyaml>=6.0.1,<7.0.0
|
| 16 |
+
docling>=2.0.0,<3.0.0
|
| 17 |
+
|
| 18 |
+
# Optional GPU/embedding stack. Uncomment to enable the embedding retriever
|
| 19 |
+
# (benchmarks.retriever.backend=embedding) and live GPU repair escalations
|
| 20 |
+
# (repair.execute_gpu_escalations=true). Both are off by default.
|
| 21 |
+
#
|
| 22 |
+
# transformers>=4.45.0,<6.0.0
|
| 23 |
+
# sentence-transformers>=3.0.0,<4.0.0
|
| 24 |
+
|
| 25 |
+
# Optional external parser CLIs. Each adds a non-trivial install footprint;
|
| 26 |
+
# enable only the ones the Space hardware can support. Adapter shells out to
|
| 27 |
+
# the CLI binary (see zsgdp/parsers/external.py); these adapters have not
|
| 28 |
+
# been smoke-tested against a live install — verify the argv schema before
|
| 29 |
+
# enabling in production.
|
| 30 |
+
#
|
| 31 |
+
# marker-pdf>=1.0.0
|
| 32 |
+
# mineru
|
| 33 |
+
# unstructured>=0.15.0
|
scripts/__init__.py
ADDED
|
File without changes
|
scripts/run_space_smoke.py
ADDED
|
@@ -0,0 +1,455 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Space-side smoke validation runner.
|
| 2 |
+
|
| 3 |
+
Automates the smokes documented in docs/space_smoke.md so a Space operator
|
| 4 |
+
can run one command and get a JSON report of which smokes passed, which
|
| 5 |
+
were skipped (missing deps), and which failed (with diagnostic context).
|
| 6 |
+
|
| 7 |
+
Usage:
|
| 8 |
+
|
| 9 |
+
# Run all smokes that have their deps installed:
|
| 10 |
+
python -m scripts.run_space_smoke --output ./space_smoke_report.json
|
| 11 |
+
|
| 12 |
+
# Run only a subset:
|
| 13 |
+
python -m scripts.run_space_smoke --smoke lexical --smoke ablation
|
| 14 |
+
|
| 15 |
+
# Force-fail on skipped smokes (CI-style strict mode):
|
| 16 |
+
python -m scripts.run_space_smoke --strict
|
| 17 |
+
|
| 18 |
+
The runner does NOT install missing dependencies — that's deliberately the
|
| 19 |
+
operator's job (each smoke's deps add Space build time and download cost).
|
| 20 |
+
A skipped smoke prints the exact `pip install` line you'd need.
|
| 21 |
+
|
| 22 |
+
Smokes mirror docs/space_smoke.md:
|
| 23 |
+
|
| 24 |
+
lexical - model-free benchmark on a synthetic markdown corpus
|
| 25 |
+
ablation - per-parser ablation runner (text vs pymupdf)
|
| 26 |
+
embedding - sentence-transformers / jina-embeddings-v3 retrieval
|
| 27 |
+
gpu_repair - live Qwen2.5-VL invocation against a malformed table
|
| 28 |
+
marker - shell out to marker_single on a small PDF (if installed)
|
| 29 |
+
"""
|
| 30 |
+
|
| 31 |
+
from __future__ import annotations
|
| 32 |
+
|
| 33 |
+
import argparse
|
| 34 |
+
import importlib.util
|
| 35 |
+
import json
|
| 36 |
+
import shutil
|
| 37 |
+
import subprocess
|
| 38 |
+
import sys
|
| 39 |
+
import tempfile
|
| 40 |
+
import time
|
| 41 |
+
from dataclasses import dataclass, field
|
| 42 |
+
from pathlib import Path
|
| 43 |
+
from typing import Any, Callable
|
| 44 |
+
|
| 45 |
+
REPO_ROOT = Path(__file__).resolve().parents[1]
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
@dataclass(slots=True)
|
| 49 |
+
class SmokeResult:
|
| 50 |
+
name: str
|
| 51 |
+
status: str # "pass" | "fail" | "skip" | "error"
|
| 52 |
+
elapsed_seconds: float = 0.0
|
| 53 |
+
detail: dict[str, Any] = field(default_factory=dict)
|
| 54 |
+
skip_reason: str = ""
|
| 55 |
+
install_hint: str = ""
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
@dataclass(slots=True)
|
| 59 |
+
class SmokeReport:
|
| 60 |
+
smokes: list[SmokeResult] = field(default_factory=list)
|
| 61 |
+
|
| 62 |
+
@property
|
| 63 |
+
def passed(self) -> bool:
|
| 64 |
+
return all(item.status in {"pass", "skip"} for item in self.smokes)
|
| 65 |
+
|
| 66 |
+
def to_dict(self) -> dict[str, Any]:
|
| 67 |
+
return {
|
| 68 |
+
"smokes": [
|
| 69 |
+
{
|
| 70 |
+
"name": item.name,
|
| 71 |
+
"status": item.status,
|
| 72 |
+
"elapsed_seconds": round(item.elapsed_seconds, 3),
|
| 73 |
+
"detail": item.detail,
|
| 74 |
+
"skip_reason": item.skip_reason,
|
| 75 |
+
"install_hint": item.install_hint,
|
| 76 |
+
}
|
| 77 |
+
for item in self.smokes
|
| 78 |
+
],
|
| 79 |
+
"summary": {
|
| 80 |
+
"total": len(self.smokes),
|
| 81 |
+
"passed": sum(1 for item in self.smokes if item.status == "pass"),
|
| 82 |
+
"failed": sum(1 for item in self.smokes if item.status == "fail"),
|
| 83 |
+
"errored": sum(1 for item in self.smokes if item.status == "error"),
|
| 84 |
+
"skipped": sum(1 for item in self.smokes if item.status == "skip"),
|
| 85 |
+
},
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
# --- Individual smokes -------------------------------------------------------
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def _make_distinctive_corpus(root: Path) -> Path:
|
| 93 |
+
"""Build a small corpus with three sentences distinct enough that the
|
| 94 |
+
synthetic-QA generator picks one query per chunk."""
|
| 95 |
+
|
| 96 |
+
src = root / "in"
|
| 97 |
+
src.mkdir()
|
| 98 |
+
(src / "doc.md").write_text(
|
| 99 |
+
"# Sample Doc\n\n"
|
| 100 |
+
"Apples grow on trees in the orchard during autumn harvest season.\n\n"
|
| 101 |
+
"Submarines navigate beneath the ocean using sonar pulses across waters.\n\n"
|
| 102 |
+
"Mountains rise above the clouds in the distant horizon line.\n",
|
| 103 |
+
encoding="utf-8",
|
| 104 |
+
)
|
| 105 |
+
return src
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def smoke_lexical() -> SmokeResult:
|
| 109 |
+
started = time.perf_counter()
|
| 110 |
+
from zsgdp.benchmarks.parser_quality import run_parser_benchmark
|
| 111 |
+
|
| 112 |
+
with tempfile.TemporaryDirectory() as tmp:
|
| 113 |
+
tmp_path = Path(tmp)
|
| 114 |
+
src = _make_distinctive_corpus(tmp_path)
|
| 115 |
+
out = tmp_path / "out"
|
| 116 |
+
try:
|
| 117 |
+
summary = run_parser_benchmark(src, out, dataset_name="custom_folder")
|
| 118 |
+
except Exception as exc:
|
| 119 |
+
return SmokeResult(
|
| 120 |
+
name="lexical",
|
| 121 |
+
status="error",
|
| 122 |
+
elapsed_seconds=time.perf_counter() - started,
|
| 123 |
+
detail={"exception": str(exc)},
|
| 124 |
+
)
|
| 125 |
+
|
| 126 |
+
quality = float(summary.get("mean_quality_score", 0.0))
|
| 127 |
+
recall = float(summary.get("mean_retrieval_recall_at_1", 0.0))
|
| 128 |
+
passed = quality >= 0.85 and recall >= 0.7
|
| 129 |
+
return SmokeResult(
|
| 130 |
+
name="lexical",
|
| 131 |
+
status="pass" if passed else "fail",
|
| 132 |
+
elapsed_seconds=time.perf_counter() - started,
|
| 133 |
+
detail={
|
| 134 |
+
"mean_quality_score": quality,
|
| 135 |
+
"mean_retrieval_recall_at_1": recall,
|
| 136 |
+
"documents_evaluated": summary.get("document_count"),
|
| 137 |
+
},
|
| 138 |
+
)
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
def smoke_ablation() -> SmokeResult:
|
| 142 |
+
started = time.perf_counter()
|
| 143 |
+
from zsgdp.benchmarks.ablation_runner import run_parser_ablations
|
| 144 |
+
|
| 145 |
+
with tempfile.TemporaryDirectory() as tmp:
|
| 146 |
+
tmp_path = Path(tmp)
|
| 147 |
+
src = _make_distinctive_corpus(tmp_path)
|
| 148 |
+
out = tmp_path / "out"
|
| 149 |
+
try:
|
| 150 |
+
comparison = run_parser_ablations(
|
| 151 |
+
src,
|
| 152 |
+
out,
|
| 153 |
+
parsers=["text", "pymupdf"],
|
| 154 |
+
dataset_name="custom_folder",
|
| 155 |
+
)
|
| 156 |
+
except Exception as exc:
|
| 157 |
+
return SmokeResult(
|
| 158 |
+
name="ablation",
|
| 159 |
+
status="error",
|
| 160 |
+
elapsed_seconds=time.perf_counter() - started,
|
| 161 |
+
detail={"exception": str(exc)},
|
| 162 |
+
)
|
| 163 |
+
|
| 164 |
+
comparison_csv_exists = (out / "ablation_comparison.csv").exists()
|
| 165 |
+
|
| 166 |
+
arms = [row["arm"] for row in comparison["rows"]]
|
| 167 |
+
expected_arms = {"text", "pymupdf", "merged"}
|
| 168 |
+
passed = comparison["arm_count"] == 3 and set(arms) == expected_arms and comparison_csv_exists
|
| 169 |
+
return SmokeResult(
|
| 170 |
+
name="ablation",
|
| 171 |
+
status="pass" if passed else "fail",
|
| 172 |
+
elapsed_seconds=time.perf_counter() - started,
|
| 173 |
+
detail={
|
| 174 |
+
"arm_count": comparison["arm_count"],
|
| 175 |
+
"arms": arms,
|
| 176 |
+
"comparison_csv_emitted": comparison_csv_exists,
|
| 177 |
+
},
|
| 178 |
+
)
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
def smoke_embedding() -> SmokeResult:
|
| 182 |
+
started = time.perf_counter()
|
| 183 |
+
if importlib.util.find_spec("sentence_transformers") is None:
|
| 184 |
+
return SmokeResult(
|
| 185 |
+
name="embedding",
|
| 186 |
+
status="skip",
|
| 187 |
+
elapsed_seconds=time.perf_counter() - started,
|
| 188 |
+
skip_reason="sentence-transformers not installed",
|
| 189 |
+
install_hint="python -m pip install 'zero-shot-gpu-doc-parser[embedding]'",
|
| 190 |
+
)
|
| 191 |
+
|
| 192 |
+
from zsgdp.benchmarks.embedding_retriever import EmbeddingRetriever
|
| 193 |
+
from zsgdp.benchmarks.parser_quality import run_parser_benchmark
|
| 194 |
+
|
| 195 |
+
# Try to load the configured embedding model. If the load fails (no HF
|
| 196 |
+
# token, download error, OOM at import time), we report it as a skip
|
| 197 |
+
# with the exception text so the operator sees what to fix without the
|
| 198 |
+
# whole smoke run blowing up.
|
| 199 |
+
try:
|
| 200 |
+
retriever = EmbeddingRetriever()
|
| 201 |
+
retriever._ensure_embedder() # type: ignore[attr-defined] # private but intentional
|
| 202 |
+
except Exception as exc:
|
| 203 |
+
return SmokeResult(
|
| 204 |
+
name="embedding",
|
| 205 |
+
status="skip",
|
| 206 |
+
elapsed_seconds=time.perf_counter() - started,
|
| 207 |
+
skip_reason=f"embedding model failed to load: {exc}",
|
| 208 |
+
install_hint="Set HF_TOKEN if the model is gated, or downsize via "
|
| 209 |
+
"benchmarks.retriever.model_id (e.g. sentence-transformers/all-MiniLM-L6-v2).",
|
| 210 |
+
)
|
| 211 |
+
|
| 212 |
+
config_overrides = {"benchmarks": {"retriever": {"backend": "embedding"}}}
|
| 213 |
+
with tempfile.TemporaryDirectory() as tmp:
|
| 214 |
+
tmp_path = Path(tmp)
|
| 215 |
+
src = _make_distinctive_corpus(tmp_path)
|
| 216 |
+
out = tmp_path / "out"
|
| 217 |
+
config_path = tmp_path / "config.yaml"
|
| 218 |
+
# Inline config write — keeps the smoke self-contained.
|
| 219 |
+
config_path.write_text(
|
| 220 |
+
"benchmarks:\n retriever:\n backend: embedding\n",
|
| 221 |
+
encoding="utf-8",
|
| 222 |
+
)
|
| 223 |
+
try:
|
| 224 |
+
summary = run_parser_benchmark(src, out, config_path=config_path, dataset_name="custom_folder")
|
| 225 |
+
except Exception as exc:
|
| 226 |
+
return SmokeResult(
|
| 227 |
+
name="embedding",
|
| 228 |
+
status="error",
|
| 229 |
+
elapsed_seconds=time.perf_counter() - started,
|
| 230 |
+
detail={"exception": str(exc)},
|
| 231 |
+
)
|
| 232 |
+
|
| 233 |
+
recall_5 = float(summary.get("mean_retrieval_recall_at_5", 0.0))
|
| 234 |
+
passed = recall_5 >= 0.7
|
| 235 |
+
return SmokeResult(
|
| 236 |
+
name="embedding",
|
| 237 |
+
status="pass" if passed else "fail",
|
| 238 |
+
elapsed_seconds=time.perf_counter() - started,
|
| 239 |
+
detail={
|
| 240 |
+
"mean_retrieval_recall_at_5": recall_5,
|
| 241 |
+
"mean_retrieval_recall_at_1": float(summary.get("mean_retrieval_recall_at_1", 0.0)),
|
| 242 |
+
"documents_evaluated": summary.get("document_count"),
|
| 243 |
+
},
|
| 244 |
+
)
|
| 245 |
+
|
| 246 |
+
|
| 247 |
+
def smoke_gpu_repair() -> SmokeResult:
|
| 248 |
+
started = time.perf_counter()
|
| 249 |
+
if importlib.util.find_spec("transformers") is None:
|
| 250 |
+
return SmokeResult(
|
| 251 |
+
name="gpu_repair",
|
| 252 |
+
status="skip",
|
| 253 |
+
elapsed_seconds=time.perf_counter() - started,
|
| 254 |
+
skip_reason="transformers not installed",
|
| 255 |
+
install_hint="python -m pip install 'zero-shot-gpu-doc-parser[gpu_repair]'",
|
| 256 |
+
)
|
| 257 |
+
|
| 258 |
+
# Don't actually instantiate the transformers pipeline here — it would
|
| 259 |
+
# download multi-GB Qwen2.5-VL weights even on a dry probe. Instead, we
|
| 260 |
+
# smoke-test the wiring: a dry-run task plan, and report whether the
|
| 261 |
+
# underlying client class can be imported. Operators who want a real
|
| 262 |
+
# model invocation should use `run-gpu-tasks --execute` against a parsed
|
| 263 |
+
# output directory; the result lands in repair.gpu_escalation.results.
|
| 264 |
+
from zsgdp.gpu.transformers_client import TransformersClient
|
| 265 |
+
from zsgdp.pipeline import parse_document
|
| 266 |
+
|
| 267 |
+
with tempfile.TemporaryDirectory() as tmp:
|
| 268 |
+
tmp_path = Path(tmp)
|
| 269 |
+
src = tmp_path / "report.md"
|
| 270 |
+
# Malformed table (header has 2 columns; data row has 3) forces the
|
| 271 |
+
# repair loop to plan a table_vlm_repair task.
|
| 272 |
+
src.write_text(
|
| 273 |
+
"# Report\n\n| A | B |\n| --- | --- |\n| 1 | 2 | 3 |\n",
|
| 274 |
+
encoding="utf-8",
|
| 275 |
+
)
|
| 276 |
+
out = tmp_path / "out"
|
| 277 |
+
try:
|
| 278 |
+
parsed = parse_document(src, out)
|
| 279 |
+
except Exception as exc:
|
| 280 |
+
return SmokeResult(
|
| 281 |
+
name="gpu_repair",
|
| 282 |
+
status="error",
|
| 283 |
+
elapsed_seconds=time.perf_counter() - started,
|
| 284 |
+
detail={"exception": str(exc)},
|
| 285 |
+
)
|
| 286 |
+
|
| 287 |
+
repair = parsed.provenance.get("repair", {})
|
| 288 |
+
gpu_escalation = repair.get("gpu_escalation") or {}
|
| 289 |
+
task_count = int(gpu_escalation.get("task_count") or 0)
|
| 290 |
+
iterations = parsed.provenance.get("repair_iterations") or []
|
| 291 |
+
# We can confirm:
|
| 292 |
+
# * Dry-run plan ran (task_count >= 1 for the malformed table)
|
| 293 |
+
# * The repair loop iterated at least once
|
| 294 |
+
# * The TransformersClient class is importable for live execution
|
| 295 |
+
can_execute = TransformersClient is not None
|
| 296 |
+
passed = task_count >= 1 and len(iterations) >= 1 and can_execute
|
| 297 |
+
return SmokeResult(
|
| 298 |
+
name="gpu_repair",
|
| 299 |
+
status="pass" if passed else "fail",
|
| 300 |
+
elapsed_seconds=time.perf_counter() - started,
|
| 301 |
+
detail={
|
| 302 |
+
"dry_run_task_count": task_count,
|
| 303 |
+
"repair_iterations": len(iterations),
|
| 304 |
+
"transformers_client_importable": can_execute,
|
| 305 |
+
"note": "This smoke verifies wiring only. To verify model invocation "
|
| 306 |
+
"end-to-end, set repair.execute_gpu_escalations=true in config "
|
| 307 |
+
"and run zsgdp run-gpu-tasks --execute against a parsed dir.",
|
| 308 |
+
},
|
| 309 |
+
)
|
| 310 |
+
|
| 311 |
+
|
| 312 |
+
def smoke_marker() -> SmokeResult:
|
| 313 |
+
started = time.perf_counter()
|
| 314 |
+
if shutil.which("marker_single") is None and shutil.which("marker") is None:
|
| 315 |
+
return SmokeResult(
|
| 316 |
+
name="marker",
|
| 317 |
+
status="skip",
|
| 318 |
+
elapsed_seconds=time.perf_counter() - started,
|
| 319 |
+
skip_reason="neither `marker_single` nor `marker` found on PATH",
|
| 320 |
+
install_hint="python -m pip install marker-pdf",
|
| 321 |
+
)
|
| 322 |
+
|
| 323 |
+
# Marker is heavy enough that even a probe call can take 30+s on first
|
| 324 |
+
# invocation (model load). We confirm the registry adapter reports
|
| 325 |
+
# available, but don't run a full parse here — surface that as a manual
|
| 326 |
+
# follow-up via the smoke checklist.
|
| 327 |
+
from zsgdp.parsers.registry import get_parser
|
| 328 |
+
|
| 329 |
+
try:
|
| 330 |
+
adapter = get_parser("marker")
|
| 331 |
+
except KeyError as exc:
|
| 332 |
+
return SmokeResult(
|
| 333 |
+
name="marker",
|
| 334 |
+
status="error",
|
| 335 |
+
elapsed_seconds=time.perf_counter() - started,
|
| 336 |
+
detail={"exception": str(exc)},
|
| 337 |
+
)
|
| 338 |
+
available = bool(adapter.available())
|
| 339 |
+
return SmokeResult(
|
| 340 |
+
name="marker",
|
| 341 |
+
status="pass" if available else "fail",
|
| 342 |
+
elapsed_seconds=time.perf_counter() - started,
|
| 343 |
+
detail={
|
| 344 |
+
"adapter_reports_available": available,
|
| 345 |
+
"note": "End-to-end Marker parse is intentionally not run here "
|
| 346 |
+
"(cold-load is heavy). See docs/space_smoke.md Smoke 5 "
|
| 347 |
+
"for the manual upload-and-parse procedure.",
|
| 348 |
+
},
|
| 349 |
+
)
|
| 350 |
+
|
| 351 |
+
|
| 352 |
+
SMOKE_REGISTRY: dict[str, Callable[[], SmokeResult]] = {
|
| 353 |
+
"lexical": smoke_lexical,
|
| 354 |
+
"ablation": smoke_ablation,
|
| 355 |
+
"embedding": smoke_embedding,
|
| 356 |
+
"gpu_repair": smoke_gpu_repair,
|
| 357 |
+
"marker": smoke_marker,
|
| 358 |
+
}
|
| 359 |
+
|
| 360 |
+
|
| 361 |
+
# --- Driver ------------------------------------------------------------------
|
| 362 |
+
|
| 363 |
+
|
| 364 |
+
def run_smokes(names: list[str] | None = None) -> SmokeReport:
|
| 365 |
+
selected = names or list(SMOKE_REGISTRY)
|
| 366 |
+
report = SmokeReport()
|
| 367 |
+
for name in selected:
|
| 368 |
+
smoke = SMOKE_REGISTRY.get(name)
|
| 369 |
+
if smoke is None:
|
| 370 |
+
report.smokes.append(
|
| 371 |
+
SmokeResult(
|
| 372 |
+
name=name,
|
| 373 |
+
status="error",
|
| 374 |
+
detail={"exception": f"unknown smoke: {name}"},
|
| 375 |
+
)
|
| 376 |
+
)
|
| 377 |
+
continue
|
| 378 |
+
try:
|
| 379 |
+
result = smoke()
|
| 380 |
+
except Exception as exc:
|
| 381 |
+
result = SmokeResult(
|
| 382 |
+
name=name,
|
| 383 |
+
status="error",
|
| 384 |
+
detail={"exception": f"{type(exc).__name__}: {exc}"},
|
| 385 |
+
)
|
| 386 |
+
report.smokes.append(result)
|
| 387 |
+
return report
|
| 388 |
+
|
| 389 |
+
|
| 390 |
+
def format_text_summary(report: SmokeReport, *, strict: bool = False) -> str:
|
| 391 |
+
lines: list[str] = []
|
| 392 |
+
for item in report.smokes:
|
| 393 |
+
marker = {
|
| 394 |
+
"pass": "ok",
|
| 395 |
+
"fail": "FAIL",
|
| 396 |
+
"skip": "skip",
|
| 397 |
+
"error": "ERROR",
|
| 398 |
+
}.get(item.status, item.status.upper())
|
| 399 |
+
line = f" [{marker}] {item.name} ({item.elapsed_seconds:.2f}s)"
|
| 400 |
+
if item.status == "skip":
|
| 401 |
+
line += f" reason={item.skip_reason}"
|
| 402 |
+
elif item.status == "fail":
|
| 403 |
+
line += f" detail={json.dumps(item.detail, default=str)}"
|
| 404 |
+
elif item.status == "error":
|
| 405 |
+
line += f" detail={json.dumps(item.detail, default=str)}"
|
| 406 |
+
lines.append(line)
|
| 407 |
+
|
| 408 |
+
summary = report.to_dict()["summary"]
|
| 409 |
+
overall = "PASS" if (report.passed and (not strict or summary["skipped"] == 0)) else "FAIL"
|
| 410 |
+
lines.append(
|
| 411 |
+
f"smoke: {overall} passed={summary['passed']} failed={summary['failed']} "
|
| 412 |
+
f"errored={summary['errored']} skipped={summary['skipped']}"
|
| 413 |
+
)
|
| 414 |
+
return "\n".join(lines)
|
| 415 |
+
|
| 416 |
+
|
| 417 |
+
def main(argv: list[str] | None = None) -> int:
|
| 418 |
+
parser = argparse.ArgumentParser(
|
| 419 |
+
prog="run_space_smoke",
|
| 420 |
+
description="Run zsgdp Space-side smoke validations.",
|
| 421 |
+
)
|
| 422 |
+
parser.add_argument(
|
| 423 |
+
"--smoke",
|
| 424 |
+
action="append",
|
| 425 |
+
dest="smokes",
|
| 426 |
+
choices=list(SMOKE_REGISTRY),
|
| 427 |
+
help="Smoke to run. Repeat to run multiple. Default: all registered smokes.",
|
| 428 |
+
)
|
| 429 |
+
parser.add_argument("--output", help="Optional JSON report path.")
|
| 430 |
+
parser.add_argument(
|
| 431 |
+
"--strict",
|
| 432 |
+
action="store_true",
|
| 433 |
+
help="Treat skipped smokes as failures (useful in CI when all deps must be present).",
|
| 434 |
+
)
|
| 435 |
+
args = parser.parse_args(argv)
|
| 436 |
+
|
| 437 |
+
report = run_smokes(args.smokes)
|
| 438 |
+
print(format_text_summary(report, strict=args.strict))
|
| 439 |
+
|
| 440 |
+
if args.output:
|
| 441 |
+
Path(args.output).write_text(
|
| 442 |
+
json.dumps(report.to_dict(), indent=2, ensure_ascii=False) + "\n",
|
| 443 |
+
encoding="utf-8",
|
| 444 |
+
)
|
| 445 |
+
|
| 446 |
+
summary = report.to_dict()["summary"]
|
| 447 |
+
if summary["failed"] or summary["errored"]:
|
| 448 |
+
return 1
|
| 449 |
+
if args.strict and summary["skipped"]:
|
| 450 |
+
return 1
|
| 451 |
+
return 0
|
| 452 |
+
|
| 453 |
+
|
| 454 |
+
if __name__ == "__main__":
|
| 455 |
+
raise SystemExit(main())
|
tests/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Test package."""
|
tests/regression/README.md
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Regression fixtures
|
| 2 |
+
|
| 3 |
+
Each fixture is a `(<name>.input.<ext>, <name>.expected.json)` pair under
|
| 4 |
+
`fixtures/`. The runner in `test_regression.py` parses every input through
|
| 5 |
+
`parse_document` and compares the resulting `ParsedDocument` against the
|
| 6 |
+
snapshot in `<name>.expected.json` with explicit tolerances.
|
| 7 |
+
|
| 8 |
+
## Fixture file shape
|
| 9 |
+
|
| 10 |
+
`<name>.expected.json` has these keys (all optional except `name`):
|
| 11 |
+
|
| 12 |
+
```json
|
| 13 |
+
{
|
| 14 |
+
"name": "human-readable identifier",
|
| 15 |
+
"config": "configs/docling.yaml",
|
| 16 |
+
"selected_parsers": ["text"],
|
| 17 |
+
"tolerances": {
|
| 18 |
+
"quality_score_min": 0.85,
|
| 19 |
+
"element_count_range": [3, 6],
|
| 20 |
+
"table_count": 1,
|
| 21 |
+
"figure_count": 0,
|
| 22 |
+
"chunk_count_min": 1,
|
| 23 |
+
"blocking_failures": false,
|
| 24 |
+
"must_contain_markdown": ["# Report", "Apples grow"],
|
| 25 |
+
"must_not_contain_markdown": ["TODO", "FIXME"]
|
| 26 |
+
}
|
| 27 |
+
}
|
| 28 |
+
```
|
| 29 |
+
|
| 30 |
+
Tolerance keys (all optional):
|
| 31 |
+
|
| 32 |
+
- `quality_score_min` (float): assert `parsed.quality_report.score >= value`.
|
| 33 |
+
- `quality_score_max` (float): assert `parsed.quality_report.score <= value`.
|
| 34 |
+
- `element_count` (int) or `element_count_range` ([min, max]).
|
| 35 |
+
- `table_count` (int) or `table_count_range`.
|
| 36 |
+
- `figure_count` (int) or `figure_count_range`.
|
| 37 |
+
- `chunk_count_min` (int): assert at least N chunks.
|
| 38 |
+
- `chunk_count_max` (int): assert at most N chunks.
|
| 39 |
+
- `blocking_failures` (bool): assert `quality_report.has_blocking_failures` matches.
|
| 40 |
+
- `must_contain_markdown` (list[str]): each string must appear in
|
| 41 |
+
`parsed.to_markdown()`.
|
| 42 |
+
- `must_not_contain_markdown` (list[str]): each string must NOT appear.
|
| 43 |
+
- `must_contain_quality_metrics` (list[str]): each metric key must appear in
|
| 44 |
+
`quality_report.metrics`.
|
| 45 |
+
- `parser_disagreement_rate_max` (float): assert disagreement <= value.
|
| 46 |
+
- `repair_resolution_rate_min` (float): assert resolution >= value.
|
| 47 |
+
|
| 48 |
+
Missing keys are not asserted (no false failures from over-specification).
|
| 49 |
+
|
| 50 |
+
## Adding a fixture
|
| 51 |
+
|
| 52 |
+
1. Drop the input document under `fixtures/`. PDFs, markdown, html, txt all
|
| 53 |
+
work via the standard pipeline.
|
| 54 |
+
2. Run a one-off `parse_document` against it locally and inspect the output.
|
| 55 |
+
3. Hand-write `<name>.expected.json` with the constraints you want to lock
|
| 56 |
+
down. Prefer ranges over exact counts where reasonable variance exists.
|
| 57 |
+
4. Run `python3.11 -m unittest tests.test_regression`. It auto-discovers.
|
| 58 |
+
|
| 59 |
+
## Performance baselines (opt-in)
|
| 60 |
+
|
| 61 |
+
A fixture may include a `performance` block with throughput floors:
|
| 62 |
+
|
| 63 |
+
```json
|
| 64 |
+
{
|
| 65 |
+
"performance": {
|
| 66 |
+
"repeats": 2,
|
| 67 |
+
"max_elapsed_seconds": 2.0,
|
| 68 |
+
"min_pages_per_second": 0.5,
|
| 69 |
+
"always_enforce": false
|
| 70 |
+
}
|
| 71 |
+
}
|
| 72 |
+
```
|
| 73 |
+
|
| 74 |
+
Keys:
|
| 75 |
+
|
| 76 |
+
- `repeats` (int, default 2): number of warm parses to time. The median
|
| 77 |
+
elapsed is compared against the floor so a single cold-import outlier
|
| 78 |
+
does not flag.
|
| 79 |
+
- `max_elapsed_seconds`: parse must finish under this in median.
|
| 80 |
+
- `min_pages_per_second`: median pages/sec must meet or beat this.
|
| 81 |
+
- `always_enforce` (bool, default false): when true, perf is always checked.
|
| 82 |
+
|
| 83 |
+
Otherwise perf is gated on `ZSGDP_REGRESSION_PERF=1` so slow CI runners
|
| 84 |
+
don't get noisy. Floors should be **catastrophic-regression guards** — set
|
| 85 |
+
them ~50–100x slacker than your local median, not tight perf bars. The
|
| 86 |
+
point is to catch "parsing a tiny markdown doc now takes 30 seconds,"
|
| 87 |
+
not to track 5 % perf shifts.
|
| 88 |
+
|
| 89 |
+
To set a baseline for a new fixture: parse it 5 times locally, take the
|
| 90 |
+
median, multiply by ~10–80x for the `max_elapsed_seconds` floor.
|
| 91 |
+
|
| 92 |
+
## When a regression fires
|
| 93 |
+
|
| 94 |
+
The failure message points at the specific tolerance that broke. Don't blindly
|
| 95 |
+
loosen the tolerance — investigate whether the regression is real first
|
| 96 |
+
(parser-version bump, repair-loop drift, chunk planner change). If the new
|
| 97 |
+
behavior is intentional and better, regenerate the snapshot.
|
tests/regression/__init__.py
ADDED
|
File without changes
|
tests/regression/fixtures/markdown_basic.expected.json
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"name": "markdown_basic",
|
| 3 |
+
"tolerances": {
|
| 4 |
+
"quality_score_min": 0.9,
|
| 5 |
+
"blocking_failures": false,
|
| 6 |
+
"element_count_range": [4, 8],
|
| 7 |
+
"table_count": 1,
|
| 8 |
+
"figure_count": 0,
|
| 9 |
+
"chunk_count_min": 4,
|
| 10 |
+
"must_contain_markdown": [
|
| 11 |
+
"# Quarterly Report",
|
| 12 |
+
"Apples grow on trees in the orchard",
|
| 13 |
+
"| Region | Q1 | Q2 |",
|
| 14 |
+
"Submarines navigate beneath the ocean"
|
| 15 |
+
],
|
| 16 |
+
"must_not_contain_markdown": ["TODO", "FIXME"],
|
| 17 |
+
"must_contain_quality_metrics": [
|
| 18 |
+
"document_text_coverage",
|
| 19 |
+
"parser_disagreement_rate",
|
| 20 |
+
"repair_resolution_rate"
|
| 21 |
+
],
|
| 22 |
+
"parser_disagreement_rate_max": 0.5,
|
| 23 |
+
"repair_resolution_rate_min": 0.5
|
| 24 |
+
},
|
| 25 |
+
"performance": {
|
| 26 |
+
"_comment": "Floors are catastrophic-regression guards, not tight perf bars. Median of 2 warm runs (cold-import outlier dropped) was ~6ms locally; the floor is 80x that to absorb slow CI. Enable with ZSGDP_REGRESSION_PERF=1 or set always_enforce: true.",
|
| 27 |
+
"repeats": 2,
|
| 28 |
+
"max_elapsed_seconds": 2.0,
|
| 29 |
+
"min_pages_per_second": 0.5
|
| 30 |
+
}
|
| 31 |
+
}
|
tests/regression/fixtures/markdown_basic.input.md
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Quarterly Report
|
| 2 |
+
|
| 3 |
+
Apples grow on trees in the orchard during the autumn harvest season.
|
| 4 |
+
|
| 5 |
+
## Revenue
|
| 6 |
+
|
| 7 |
+
| Region | Q1 | Q2 |
|
| 8 |
+
| --- | --- | --- |
|
| 9 |
+
| North America | 10 | 12 |
|
| 10 |
+
| Europe | 8 | 9 |
|
| 11 |
+
|
| 12 |
+
## Outlook
|
| 13 |
+
|
| 14 |
+
Submarines navigate beneath the ocean using sonar pulses across waters.
|
tests/regression/test_regression.py
ADDED
|
@@ -0,0 +1,255 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Snapshot regression tests against fixtures in this directory.
|
| 2 |
+
|
| 3 |
+
Discovery: every <name>.expected.json under fixtures/ pairs with a sibling
|
| 4 |
+
<name>.input.<ext>. The runner parses the input, then asserts each tolerance
|
| 5 |
+
in the expected file. Tolerance keys are documented in fixtures/README.md.
|
| 6 |
+
|
| 7 |
+
Performance baselines are opt-in per fixture via a `performance` block in
|
| 8 |
+
the expected file. They run only when ZSGDP_REGRESSION_PERF=1 (or when the
|
| 9 |
+
performance block has `always_enforce: true`) so a slow CI runner does not
|
| 10 |
+
fail on transient noise. When enabled, the parse is run twice and the
|
| 11 |
+
median elapsed time is compared against the floor.
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
from __future__ import annotations
|
| 15 |
+
|
| 16 |
+
import json
|
| 17 |
+
import os
|
| 18 |
+
import statistics
|
| 19 |
+
import tempfile
|
| 20 |
+
import time
|
| 21 |
+
import unittest
|
| 22 |
+
import unittest.mock
|
| 23 |
+
from pathlib import Path
|
| 24 |
+
from typing import Any
|
| 25 |
+
|
| 26 |
+
from zsgdp.pipeline import parse_document
|
| 27 |
+
|
| 28 |
+
FIXTURE_DIR = Path(__file__).parent / "fixtures"
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def _discover_fixtures() -> list[tuple[str, Path, Path]]:
|
| 32 |
+
pairs: list[tuple[str, Path, Path]] = []
|
| 33 |
+
if not FIXTURE_DIR.exists():
|
| 34 |
+
return pairs
|
| 35 |
+
for expected in sorted(FIXTURE_DIR.glob("*.expected.json")):
|
| 36 |
+
name = expected.name[: -len(".expected.json")]
|
| 37 |
+
candidates = sorted(FIXTURE_DIR.glob(f"{name}.input.*"))
|
| 38 |
+
if not candidates:
|
| 39 |
+
continue
|
| 40 |
+
pairs.append((name, candidates[0], expected))
|
| 41 |
+
return pairs
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def _check_int_or_range(actual: int, exact: Any, range_value: Any, label: str) -> str | None:
|
| 45 |
+
if exact is not None and int(exact) != actual:
|
| 46 |
+
return f"{label}: expected {exact}, got {actual}"
|
| 47 |
+
if isinstance(range_value, (list, tuple)) and len(range_value) == 2:
|
| 48 |
+
lo, hi = int(range_value[0]), int(range_value[1])
|
| 49 |
+
if not (lo <= actual <= hi):
|
| 50 |
+
return f"{label}: expected in [{lo}, {hi}], got {actual}"
|
| 51 |
+
return None
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def _evaluate(parsed, tolerances: dict[str, Any]) -> list[str]:
|
| 55 |
+
failures: list[str] = []
|
| 56 |
+
score = float(parsed.quality_report.score)
|
| 57 |
+
if "quality_score_min" in tolerances and score < float(tolerances["quality_score_min"]):
|
| 58 |
+
failures.append(f"quality_score: {score:.3f} < {tolerances['quality_score_min']}")
|
| 59 |
+
if "quality_score_max" in tolerances and score > float(tolerances["quality_score_max"]):
|
| 60 |
+
failures.append(f"quality_score: {score:.3f} > {tolerances['quality_score_max']}")
|
| 61 |
+
|
| 62 |
+
for label, count, exact_key, range_key in (
|
| 63 |
+
("element_count", len(parsed.elements), "element_count", "element_count_range"),
|
| 64 |
+
("table_count", len(parsed.tables), "table_count", "table_count_range"),
|
| 65 |
+
("figure_count", len(parsed.figures), "figure_count", "figure_count_range"),
|
| 66 |
+
):
|
| 67 |
+
message = _check_int_or_range(count, tolerances.get(exact_key), tolerances.get(range_key), label)
|
| 68 |
+
if message:
|
| 69 |
+
failures.append(message)
|
| 70 |
+
|
| 71 |
+
chunk_count = len(parsed.chunks)
|
| 72 |
+
if "chunk_count_min" in tolerances and chunk_count < int(tolerances["chunk_count_min"]):
|
| 73 |
+
failures.append(f"chunk_count: {chunk_count} < {tolerances['chunk_count_min']}")
|
| 74 |
+
if "chunk_count_max" in tolerances and chunk_count > int(tolerances["chunk_count_max"]):
|
| 75 |
+
failures.append(f"chunk_count: {chunk_count} > {tolerances['chunk_count_max']}")
|
| 76 |
+
|
| 77 |
+
if "blocking_failures" in tolerances:
|
| 78 |
+
actual = parsed.quality_report.has_blocking_failures
|
| 79 |
+
expected = bool(tolerances["blocking_failures"])
|
| 80 |
+
if actual != expected:
|
| 81 |
+
failures.append(f"blocking_failures: expected {expected}, got {actual}")
|
| 82 |
+
|
| 83 |
+
md = parsed.to_markdown()
|
| 84 |
+
for needle in tolerances.get("must_contain_markdown", []) or []:
|
| 85 |
+
if str(needle) not in md:
|
| 86 |
+
failures.append(f"must_contain_markdown: {needle!r} not found")
|
| 87 |
+
for needle in tolerances.get("must_not_contain_markdown", []) or []:
|
| 88 |
+
if str(needle) in md:
|
| 89 |
+
failures.append(f"must_not_contain_markdown: {needle!r} present")
|
| 90 |
+
|
| 91 |
+
metrics = parsed.quality_report.metrics
|
| 92 |
+
for key in tolerances.get("must_contain_quality_metrics", []) or []:
|
| 93 |
+
if key not in metrics:
|
| 94 |
+
failures.append(f"must_contain_quality_metrics: {key!r} missing")
|
| 95 |
+
|
| 96 |
+
if "parser_disagreement_rate_max" in tolerances:
|
| 97 |
+
rate = float(metrics.get("parser_disagreement_rate", 0.0))
|
| 98 |
+
if rate > float(tolerances["parser_disagreement_rate_max"]):
|
| 99 |
+
failures.append(
|
| 100 |
+
f"parser_disagreement_rate: {rate:.3f} > {tolerances['parser_disagreement_rate_max']}"
|
| 101 |
+
)
|
| 102 |
+
if "repair_resolution_rate_min" in tolerances:
|
| 103 |
+
rate = float(metrics.get("repair_resolution_rate", 1.0))
|
| 104 |
+
if rate < float(tolerances["repair_resolution_rate_min"]):
|
| 105 |
+
failures.append(
|
| 106 |
+
f"repair_resolution_rate: {rate:.3f} < {tolerances['repair_resolution_rate_min']}"
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
return failures
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def _perf_enforcement_enabled(performance: dict[str, Any]) -> bool:
|
| 113 |
+
if performance.get("always_enforce"):
|
| 114 |
+
return True
|
| 115 |
+
return os.environ.get("ZSGDP_REGRESSION_PERF", "").strip().lower() in {"1", "true", "yes"}
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
def _measure_parse(input_path: Path, *, config_path: Path | None, selected_parsers, repeats: int) -> tuple[Any, list[float]]:
|
| 119 |
+
"""Parse the input N times, returning (last_parsed, list_of_elapsed_seconds).
|
| 120 |
+
|
| 121 |
+
Uses a fresh temp output directory for each run so disk caching effects
|
| 122 |
+
are roughly equal across runs. The last parsed document is returned for
|
| 123 |
+
tolerance evaluation; per-run elapsed times feed the perf assertion.
|
| 124 |
+
"""
|
| 125 |
+
|
| 126 |
+
elapsed: list[float] = []
|
| 127 |
+
parsed = None
|
| 128 |
+
for _ in range(max(1, repeats)):
|
| 129 |
+
with tempfile.TemporaryDirectory() as tmp:
|
| 130 |
+
started = time.perf_counter()
|
| 131 |
+
parsed = parse_document(
|
| 132 |
+
input_path,
|
| 133 |
+
Path(tmp) / "out",
|
| 134 |
+
config_path=config_path if config_path else None,
|
| 135 |
+
selected_parsers=selected_parsers,
|
| 136 |
+
)
|
| 137 |
+
elapsed.append(time.perf_counter() - started)
|
| 138 |
+
return parsed, elapsed
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
def _evaluate_performance(parsed, performance: dict[str, Any], elapsed_seconds: list[float]) -> list[str]:
|
| 142 |
+
failures: list[str] = []
|
| 143 |
+
if not elapsed_seconds:
|
| 144 |
+
return failures
|
| 145 |
+
|
| 146 |
+
median_elapsed = statistics.median(elapsed_seconds)
|
| 147 |
+
page_count = max(len(parsed.pages), 1)
|
| 148 |
+
median_pages_per_second = page_count / median_elapsed if median_elapsed > 0 else float("inf")
|
| 149 |
+
|
| 150 |
+
max_elapsed = performance.get("max_elapsed_seconds")
|
| 151 |
+
if max_elapsed is not None and median_elapsed > float(max_elapsed):
|
| 152 |
+
failures.append(
|
| 153 |
+
f"performance.max_elapsed_seconds: median {median_elapsed:.2f}s > {max_elapsed}s "
|
| 154 |
+
f"(runs={len(elapsed_seconds)})"
|
| 155 |
+
)
|
| 156 |
+
|
| 157 |
+
min_pps = performance.get("min_pages_per_second")
|
| 158 |
+
if min_pps is not None and median_pages_per_second < float(min_pps):
|
| 159 |
+
failures.append(
|
| 160 |
+
f"performance.min_pages_per_second: median {median_pages_per_second:.2f} < {min_pps} "
|
| 161 |
+
f"(runs={len(elapsed_seconds)})"
|
| 162 |
+
)
|
| 163 |
+
|
| 164 |
+
return failures
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
class RegressionFixturesTest(unittest.TestCase):
|
| 168 |
+
def test_regression_fixtures_match_snapshots(self):
|
| 169 |
+
fixtures = _discover_fixtures()
|
| 170 |
+
if not fixtures:
|
| 171 |
+
self.skipTest("No regression fixtures present.")
|
| 172 |
+
|
| 173 |
+
all_failures: list[str] = []
|
| 174 |
+
for name, input_path, expected_path in fixtures:
|
| 175 |
+
with self.subTest(fixture=name):
|
| 176 |
+
expected = json.loads(expected_path.read_text(encoding="utf-8"))
|
| 177 |
+
tolerances = expected.get("tolerances") or {}
|
| 178 |
+
performance = expected.get("performance") or {}
|
| 179 |
+
config_rel = expected.get("config")
|
| 180 |
+
config_path = Path(config_rel) if config_rel else None
|
| 181 |
+
if config_path and not config_path.is_absolute():
|
| 182 |
+
config_path = Path(__file__).resolve().parents[2] / config_path
|
| 183 |
+
selected_parsers = expected.get("selected_parsers")
|
| 184 |
+
|
| 185 |
+
perf_enabled = bool(performance) and _perf_enforcement_enabled(performance)
|
| 186 |
+
repeats = int(performance.get("repeats", 2)) if perf_enabled else 1
|
| 187 |
+
|
| 188 |
+
parsed, elapsed = _measure_parse(
|
| 189 |
+
input_path,
|
| 190 |
+
config_path=config_path,
|
| 191 |
+
selected_parsers=selected_parsers,
|
| 192 |
+
repeats=repeats,
|
| 193 |
+
)
|
| 194 |
+
|
| 195 |
+
failures = _evaluate(parsed, tolerances)
|
| 196 |
+
if perf_enabled:
|
| 197 |
+
failures.extend(_evaluate_performance(parsed, performance, elapsed))
|
| 198 |
+
if failures:
|
| 199 |
+
all_failures.append(f"[{name}] " + "; ".join(failures))
|
| 200 |
+
|
| 201 |
+
if all_failures:
|
| 202 |
+
self.fail("\n".join(all_failures))
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
class PerformanceEvaluatorTests(unittest.TestCase):
|
| 206 |
+
"""Unit tests for the perf-evaluation helpers, separate from fixture discovery."""
|
| 207 |
+
|
| 208 |
+
def test_max_elapsed_floor_fires_when_too_slow(self):
|
| 209 |
+
from types import SimpleNamespace
|
| 210 |
+
|
| 211 |
+
parsed = SimpleNamespace(pages=[{"page_num": 1}])
|
| 212 |
+
failures = _evaluate_performance(parsed, {"max_elapsed_seconds": 0.1}, [0.5, 0.5])
|
| 213 |
+
self.assertEqual(len(failures), 1)
|
| 214 |
+
self.assertIn("max_elapsed_seconds", failures[0])
|
| 215 |
+
|
| 216 |
+
def test_min_pages_per_second_fires_when_too_slow(self):
|
| 217 |
+
from types import SimpleNamespace
|
| 218 |
+
|
| 219 |
+
parsed = SimpleNamespace(pages=[{"page_num": 1}])
|
| 220 |
+
# 1 page in 10s => 0.1 pps, floor 1.0 => fail.
|
| 221 |
+
failures = _evaluate_performance(parsed, {"min_pages_per_second": 1.0}, [10.0, 10.0])
|
| 222 |
+
self.assertEqual(len(failures), 1)
|
| 223 |
+
self.assertIn("min_pages_per_second", failures[0])
|
| 224 |
+
|
| 225 |
+
def test_passing_floors_yield_no_failures(self):
|
| 226 |
+
from types import SimpleNamespace
|
| 227 |
+
|
| 228 |
+
parsed = SimpleNamespace(pages=[{"page_num": 1}, {"page_num": 2}])
|
| 229 |
+
# 2 pages in 0.5s => 4 pps; floor 1.0 pps and max 2s.
|
| 230 |
+
failures = _evaluate_performance(
|
| 231 |
+
parsed,
|
| 232 |
+
{"max_elapsed_seconds": 2.0, "min_pages_per_second": 1.0},
|
| 233 |
+
[0.5, 0.5, 0.5],
|
| 234 |
+
)
|
| 235 |
+
self.assertEqual(failures, [])
|
| 236 |
+
|
| 237 |
+
def test_median_strips_cold_outlier(self):
|
| 238 |
+
from types import SimpleNamespace
|
| 239 |
+
|
| 240 |
+
parsed = SimpleNamespace(pages=[{"page_num": 1}])
|
| 241 |
+
# First run cold (5s), next two warm (0.1s). Median = 0.1s; floor 1s passes.
|
| 242 |
+
failures = _evaluate_performance(parsed, {"max_elapsed_seconds": 1.0}, [5.0, 0.1, 0.1])
|
| 243 |
+
self.assertEqual(failures, [])
|
| 244 |
+
|
| 245 |
+
def test_perf_enforcement_gating(self):
|
| 246 |
+
with unittest.mock.patch.dict("os.environ", {"ZSGDP_REGRESSION_PERF": "0"}, clear=False):
|
| 247 |
+
self.assertFalse(_perf_enforcement_enabled({"max_elapsed_seconds": 1.0}))
|
| 248 |
+
self.assertTrue(_perf_enforcement_enabled({"always_enforce": True}))
|
| 249 |
+
|
| 250 |
+
with unittest.mock.patch.dict("os.environ", {"ZSGDP_REGRESSION_PERF": "1"}, clear=False):
|
| 251 |
+
self.assertTrue(_perf_enforcement_enabled({"max_elapsed_seconds": 1.0}))
|
| 252 |
+
|
| 253 |
+
|
| 254 |
+
if __name__ == "__main__":
|
| 255 |
+
unittest.main()
|
tests/test_ablation_runner.py
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for parser-contribution metrics and the ablation runner."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import json
|
| 6 |
+
import tempfile
|
| 7 |
+
import unittest
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
from zsgdp.benchmarks.ablation_runner import ABLATION_METRIC_KEYS, run_parser_ablations
|
| 11 |
+
from zsgdp.benchmarks.parser_quality import run_parser_benchmark
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class TestParserContribution(unittest.TestCase):
|
| 15 |
+
def test_contribution_counts_appear_in_summary(self):
|
| 16 |
+
with tempfile.TemporaryDirectory() as tmp:
|
| 17 |
+
tmp = Path(tmp)
|
| 18 |
+
src = tmp / "in"
|
| 19 |
+
src.mkdir()
|
| 20 |
+
(src / "doc.md").write_text("# Doc\n\nA paragraph.\n", encoding="utf-8")
|
| 21 |
+
|
| 22 |
+
summary = run_parser_benchmark(src, tmp / "out", dataset_name="custom_folder")
|
| 23 |
+
|
| 24 |
+
doc = summary["documents"][0]
|
| 25 |
+
self.assertIn("parser_contribution_counts", doc)
|
| 26 |
+
self.assertIn("parser_contribution_fractions", doc)
|
| 27 |
+
self.assertGreater(sum(doc["parser_contribution_counts"].values()), 0)
|
| 28 |
+
# The sum of fractions should be ~1.0 across parsers.
|
| 29 |
+
total_fraction = sum(doc["parser_contribution_fractions"].values())
|
| 30 |
+
self.assertAlmostEqual(total_fraction, 1.0, places=6)
|
| 31 |
+
|
| 32 |
+
top_summary = summary["parser_contribution_summary"]
|
| 33 |
+
self.assertGreater(top_summary["total"], 0)
|
| 34 |
+
self.assertEqual(set(top_summary["counts"]), set(top_summary["fractions"]))
|
| 35 |
+
|
| 36 |
+
def test_text_parser_dominates_markdown_doc(self):
|
| 37 |
+
with tempfile.TemporaryDirectory() as tmp:
|
| 38 |
+
tmp = Path(tmp)
|
| 39 |
+
src = tmp / "in"
|
| 40 |
+
src.mkdir()
|
| 41 |
+
(src / "doc.md").write_text("# Doc\n\nPara one.\n\nPara two.\n", encoding="utf-8")
|
| 42 |
+
|
| 43 |
+
summary = run_parser_benchmark(src, tmp / "out", dataset_name="custom_folder")
|
| 44 |
+
|
| 45 |
+
top_counts = summary["parser_contribution_summary"]["counts"]
|
| 46 |
+
self.assertIn("text", top_counts)
|
| 47 |
+
text_count = top_counts["text"]
|
| 48 |
+
other_count = sum(value for parser, value in top_counts.items() if parser != "text")
|
| 49 |
+
self.assertGreaterEqual(text_count, other_count)
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
class TestRunParserAblations(unittest.TestCase):
|
| 53 |
+
def test_two_arms_plus_merged(self):
|
| 54 |
+
with tempfile.TemporaryDirectory() as tmp:
|
| 55 |
+
tmp = Path(tmp)
|
| 56 |
+
src = tmp / "in"
|
| 57 |
+
src.mkdir()
|
| 58 |
+
(src / "doc.md").write_text("# Doc\n\nPara one.\n\nPara two.\n", encoding="utf-8")
|
| 59 |
+
out = tmp / "out"
|
| 60 |
+
|
| 61 |
+
comparison = run_parser_ablations(
|
| 62 |
+
src,
|
| 63 |
+
out,
|
| 64 |
+
parsers=["text", "pymupdf"],
|
| 65 |
+
dataset_name="custom_folder",
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
self.assertEqual(comparison["arm_count"], 3)
|
| 69 |
+
arms = sorted(row["arm"] for row in comparison["rows"])
|
| 70 |
+
self.assertEqual(arms, ["merged", "pymupdf", "text"])
|
| 71 |
+
self.assertTrue((out / "arm_text").exists())
|
| 72 |
+
self.assertTrue((out / "arm_pymupdf").exists())
|
| 73 |
+
self.assertTrue((out / "arm_merged").exists())
|
| 74 |
+
self.assertTrue((out / "ablation_comparison.csv").exists())
|
| 75 |
+
self.assertTrue((out / "ablation_summary.json").exists())
|
| 76 |
+
|
| 77 |
+
# Each arm record carries the canonical metric keys (subset of those present).
|
| 78 |
+
for row in comparison["rows"]:
|
| 79 |
+
self.assertIn("mean_quality_score", row)
|
| 80 |
+
|
| 81 |
+
def test_no_merged_when_disabled(self):
|
| 82 |
+
with tempfile.TemporaryDirectory() as tmp:
|
| 83 |
+
tmp = Path(tmp)
|
| 84 |
+
src = tmp / "in"
|
| 85 |
+
src.mkdir()
|
| 86 |
+
(src / "doc.md").write_text("# Doc\n\nPara.\n", encoding="utf-8")
|
| 87 |
+
|
| 88 |
+
comparison = run_parser_ablations(
|
| 89 |
+
src,
|
| 90 |
+
tmp / "out",
|
| 91 |
+
parsers=["text", "pymupdf"],
|
| 92 |
+
dataset_name="custom_folder",
|
| 93 |
+
include_merged=False,
|
| 94 |
+
)
|
| 95 |
+
self.assertEqual(comparison["arm_count"], 2)
|
| 96 |
+
self.assertNotIn("merged", {row["arm"] for row in comparison["rows"]})
|
| 97 |
+
|
| 98 |
+
def test_single_parser_ablation_skips_merged_arm(self):
|
| 99 |
+
with tempfile.TemporaryDirectory() as tmp:
|
| 100 |
+
tmp = Path(tmp)
|
| 101 |
+
src = tmp / "in"
|
| 102 |
+
src.mkdir()
|
| 103 |
+
(src / "doc.md").write_text("# Doc\n\nPara.\n", encoding="utf-8")
|
| 104 |
+
|
| 105 |
+
comparison = run_parser_ablations(
|
| 106 |
+
src,
|
| 107 |
+
tmp / "out",
|
| 108 |
+
parsers=["text"],
|
| 109 |
+
dataset_name="custom_folder",
|
| 110 |
+
)
|
| 111 |
+
# Single parser + include_merged defaults true, but len(parsers) == 1
|
| 112 |
+
# so merged would be redundant and is skipped.
|
| 113 |
+
self.assertEqual(comparison["arm_count"], 1)
|
| 114 |
+
self.assertEqual(comparison["rows"][0]["arm"], "text")
|
| 115 |
+
|
| 116 |
+
def test_empty_parsers_raises(self):
|
| 117 |
+
with self.assertRaises(ValueError):
|
| 118 |
+
run_parser_ablations(".", "./out", parsers=[])
|
| 119 |
+
|
| 120 |
+
def test_metric_keys_constant_matches_summary_shape(self):
|
| 121 |
+
with tempfile.TemporaryDirectory() as tmp:
|
| 122 |
+
tmp = Path(tmp)
|
| 123 |
+
src = tmp / "in"
|
| 124 |
+
src.mkdir()
|
| 125 |
+
(src / "doc.md").write_text("# Doc\n\nPara.\n", encoding="utf-8")
|
| 126 |
+
|
| 127 |
+
summary = run_parser_benchmark(src, tmp / "out", dataset_name="custom_folder")
|
| 128 |
+
for key in ABLATION_METRIC_KEYS:
|
| 129 |
+
self.assertIn(key, summary, f"benchmark summary missing key {key}")
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
if __name__ == "__main__":
|
| 133 |
+
unittest.main()
|
tests/test_app.py
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import tempfile
|
| 2 |
+
import unittest
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from unittest.mock import patch
|
| 5 |
+
|
| 6 |
+
try:
|
| 7 |
+
import app as space_app
|
| 8 |
+
except RuntimeError as exc:
|
| 9 |
+
space_app = None
|
| 10 |
+
APP_IMPORT_ERROR = str(exc)
|
| 11 |
+
else:
|
| 12 |
+
APP_IMPORT_ERROR = ""
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class _UploadedFile:
|
| 16 |
+
def __init__(self, name: str):
|
| 17 |
+
self.name = name
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class AppTests(unittest.TestCase):
|
| 21 |
+
def test_parse_uploaded_document_returns_artifact_validation(self):
|
| 22 |
+
if space_app is None:
|
| 23 |
+
self.skipTest(APP_IMPORT_ERROR)
|
| 24 |
+
|
| 25 |
+
with tempfile.TemporaryDirectory() as tmp:
|
| 26 |
+
input_path = Path(tmp) / "sample.md"
|
| 27 |
+
input_path.write_text("# Report\n\nHello from the Space UI.\n", encoding="utf-8")
|
| 28 |
+
|
| 29 |
+
outputs = space_app.parse_uploaded_document(_UploadedFile(str(input_path)), "Default lightweight")
|
| 30 |
+
|
| 31 |
+
self.assertEqual(len(outputs), 11)
|
| 32 |
+
summary = outputs[1]
|
| 33 |
+
artifact_validation = outputs[8]
|
| 34 |
+
archive_path = outputs[9]
|
| 35 |
+
individual_files = outputs[10]
|
| 36 |
+
self.assertTrue(summary["artifact_manifest_valid"])
|
| 37 |
+
self.assertTrue(artifact_validation["valid"])
|
| 38 |
+
self.assertTrue(Path(archive_path).exists())
|
| 39 |
+
# Per-artifact downloads.
|
| 40 |
+
self.assertIsInstance(individual_files, list)
|
| 41 |
+
self.assertGreater(len(individual_files), 0)
|
| 42 |
+
names = [Path(p).name for p in individual_files]
|
| 43 |
+
# Core artifacts every parse should produce.
|
| 44 |
+
for required in ("parsed_document.json", "document.md", "chunks.jsonl", "artifact_manifest.json"):
|
| 45 |
+
self.assertIn(required, names)
|
| 46 |
+
# Each path actually exists on disk so Gradio can serve it.
|
| 47 |
+
for path in individual_files:
|
| 48 |
+
self.assertTrue(Path(path).exists(), f"missing: {path}")
|
| 49 |
+
# The archive zip is a separate artifact and must NOT appear in the
|
| 50 |
+
# per-artifact list (zip is the bundled-everything view).
|
| 51 |
+
self.assertNotIn(Path(archive_path).name, names)
|
| 52 |
+
# Summary records the per-artifact count.
|
| 53 |
+
self.assertEqual(summary["individual_artifact_count"], len(individual_files))
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
class UploadGuardTests(unittest.TestCase):
|
| 57 |
+
def test_oversized_upload_rejected_with_clear_message(self):
|
| 58 |
+
if space_app is None:
|
| 59 |
+
self.skipTest(APP_IMPORT_ERROR)
|
| 60 |
+
|
| 61 |
+
with tempfile.TemporaryDirectory() as tmp:
|
| 62 |
+
input_path = Path(tmp) / "huge.md"
|
| 63 |
+
input_path.write_text("# Big\n\n" + "x" * 4096, encoding="utf-8")
|
| 64 |
+
|
| 65 |
+
with patch.object(space_app, "MAX_UPLOAD_BYTES", 1024):
|
| 66 |
+
outputs = space_app.parse_uploaded_document(
|
| 67 |
+
_UploadedFile(str(input_path)), "Default lightweight"
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
summary = outputs[1]
|
| 71 |
+
self.assertTrue(summary.get("rejected"))
|
| 72 |
+
self.assertIn("MB", summary["error"])
|
| 73 |
+
|
| 74 |
+
def test_high_page_count_rejected(self):
|
| 75 |
+
if space_app is None:
|
| 76 |
+
self.skipTest(APP_IMPORT_ERROR)
|
| 77 |
+
|
| 78 |
+
with tempfile.TemporaryDirectory() as tmp:
|
| 79 |
+
input_path = Path(tmp) / "doc.md"
|
| 80 |
+
input_path.write_text("# Doc\n\nSomething small.\n", encoding="utf-8")
|
| 81 |
+
|
| 82 |
+
class _FakeProfile:
|
| 83 |
+
page_count = 1000
|
| 84 |
+
|
| 85 |
+
with patch.object(space_app, "MAX_PAGE_COUNT", 50), patch.object(
|
| 86 |
+
space_app, "profile_document", return_value=_FakeProfile()
|
| 87 |
+
):
|
| 88 |
+
outputs = space_app.parse_uploaded_document(
|
| 89 |
+
_UploadedFile(str(input_path)), "Default lightweight"
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
summary = outputs[1]
|
| 93 |
+
self.assertTrue(summary.get("rejected"))
|
| 94 |
+
self.assertIn("pages", summary["error"])
|
| 95 |
+
|
| 96 |
+
def test_missing_upload_path_rejected(self):
|
| 97 |
+
if space_app is None:
|
| 98 |
+
self.skipTest(APP_IMPORT_ERROR)
|
| 99 |
+
|
| 100 |
+
outputs = space_app.parse_uploaded_document(
|
| 101 |
+
_UploadedFile("/tmp/zsgdp-does-not-exist.md"), "Default lightweight"
|
| 102 |
+
)
|
| 103 |
+
summary = outputs[1]
|
| 104 |
+
self.assertTrue(summary.get("rejected"))
|
| 105 |
+
self.assertIn("missing", summary["error"].lower())
|
| 106 |
+
|
| 107 |
+
def test_error_paths_return_full_tuple_width(self):
|
| 108 |
+
# Drift guard: every return path (success + error) must yield 11 outputs
|
| 109 |
+
# so the Gradio click handler doesn't error on shape mismatch.
|
| 110 |
+
if space_app is None:
|
| 111 |
+
self.skipTest(APP_IMPORT_ERROR)
|
| 112 |
+
|
| 113 |
+
# No upload at all.
|
| 114 |
+
outputs = space_app.parse_uploaded_document(None, "Default lightweight")
|
| 115 |
+
self.assertEqual(len(outputs), 11)
|
| 116 |
+
self.assertEqual(outputs[10], [])
|
| 117 |
+
|
| 118 |
+
# Missing-file rejection.
|
| 119 |
+
outputs = space_app.parse_uploaded_document(
|
| 120 |
+
_UploadedFile("/tmp/zsgdp-does-not-exist-xyz.md"), "Default lightweight"
|
| 121 |
+
)
|
| 122 |
+
self.assertEqual(len(outputs), 11)
|
| 123 |
+
self.assertEqual(outputs[10], [])
|
| 124 |
+
|
| 125 |
+
def test_normal_upload_passes_guards(self):
|
| 126 |
+
if space_app is None:
|
| 127 |
+
self.skipTest(APP_IMPORT_ERROR)
|
| 128 |
+
|
| 129 |
+
with tempfile.TemporaryDirectory() as tmp:
|
| 130 |
+
input_path = Path(tmp) / "ok.md"
|
| 131 |
+
input_path.write_text("# OK\n\nA normal document.\n", encoding="utf-8")
|
| 132 |
+
outputs = space_app.parse_uploaded_document(
|
| 133 |
+
_UploadedFile(str(input_path)), "Default lightweight"
|
| 134 |
+
)
|
| 135 |
+
|
| 136 |
+
summary = outputs[1]
|
| 137 |
+
self.assertNotIn("rejected", summary)
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
if __name__ == "__main__":
|
| 141 |
+
unittest.main()
|
tests/test_artifacts.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import tempfile
|
| 3 |
+
import unittest
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
from zsgdp.artifacts import MANIFEST_SCHEMA_VERSION, validate_artifact_manifest
|
| 7 |
+
from zsgdp.cli import main
|
| 8 |
+
from zsgdp.pipeline import parse_document
|
| 9 |
+
from zsgdp.schema import SCHEMA_VERSION
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class ArtifactManifestTests(unittest.TestCase):
|
| 13 |
+
def test_parse_writes_valid_artifact_manifest(self):
|
| 14 |
+
with tempfile.TemporaryDirectory() as tmp:
|
| 15 |
+
tmp_path = Path(tmp)
|
| 16 |
+
input_path = tmp_path / "sample.md"
|
| 17 |
+
output_dir = tmp_path / "out"
|
| 18 |
+
input_path.write_text("# Report\n\nHello world.\n", encoding="utf-8")
|
| 19 |
+
|
| 20 |
+
parsed = parse_document(input_path, output_dir)
|
| 21 |
+
manifest = json.loads((output_dir / "artifact_manifest.json").read_text(encoding="utf-8"))
|
| 22 |
+
validation = validate_artifact_manifest(output_dir)
|
| 23 |
+
|
| 24 |
+
self.assertEqual(manifest["doc_id"], parsed.doc_id)
|
| 25 |
+
self.assertEqual(manifest["counts"]["chunks"], len(parsed.chunks))
|
| 26 |
+
self.assertTrue(any(record["path"] == "parsed_document.json" for record in manifest["files"]))
|
| 27 |
+
self.assertTrue(validation["valid"])
|
| 28 |
+
self.assertEqual(validation["checked_count"], manifest["artifact_count"])
|
| 29 |
+
|
| 30 |
+
def test_manifest_records_schema_versions(self):
|
| 31 |
+
with tempfile.TemporaryDirectory() as tmp:
|
| 32 |
+
tmp_path = Path(tmp)
|
| 33 |
+
input_path = tmp_path / "sample.md"
|
| 34 |
+
output_dir = tmp_path / "out"
|
| 35 |
+
input_path.write_text("# Report\n\nHello.\n", encoding="utf-8")
|
| 36 |
+
|
| 37 |
+
parsed = parse_document(input_path, output_dir)
|
| 38 |
+
manifest = json.loads((output_dir / "artifact_manifest.json").read_text(encoding="utf-8"))
|
| 39 |
+
|
| 40 |
+
# Manifest format version is its own integer; parsed-document
|
| 41 |
+
# schema version is a string echoed from the dataclass.
|
| 42 |
+
self.assertEqual(manifest["schema_version"], MANIFEST_SCHEMA_VERSION)
|
| 43 |
+
self.assertEqual(manifest["parsed_document_schema_version"], SCHEMA_VERSION)
|
| 44 |
+
self.assertEqual(parsed.schema_version, SCHEMA_VERSION)
|
| 45 |
+
|
| 46 |
+
# Validation echoes both versions so callers can gate on them.
|
| 47 |
+
validation = validate_artifact_manifest(output_dir)
|
| 48 |
+
self.assertEqual(validation["manifest_schema_version"], MANIFEST_SCHEMA_VERSION)
|
| 49 |
+
self.assertEqual(validation["parsed_document_schema_version"], SCHEMA_VERSION)
|
| 50 |
+
|
| 51 |
+
def test_validate_artifact_manifest_detects_checksum_mismatch(self):
|
| 52 |
+
with tempfile.TemporaryDirectory() as tmp:
|
| 53 |
+
tmp_path = Path(tmp)
|
| 54 |
+
input_path = tmp_path / "sample.md"
|
| 55 |
+
output_dir = tmp_path / "out"
|
| 56 |
+
input_path.write_text("# Report\n\nHello world.\n", encoding="utf-8")
|
| 57 |
+
parse_document(input_path, output_dir)
|
| 58 |
+
|
| 59 |
+
(output_dir / "document.md").write_text("tampered\n", encoding="utf-8")
|
| 60 |
+
validation = validate_artifact_manifest(output_dir)
|
| 61 |
+
|
| 62 |
+
self.assertFalse(validation["valid"])
|
| 63 |
+
self.assertTrue(any("SHA-256 mismatch: document.md" == error for error in validation["errors"]))
|
| 64 |
+
|
| 65 |
+
def test_validate_artifacts_cli_writes_report(self):
|
| 66 |
+
with tempfile.TemporaryDirectory() as tmp:
|
| 67 |
+
tmp_path = Path(tmp)
|
| 68 |
+
input_path = tmp_path / "sample.md"
|
| 69 |
+
output_dir = tmp_path / "out"
|
| 70 |
+
report_path = tmp_path / "validation.json"
|
| 71 |
+
input_path.write_text("# Report\n\nHello world.\n", encoding="utf-8")
|
| 72 |
+
parse_document(input_path, output_dir)
|
| 73 |
+
|
| 74 |
+
code = main(["validate-artifacts", "--parsed", str(output_dir), "--output", str(report_path)])
|
| 75 |
+
|
| 76 |
+
self.assertEqual(code, 0)
|
| 77 |
+
self.assertTrue(report_path.exists())
|
| 78 |
+
self.assertTrue(json.loads(report_path.read_text(encoding="utf-8"))["valid"])
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
if __name__ == "__main__":
|
| 82 |
+
unittest.main()
|
tests/test_benchmark.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import tempfile
|
| 2 |
+
import unittest
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
from zsgdp.benchmarks.parser_quality import run_parser_benchmark
|
| 6 |
+
from zsgdp.cli import main
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class BenchmarkTests(unittest.TestCase):
|
| 10 |
+
def test_run_parser_benchmark_writes_results(self):
|
| 11 |
+
with tempfile.TemporaryDirectory() as tmp:
|
| 12 |
+
tmp_path = Path(tmp)
|
| 13 |
+
docs = tmp_path / "docs"
|
| 14 |
+
out = tmp_path / "bench"
|
| 15 |
+
docs.mkdir()
|
| 16 |
+
(docs / "one.md").write_text("# One\n\nHello world", encoding="utf-8")
|
| 17 |
+
|
| 18 |
+
summary = run_parser_benchmark(docs, out)
|
| 19 |
+
|
| 20 |
+
self.assertEqual(summary["document_count"], 1)
|
| 21 |
+
self.assertIn("fixed_token_baseline", summary["documents"][0]["chunk_strategy_counts"])
|
| 22 |
+
self.assertTrue(summary["chunk_strategy_leaderboard"])
|
| 23 |
+
self.assertIn("structure_quality", summary)
|
| 24 |
+
self.assertIn("chunking_quality", summary)
|
| 25 |
+
self.assertIn("throughput", summary)
|
| 26 |
+
self.assertIn("ablation_plan", summary)
|
| 27 |
+
self.assertTrue((out / "results.json").exists())
|
| 28 |
+
self.assertTrue((out / "leaderboard.csv").exists())
|
| 29 |
+
self.assertTrue((out / "parser_runs.csv").exists())
|
| 30 |
+
self.assertTrue((out / "chunk_runs.csv").exists())
|
| 31 |
+
self.assertTrue((out / "structure_runs.csv").exists())
|
| 32 |
+
self.assertTrue((out / "chunk_quality.csv").exists())
|
| 33 |
+
self.assertTrue((out / "throughput_runs.csv").exists())
|
| 34 |
+
self.assertTrue((out / "ablations.json").exists())
|
| 35 |
+
|
| 36 |
+
def test_benchmark_cli(self):
|
| 37 |
+
with tempfile.TemporaryDirectory() as tmp:
|
| 38 |
+
tmp_path = Path(tmp)
|
| 39 |
+
docs = tmp_path / "docs"
|
| 40 |
+
out = tmp_path / "bench"
|
| 41 |
+
docs.mkdir()
|
| 42 |
+
(docs / "one.md").write_text("# One\n\nHello world", encoding="utf-8")
|
| 43 |
+
|
| 44 |
+
code = main(["benchmark", "--input", str(docs), "--output", str(out), "--parsers", "text"])
|
| 45 |
+
|
| 46 |
+
self.assertEqual(code, 0)
|
| 47 |
+
self.assertTrue((out / "leaderboard.csv").exists())
|
| 48 |
+
self.assertTrue((out / "chunk_runs.csv").exists())
|
| 49 |
+
self.assertTrue((out / "structure_runs.csv").exists())
|
| 50 |
+
self.assertTrue((out / "chunk_quality.csv").exists())
|
| 51 |
+
self.assertTrue((out / "throughput_runs.csv").exists())
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
if __name__ == "__main__":
|
| 55 |
+
unittest.main()
|
tests/test_chunking.py
ADDED
|
@@ -0,0 +1,286 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import unittest
|
| 2 |
+
|
| 3 |
+
from zsgdp.chunking import build_agentic_chunks
|
| 4 |
+
from zsgdp.config import load_config
|
| 5 |
+
from zsgdp.schema import DocumentProfile, Element, FigureObject, PageProfile, ParsedDocument, QualityReport, TableObject
|
| 6 |
+
from zsgdp.verify import verify_chunks
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class ChunkingTests(unittest.TestCase):
|
| 10 |
+
def test_agentic_chunking_builds_parent_child_chunks(self):
|
| 11 |
+
profile = DocumentProfile(
|
| 12 |
+
doc_id="d1",
|
| 13 |
+
source_path="sample.md",
|
| 14 |
+
file_type="markdown",
|
| 15 |
+
page_count=1,
|
| 16 |
+
extension=".md",
|
| 17 |
+
pages=[PageProfile(page_num=1, digital_text_chars=120, digital_text_quality=1.0)],
|
| 18 |
+
)
|
| 19 |
+
parsed = ParsedDocument(
|
| 20 |
+
doc_id="d1",
|
| 21 |
+
source_path="sample.md",
|
| 22 |
+
file_type="markdown",
|
| 23 |
+
quality_report=QualityReport(score=0.95),
|
| 24 |
+
)
|
| 25 |
+
parsed.elements.extend(
|
| 26 |
+
[
|
| 27 |
+
Element("e1", "d1", 1, "title", markdown="# Report", reading_order=1, source_parser="text"),
|
| 28 |
+
Element("e2", "d1", 1, "paragraph", text=" ".join(["alpha"] * 80), reading_order=2, source_parser="text"),
|
| 29 |
+
]
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
chunks = build_agentic_chunks(parsed, profile, load_config())
|
| 33 |
+
|
| 34 |
+
self.assertTrue(any(chunk.content_type == "parent" for chunk in chunks))
|
| 35 |
+
self.assertTrue(any(chunk.parent_chunk_id for chunk in chunks))
|
| 36 |
+
self.assertEqual(parsed.provenance["chunking"]["plan"]["target_tokens"], 512)
|
| 37 |
+
|
| 38 |
+
def test_chunk_readiness_adds_metrics(self):
|
| 39 |
+
profile = DocumentProfile(
|
| 40 |
+
doc_id="d1",
|
| 41 |
+
source_path="sample.md",
|
| 42 |
+
file_type="markdown",
|
| 43 |
+
page_count=1,
|
| 44 |
+
extension=".md",
|
| 45 |
+
pages=[PageProfile(page_num=1, digital_text_chars=120, digital_text_quality=1.0)],
|
| 46 |
+
)
|
| 47 |
+
parsed = ParsedDocument(
|
| 48 |
+
doc_id="d1",
|
| 49 |
+
source_path="sample.md",
|
| 50 |
+
file_type="markdown",
|
| 51 |
+
quality_report=QualityReport(score=0.95),
|
| 52 |
+
)
|
| 53 |
+
parsed.elements.append(
|
| 54 |
+
Element("e1", "d1", 1, "paragraph", text=" ".join(["alpha"] * 80), reading_order=1, source_parser="text")
|
| 55 |
+
)
|
| 56 |
+
parsed.chunks = build_agentic_chunks(parsed, profile, load_config())
|
| 57 |
+
|
| 58 |
+
report = verify_chunks(parsed, load_config())
|
| 59 |
+
|
| 60 |
+
self.assertEqual(report.metrics["chunk_count"], len(parsed.chunks))
|
| 61 |
+
self.assertIn("fixed_token_baseline", report.metrics["chunk_strategy_counts"])
|
| 62 |
+
self.assertIn("recursive_structure", report.metrics["chunk_strategy_counts"])
|
| 63 |
+
|
| 64 |
+
def test_fixed_token_baseline_chunks_are_emitted_with_provenance(self):
|
| 65 |
+
profile = DocumentProfile(
|
| 66 |
+
doc_id="d1",
|
| 67 |
+
source_path="sample.md",
|
| 68 |
+
file_type="markdown",
|
| 69 |
+
page_count=2,
|
| 70 |
+
extension=".md",
|
| 71 |
+
pages=[
|
| 72 |
+
PageProfile(page_num=1, digital_text_chars=120, digital_text_quality=1.0),
|
| 73 |
+
PageProfile(page_num=2, digital_text_chars=120, digital_text_quality=1.0),
|
| 74 |
+
],
|
| 75 |
+
)
|
| 76 |
+
parsed = ParsedDocument(
|
| 77 |
+
doc_id="d1",
|
| 78 |
+
source_path="sample.md",
|
| 79 |
+
file_type="markdown",
|
| 80 |
+
quality_report=QualityReport(score=0.95),
|
| 81 |
+
)
|
| 82 |
+
parsed.elements.extend(
|
| 83 |
+
[
|
| 84 |
+
Element("e1", "d1", 1, "paragraph", text=" ".join(["alpha"] * 18), reading_order=1, source_parser="text"),
|
| 85 |
+
Element("e2", "d1", 2, "paragraph", text=" ".join(["beta"] * 18), reading_order=1, source_parser="text"),
|
| 86 |
+
]
|
| 87 |
+
)
|
| 88 |
+
config = load_config(overrides={"chunking": {"target_tokens": 10, "overlap_ratio": 0.2}})
|
| 89 |
+
|
| 90 |
+
chunks = build_agentic_chunks(parsed, profile, config)
|
| 91 |
+
baseline_chunks = [chunk for chunk in chunks if chunk.strategy == "fixed_token_baseline"]
|
| 92 |
+
|
| 93 |
+
self.assertGreaterEqual(len(baseline_chunks), 4)
|
| 94 |
+
self.assertEqual(baseline_chunks[0].element_ids, ["e1"])
|
| 95 |
+
self.assertEqual(baseline_chunks[-1].page_end, 2)
|
| 96 |
+
self.assertEqual(parsed.provenance["chunking"]["fixed_token_baseline_count"], len(baseline_chunks))
|
| 97 |
+
|
| 98 |
+
def test_figure_without_caption_still_gets_visual_chunk(self):
|
| 99 |
+
profile = DocumentProfile(
|
| 100 |
+
doc_id="d1",
|
| 101 |
+
source_path="sample.pdf",
|
| 102 |
+
file_type="pdf",
|
| 103 |
+
page_count=1,
|
| 104 |
+
extension=".pdf",
|
| 105 |
+
pages=[PageProfile(page_num=1, digital_text_chars=20, digital_text_quality=1.0)],
|
| 106 |
+
)
|
| 107 |
+
parsed = ParsedDocument(
|
| 108 |
+
doc_id="d1",
|
| 109 |
+
source_path="sample.pdf",
|
| 110 |
+
file_type="pdf",
|
| 111 |
+
quality_report=QualityReport(score=0.90),
|
| 112 |
+
)
|
| 113 |
+
parsed.elements.append(Element("e1", "d1", 1, "paragraph", text="hello world", reading_order=1, source_parser="pymupdf"))
|
| 114 |
+
parsed.figures.append(
|
| 115 |
+
FigureObject(
|
| 116 |
+
figure_id="f1",
|
| 117 |
+
page_num=1,
|
| 118 |
+
image_path="/tmp/figure.png",
|
| 119 |
+
confidence=0.5,
|
| 120 |
+
source_parser="pymupdf",
|
| 121 |
+
)
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
parsed.chunks = build_agentic_chunks(parsed, profile, load_config())
|
| 125 |
+
report = verify_chunks(parsed, load_config())
|
| 126 |
+
|
| 127 |
+
self.assertTrue(any(chunk.figure_ids == ["f1"] for chunk in parsed.chunks))
|
| 128 |
+
self.assertEqual(report.metrics["figure_chunk_coverage"], 1.0)
|
| 129 |
+
|
| 130 |
+
def test_table_chunk_keeps_multimodal_metadata(self):
|
| 131 |
+
profile = DocumentProfile(
|
| 132 |
+
doc_id="d1",
|
| 133 |
+
source_path="sample.pdf",
|
| 134 |
+
file_type="pdf",
|
| 135 |
+
page_count=1,
|
| 136 |
+
extension=".pdf",
|
| 137 |
+
pages=[PageProfile(page_num=1, digital_text_chars=20, digital_text_quality=1.0)],
|
| 138 |
+
)
|
| 139 |
+
parsed = ParsedDocument(
|
| 140 |
+
doc_id="d1",
|
| 141 |
+
source_path="sample.pdf",
|
| 142 |
+
file_type="pdf",
|
| 143 |
+
quality_report=QualityReport(score=0.90),
|
| 144 |
+
)
|
| 145 |
+
parsed.elements.append(Element("e1", "d1", 1, "paragraph", text="hello world", reading_order=1, source_parser="pymupdf"))
|
| 146 |
+
parsed.tables.append(
|
| 147 |
+
TableObject(
|
| 148 |
+
table_id="t1",
|
| 149 |
+
page_nums=[1],
|
| 150 |
+
bbox=[(1.0, 2.0, 3.0, 4.0)],
|
| 151 |
+
markdown="| A | B |\n| --- | --- |\n| 1 | 2 |",
|
| 152 |
+
natural_language_rendering="Table with columns A, B. Rows: 1: B=2.",
|
| 153 |
+
confidence=0.82,
|
| 154 |
+
source_parser="pymupdf",
|
| 155 |
+
provenance={"crop_path": "/tmp/table.png", "source_parsers": ["pymupdf", "docling"]},
|
| 156 |
+
)
|
| 157 |
+
)
|
| 158 |
+
|
| 159 |
+
parsed.chunks = build_agentic_chunks(parsed, profile, load_config())
|
| 160 |
+
table_chunk = next(chunk for chunk in parsed.chunks if chunk.strategy == "table_object")
|
| 161 |
+
|
| 162 |
+
self.assertEqual(table_chunk.text, "Table with columns A, B. Rows: 1: B=2.")
|
| 163 |
+
self.assertEqual(table_chunk.metadata["markdown"], "| A | B |\n| --- | --- |\n| 1 | 2 |")
|
| 164 |
+
self.assertEqual(table_chunk.metadata["bbox"], [(1.0, 2.0, 3.0, 4.0)])
|
| 165 |
+
self.assertEqual(table_chunk.metadata["crop_path"], "/tmp/table.png")
|
| 166 |
+
self.assertEqual(table_chunk.metadata["source_parsers"], ["pymupdf", "docling"])
|
| 167 |
+
|
| 168 |
+
def test_vision_guided_chunking_exports_visual_regions(self):
|
| 169 |
+
profile = DocumentProfile(
|
| 170 |
+
doc_id="d1",
|
| 171 |
+
source_path="sample.pdf",
|
| 172 |
+
file_type="pdf",
|
| 173 |
+
page_count=1,
|
| 174 |
+
extension=".pdf",
|
| 175 |
+
pages=[PageProfile(page_num=1, digital_text_chars=20, digital_text_quality=1.0)],
|
| 176 |
+
)
|
| 177 |
+
parsed = ParsedDocument(
|
| 178 |
+
doc_id="d1",
|
| 179 |
+
source_path="sample.pdf",
|
| 180 |
+
file_type="pdf",
|
| 181 |
+
quality_report=QualityReport(score=0.90),
|
| 182 |
+
)
|
| 183 |
+
parsed.elements.append(Element("e1", "d1", 1, "paragraph", text="hello world", reading_order=1, source_parser="pymupdf"))
|
| 184 |
+
parsed.tables.append(TableObject(table_id="t1", page_nums=[1], bbox=[(1.0, 2.0, 3.0, 4.0)], markdown="| A | B |\n| --- | --- |\n| 1 | 2 |"))
|
| 185 |
+
parsed.figures.append(FigureObject(figure_id="f1", page_num=1, bbox=(5.0, 6.0, 7.0, 8.0), source_parser="pymupdf"))
|
| 186 |
+
config = load_config(overrides={"chunking": {"vision_guided": True}})
|
| 187 |
+
|
| 188 |
+
parsed.chunks = build_agentic_chunks(parsed, profile, config)
|
| 189 |
+
|
| 190 |
+
visual_chunks = [chunk for chunk in parsed.chunks if chunk.content_type in {"table", "figure"}]
|
| 191 |
+
self.assertTrue(all(chunk.requires_visual_context for chunk in visual_chunks))
|
| 192 |
+
self.assertEqual(len(parsed.provenance["chunking"]["vision_regions"]), 2)
|
| 193 |
+
self.assertEqual(parsed.provenance["chunking"]["vision_regions"][0]["region_id"], "t1")
|
| 194 |
+
|
| 195 |
+
def test_advanced_chunking_flags_emit_strategy_chunks(self):
|
| 196 |
+
profile = DocumentProfile(
|
| 197 |
+
doc_id="d1",
|
| 198 |
+
source_path="sample.pdf",
|
| 199 |
+
file_type="pdf",
|
| 200 |
+
page_count=2,
|
| 201 |
+
extension=".pdf",
|
| 202 |
+
pages=[
|
| 203 |
+
PageProfile(page_num=1, digital_text_chars=200, digital_text_quality=1.0),
|
| 204 |
+
PageProfile(page_num=2, digital_text_chars=200, digital_text_quality=1.0),
|
| 205 |
+
],
|
| 206 |
+
)
|
| 207 |
+
parsed = ParsedDocument(
|
| 208 |
+
doc_id="d1",
|
| 209 |
+
source_path="sample.pdf",
|
| 210 |
+
file_type="pdf",
|
| 211 |
+
quality_report=QualityReport(score=0.92),
|
| 212 |
+
)
|
| 213 |
+
parsed.elements.extend(
|
| 214 |
+
[
|
| 215 |
+
Element("e1", "d1", 1, "heading", markdown="## Revenue", reading_order=1, source_parser="pymupdf"),
|
| 216 |
+
Element(
|
| 217 |
+
"e2",
|
| 218 |
+
"d1",
|
| 219 |
+
1,
|
| 220 |
+
"paragraph",
|
| 221 |
+
text="Revenue increased by 12 percent in Q1. Gross margin improved due to pricing.",
|
| 222 |
+
reading_order=2,
|
| 223 |
+
source_parser="pymupdf",
|
| 224 |
+
),
|
| 225 |
+
Element("e3", "d1", 2, "heading", markdown="## Safety", reading_order=1, source_parser="pymupdf"),
|
| 226 |
+
Element(
|
| 227 |
+
"e4",
|
| 228 |
+
"d1",
|
| 229 |
+
2,
|
| 230 |
+
"paragraph",
|
| 231 |
+
text="Safety inspections found three unresolved risks. Corrective actions are due in June.",
|
| 232 |
+
reading_order=2,
|
| 233 |
+
source_parser="pymupdf",
|
| 234 |
+
),
|
| 235 |
+
]
|
| 236 |
+
)
|
| 237 |
+
parsed.tables.append(
|
| 238 |
+
TableObject(
|
| 239 |
+
table_id="t1",
|
| 240 |
+
page_nums=[1],
|
| 241 |
+
markdown="| Metric | Value |\n| --- | --- |\n| Revenue | 12% |",
|
| 242 |
+
natural_language_rendering="Table t1 reports revenue growth of 12 percent.",
|
| 243 |
+
source_parser="pymupdf",
|
| 244 |
+
)
|
| 245 |
+
)
|
| 246 |
+
parsed.figures.append(
|
| 247 |
+
FigureObject(
|
| 248 |
+
figure_id="f1",
|
| 249 |
+
page_num=2,
|
| 250 |
+
caption="Risk trend chart shows open safety findings.",
|
| 251 |
+
source_parser="pymupdf",
|
| 252 |
+
)
|
| 253 |
+
)
|
| 254 |
+
config = load_config(
|
| 255 |
+
overrides={
|
| 256 |
+
"chunking": {
|
| 257 |
+
"contextual_retrieval": True,
|
| 258 |
+
"semantic_chunking": True,
|
| 259 |
+
"late_chunking": True,
|
| 260 |
+
"vision_guided": True,
|
| 261 |
+
"agentic_proposition_chunking": True,
|
| 262 |
+
}
|
| 263 |
+
}
|
| 264 |
+
)
|
| 265 |
+
|
| 266 |
+
parsed.chunks = build_agentic_chunks(parsed, profile, config)
|
| 267 |
+
strategies = {chunk.strategy for chunk in parsed.chunks}
|
| 268 |
+
|
| 269 |
+
self.assertIn("semantic", strategies)
|
| 270 |
+
self.assertIn("late", strategies)
|
| 271 |
+
self.assertIn("contextual_retrieval", strategies)
|
| 272 |
+
self.assertIn("vision_guided", strategies)
|
| 273 |
+
self.assertIn("agentic_proposition", strategies)
|
| 274 |
+
self.assertGreater(parsed.provenance["chunking"]["semantic_chunk_count"], 0)
|
| 275 |
+
self.assertGreater(parsed.provenance["chunking"]["late_chunk_count"], 0)
|
| 276 |
+
self.assertGreater(parsed.provenance["chunking"]["contextual_retrieval_chunk_count"], 0)
|
| 277 |
+
semantic_chunk = next(chunk for chunk in parsed.chunks if chunk.strategy == "semantic")
|
| 278 |
+
self.assertEqual(semantic_chunk.metadata["execution_mode"], "lexical_similarity_proxy")
|
| 279 |
+
contextual_chunk = next(chunk for chunk in parsed.chunks if chunk.strategy == "contextual_retrieval")
|
| 280 |
+
self.assertIn("source_chunk_id", contextual_chunk.metadata)
|
| 281 |
+
late_chunk = next(chunk for chunk in parsed.chunks if chunk.strategy == "late")
|
| 282 |
+
self.assertTrue(late_chunk.metadata["requires_token_level_embeddings"])
|
| 283 |
+
|
| 284 |
+
|
| 285 |
+
if __name__ == "__main__":
|
| 286 |
+
unittest.main()
|
tests/test_cli_help.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests guarding CLI help text — examples must render and stay clean."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import io
|
| 6 |
+
import unittest
|
| 7 |
+
from contextlib import redirect_stdout
|
| 8 |
+
|
| 9 |
+
from zsgdp.cli import _epilog, main
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def _capture_help(argv: list[str]) -> str:
|
| 13 |
+
"""Run `zsgdp <argv> --help` and return captured stdout. SystemExit is normal."""
|
| 14 |
+
|
| 15 |
+
buffer = io.StringIO()
|
| 16 |
+
with redirect_stdout(buffer):
|
| 17 |
+
try:
|
| 18 |
+
main(argv + ["--help"])
|
| 19 |
+
except SystemExit:
|
| 20 |
+
pass
|
| 21 |
+
return buffer.getvalue()
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class EpilogFormatterTests(unittest.TestCase):
|
| 25 |
+
def test_epilog_dedents_indented_source_string(self):
|
| 26 |
+
rendered = _epilog(
|
| 27 |
+
"""
|
| 28 |
+
zsgdp parse --input ./a --output ./b
|
| 29 |
+
zsgdp parse --input ./c --output ./d
|
| 30 |
+
"""
|
| 31 |
+
)
|
| 32 |
+
# No double-indentation; first non-blank line begins with two spaces only.
|
| 33 |
+
lines = rendered.splitlines()
|
| 34 |
+
self.assertEqual(lines[0], "Examples:")
|
| 35 |
+
self.assertTrue(lines[1].startswith(" zsgdp parse"))
|
| 36 |
+
# No source-indent leak.
|
| 37 |
+
self.assertNotIn(" zsgdp", rendered)
|
| 38 |
+
|
| 39 |
+
def test_epilog_preserves_blank_lines_as_separators(self):
|
| 40 |
+
rendered = _epilog(
|
| 41 |
+
"""
|
| 42 |
+
line one
|
| 43 |
+
|
| 44 |
+
line two
|
| 45 |
+
"""
|
| 46 |
+
)
|
| 47 |
+
self.assertIn("\n\n", rendered)
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
class SubcommandHelpTests(unittest.TestCase):
|
| 51 |
+
def test_top_level_help_lists_examples_section(self):
|
| 52 |
+
text = _capture_help([])
|
| 53 |
+
self.assertIn("Examples:", text)
|
| 54 |
+
self.assertIn("zsgdp parse", text)
|
| 55 |
+
self.assertIn("docs/space_smoke.md", text)
|
| 56 |
+
|
| 57 |
+
def test_parse_help_has_examples(self):
|
| 58 |
+
text = _capture_help(["parse"])
|
| 59 |
+
self.assertIn("Examples:", text)
|
| 60 |
+
self.assertIn("zsgdp parse --input", text)
|
| 61 |
+
self.assertIn("--config configs/docling.yaml", text)
|
| 62 |
+
|
| 63 |
+
def test_benchmark_help_covers_three_dataset_modes(self):
|
| 64 |
+
text = _capture_help(["benchmark"])
|
| 65 |
+
self.assertIn("Examples:", text)
|
| 66 |
+
self.assertIn("--dataset omnidocbench", text)
|
| 67 |
+
self.assertIn("--dataset doclaynet", text)
|
| 68 |
+
|
| 69 |
+
def test_benchmark_ablate_shows_merged_arm_pattern(self):
|
| 70 |
+
text = _capture_help(["benchmark-ablate"])
|
| 71 |
+
self.assertIn("--parser docling --parser pymupdf", text)
|
| 72 |
+
self.assertIn("--no-merged", text)
|
| 73 |
+
|
| 74 |
+
def test_run_gpu_tasks_documents_dry_run_vs_execute(self):
|
| 75 |
+
text = _capture_help(["run-gpu-tasks"])
|
| 76 |
+
self.assertIn("Dry-run", text)
|
| 77 |
+
self.assertIn("--execute", text)
|
| 78 |
+
|
| 79 |
+
def test_combine_benchmarks_shows_label_pairing(self):
|
| 80 |
+
text = _capture_help(["combine-benchmarks"])
|
| 81 |
+
self.assertIn("--label omnidocbench", text)
|
| 82 |
+
self.assertIn("--label doclaynet", text)
|
| 83 |
+
|
| 84 |
+
def test_preflight_help_documents_skip_flags(self):
|
| 85 |
+
text = _capture_help(["preflight"])
|
| 86 |
+
self.assertIn("--benchmark", text)
|
| 87 |
+
self.assertIn("--skip-unit", text)
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
if __name__ == "__main__":
|
| 91 |
+
unittest.main()
|
tests/test_conflict_detection.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import tempfile
|
| 2 |
+
import unittest
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
from zsgdp.export import export_parsed_document
|
| 6 |
+
from zsgdp.merge.conflict_detection import build_candidate_conflict_report, detect_candidate_conflicts
|
| 7 |
+
from zsgdp.merge.merge_candidates import merge_candidates
|
| 8 |
+
from zsgdp.schema import DocumentProfile, Element, PageProfile, ParseCandidate, TableObject
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class ConflictDetectionTests(unittest.TestCase):
|
| 12 |
+
def test_conflict_report_flags_reading_order_and_table_structure(self):
|
| 13 |
+
candidates = [_candidate("docling", ["Alpha", "Beta", "Gamma"], 3), _candidate("pymupdf", ["Gamma", "Beta", "Alpha"], 2)]
|
| 14 |
+
|
| 15 |
+
report = build_candidate_conflict_report(candidates)
|
| 16 |
+
issues = detect_candidate_conflicts(candidates)
|
| 17 |
+
|
| 18 |
+
conflict_types = {conflict["type"] for conflict in report["conflicts"]}
|
| 19 |
+
self.assertIn("reading_order_disagreement", conflict_types)
|
| 20 |
+
self.assertIn("table_structure_disagreement", conflict_types)
|
| 21 |
+
self.assertTrue(issues)
|
| 22 |
+
self.assertTrue(all(issue.issue_type == "parser_disagreement" for issue in issues))
|
| 23 |
+
|
| 24 |
+
def test_merge_stores_and_exports_conflict_report(self):
|
| 25 |
+
profile = DocumentProfile(
|
| 26 |
+
doc_id="d1",
|
| 27 |
+
source_path="sample.pdf",
|
| 28 |
+
file_type="pdf",
|
| 29 |
+
page_count=1,
|
| 30 |
+
extension=".pdf",
|
| 31 |
+
pages=[PageProfile(page_num=1, digital_text_chars=30)],
|
| 32 |
+
)
|
| 33 |
+
parsed = merge_candidates(
|
| 34 |
+
[_candidate("docling", ["Alpha", "Beta", "Gamma"], 3), _candidate("pymupdf", ["Gamma", "Beta", "Alpha"], 2)],
|
| 35 |
+
profile,
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
with tempfile.TemporaryDirectory() as tmp:
|
| 39 |
+
output_dir = Path(tmp) / "out"
|
| 40 |
+
export_parsed_document(parsed, output_dir)
|
| 41 |
+
|
| 42 |
+
self.assertTrue((output_dir / "conflict_report.json").exists())
|
| 43 |
+
|
| 44 |
+
self.assertIn("conflict_report", parsed.provenance)
|
| 45 |
+
self.assertGreater(parsed.provenance["conflict_report"]["conflict_count"], 0)
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def _candidate(parser_name: str, ordered_text: list[str], table_columns: int) -> ParseCandidate:
|
| 49 |
+
elements = [
|
| 50 |
+
Element(
|
| 51 |
+
element_id=f"{parser_name}_e{index}",
|
| 52 |
+
doc_id="d1",
|
| 53 |
+
page_num=1,
|
| 54 |
+
type="paragraph",
|
| 55 |
+
text=text,
|
| 56 |
+
reading_order=index,
|
| 57 |
+
confidence=0.8,
|
| 58 |
+
source_parser=parser_name,
|
| 59 |
+
)
|
| 60 |
+
for index, text in enumerate(ordered_text, start=1)
|
| 61 |
+
]
|
| 62 |
+
return ParseCandidate(
|
| 63 |
+
parser_name=parser_name,
|
| 64 |
+
doc_id="d1",
|
| 65 |
+
source_path="sample.pdf",
|
| 66 |
+
file_type="pdf",
|
| 67 |
+
pages=[{"page_num": 1, "source_parser": parser_name}],
|
| 68 |
+
elements=elements,
|
| 69 |
+
tables=[
|
| 70 |
+
TableObject(
|
| 71 |
+
table_id=f"{parser_name}_t1",
|
| 72 |
+
page_nums=[1],
|
| 73 |
+
markdown=_table_markdown(table_columns),
|
| 74 |
+
confidence=0.8,
|
| 75 |
+
source_parser=parser_name,
|
| 76 |
+
)
|
| 77 |
+
],
|
| 78 |
+
confidence=0.8,
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def _table_markdown(columns: int) -> str:
|
| 83 |
+
if columns == 3:
|
| 84 |
+
return "| Region | Q1 | Q2 |\n| --- | --- | --- |\n| NA | 10 | 12 |"
|
| 85 |
+
return "| Region | Q1 |\n| --- | --- |\n| NA | 10 |"
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
if __name__ == "__main__":
|
| 89 |
+
unittest.main()
|
tests/test_cross_dataset.py
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for cross-dataset benchmark comparison."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import json
|
| 6 |
+
import tempfile
|
| 7 |
+
import unittest
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
from zsgdp.benchmarks.cross_dataset import (
|
| 11 |
+
combine_benchmark_summaries,
|
| 12 |
+
write_cross_dataset_outputs,
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def _summary(dataset_name: str, *, layout_f1: float, leaderboard: list[dict] | None = None) -> dict:
|
| 17 |
+
return {
|
| 18 |
+
"dataset_name": dataset_name,
|
| 19 |
+
"dataset_root": f"/tmp/{dataset_name}",
|
| 20 |
+
"document_count": 5,
|
| 21 |
+
"mean_quality_score": 0.9,
|
| 22 |
+
"mean_layout_f1": layout_f1,
|
| 23 |
+
"mean_retrieval_recall_at_5": 0.7,
|
| 24 |
+
"mean_table_structure_score": 0.6,
|
| 25 |
+
"mean_formula_cer": 0.2,
|
| 26 |
+
"per_parser_gt_leaderboard": leaderboard or [],
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class TestCombineBenchmarkSummaries(unittest.TestCase):
|
| 31 |
+
def test_two_runs_produce_two_rows(self):
|
| 32 |
+
runs = [
|
| 33 |
+
("docs_a", _summary("docs_a", layout_f1=0.5)),
|
| 34 |
+
("docs_b", _summary("docs_b", layout_f1=0.8)),
|
| 35 |
+
]
|
| 36 |
+
comparison = combine_benchmark_summaries(runs)
|
| 37 |
+
self.assertEqual(comparison["run_count"], 2)
|
| 38 |
+
self.assertEqual(comparison["labels"], ["docs_a", "docs_b"])
|
| 39 |
+
self.assertEqual([row["label"] for row in comparison["dataset_summary"]], ["docs_a", "docs_b"])
|
| 40 |
+
layouts = {row["label"]: row["mean_layout_f1"] for row in comparison["dataset_summary"]}
|
| 41 |
+
self.assertEqual(layouts, {"docs_a": 0.5, "docs_b": 0.8})
|
| 42 |
+
|
| 43 |
+
def test_parser_matrix_aligns_parsers_across_runs(self):
|
| 44 |
+
leaderboard_a = [
|
| 45 |
+
{"parser": "docling", "mean_layout_class_aware_f1": 0.9, "document_count": 3},
|
| 46 |
+
{"parser": "pymupdf", "mean_layout_class_aware_f1": 0.4, "document_count": 3},
|
| 47 |
+
]
|
| 48 |
+
leaderboard_b = [
|
| 49 |
+
{"parser": "docling", "mean_layout_class_aware_f1": 0.7, "document_count": 5},
|
| 50 |
+
# marker only appears in run B.
|
| 51 |
+
{"parser": "marker", "mean_layout_class_aware_f1": 0.6, "document_count": 5},
|
| 52 |
+
]
|
| 53 |
+
runs = [
|
| 54 |
+
("a", _summary("a", layout_f1=0.5, leaderboard=leaderboard_a)),
|
| 55 |
+
("b", _summary("b", layout_f1=0.7, leaderboard=leaderboard_b)),
|
| 56 |
+
]
|
| 57 |
+
comparison = combine_benchmark_summaries(runs)
|
| 58 |
+
|
| 59 |
+
matrix = comparison["parser_matrix"]
|
| 60 |
+
parsers = sorted(row["parser"] for row in matrix)
|
| 61 |
+
self.assertEqual(parsers, ["docling", "marker", "pymupdf"])
|
| 62 |
+
|
| 63 |
+
by_parser = {row["parser"]: row for row in matrix}
|
| 64 |
+
# Docling appears in both runs.
|
| 65 |
+
self.assertEqual(by_parser["docling"]["a__mean_layout_class_aware_f1"], 0.9)
|
| 66 |
+
self.assertEqual(by_parser["docling"]["b__mean_layout_class_aware_f1"], 0.7)
|
| 67 |
+
# Marker missing in run A -> None, present in B.
|
| 68 |
+
self.assertIsNone(by_parser["marker"]["a__mean_layout_class_aware_f1"])
|
| 69 |
+
self.assertEqual(by_parser["marker"]["b__mean_layout_class_aware_f1"], 0.6)
|
| 70 |
+
# PyMuPDF missing in run B -> None.
|
| 71 |
+
self.assertIsNone(by_parser["pymupdf"]["b__mean_layout_class_aware_f1"])
|
| 72 |
+
|
| 73 |
+
def test_duplicate_labels_raise(self):
|
| 74 |
+
with self.assertRaises(ValueError):
|
| 75 |
+
combine_benchmark_summaries(
|
| 76 |
+
[
|
| 77 |
+
("same", _summary("a", layout_f1=0.5)),
|
| 78 |
+
("same", _summary("b", layout_f1=0.7)),
|
| 79 |
+
]
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
def test_summary_loaded_from_path(self):
|
| 83 |
+
with tempfile.TemporaryDirectory() as tmp:
|
| 84 |
+
tmp = Path(tmp)
|
| 85 |
+
(tmp / "results.json").write_text(json.dumps(_summary("from_path", layout_f1=0.42)))
|
| 86 |
+
|
| 87 |
+
comparison = combine_benchmark_summaries([("a", tmp)])
|
| 88 |
+
self.assertEqual(comparison["dataset_summary"][0]["mean_layout_f1"], 0.42)
|
| 89 |
+
|
| 90 |
+
def test_missing_metric_yields_none_not_zero(self):
|
| 91 |
+
# A summary missing mean_formula_cer (older code, e.g.) preserves None.
|
| 92 |
+
sparse_summary = {"dataset_name": "old_run", "document_count": 1}
|
| 93 |
+
comparison = combine_benchmark_summaries([("old", sparse_summary)])
|
| 94 |
+
row = comparison["dataset_summary"][0]
|
| 95 |
+
self.assertEqual(row["document_count"], 1)
|
| 96 |
+
self.assertIsNone(row["mean_layout_f1"])
|
| 97 |
+
self.assertIsNone(row["mean_formula_cer"])
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
class TestWriteCrossDatasetOutputs(unittest.TestCase):
|
| 101 |
+
def test_writes_json_and_csvs(self):
|
| 102 |
+
leaderboard = [{"parser": "docling", "mean_layout_class_aware_f1": 0.9, "document_count": 3}]
|
| 103 |
+
comparison = combine_benchmark_summaries(
|
| 104 |
+
[("a", _summary("a", layout_f1=0.5, leaderboard=leaderboard))]
|
| 105 |
+
)
|
| 106 |
+
with tempfile.TemporaryDirectory() as tmp:
|
| 107 |
+
tmp = Path(tmp)
|
| 108 |
+
write_cross_dataset_outputs(comparison, tmp)
|
| 109 |
+
|
| 110 |
+
self.assertTrue((tmp / "cross_dataset_comparison.json").exists())
|
| 111 |
+
self.assertTrue((tmp / "dataset_summary.csv").exists())
|
| 112 |
+
self.assertTrue((tmp / "parser_matrix.csv").exists())
|
| 113 |
+
|
| 114 |
+
ds_csv = (tmp / "dataset_summary.csv").read_text()
|
| 115 |
+
self.assertIn("mean_layout_f1", ds_csv.splitlines()[0])
|
| 116 |
+
self.assertIn("a", ds_csv.splitlines()[1])
|
| 117 |
+
|
| 118 |
+
matrix_csv = (tmp / "parser_matrix.csv").read_text()
|
| 119 |
+
self.assertIn("a__mean_layout_class_aware_f1", matrix_csv.splitlines()[0])
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
if __name__ == "__main__":
|
| 123 |
+
unittest.main()
|
tests/test_datasets.py
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Dataset loader tests."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import json
|
| 6 |
+
import tempfile
|
| 7 |
+
import unittest
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
from zsgdp.benchmarks.datasets import (
|
| 11 |
+
DatasetDocument,
|
| 12 |
+
get_dataset_loader,
|
| 13 |
+
iter_dataset,
|
| 14 |
+
list_dataset_loaders,
|
| 15 |
+
register_dataset_loader,
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class TestDatasetRegistry(unittest.TestCase):
|
| 20 |
+
def test_built_in_loaders_registered(self):
|
| 21 |
+
loaders = list_dataset_loaders()
|
| 22 |
+
self.assertIn("custom_folder", loaders)
|
| 23 |
+
self.assertIn("omnidocbench", loaders)
|
| 24 |
+
self.assertIn("doclaynet", loaders)
|
| 25 |
+
|
| 26 |
+
def test_custom_alias_resolves_to_custom_folder(self):
|
| 27 |
+
loader_default = get_dataset_loader("default")
|
| 28 |
+
loader_alias = get_dataset_loader("custom")
|
| 29 |
+
loader_canonical = get_dataset_loader("custom_folder")
|
| 30 |
+
self.assertIs(loader_default, loader_canonical)
|
| 31 |
+
self.assertIs(loader_alias, loader_canonical)
|
| 32 |
+
|
| 33 |
+
def test_unknown_loader_raises(self):
|
| 34 |
+
with self.assertRaises(KeyError):
|
| 35 |
+
get_dataset_loader("not_a_real_dataset")
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
class TestCustomFolderLoader(unittest.TestCase):
|
| 39 |
+
def test_yields_files_with_no_ground_truth(self):
|
| 40 |
+
with tempfile.TemporaryDirectory() as tmp:
|
| 41 |
+
root = Path(tmp)
|
| 42 |
+
(root / "a.md").write_text("# A\n")
|
| 43 |
+
(root / "b.md").write_text("# B\n")
|
| 44 |
+
(root / "subdir").mkdir()
|
| 45 |
+
(root / "subdir" / "ignored.md").write_text("# nope\n")
|
| 46 |
+
|
| 47 |
+
documents = list(iter_dataset("custom_folder", root))
|
| 48 |
+
|
| 49 |
+
ids = sorted(document.doc_id for document in documents)
|
| 50 |
+
self.assertEqual(ids, ["a", "b"])
|
| 51 |
+
for document in documents:
|
| 52 |
+
self.assertIsNone(document.ground_truth)
|
| 53 |
+
self.assertEqual(document.dataset_id, "custom_folder")
|
| 54 |
+
self.assertTrue(document.path.exists())
|
| 55 |
+
|
| 56 |
+
def test_missing_root_raises(self):
|
| 57 |
+
with self.assertRaises(FileNotFoundError):
|
| 58 |
+
list(iter_dataset("custom_folder", "/tmp/this-path-should-not-exist-zsgdp"))
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
class TestOmniDocBenchLoader(unittest.TestCase):
|
| 62 |
+
def test_pairs_pdf_with_sibling_json(self):
|
| 63 |
+
with tempfile.TemporaryDirectory() as tmp:
|
| 64 |
+
root = Path(tmp)
|
| 65 |
+
(root / "doc1.pdf").write_bytes(b"%PDF-1.4\n%%EOF\n")
|
| 66 |
+
(root / "doc1.json").write_text(json.dumps({"reading_order": ["e1", "e2"]}))
|
| 67 |
+
(root / "doc2.pdf").write_bytes(b"%PDF-1.4\n%%EOF\n") # no GT
|
| 68 |
+
|
| 69 |
+
documents = list(iter_dataset("omnidocbench", root))
|
| 70 |
+
|
| 71 |
+
by_id = {document.doc_id: document for document in documents}
|
| 72 |
+
self.assertEqual(set(by_id), {"doc1", "doc2"})
|
| 73 |
+
|
| 74 |
+
self.assertIsNotNone(by_id["doc1"].ground_truth)
|
| 75 |
+
self.assertEqual(by_id["doc1"].ground_truth["reading_order"], ["e1", "e2"])
|
| 76 |
+
self.assertTrue(by_id["doc1"].metadata["has_ground_truth"])
|
| 77 |
+
|
| 78 |
+
self.assertIsNone(by_id["doc2"].ground_truth)
|
| 79 |
+
self.assertFalse(by_id["doc2"].metadata["has_ground_truth"])
|
| 80 |
+
|
| 81 |
+
def test_no_pdfs_raises(self):
|
| 82 |
+
with tempfile.TemporaryDirectory() as tmp:
|
| 83 |
+
with self.assertRaises(FileNotFoundError):
|
| 84 |
+
list(iter_dataset("omnidocbench", tmp))
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
class TestDocLayNetLoader(unittest.TestCase):
|
| 88 |
+
def test_yields_one_document_per_image_with_filtered_annotations(self):
|
| 89 |
+
with tempfile.TemporaryDirectory() as tmp:
|
| 90 |
+
root = Path(tmp)
|
| 91 |
+
(root / "page1.png").write_bytes(b"\x89PNG\r\n\x1a\n")
|
| 92 |
+
(root / "page2.png").write_bytes(b"\x89PNG\r\n\x1a\n")
|
| 93 |
+
(root / "annotations.json").write_text(
|
| 94 |
+
json.dumps(
|
| 95 |
+
{
|
| 96 |
+
"images": [
|
| 97 |
+
{"id": 1, "file_name": "page1.png", "width": 800, "height": 1100},
|
| 98 |
+
{"id": 2, "file_name": "page2.png", "width": 800, "height": 1100},
|
| 99 |
+
],
|
| 100 |
+
"annotations": [
|
| 101 |
+
{"id": 10, "image_id": 1, "category_id": 1, "bbox": [0, 0, 100, 50]},
|
| 102 |
+
{"id": 11, "image_id": 1, "category_id": 2, "bbox": [0, 60, 100, 50]},
|
| 103 |
+
{"id": 12, "image_id": 2, "category_id": 1, "bbox": [0, 0, 100, 50]},
|
| 104 |
+
],
|
| 105 |
+
"categories": [
|
| 106 |
+
{"id": 1, "name": "Title"},
|
| 107 |
+
{"id": 2, "name": "Text"},
|
| 108 |
+
],
|
| 109 |
+
}
|
| 110 |
+
)
|
| 111 |
+
)
|
| 112 |
+
|
| 113 |
+
documents = list(iter_dataset("doclaynet", root))
|
| 114 |
+
|
| 115 |
+
by_id = {document.doc_id: document for document in documents}
|
| 116 |
+
self.assertEqual(set(by_id), {"page1.png", "page2.png"})
|
| 117 |
+
|
| 118 |
+
self.assertEqual(len(by_id["page1.png"].ground_truth["annotations"]), 2)
|
| 119 |
+
self.assertEqual(len(by_id["page2.png"].ground_truth["annotations"]), 1)
|
| 120 |
+
self.assertEqual(by_id["page1.png"].ground_truth["categories"][1]["name"], "Title")
|
| 121 |
+
|
| 122 |
+
def test_missing_annotations_raises(self):
|
| 123 |
+
with tempfile.TemporaryDirectory() as tmp:
|
| 124 |
+
root = Path(tmp)
|
| 125 |
+
(root / "page1.png").write_bytes(b"\x89PNG\r\n\x1a\n")
|
| 126 |
+
with self.assertRaises(FileNotFoundError):
|
| 127 |
+
list(iter_dataset("doclaynet", root))
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
class TestRegisterDatasetLoader(unittest.TestCase):
|
| 131 |
+
def test_register_and_use_custom_loader(self):
|
| 132 |
+
marker = []
|
| 133 |
+
|
| 134 |
+
def fake_loader(root: Path):
|
| 135 |
+
marker.append(root)
|
| 136 |
+
yield DatasetDocument(dataset_id="fake", doc_id="x", path=root)
|
| 137 |
+
|
| 138 |
+
register_dataset_loader("zsgdp_test_fake", fake_loader)
|
| 139 |
+
try:
|
| 140 |
+
documents = list(iter_dataset("zsgdp_test_fake", Path("/tmp/whatever")))
|
| 141 |
+
finally:
|
| 142 |
+
from zsgdp.benchmarks.datasets import _LOADERS
|
| 143 |
+
|
| 144 |
+
_LOADERS.pop("zsgdp_test_fake", None)
|
| 145 |
+
|
| 146 |
+
self.assertEqual(len(documents), 1)
|
| 147 |
+
self.assertEqual(documents[0].dataset_id, "fake")
|
| 148 |
+
self.assertEqual(marker, [Path("/tmp/whatever")])
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
if __name__ == "__main__":
|
| 152 |
+
unittest.main()
|
tests/test_deployment.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import tempfile
|
| 3 |
+
import unittest
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
from zsgdp.cli import main
|
| 7 |
+
from zsgdp.deployment import check_huggingface_space
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class DeploymentReadinessTests(unittest.TestCase):
|
| 11 |
+
def test_space_check_accepts_current_project(self):
|
| 12 |
+
report = check_huggingface_space(Path.cwd())
|
| 13 |
+
|
| 14 |
+
self.assertTrue(report["valid"])
|
| 15 |
+
self.assertEqual(report["target"], "huggingface_spaces")
|
| 16 |
+
self.assertEqual(report["space_name"], "zeroshotGPU")
|
| 17 |
+
self.assertEqual(report["gpu_models_target"], "zeroshotGPU")
|
| 18 |
+
self.assertEqual(report["failure_count"], 0)
|
| 19 |
+
self.assertTrue(any(check["status"] == "warn" for check in report["checks"]))
|
| 20 |
+
|
| 21 |
+
def test_space_check_cli_writes_report(self):
|
| 22 |
+
with tempfile.TemporaryDirectory() as tmp:
|
| 23 |
+
output_path = Path(tmp) / "space_report.json"
|
| 24 |
+
|
| 25 |
+
code = main(["space-check", "--root", str(Path.cwd()), "--output", str(output_path)])
|
| 26 |
+
|
| 27 |
+
self.assertEqual(code, 0)
|
| 28 |
+
self.assertTrue(output_path.exists())
|
| 29 |
+
self.assertTrue(json.loads(output_path.read_text(encoding="utf-8"))["valid"])
|
| 30 |
+
|
| 31 |
+
def test_space_check_reports_missing_files(self):
|
| 32 |
+
with tempfile.TemporaryDirectory() as tmp:
|
| 33 |
+
root = Path(tmp)
|
| 34 |
+
|
| 35 |
+
report = check_huggingface_space(root)
|
| 36 |
+
|
| 37 |
+
self.assertFalse(report["valid"])
|
| 38 |
+
self.assertGreater(report["failure_count"], 0)
|
| 39 |
+
self.assertTrue(any(check["id"] == "required_file" and check["status"] == "fail" for check in report["checks"]))
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
if __name__ == "__main__":
|
| 43 |
+
unittest.main()
|
tests/test_docling_parser.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import unittest
|
| 2 |
+
|
| 3 |
+
from zsgdp.parsers.docling_parser import _export_markdown, normalize_docling_markdown
|
| 4 |
+
from zsgdp.schema import DocumentProfile, PageProfile
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class FakeDoclingDocument:
|
| 8 |
+
def export_to_markdown(self):
|
| 9 |
+
return "# Report\n\n| A | B |\n| --- | --- |\n| 1 | 2 |"
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class DoclingParserTests(unittest.TestCase):
|
| 13 |
+
def test_export_markdown_uses_docling_method(self):
|
| 14 |
+
self.assertEqual(_export_markdown(FakeDoclingDocument()), "# Report\n\n| A | B |\n| --- | --- |\n| 1 | 2 |")
|
| 15 |
+
|
| 16 |
+
def test_normalize_docling_markdown_emits_schema(self):
|
| 17 |
+
profile = DocumentProfile(
|
| 18 |
+
doc_id="d1",
|
| 19 |
+
source_path="sample.pdf",
|
| 20 |
+
file_type="pdf",
|
| 21 |
+
page_count=1,
|
| 22 |
+
extension=".pdf",
|
| 23 |
+
pages=[PageProfile(page_num=1, digital_text_chars=20)],
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
candidate = normalize_docling_markdown(
|
| 27 |
+
markdown="# Report\n\n| A | B |\n| --- | --- |\n| 1 | 2 |",
|
| 28 |
+
profile=profile,
|
| 29 |
+
source_path="sample.pdf",
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
self.assertEqual(candidate.parser_name, "docling")
|
| 33 |
+
self.assertEqual(len(candidate.elements), 2)
|
| 34 |
+
self.assertEqual(len(candidate.tables), 1)
|
| 35 |
+
self.assertEqual(candidate.pages[0]["source_parser"], "docling")
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
if __name__ == "__main__":
|
| 39 |
+
unittest.main()
|
tests/test_embedding_retriever.py
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for the embedding-based retriever and the build_retriever factory."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import math
|
| 6 |
+
import tempfile
|
| 7 |
+
import unittest
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from unittest.mock import patch
|
| 10 |
+
|
| 11 |
+
from zsgdp.benchmarks.embedding_retriever import (
|
| 12 |
+
EmbeddingRetriever,
|
| 13 |
+
build_retriever,
|
| 14 |
+
)
|
| 15 |
+
from zsgdp.benchmarks.parser_quality import run_parser_benchmark
|
| 16 |
+
from zsgdp.benchmarks.retrieval import LexicalRetriever, run_retrieval_for_document
|
| 17 |
+
from zsgdp.schema import Chunk, ParsedDocument, QualityReport
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def _chunk(chunk_id: str, text: str) -> Chunk:
|
| 21 |
+
return Chunk(
|
| 22 |
+
chunk_id=chunk_id,
|
| 23 |
+
doc_id="d1",
|
| 24 |
+
page_start=1,
|
| 25 |
+
page_end=1,
|
| 26 |
+
section_path=[],
|
| 27 |
+
content_type="prose",
|
| 28 |
+
text=text,
|
| 29 |
+
token_count=len(text.split()),
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def _hashing_embedder(dim: int = 32):
|
| 34 |
+
"""Deterministic toy embedder: tokens hashed into a fixed-dim vector.
|
| 35 |
+
|
| 36 |
+
Uses a process-stable hash (hashlib.md5) instead of builtins.hash(), which
|
| 37 |
+
is randomized per Python process and would make ranking non-deterministic
|
| 38 |
+
across test runs.
|
| 39 |
+
"""
|
| 40 |
+
|
| 41 |
+
import hashlib
|
| 42 |
+
|
| 43 |
+
def stable_hash(token: str) -> int:
|
| 44 |
+
return int.from_bytes(hashlib.md5(token.encode("utf-8")).digest()[:8], "big")
|
| 45 |
+
|
| 46 |
+
def encode(texts):
|
| 47 |
+
out = []
|
| 48 |
+
for text in texts:
|
| 49 |
+
vector = [0.0] * dim
|
| 50 |
+
for token in text.lower().split():
|
| 51 |
+
vector[stable_hash(token) % dim] += 1.0
|
| 52 |
+
out.append(vector)
|
| 53 |
+
return out
|
| 54 |
+
|
| 55 |
+
return encode
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
class TestEmbeddingRetriever(unittest.TestCase):
|
| 59 |
+
def test_finds_distinctive_chunk_with_injected_embedder(self):
|
| 60 |
+
chunks = [
|
| 61 |
+
_chunk("c1", "Apples grow on trees in the orchard."),
|
| 62 |
+
_chunk("c2", "Cars drive on highways across the country."),
|
| 63 |
+
_chunk("c3", "Boats sail on rivers and oceans."),
|
| 64 |
+
]
|
| 65 |
+
retriever = EmbeddingRetriever(embedder=_hashing_embedder())
|
| 66 |
+
retriever.index(chunks)
|
| 67 |
+
|
| 68 |
+
ranking = retriever.query("apples orchard", top_k=3)
|
| 69 |
+
self.assertEqual(ranking[0], "c1")
|
| 70 |
+
|
| 71 |
+
def test_empty_index_returns_empty(self):
|
| 72 |
+
retriever = EmbeddingRetriever(embedder=_hashing_embedder())
|
| 73 |
+
self.assertEqual(retriever.query("anything", top_k=3), [])
|
| 74 |
+
|
| 75 |
+
def test_zero_norm_vector_skipped(self):
|
| 76 |
+
retriever = EmbeddingRetriever(embedder=lambda texts: [[0.0, 0.0, 0.0]] * len(texts))
|
| 77 |
+
retriever.index([_chunk("c1", "anything")])
|
| 78 |
+
# Query embedder also returns zero vector, normalization fails -> empty.
|
| 79 |
+
self.assertEqual(retriever.query("anything", top_k=3), [])
|
| 80 |
+
|
| 81 |
+
def test_embedder_returning_wrong_count_raises(self):
|
| 82 |
+
bad = lambda texts: [[1.0]] # always returns one vector
|
| 83 |
+
retriever = EmbeddingRetriever(embedder=bad)
|
| 84 |
+
with self.assertRaises(RuntimeError):
|
| 85 |
+
retriever.index([_chunk("c1", "a"), _chunk("c2", "b")])
|
| 86 |
+
|
| 87 |
+
def test_lazy_load_path_raises_if_sentence_transformers_missing(self):
|
| 88 |
+
retriever = EmbeddingRetriever(model_id="fake/model")
|
| 89 |
+
# Force the import to fail by patching builtins.__import__.
|
| 90 |
+
import builtins
|
| 91 |
+
|
| 92 |
+
real_import = builtins.__import__
|
| 93 |
+
|
| 94 |
+
def fake_import(name, *args, **kwargs):
|
| 95 |
+
if name == "sentence_transformers":
|
| 96 |
+
raise ImportError("not installed")
|
| 97 |
+
return real_import(name, *args, **kwargs)
|
| 98 |
+
|
| 99 |
+
with patch("builtins.__import__", side_effect=fake_import):
|
| 100 |
+
with self.assertRaises(RuntimeError) as ctx:
|
| 101 |
+
retriever.index([_chunk("c1", "anything")])
|
| 102 |
+
self.assertIn("sentence-transformers", str(ctx.exception))
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
class TestBuildRetriever(unittest.TestCase):
|
| 106 |
+
def test_default_returns_lexical(self):
|
| 107 |
+
retriever = build_retriever({})
|
| 108 |
+
self.assertIsInstance(retriever, LexicalRetriever)
|
| 109 |
+
|
| 110 |
+
def test_explicit_lexical_backend(self):
|
| 111 |
+
retriever = build_retriever({"benchmarks": {"retriever": {"backend": "lexical"}}})
|
| 112 |
+
self.assertIsInstance(retriever, LexicalRetriever)
|
| 113 |
+
|
| 114 |
+
def test_embedding_backend_uses_gpu_models_embedding_default(self):
|
| 115 |
+
config = {
|
| 116 |
+
"benchmarks": {"retriever": {"backend": "embedding"}},
|
| 117 |
+
"gpu": {"models": {"embedding": {"model_id": "custom/model", "task": "retrieval.query"}}},
|
| 118 |
+
}
|
| 119 |
+
retriever = build_retriever(config)
|
| 120 |
+
self.assertIsInstance(retriever, EmbeddingRetriever)
|
| 121 |
+
self.assertEqual(retriever._model_id, "custom/model")
|
| 122 |
+
self.assertEqual(retriever._task, "retrieval.query")
|
| 123 |
+
|
| 124 |
+
def test_explicit_model_id_overrides_gpu_default(self):
|
| 125 |
+
config = {
|
| 126 |
+
"benchmarks": {"retriever": {"backend": "embedding", "model_id": "explicit/model"}},
|
| 127 |
+
"gpu": {"models": {"embedding": {"model_id": "ignored/model"}}},
|
| 128 |
+
}
|
| 129 |
+
retriever = build_retriever(config)
|
| 130 |
+
self.assertEqual(retriever._model_id, "explicit/model")
|
| 131 |
+
|
| 132 |
+
def test_unknown_backend_raises(self):
|
| 133 |
+
with self.assertRaises(ValueError):
|
| 134 |
+
build_retriever({"benchmarks": {"retriever": {"backend": "magic"}}})
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
class TestRunRetrievalWithEmbedding(unittest.TestCase):
|
| 138 |
+
def test_run_retrieval_for_document_accepts_embedding_retriever(self):
|
| 139 |
+
parsed = ParsedDocument(
|
| 140 |
+
doc_id="d1",
|
| 141 |
+
source_path="/tmp/d1.md",
|
| 142 |
+
file_type="markdown",
|
| 143 |
+
chunks=[
|
| 144 |
+
_chunk("c1", "Apples grow on trees in the orchard during autumn."),
|
| 145 |
+
_chunk("c2", "Submarines navigate beneath the ocean using sonar."),
|
| 146 |
+
],
|
| 147 |
+
quality_report=QualityReport(),
|
| 148 |
+
)
|
| 149 |
+
retriever = EmbeddingRetriever(embedder=_hashing_embedder())
|
| 150 |
+
run = run_retrieval_for_document(parsed, retriever=retriever)
|
| 151 |
+
self.assertTrue(run["evaluated"])
|
| 152 |
+
self.assertGreater(run["query_count"], 0)
|
| 153 |
+
for result in run["results"]:
|
| 154 |
+
truth = result["truths"][0]
|
| 155 |
+
self.assertEqual(result["retrieved"][0], truth)
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
class TestBenchmarkOptInToEmbeddingBackend(unittest.TestCase):
|
| 159 |
+
def test_benchmark_uses_embedding_when_config_says_so(self):
|
| 160 |
+
# Patch build_retriever to return an EmbeddingRetriever with our toy embedder
|
| 161 |
+
# so the benchmark exercises the opt-in code path without loading a real model.
|
| 162 |
+
toy = EmbeddingRetriever(embedder=_hashing_embedder())
|
| 163 |
+
|
| 164 |
+
with tempfile.TemporaryDirectory() as tmp:
|
| 165 |
+
tmp = Path(tmp)
|
| 166 |
+
src = tmp / "in"
|
| 167 |
+
src.mkdir()
|
| 168 |
+
(src / "doc.md").write_text(
|
| 169 |
+
"# Doc\n\n"
|
| 170 |
+
"Apples grow on trees in the orchard during autumn season.\n\n"
|
| 171 |
+
"Submarines navigate beneath the ocean using sonar pulses across waters.\n",
|
| 172 |
+
encoding="utf-8",
|
| 173 |
+
)
|
| 174 |
+
|
| 175 |
+
with patch("zsgdp.benchmarks.parser_quality.load_config") as load_config:
|
| 176 |
+
load_config.return_value = {
|
| 177 |
+
"benchmarks": {"retriever": {"backend": "embedding"}},
|
| 178 |
+
}
|
| 179 |
+
with patch(
|
| 180 |
+
"zsgdp.benchmarks.embedding_retriever.build_retriever",
|
| 181 |
+
return_value=toy,
|
| 182 |
+
) as build_call:
|
| 183 |
+
summary = run_parser_benchmark(src, tmp / "out", dataset_name="custom_folder")
|
| 184 |
+
|
| 185 |
+
self.assertGreaterEqual(build_call.call_count, 1)
|
| 186 |
+
self.assertTrue(summary["documents"][0]["retrieval_evaluated"])
|
| 187 |
+
|
| 188 |
+
|
| 189 |
+
if __name__ == "__main__":
|
| 190 |
+
unittest.main()
|
tests/test_env_loading.py
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for .env loading and HF_TOKEN resolution."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
import tempfile
|
| 7 |
+
import unittest
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from unittest.mock import patch
|
| 10 |
+
|
| 11 |
+
from zsgdp.config import hf_token, load_env_file
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class LoadEnvFileTests(unittest.TestCase):
|
| 15 |
+
def test_loads_simple_key_value(self):
|
| 16 |
+
with tempfile.TemporaryDirectory() as tmp:
|
| 17 |
+
env = Path(tmp) / ".env"
|
| 18 |
+
env.write_text("HF_TOKEN=hf_test_value_123\nOTHER=foo\n", encoding="utf-8")
|
| 19 |
+
|
| 20 |
+
with patch.dict("os.environ", {}, clear=False):
|
| 21 |
+
os.environ.pop("HF_TOKEN", None)
|
| 22 |
+
os.environ.pop("OTHER", None)
|
| 23 |
+
applied = load_env_file(env)
|
| 24 |
+
|
| 25 |
+
self.assertEqual(applied["HF_TOKEN"], "hf_test_value_123")
|
| 26 |
+
self.assertEqual(applied["OTHER"], "foo")
|
| 27 |
+
|
| 28 |
+
def test_skips_comments_and_blank_lines(self):
|
| 29 |
+
with tempfile.TemporaryDirectory() as tmp:
|
| 30 |
+
env = Path(tmp) / ".env"
|
| 31 |
+
env.write_text(
|
| 32 |
+
"# top comment\n\nFOO=bar\n # indented\n\nBAZ=qux\n",
|
| 33 |
+
encoding="utf-8",
|
| 34 |
+
)
|
| 35 |
+
with patch.dict("os.environ", {}, clear=False):
|
| 36 |
+
os.environ.pop("FOO", None)
|
| 37 |
+
os.environ.pop("BAZ", None)
|
| 38 |
+
applied = load_env_file(env)
|
| 39 |
+
|
| 40 |
+
self.assertEqual(set(applied), {"FOO", "BAZ"})
|
| 41 |
+
|
| 42 |
+
def test_quoted_values_unquoted(self):
|
| 43 |
+
with tempfile.TemporaryDirectory() as tmp:
|
| 44 |
+
env = Path(tmp) / ".env"
|
| 45 |
+
env.write_text('A="quoted value"\nB=\'single\'\nC=plain\n', encoding="utf-8")
|
| 46 |
+
with patch.dict("os.environ", {}, clear=False):
|
| 47 |
+
for key in ("A", "B", "C"):
|
| 48 |
+
os.environ.pop(key, None)
|
| 49 |
+
applied = load_env_file(env)
|
| 50 |
+
|
| 51 |
+
self.assertEqual(applied["A"], "quoted value")
|
| 52 |
+
self.assertEqual(applied["B"], "single")
|
| 53 |
+
self.assertEqual(applied["C"], "plain")
|
| 54 |
+
|
| 55 |
+
def test_export_prefix_stripped(self):
|
| 56 |
+
with tempfile.TemporaryDirectory() as tmp:
|
| 57 |
+
env = Path(tmp) / ".env"
|
| 58 |
+
env.write_text("export FOO=bar\n", encoding="utf-8")
|
| 59 |
+
with patch.dict("os.environ", {}, clear=False):
|
| 60 |
+
os.environ.pop("FOO", None)
|
| 61 |
+
applied = load_env_file(env)
|
| 62 |
+
|
| 63 |
+
self.assertEqual(applied["FOO"], "bar")
|
| 64 |
+
|
| 65 |
+
def test_existing_env_wins_unless_override(self):
|
| 66 |
+
with tempfile.TemporaryDirectory() as tmp:
|
| 67 |
+
env = Path(tmp) / ".env"
|
| 68 |
+
env.write_text("FOO=from_file\n", encoding="utf-8")
|
| 69 |
+
|
| 70 |
+
with patch.dict("os.environ", {"FOO": "from_env"}, clear=False):
|
| 71 |
+
applied = load_env_file(env)
|
| 72 |
+
# Default behaviour: don't override.
|
| 73 |
+
self.assertNotIn("FOO", applied)
|
| 74 |
+
self.assertEqual(os.environ["FOO"], "from_env")
|
| 75 |
+
|
| 76 |
+
# With override=True, file wins.
|
| 77 |
+
applied = load_env_file(env, override=True)
|
| 78 |
+
self.assertEqual(applied["FOO"], "from_file")
|
| 79 |
+
self.assertEqual(os.environ["FOO"], "from_file")
|
| 80 |
+
|
| 81 |
+
def test_missing_file_returns_empty_no_error(self):
|
| 82 |
+
self.assertEqual(load_env_file(Path("/tmp/zsgdp_does_not_exist.env")), {})
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
class HFTokenResolverTests(unittest.TestCase):
|
| 86 |
+
def test_picks_up_hf_token(self):
|
| 87 |
+
with patch.dict(
|
| 88 |
+
"os.environ",
|
| 89 |
+
{"HF_TOKEN": "primary", "HUGGING_FACE_HUB_TOKEN": "secondary"},
|
| 90 |
+
clear=False,
|
| 91 |
+
):
|
| 92 |
+
self.assertEqual(hf_token(), "primary")
|
| 93 |
+
|
| 94 |
+
def test_falls_through_alternative_names(self):
|
| 95 |
+
with patch.dict("os.environ", {}, clear=True):
|
| 96 |
+
os.environ["HUGGINGFACE_TOKEN"] = "fallback"
|
| 97 |
+
self.assertEqual(hf_token(), "fallback")
|
| 98 |
+
|
| 99 |
+
def test_recognises_hf_access_token_alias(self):
|
| 100 |
+
with patch.dict("os.environ", {}, clear=True):
|
| 101 |
+
os.environ["HF_ACCESS_TOKEN"] = "from_alias"
|
| 102 |
+
self.assertEqual(hf_token(), "from_alias")
|
| 103 |
+
|
| 104 |
+
def test_returns_none_when_unset(self):
|
| 105 |
+
with patch.dict("os.environ", {}, clear=True):
|
| 106 |
+
self.assertIsNone(hf_token())
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
if __name__ == "__main__":
|
| 110 |
+
unittest.main()
|
tests/test_external_parser_adapters.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import unittest
|
| 2 |
+
from unittest.mock import patch
|
| 3 |
+
|
| 4 |
+
from zsgdp.config import load_config
|
| 5 |
+
from zsgdp.normalize.normalize_unstructured import normalize_unstructured_parts
|
| 6 |
+
from zsgdp.parsers.external import MinerUParser, OlmOCRParser, PaddleOCRParser
|
| 7 |
+
from zsgdp.schema import DocumentProfile, PageProfile
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class ExternalParserAdapterTests(unittest.TestCase):
|
| 11 |
+
def test_command_backed_parsers_normalize_markdown(self):
|
| 12 |
+
cases = [
|
| 13 |
+
(MinerUParser, "mineru"),
|
| 14 |
+
(OlmOCRParser, "olmocr"),
|
| 15 |
+
(PaddleOCRParser, "paddleocr"),
|
| 16 |
+
]
|
| 17 |
+
profile = _profile()
|
| 18 |
+
|
| 19 |
+
for parser_class, parser_name in cases:
|
| 20 |
+
with self.subTest(parser=parser_name), patch.object(parser_class, "available", return_value=True), patch(
|
| 21 |
+
"zsgdp.parsers.external.run_external_parser_to_markdown",
|
| 22 |
+
return_value="# Report\n\n| A | B |\n| --- | --- |\n| 1 | 2 |",
|
| 23 |
+
):
|
| 24 |
+
candidate = parser_class().parse("sample.pdf", profile, load_config())
|
| 25 |
+
|
| 26 |
+
self.assertEqual(candidate.parser_name, parser_name)
|
| 27 |
+
self.assertEqual(candidate.elements[0].source_parser, parser_name)
|
| 28 |
+
self.assertEqual(len(candidate.tables), 1)
|
| 29 |
+
self.assertEqual(candidate.provenance["requested_pages"], [1])
|
| 30 |
+
|
| 31 |
+
def test_unstructured_normalizer_preserves_page_and_title_metadata(self):
|
| 32 |
+
class Metadata:
|
| 33 |
+
page_number = 2
|
| 34 |
+
|
| 35 |
+
class Title:
|
| 36 |
+
category = "Title"
|
| 37 |
+
metadata = Metadata()
|
| 38 |
+
|
| 39 |
+
def __str__(self):
|
| 40 |
+
return "Executive Summary"
|
| 41 |
+
|
| 42 |
+
class Narrative:
|
| 43 |
+
category = "NarrativeText"
|
| 44 |
+
metadata = Metadata()
|
| 45 |
+
|
| 46 |
+
def __str__(self):
|
| 47 |
+
return "The document parser keeps provenance."
|
| 48 |
+
|
| 49 |
+
candidate = normalize_unstructured_parts(parts=[Title(), Narrative()], profile=_profile(), source_path="sample.pdf")
|
| 50 |
+
|
| 51 |
+
self.assertEqual(candidate.parser_name, "unstructured")
|
| 52 |
+
self.assertEqual(candidate.elements[0].page_num, 2)
|
| 53 |
+
self.assertEqual(candidate.elements[0].type, "title")
|
| 54 |
+
self.assertEqual(candidate.elements[0].markdown, "# Executive Summary")
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def _profile():
|
| 58 |
+
return DocumentProfile(
|
| 59 |
+
doc_id="d1",
|
| 60 |
+
source_path="sample.pdf",
|
| 61 |
+
file_type="pdf",
|
| 62 |
+
page_count=1,
|
| 63 |
+
extension=".pdf",
|
| 64 |
+
pages=[PageProfile(page_num=1, digital_text_chars=20)],
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
if __name__ == "__main__":
|
| 69 |
+
unittest.main()
|
tests/test_gpu_runner.py
ADDED
|
@@ -0,0 +1,185 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import tempfile
|
| 3 |
+
import unittest
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from unittest.mock import patch
|
| 6 |
+
|
| 7 |
+
from zsgdp.cli import main
|
| 8 |
+
from zsgdp.config import load_config
|
| 9 |
+
from zsgdp.gpu.batching import batch_gpu_tasks
|
| 10 |
+
from zsgdp.gpu.runner import dry_run_gpu_tasks, load_gpu_tasks, run_gpu_task_manifest
|
| 11 |
+
from zsgdp.gpu.worker import GPUWorker
|
| 12 |
+
from zsgdp.utils import write_jsonl
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class GPURunnerTests(unittest.TestCase):
|
| 16 |
+
def test_batch_gpu_tasks_groups_by_task_type_and_batch_size(self):
|
| 17 |
+
tasks = [
|
| 18 |
+
{"task_id": "a", "task_type": "figure_description", "priority": 1},
|
| 19 |
+
{"task_id": "b", "task_type": "figure_description", "priority": 2},
|
| 20 |
+
{"task_id": "c", "task_type": "table_vlm_repair", "priority": 3},
|
| 21 |
+
]
|
| 22 |
+
|
| 23 |
+
batches = batch_gpu_tasks(tasks, max_batch_size=1)
|
| 24 |
+
|
| 25 |
+
self.assertEqual(len(batches), 3)
|
| 26 |
+
self.assertEqual(batches[0]["task_count"], 1)
|
| 27 |
+
self.assertEqual({batch["task_type"] for batch in batches}, {"figure_description", "table_vlm_repair"})
|
| 28 |
+
|
| 29 |
+
def test_worker_reports_missing_image_path(self):
|
| 30 |
+
worker = GPUWorker(load_config())
|
| 31 |
+
|
| 32 |
+
result = worker.run(
|
| 33 |
+
{
|
| 34 |
+
"task_id": "gt1",
|
| 35 |
+
"task_type": "figure_description",
|
| 36 |
+
"doc_id": "d1",
|
| 37 |
+
"page_nums": [1],
|
| 38 |
+
"image_path": "/tmp/does-not-exist.png",
|
| 39 |
+
}
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
self.assertEqual(result["status"], "blocked_missing_inputs")
|
| 43 |
+
self.assertIn("image_path", result["readiness"]["missing_inputs"])
|
| 44 |
+
|
| 45 |
+
def test_run_gpu_task_manifest_writes_report(self):
|
| 46 |
+
with tempfile.TemporaryDirectory() as tmp:
|
| 47 |
+
tmp_path = Path(tmp)
|
| 48 |
+
image_path = tmp_path / "figure.png"
|
| 49 |
+
image_path.write_bytes(b"fake")
|
| 50 |
+
tasks_path = tmp_path / "gpu_tasks.jsonl"
|
| 51 |
+
report_path = tmp_path / "report.json"
|
| 52 |
+
write_jsonl(
|
| 53 |
+
tasks_path,
|
| 54 |
+
[
|
| 55 |
+
{
|
| 56 |
+
"task_id": "gt1",
|
| 57 |
+
"task_type": "figure_description",
|
| 58 |
+
"doc_id": "d1",
|
| 59 |
+
"page_nums": [1],
|
| 60 |
+
"image_path": str(image_path),
|
| 61 |
+
"priority": 60,
|
| 62 |
+
}
|
| 63 |
+
],
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
report = run_gpu_task_manifest(tmp_path, config=load_config(), output_path=report_path)
|
| 67 |
+
|
| 68 |
+
self.assertEqual(report["task_count"], 1)
|
| 69 |
+
self.assertEqual(report["ready_count"], 1)
|
| 70 |
+
self.assertTrue(report_path.exists())
|
| 71 |
+
self.assertEqual(json.loads(report_path.read_text(encoding="utf-8"))["batch_count"], 1)
|
| 72 |
+
|
| 73 |
+
def test_dry_run_gpu_tasks_accepts_in_memory_tasks(self):
|
| 74 |
+
with tempfile.TemporaryDirectory() as tmp:
|
| 75 |
+
image_path = Path(tmp) / "figure.png"
|
| 76 |
+
image_path.write_bytes(b"fake")
|
| 77 |
+
|
| 78 |
+
report = dry_run_gpu_tasks(
|
| 79 |
+
[
|
| 80 |
+
{
|
| 81 |
+
"task_id": "gt1",
|
| 82 |
+
"task_type": "figure_description",
|
| 83 |
+
"doc_id": "d1",
|
| 84 |
+
"page_nums": [1],
|
| 85 |
+
"image_path": str(image_path),
|
| 86 |
+
"priority": 60,
|
| 87 |
+
}
|
| 88 |
+
],
|
| 89 |
+
config=load_config(),
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
self.assertEqual(report["ready_count"], 1)
|
| 93 |
+
self.assertEqual(report["blocked_count"], 0)
|
| 94 |
+
|
| 95 |
+
def test_execute_gpu_tasks_dispatches_transformers_client(self):
|
| 96 |
+
with tempfile.TemporaryDirectory() as tmp:
|
| 97 |
+
image_path = Path(tmp) / "figure.png"
|
| 98 |
+
image_path.write_bytes(b"fake")
|
| 99 |
+
task = {
|
| 100 |
+
"task_id": "gt1",
|
| 101 |
+
"task_type": "figure_description",
|
| 102 |
+
"doc_id": "d1",
|
| 103 |
+
"page_nums": [1],
|
| 104 |
+
"image_path": str(image_path),
|
| 105 |
+
"priority": 60,
|
| 106 |
+
"backend": "transformers",
|
| 107 |
+
"model_role": "vlm",
|
| 108 |
+
"model_id": "local-test-model",
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
with patch("zsgdp.gpu.worker.TransformersClient") as client_class:
|
| 112 |
+
client_class.return_value.execute_task.return_value = {"status": "executed", "text": "Figure description."}
|
| 113 |
+
report = dry_run_gpu_tasks([task], config=load_config(), dry_run=False)
|
| 114 |
+
|
| 115 |
+
self.assertFalse(report["dry_run"])
|
| 116 |
+
self.assertEqual(report["executed_count"], 1)
|
| 117 |
+
self.assertEqual(report["failed_count"], 0)
|
| 118 |
+
self.assertEqual(report["batches"][0]["status"], "execute_complete")
|
| 119 |
+
client_class.return_value.execute_task.assert_called_once()
|
| 120 |
+
|
| 121 |
+
def test_load_gpu_tasks_accepts_file_path(self):
|
| 122 |
+
with tempfile.TemporaryDirectory() as tmp:
|
| 123 |
+
tasks_path = Path(tmp) / "tasks.jsonl"
|
| 124 |
+
write_jsonl(tasks_path, [{"task_id": "gt1"}])
|
| 125 |
+
|
| 126 |
+
self.assertEqual(load_gpu_tasks(tasks_path)[0]["task_id"], "gt1")
|
| 127 |
+
|
| 128 |
+
def test_run_gpu_tasks_cli(self):
|
| 129 |
+
with tempfile.TemporaryDirectory() as tmp:
|
| 130 |
+
tmp_path = Path(tmp)
|
| 131 |
+
tasks_path = tmp_path / "gpu_tasks.jsonl"
|
| 132 |
+
report_path = tmp_path / "report.json"
|
| 133 |
+
write_jsonl(
|
| 134 |
+
tasks_path,
|
| 135 |
+
[
|
| 136 |
+
{
|
| 137 |
+
"task_id": "gt1",
|
| 138 |
+
"task_type": "figure_description",
|
| 139 |
+
"doc_id": "d1",
|
| 140 |
+
"page_nums": [1],
|
| 141 |
+
"image_path": str(tmp_path / "missing.png"),
|
| 142 |
+
"priority": 60,
|
| 143 |
+
}
|
| 144 |
+
],
|
| 145 |
+
)
|
| 146 |
+
|
| 147 |
+
code = main(["run-gpu-tasks", "--input", str(tasks_path), "--output", str(report_path)])
|
| 148 |
+
|
| 149 |
+
self.assertEqual(code, 0)
|
| 150 |
+
self.assertTrue(report_path.exists())
|
| 151 |
+
|
| 152 |
+
def test_run_gpu_tasks_cli_execute(self):
|
| 153 |
+
with tempfile.TemporaryDirectory() as tmp:
|
| 154 |
+
tmp_path = Path(tmp)
|
| 155 |
+
image_path = tmp_path / "figure.png"
|
| 156 |
+
image_path.write_bytes(b"fake")
|
| 157 |
+
tasks_path = tmp_path / "gpu_tasks.jsonl"
|
| 158 |
+
report_path = tmp_path / "report.json"
|
| 159 |
+
write_jsonl(
|
| 160 |
+
tasks_path,
|
| 161 |
+
[
|
| 162 |
+
{
|
| 163 |
+
"task_id": "gt1",
|
| 164 |
+
"task_type": "figure_description",
|
| 165 |
+
"doc_id": "d1",
|
| 166 |
+
"page_nums": [1],
|
| 167 |
+
"image_path": str(image_path),
|
| 168 |
+
"priority": 60,
|
| 169 |
+
"backend": "transformers",
|
| 170 |
+
"model_role": "vlm",
|
| 171 |
+
"model_id": "local-test-model",
|
| 172 |
+
}
|
| 173 |
+
],
|
| 174 |
+
)
|
| 175 |
+
|
| 176 |
+
with patch("zsgdp.gpu.worker.TransformersClient") as client_class:
|
| 177 |
+
client_class.return_value.execute_task.return_value = {"status": "executed", "text": "done"}
|
| 178 |
+
code = main(["run-gpu-tasks", "--input", str(tasks_path), "--output", str(report_path), "--execute"])
|
| 179 |
+
|
| 180 |
+
self.assertEqual(code, 0)
|
| 181 |
+
self.assertEqual(json.loads(report_path.read_text(encoding="utf-8"))["executed_count"], 1)
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
if __name__ == "__main__":
|
| 185 |
+
unittest.main()
|
tests/test_gpu_runtime.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import unittest
|
| 2 |
+
from unittest.mock import patch
|
| 3 |
+
|
| 4 |
+
from zsgdp.config import load_config
|
| 5 |
+
from zsgdp.gpu import GPUModelConfig, collect_gpu_runtime_status
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class GPURuntimeTests(unittest.TestCase):
|
| 9 |
+
def test_model_config_reads_gpu_section(self):
|
| 10 |
+
config = load_config(overrides={"gpu": {"backend": "vllm", "provider": "huggingface_spaces", "space_name": "zeroshotGPU", "max_batch_size": 8}})
|
| 11 |
+
|
| 12 |
+
model_config = GPUModelConfig.from_config(config)
|
| 13 |
+
|
| 14 |
+
self.assertEqual(model_config.backend, "vllm")
|
| 15 |
+
self.assertEqual(model_config.provider, "huggingface_spaces")
|
| 16 |
+
self.assertEqual(model_config.space_name, "zeroshotGPU")
|
| 17 |
+
self.assertEqual(model_config.max_batch_size, 8)
|
| 18 |
+
|
| 19 |
+
def test_collect_runtime_detects_space_environment(self):
|
| 20 |
+
config = load_config()
|
| 21 |
+
|
| 22 |
+
with patch.dict("os.environ", {"SPACE_ID": "user/zeroshotGPU", "SPACE_HARDWARE": "l4x1"}, clear=False):
|
| 23 |
+
status = collect_gpu_runtime_status(config).to_dict()
|
| 24 |
+
|
| 25 |
+
self.assertEqual(status["provider"], "huggingface_spaces")
|
| 26 |
+
self.assertEqual(status["space_name"], "zeroshotGPU")
|
| 27 |
+
self.assertEqual(status["gpu_models_target"], "zeroshotGPU")
|
| 28 |
+
self.assertTrue(status["running_on_huggingface_space"])
|
| 29 |
+
self.assertEqual(status["space_id"], "user/zeroshotGPU")
|
| 30 |
+
self.assertEqual(status["hardware"], "l4x1")
|
| 31 |
+
self.assertIn(status["device"], {"cpu", "cuda", "mps"})
|
| 32 |
+
self.assertIn("torch_available", status)
|
| 33 |
+
self.assertEqual(status["configured_models"]["vlm"]["model_id"], "Qwen/Qwen2.5-VL-3B-Instruct")
|
| 34 |
+
self.assertEqual(status["configured_models"]["embedding"]["model_id"], "jinaai/jina-embeddings-v3")
|
| 35 |
+
|
| 36 |
+
def test_collect_runtime_reports_local_note(self):
|
| 37 |
+
config = load_config()
|
| 38 |
+
|
| 39 |
+
with patch.dict("os.environ", {"SPACE_ID": "", "SPACE_HOST": "", "SPACE_HARDWARE": ""}, clear=False):
|
| 40 |
+
status = collect_gpu_runtime_status(config)
|
| 41 |
+
|
| 42 |
+
self.assertFalse(status.running_on_huggingface_space)
|
| 43 |
+
self.assertTrue(any("local run" in note for note in status.notes))
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
if __name__ == "__main__":
|
| 47 |
+
unittest.main()
|
tests/test_gpu_tasks.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import unittest
|
| 2 |
+
|
| 3 |
+
from zsgdp.config import load_config
|
| 4 |
+
from zsgdp.gpu import plan_gpu_tasks
|
| 5 |
+
from zsgdp.routing import RouteDecision
|
| 6 |
+
from zsgdp.routing.budgets import Budget
|
| 7 |
+
from zsgdp.schema import DocumentProfile, FigureObject, PageProfile, ParsedDocument, TableObject
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class GPUTaskTests(unittest.TestCase):
|
| 11 |
+
def test_plan_gpu_tasks_includes_route_ocr_table_and_figure(self):
|
| 12 |
+
config = load_config(overrides={"chunking": {"vision_guided": True}})
|
| 13 |
+
profile = DocumentProfile(
|
| 14 |
+
doc_id="d1",
|
| 15 |
+
source_path="sample.pdf",
|
| 16 |
+
file_type="pdf",
|
| 17 |
+
page_count=1,
|
| 18 |
+
extension=".pdf",
|
| 19 |
+
pages=[
|
| 20 |
+
PageProfile(page_num=1, scanned_score=0.8, digital_text_chars=0, digital_text_quality=0.0),
|
| 21 |
+
],
|
| 22 |
+
)
|
| 23 |
+
parsed = ParsedDocument(
|
| 24 |
+
doc_id="d1",
|
| 25 |
+
source_path="sample.pdf",
|
| 26 |
+
file_type="pdf",
|
| 27 |
+
pages=[
|
| 28 |
+
{
|
| 29 |
+
"page_num": 1,
|
| 30 |
+
"parser_pages": [
|
| 31 |
+
{"rendered_page": {"image_path": "/tmp/page.png"}},
|
| 32 |
+
],
|
| 33 |
+
}
|
| 34 |
+
],
|
| 35 |
+
)
|
| 36 |
+
parsed.tables.append(
|
| 37 |
+
TableObject(
|
| 38 |
+
table_id="t1",
|
| 39 |
+
page_nums=[1],
|
| 40 |
+
bbox=[(1.0, 2.0, 3.0, 4.0)],
|
| 41 |
+
markdown="| A | B |\n| --- | --- |\n| 1 | 2 |",
|
| 42 |
+
provenance={"crop_path": "/tmp/table.png"},
|
| 43 |
+
)
|
| 44 |
+
)
|
| 45 |
+
parsed.figures.append(FigureObject(figure_id="f1", page_num=1, image_path="/tmp/figure.png"))
|
| 46 |
+
routes = [
|
| 47 |
+
RouteDecision(
|
| 48 |
+
page_id=1,
|
| 49 |
+
experts=["pymupdf", "vlm_figure_repair"],
|
| 50 |
+
reason="figure-heavy page",
|
| 51 |
+
budget=Budget(),
|
| 52 |
+
labels=["figure_heavy"],
|
| 53 |
+
)
|
| 54 |
+
]
|
| 55 |
+
|
| 56 |
+
tasks = plan_gpu_tasks(profile, parsed, config, routes)
|
| 57 |
+
|
| 58 |
+
task_types = [task["task_type"] for task in tasks]
|
| 59 |
+
self.assertIn("vlm_route_repair", task_types)
|
| 60 |
+
self.assertIn("ocr_page", task_types)
|
| 61 |
+
self.assertIn("table_vlm_repair", task_types)
|
| 62 |
+
self.assertIn("figure_description", task_types)
|
| 63 |
+
self.assertEqual(tasks[0]["task_type"], "vlm_route_repair")
|
| 64 |
+
self.assertTrue(all(task["provider"] == "huggingface_spaces" for task in tasks))
|
| 65 |
+
self.assertTrue(all(task["space_name"] == "zeroshotGPU" for task in tasks))
|
| 66 |
+
self.assertTrue(all(task["model_id"] for task in tasks))
|
| 67 |
+
self.assertEqual(_task_by_type(tasks, "ocr_page")["model_role"], "ocr")
|
| 68 |
+
self.assertEqual(_task_by_type(tasks, "table_vlm_repair")["model_role"], "table")
|
| 69 |
+
self.assertEqual(_task_by_type(tasks, "figure_description")["model_role"], "vlm")
|
| 70 |
+
self.assertEqual(_task_by_type(tasks, "figure_description")["model_id"], "Qwen/Qwen2.5-VL-3B-Instruct")
|
| 71 |
+
|
| 72 |
+
def test_plan_gpu_tasks_respects_max_vlm_calls(self):
|
| 73 |
+
config = load_config(overrides={"gpu": {"max_vlm_calls_per_doc": 1}, "chunking": {"vision_guided": True}})
|
| 74 |
+
profile = DocumentProfile(
|
| 75 |
+
doc_id="d1",
|
| 76 |
+
source_path="sample.pdf",
|
| 77 |
+
file_type="pdf",
|
| 78 |
+
page_count=1,
|
| 79 |
+
extension=".pdf",
|
| 80 |
+
pages=[PageProfile(page_num=1, scanned_score=0.8)],
|
| 81 |
+
)
|
| 82 |
+
parsed = ParsedDocument(doc_id="d1", source_path="sample.pdf", file_type="pdf")
|
| 83 |
+
parsed.figures.append(FigureObject(figure_id="f1", page_num=1, image_path="/tmp/figure.png"))
|
| 84 |
+
|
| 85 |
+
tasks = plan_gpu_tasks(profile, parsed, config)
|
| 86 |
+
|
| 87 |
+
self.assertEqual(len(tasks), 1)
|
| 88 |
+
self.assertEqual(tasks[0]["task_type"], "ocr_page")
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def _task_by_type(tasks, task_type):
|
| 92 |
+
for task in tasks:
|
| 93 |
+
if task["task_type"] == task_type:
|
| 94 |
+
return task
|
| 95 |
+
raise AssertionError(f"Missing task type: {task_type}")
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
if __name__ == "__main__":
|
| 99 |
+
unittest.main()
|
tests/test_layout_f1.py
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for layout F1 metric and ground-truth adapters."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import unittest
|
| 6 |
+
|
| 7 |
+
from zsgdp.benchmarks.ground_truth import (
|
| 8 |
+
canonical_category,
|
| 9 |
+
doclaynet_layout_truths,
|
| 10 |
+
omnidocbench_layout_truths,
|
| 11 |
+
parsed_layout_predictions,
|
| 12 |
+
)
|
| 13 |
+
from zsgdp.schema import Element, FigureObject, ParsedDocument, QualityReport, TableObject
|
| 14 |
+
from zsgdp.verify.layout_f1 import compute_layout_f1
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def _item(bbox, category="paragraph", page_num=1):
|
| 18 |
+
return {"bbox": bbox, "category": category, "page_num": page_num}
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class TestComputeLayoutF1(unittest.TestCase):
|
| 22 |
+
def test_perfect_match_yields_f1_1(self):
|
| 23 |
+
predictions = [_item((0, 0, 100, 50)), _item((0, 60, 100, 110), "table")]
|
| 24 |
+
truths = [_item((0, 0, 100, 50)), _item((0, 60, 100, 110), "table")]
|
| 25 |
+
result = compute_layout_f1(predictions, truths)
|
| 26 |
+
self.assertEqual(result["class_aware"]["f1"], 1.0)
|
| 27 |
+
self.assertEqual(result["class_agnostic"]["f1"], 1.0)
|
| 28 |
+
self.assertEqual(result["class_aware"]["tp"], 2)
|
| 29 |
+
|
| 30 |
+
def test_zero_match_yields_f1_0(self):
|
| 31 |
+
predictions = [_item((0, 0, 50, 50))]
|
| 32 |
+
truths = [_item((1000, 1000, 1100, 1100))]
|
| 33 |
+
result = compute_layout_f1(predictions, truths)
|
| 34 |
+
self.assertEqual(result["class_aware"]["f1"], 0.0)
|
| 35 |
+
self.assertEqual(result["class_aware"]["fp"], 1)
|
| 36 |
+
self.assertEqual(result["class_aware"]["fn"], 1)
|
| 37 |
+
|
| 38 |
+
def test_iou_below_threshold_misses(self):
|
| 39 |
+
# 50% overlap by area in one axis only -> IoU < 0.5
|
| 40 |
+
predictions = [_item((0, 0, 100, 100))]
|
| 41 |
+
truths = [_item((60, 0, 160, 100))]
|
| 42 |
+
result = compute_layout_f1(predictions, truths, iou_threshold=0.5)
|
| 43 |
+
self.assertEqual(result["class_aware"]["tp"], 0)
|
| 44 |
+
|
| 45 |
+
def test_class_aware_vs_agnostic(self):
|
| 46 |
+
# Same bbox, different category -> agnostic matches, aware doesn't.
|
| 47 |
+
predictions = [_item((0, 0, 100, 100), "paragraph")]
|
| 48 |
+
truths = [_item((0, 0, 100, 100), "title")]
|
| 49 |
+
result = compute_layout_f1(predictions, truths)
|
| 50 |
+
self.assertEqual(result["class_aware"]["tp"], 0)
|
| 51 |
+
self.assertEqual(result["class_agnostic"]["tp"], 1)
|
| 52 |
+
|
| 53 |
+
def test_per_category_breakdown(self):
|
| 54 |
+
predictions = [_item((0, 0, 100, 100), "title"), _item((0, 200, 100, 300), "table")]
|
| 55 |
+
truths = [_item((0, 0, 100, 100), "title")]
|
| 56 |
+
result = compute_layout_f1(predictions, truths)
|
| 57 |
+
per_category = result["per_category"]
|
| 58 |
+
self.assertEqual(per_category["title"]["tp"], 1)
|
| 59 |
+
self.assertEqual(per_category["table"]["fp"], 1)
|
| 60 |
+
|
| 61 |
+
def test_empty_inputs_are_vacuously_correct(self):
|
| 62 |
+
self.assertEqual(compute_layout_f1([], [])["class_aware"]["f1"], 1.0)
|
| 63 |
+
|
| 64 |
+
def test_predictions_only_yields_zero(self):
|
| 65 |
+
result = compute_layout_f1([_item((0, 0, 10, 10))], [])
|
| 66 |
+
self.assertEqual(result["class_aware"]["fp"], 1)
|
| 67 |
+
self.assertEqual(result["class_aware"]["f1"], 0.0)
|
| 68 |
+
|
| 69 |
+
def test_page_num_must_match(self):
|
| 70 |
+
predictions = [_item((0, 0, 100, 100), "table", page_num=1)]
|
| 71 |
+
truths = [_item((0, 0, 100, 100), "table", page_num=2)]
|
| 72 |
+
result = compute_layout_f1(predictions, truths)
|
| 73 |
+
self.assertEqual(result["class_aware"]["tp"], 0)
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
class TestDocLayNetAdapter(unittest.TestCase):
|
| 77 |
+
def test_xywh_converted_and_categories_normalized(self):
|
| 78 |
+
ground_truth = {
|
| 79 |
+
"image": {"id": 5, "file_name": "p.png", "page_no": 5},
|
| 80 |
+
"annotations": [
|
| 81 |
+
{"image_id": 5, "category_id": 1, "bbox": [10, 20, 50, 60]},
|
| 82 |
+
{"image_id": 5, "category_id": 2, "bbox": [100, 0, 40, 30]},
|
| 83 |
+
],
|
| 84 |
+
"categories": {1: {"name": "Title"}, 2: {"name": "Section-header"}},
|
| 85 |
+
}
|
| 86 |
+
truths = doclaynet_layout_truths(ground_truth)
|
| 87 |
+
self.assertEqual(len(truths), 2)
|
| 88 |
+
self.assertEqual(truths[0]["bbox"], (10.0, 20.0, 60.0, 80.0))
|
| 89 |
+
self.assertEqual(truths[0]["category"], "title")
|
| 90 |
+
self.assertEqual(truths[0]["page_num"], 5)
|
| 91 |
+
self.assertEqual(truths[1]["category"], "heading")
|
| 92 |
+
|
| 93 |
+
def test_invalid_annotations_dropped(self):
|
| 94 |
+
ground_truth = {
|
| 95 |
+
"image": {"id": 1, "file_name": "p.png"},
|
| 96 |
+
"annotations": [
|
| 97 |
+
{"image_id": 1, "category_id": 1, "bbox": [0, 0, 0, 0]},
|
| 98 |
+
{"image_id": 1, "category_id": 1},
|
| 99 |
+
],
|
| 100 |
+
"categories": {1: {"name": "Text"}},
|
| 101 |
+
}
|
| 102 |
+
self.assertEqual(doclaynet_layout_truths(ground_truth), [])
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
class TestOmniDocBenchAdapter(unittest.TestCase):
|
| 106 |
+
def test_picks_layout_dets_first(self):
|
| 107 |
+
ground_truth = {
|
| 108 |
+
"layout_dets": [
|
| 109 |
+
{"bbox": [0, 0, 100, 50], "category": "title", "page_num": 1},
|
| 110 |
+
{"bbox": [0, 100, 100, 150], "category": "Table", "page": 1},
|
| 111 |
+
]
|
| 112 |
+
}
|
| 113 |
+
truths = omnidocbench_layout_truths(ground_truth)
|
| 114 |
+
self.assertEqual(len(truths), 2)
|
| 115 |
+
self.assertEqual(truths[0]["category"], "title")
|
| 116 |
+
self.assertEqual(truths[1]["category"], "table")
|
| 117 |
+
|
| 118 |
+
def test_pages_nested_records(self):
|
| 119 |
+
ground_truth = {
|
| 120 |
+
"pages": [
|
| 121 |
+
{"page_num": 1, "elements": [{"bbox": [0, 0, 10, 10], "category": "paragraph"}]},
|
| 122 |
+
{"page_num": 2, "elements": [{"bbox": [0, 0, 10, 10], "category": "table"}]},
|
| 123 |
+
]
|
| 124 |
+
}
|
| 125 |
+
truths = omnidocbench_layout_truths(ground_truth)
|
| 126 |
+
self.assertEqual(len(truths), 2)
|
| 127 |
+
self.assertEqual(truths[0]["page_num"], 1)
|
| 128 |
+
self.assertEqual(truths[1]["page_num"], 2)
|
| 129 |
+
|
| 130 |
+
def test_unknown_shape_returns_empty(self):
|
| 131 |
+
self.assertEqual(omnidocbench_layout_truths({"weird": "shape"}), [])
|
| 132 |
+
self.assertEqual(omnidocbench_layout_truths(None), [])
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
class TestParsedPredictions(unittest.TestCase):
|
| 136 |
+
def test_extracts_bboxes_from_elements_tables_figures(self):
|
| 137 |
+
parsed = ParsedDocument(
|
| 138 |
+
doc_id="d1",
|
| 139 |
+
source_path="/tmp/d1.pdf",
|
| 140 |
+
file_type="pdf",
|
| 141 |
+
elements=[
|
| 142 |
+
Element(
|
| 143 |
+
element_id="e1",
|
| 144 |
+
doc_id="d1",
|
| 145 |
+
page_num=1,
|
| 146 |
+
type="title",
|
| 147 |
+
text="Title",
|
| 148 |
+
bbox=(0.0, 0.0, 100.0, 30.0),
|
| 149 |
+
),
|
| 150 |
+
Element(
|
| 151 |
+
element_id="e2",
|
| 152 |
+
doc_id="d1",
|
| 153 |
+
page_num=1,
|
| 154 |
+
type="paragraph",
|
| 155 |
+
text="No bbox",
|
| 156 |
+
),
|
| 157 |
+
],
|
| 158 |
+
tables=[
|
| 159 |
+
TableObject(
|
| 160 |
+
table_id="t1",
|
| 161 |
+
page_nums=[1],
|
| 162 |
+
bbox=[(0.0, 100.0, 200.0, 200.0)],
|
| 163 |
+
)
|
| 164 |
+
],
|
| 165 |
+
figures=[
|
| 166 |
+
FigureObject(
|
| 167 |
+
figure_id="f1",
|
| 168 |
+
page_num=2,
|
| 169 |
+
bbox=(50.0, 50.0, 150.0, 250.0),
|
| 170 |
+
)
|
| 171 |
+
],
|
| 172 |
+
quality_report=QualityReport(),
|
| 173 |
+
)
|
| 174 |
+
predictions = parsed_layout_predictions(parsed)
|
| 175 |
+
categories = sorted(prediction["category"] for prediction in predictions)
|
| 176 |
+
self.assertEqual(categories, ["figure", "table", "title"])
|
| 177 |
+
self.assertEqual(len(predictions), 3)
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
class TestCanonicalCategory(unittest.TestCase):
|
| 181 |
+
def test_canonical_mapping(self):
|
| 182 |
+
self.assertEqual(canonical_category("Picture"), "figure")
|
| 183 |
+
self.assertEqual(canonical_category("Section-header"), "heading")
|
| 184 |
+
self.assertEqual(canonical_category("Page-footer"), "footer")
|
| 185 |
+
self.assertEqual(canonical_category("formula"), "formula")
|
| 186 |
+
self.assertEqual(canonical_category("Mystery"), "mystery")
|
| 187 |
+
|
| 188 |
+
|
| 189 |
+
if __name__ == "__main__":
|
| 190 |
+
unittest.main()
|
tests/test_logging.py
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for the logging configuration and structured log emission."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import io
|
| 6 |
+
import json
|
| 7 |
+
import logging
|
| 8 |
+
import tempfile
|
| 9 |
+
import unittest
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from unittest.mock import patch
|
| 12 |
+
|
| 13 |
+
from zsgdp.logging_config import configure_logging, get_logger
|
| 14 |
+
from zsgdp.pipeline import parse_document
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class ConfigureLoggingTests(unittest.TestCase):
|
| 18 |
+
def setUp(self) -> None:
|
| 19 |
+
# Reset between tests so each one configures cleanly.
|
| 20 |
+
root = logging.getLogger("zsgdp")
|
| 21 |
+
for handler in list(root.handlers):
|
| 22 |
+
root.removeHandler(handler)
|
| 23 |
+
|
| 24 |
+
def test_idempotent_configuration(self):
|
| 25 |
+
stream = io.StringIO()
|
| 26 |
+
configure_logging(level="INFO", json_format=False, stream=stream)
|
| 27 |
+
configure_logging(level="INFO", json_format=False, stream=stream)
|
| 28 |
+
root = logging.getLogger("zsgdp")
|
| 29 |
+
# Idempotent: still exactly one handler attached.
|
| 30 |
+
self.assertEqual(len(root.handlers), 1)
|
| 31 |
+
|
| 32 |
+
def test_text_format_emits_human_readable(self):
|
| 33 |
+
stream = io.StringIO()
|
| 34 |
+
configure_logging(level="INFO", json_format=False, stream=stream)
|
| 35 |
+
get_logger("zsgdp.test").info("hello", extra={"doc_id": "d1"})
|
| 36 |
+
output = stream.getvalue()
|
| 37 |
+
self.assertIn("INFO", output)
|
| 38 |
+
self.assertIn("zsgdp.test", output)
|
| 39 |
+
self.assertIn("hello", output)
|
| 40 |
+
|
| 41 |
+
def test_json_format_emits_one_line_records(self):
|
| 42 |
+
stream = io.StringIO()
|
| 43 |
+
configure_logging(level="INFO", json_format=True, stream=stream)
|
| 44 |
+
get_logger("zsgdp.test").info("event", extra={"doc_id": "abc", "count": 3})
|
| 45 |
+
output = stream.getvalue().strip()
|
| 46 |
+
record = json.loads(output)
|
| 47 |
+
self.assertEqual(record["level"], "INFO")
|
| 48 |
+
self.assertEqual(record["logger"], "zsgdp.test")
|
| 49 |
+
self.assertEqual(record["message"], "event")
|
| 50 |
+
self.assertEqual(record["doc_id"], "abc")
|
| 51 |
+
self.assertEqual(record["count"], 3)
|
| 52 |
+
|
| 53 |
+
def test_default_level_is_warning(self):
|
| 54 |
+
stream = io.StringIO()
|
| 55 |
+
with patch.dict("os.environ", {"ZSGDP_LOG_LEVEL": "", "ZSGDP_LOG_JSON": ""}, clear=False):
|
| 56 |
+
configure_logging(stream=stream)
|
| 57 |
+
get_logger("zsgdp.test").info("hidden_at_default_level")
|
| 58 |
+
self.assertNotIn("hidden_at_default_level", stream.getvalue())
|
| 59 |
+
get_logger("zsgdp.test").warning("visible_at_default_level")
|
| 60 |
+
self.assertIn("visible_at_default_level", stream.getvalue())
|
| 61 |
+
|
| 62 |
+
def test_get_logger_namespacing(self):
|
| 63 |
+
self.assertEqual(get_logger().name, "zsgdp")
|
| 64 |
+
self.assertEqual(get_logger("zsgdp.foo").name, "zsgdp.foo")
|
| 65 |
+
# Bare names get prefixed.
|
| 66 |
+
self.assertEqual(get_logger("foo").name, "zsgdp.foo")
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
class PipelineLogEmissionTests(unittest.TestCase):
|
| 70 |
+
def test_parse_emits_start_and_end_records(self):
|
| 71 |
+
# Reset handlers so assertLogs works against the named logger.
|
| 72 |
+
root = logging.getLogger("zsgdp")
|
| 73 |
+
for handler in list(root.handlers):
|
| 74 |
+
root.removeHandler(handler)
|
| 75 |
+
root.setLevel(logging.DEBUG)
|
| 76 |
+
root.propagate = True
|
| 77 |
+
|
| 78 |
+
with tempfile.TemporaryDirectory() as tmp:
|
| 79 |
+
input_path = Path(tmp) / "doc.md"
|
| 80 |
+
input_path.write_text("# Doc\n\nHello.\n", encoding="utf-8")
|
| 81 |
+
|
| 82 |
+
with self.assertLogs("zsgdp.pipeline", level="INFO") as captured:
|
| 83 |
+
parse_document(input_path, Path(tmp) / "out")
|
| 84 |
+
|
| 85 |
+
messages = [record.message for record in captured.records]
|
| 86 |
+
self.assertIn("parse_start", messages)
|
| 87 |
+
self.assertIn("parse_end", messages)
|
| 88 |
+
|
| 89 |
+
# Find a parse_end record and assert structured fields are present.
|
| 90 |
+
parse_end = next(record for record in captured.records if record.message == "parse_end")
|
| 91 |
+
self.assertTrue(hasattr(parse_end, "doc_id"))
|
| 92 |
+
self.assertTrue(hasattr(parse_end, "elapsed_seconds"))
|
| 93 |
+
self.assertTrue(hasattr(parse_end, "quality_score"))
|
| 94 |
+
self.assertTrue(hasattr(parse_end, "element_count"))
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
class RepairLogEmissionTests(unittest.TestCase):
|
| 98 |
+
def test_repair_emits_iteration_record(self):
|
| 99 |
+
root = logging.getLogger("zsgdp")
|
| 100 |
+
for handler in list(root.handlers):
|
| 101 |
+
root.removeHandler(handler)
|
| 102 |
+
root.setLevel(logging.DEBUG)
|
| 103 |
+
root.propagate = True
|
| 104 |
+
|
| 105 |
+
with tempfile.TemporaryDirectory() as tmp:
|
| 106 |
+
input_path = Path(tmp) / "report.md"
|
| 107 |
+
# Malformed table forces a repair iteration.
|
| 108 |
+
input_path.write_text(
|
| 109 |
+
"# Report\n\n| A | B |\n| --- | --- |\n| 1 | 2 | 3 |\n",
|
| 110 |
+
encoding="utf-8",
|
| 111 |
+
)
|
| 112 |
+
|
| 113 |
+
with self.assertLogs("zsgdp.repair.controller", level="INFO") as captured:
|
| 114 |
+
parse_document(input_path, Path(tmp) / "out")
|
| 115 |
+
|
| 116 |
+
repair_records = [r for r in captured.records if r.message == "repair_iteration"]
|
| 117 |
+
self.assertGreaterEqual(len(repair_records), 1)
|
| 118 |
+
# Each record carries the iteration index.
|
| 119 |
+
for record in repair_records:
|
| 120 |
+
self.assertTrue(hasattr(record, "iteration"))
|
| 121 |
+
self.assertTrue(hasattr(record, "status"))
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
if __name__ == "__main__":
|
| 125 |
+
unittest.main()
|
tests/test_markdown_normalizer.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import unittest
|
| 2 |
+
|
| 3 |
+
from zsgdp.normalize.markdown import markdown_to_blocks, normalize_markdown_candidate, normalize_markdown_table
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class MarkdownNormalizerTests(unittest.TestCase):
|
| 7 |
+
def test_markdown_to_blocks_preserves_pages_tables_and_images(self):
|
| 8 |
+
markdown = """# Report
|
| 9 |
+
|
| 10 |
+
Intro paragraph.
|
| 11 |
+
|
| 12 |
+
| Region | Q1 |
|
| 13 |
+
| --- | --- |
|
| 14 |
+
| NA | 10 |
|
| 15 |
+
|
| 16 |
+
<!-- page:2 -->
|
| 17 |
+
|
| 18 |
+
## Figure Section
|
| 19 |
+
|
| 20 |
+

|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
candidate = normalize_markdown_candidate(
|
| 24 |
+
markdown=markdown,
|
| 25 |
+
doc_id="d1",
|
| 26 |
+
source_path="sample.md",
|
| 27 |
+
file_type="markdown",
|
| 28 |
+
parser_name="test",
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
self.assertEqual([page["page_num"] for page in candidate.pages], [1, 2])
|
| 32 |
+
self.assertEqual(len(candidate.tables), 1)
|
| 33 |
+
self.assertEqual(candidate.tables[0].page_nums, [1])
|
| 34 |
+
self.assertEqual(len(candidate.figures), 1)
|
| 35 |
+
self.assertEqual(candidate.figures[0].page_num, 2)
|
| 36 |
+
self.assertEqual(candidate.figures[0].image_path, "chart.png")
|
| 37 |
+
|
| 38 |
+
def test_normalize_markdown_table_repairs_separator(self):
|
| 39 |
+
table = "| A | B |\n| --- | --- |\n| 1 | 2 |"
|
| 40 |
+
|
| 41 |
+
self.assertEqual(normalize_markdown_table(table), "| A | B |\n| --- | --- |\n| 1 | 2 |")
|
| 42 |
+
|
| 43 |
+
def test_normalize_plain_aligned_table(self):
|
| 44 |
+
table = "Region Q1 Q2\nNorth America 10 12\nEurope 8 7"
|
| 45 |
+
|
| 46 |
+
self.assertEqual(
|
| 47 |
+
normalize_markdown_table(table),
|
| 48 |
+
"| Region | Q1 | Q2 |\n| --- | --- | --- |\n| North America | 10 | 12 |\n| Europe | 8 | 7 |",
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
def test_markdown_to_blocks_detects_plain_aligned_table(self):
|
| 52 |
+
blocks = markdown_to_blocks("# Report\n\nRegion Q1 Q2\nNorth America 10 12\nEurope 8 7")
|
| 53 |
+
|
| 54 |
+
self.assertEqual(blocks[1].block_type, "table")
|
| 55 |
+
|
| 56 |
+
def test_markdown_to_blocks_classifies_caption(self):
|
| 57 |
+
blocks = markdown_to_blocks("Figure 1 Revenue trend")
|
| 58 |
+
|
| 59 |
+
self.assertEqual(blocks[0].block_type, "caption")
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
if __name__ == "__main__":
|
| 63 |
+
unittest.main()
|
tests/test_marker_parser.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import tempfile
|
| 2 |
+
import unittest
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from unittest.mock import patch
|
| 5 |
+
|
| 6 |
+
from zsgdp.config import load_config
|
| 7 |
+
from zsgdp.parsers.external import MarkerParser, _read_external_markdown, _read_marker_markdown, normalize_marker_markdown
|
| 8 |
+
from zsgdp.schema import DocumentProfile, PageProfile
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class MarkerParserTests(unittest.TestCase):
|
| 12 |
+
def test_normalize_marker_markdown_emits_common_schema(self):
|
| 13 |
+
profile = DocumentProfile(
|
| 14 |
+
doc_id="d1",
|
| 15 |
+
source_path="sample.pdf",
|
| 16 |
+
file_type="pdf",
|
| 17 |
+
page_count=1,
|
| 18 |
+
extension=".pdf",
|
| 19 |
+
pages=[PageProfile(page_num=1, digital_text_chars=20)],
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
candidate = normalize_marker_markdown(
|
| 23 |
+
markdown="# Report\n\n| A | B |\n| --- | --- |\n| 1 | 2 |\n\n",
|
| 24 |
+
profile=profile,
|
| 25 |
+
source_path="sample.pdf",
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
self.assertEqual(candidate.parser_name, "marker")
|
| 29 |
+
self.assertEqual(len(candidate.tables), 1)
|
| 30 |
+
self.assertEqual(len(candidate.figures), 1)
|
| 31 |
+
self.assertEqual(candidate.pages[0]["source_parser"], "marker")
|
| 32 |
+
|
| 33 |
+
def test_marker_parser_runs_markdown_through_normalizer(self):
|
| 34 |
+
profile = DocumentProfile(
|
| 35 |
+
doc_id="d1",
|
| 36 |
+
source_path="sample.pdf",
|
| 37 |
+
file_type="pdf",
|
| 38 |
+
page_count=1,
|
| 39 |
+
extension=".pdf",
|
| 40 |
+
pages=[PageProfile(page_num=1, digital_text_chars=20)],
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
with patch.object(MarkerParser, "available", return_value=True), patch(
|
| 44 |
+
"zsgdp.parsers.external.run_marker_to_markdown",
|
| 45 |
+
return_value="# Report\n\nBody.",
|
| 46 |
+
):
|
| 47 |
+
candidate = MarkerParser().parse("sample.pdf", profile, load_config())
|
| 48 |
+
|
| 49 |
+
self.assertEqual(candidate.parser_name, "marker")
|
| 50 |
+
self.assertEqual(candidate.elements[0].source_parser, "marker")
|
| 51 |
+
self.assertEqual(candidate.provenance["requested_pages"], [1])
|
| 52 |
+
|
| 53 |
+
def test_read_marker_markdown_prefers_markdown_file(self):
|
| 54 |
+
with tempfile.TemporaryDirectory() as tmp:
|
| 55 |
+
root = Path(tmp)
|
| 56 |
+
nested = root / "sample"
|
| 57 |
+
nested.mkdir()
|
| 58 |
+
(nested / "other.md").write_text("# Other", encoding="utf-8")
|
| 59 |
+
(nested / "markdown.md").write_text("# Preferred", encoding="utf-8")
|
| 60 |
+
|
| 61 |
+
markdown = _read_marker_markdown(root)
|
| 62 |
+
|
| 63 |
+
self.assertEqual(markdown, "# Preferred")
|
| 64 |
+
|
| 65 |
+
def test_read_external_markdown_falls_back_to_stdout(self):
|
| 66 |
+
with tempfile.TemporaryDirectory() as tmp:
|
| 67 |
+
markdown = _read_external_markdown(Path(tmp), parser_name="mineru", stdout="# From stdout")
|
| 68 |
+
|
| 69 |
+
self.assertEqual(markdown, "# From stdout")
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
if __name__ == "__main__":
|
| 73 |
+
unittest.main()
|
tests/test_merge.py
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import unittest
|
| 2 |
+
|
| 3 |
+
from zsgdp.merge.dedupe import dedupe_elements, dedupe_tables
|
| 4 |
+
from zsgdp.schema import Element, TableObject
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class MergeDedupeTests(unittest.TestCase):
|
| 8 |
+
def test_merges_docling_heading_with_pymupdf_bbox(self):
|
| 9 |
+
elements = [
|
| 10 |
+
Element(
|
| 11 |
+
element_id="docling_p1_e1",
|
| 12 |
+
doc_id="d1",
|
| 13 |
+
page_num=1,
|
| 14 |
+
type="heading",
|
| 15 |
+
text="## Revenue Summary",
|
| 16 |
+
markdown="## Revenue Summary",
|
| 17 |
+
reading_order=1,
|
| 18 |
+
confidence=0.88,
|
| 19 |
+
source_parser="docling",
|
| 20 |
+
),
|
| 21 |
+
Element(
|
| 22 |
+
element_id="pymupdf_p1_e1",
|
| 23 |
+
doc_id="d1",
|
| 24 |
+
page_num=1,
|
| 25 |
+
type="paragraph",
|
| 26 |
+
text="Revenue Summary",
|
| 27 |
+
bbox=(72.0, 100.0, 200.0, 124.0),
|
| 28 |
+
reading_order=1,
|
| 29 |
+
confidence=0.86,
|
| 30 |
+
source_parser="pymupdf",
|
| 31 |
+
),
|
| 32 |
+
]
|
| 33 |
+
|
| 34 |
+
deduped = dedupe_elements(elements)
|
| 35 |
+
|
| 36 |
+
self.assertEqual(len(deduped), 1)
|
| 37 |
+
self.assertEqual(deduped[0].source_parser, "docling")
|
| 38 |
+
self.assertEqual(deduped[0].bbox, (72.0, 100.0, 200.0, 124.0))
|
| 39 |
+
self.assertEqual(deduped[0].provenance["bbox_source_parser"], "pymupdf")
|
| 40 |
+
|
| 41 |
+
def test_drops_paragraph_duplicate_of_structured_table(self):
|
| 42 |
+
elements = [
|
| 43 |
+
Element(
|
| 44 |
+
element_id="docling_p1_e1",
|
| 45 |
+
doc_id="d1",
|
| 46 |
+
page_num=1,
|
| 47 |
+
type="paragraph",
|
| 48 |
+
text="Region Q1 Q2 North America 10 12 Europe 8 7",
|
| 49 |
+
reading_order=1,
|
| 50 |
+
confidence=0.88,
|
| 51 |
+
source_parser="docling",
|
| 52 |
+
),
|
| 53 |
+
Element(
|
| 54 |
+
element_id="pymupdf_p1_e1",
|
| 55 |
+
doc_id="d1",
|
| 56 |
+
page_num=1,
|
| 57 |
+
type="table",
|
| 58 |
+
markdown="| Region | Q1 | Q2 |\n| --- | --- | --- |\n| North America | 10 | 12 |\n| Europe | 8 | 7 |",
|
| 59 |
+
reading_order=1,
|
| 60 |
+
confidence=0.72,
|
| 61 |
+
source_parser="pymupdf",
|
| 62 |
+
),
|
| 63 |
+
]
|
| 64 |
+
|
| 65 |
+
deduped = dedupe_elements(elements)
|
| 66 |
+
|
| 67 |
+
self.assertEqual(len(deduped), 1)
|
| 68 |
+
self.assertEqual(deduped[0].type, "table")
|
| 69 |
+
|
| 70 |
+
def test_merges_duplicate_table_elements_and_keeps_better_grid(self):
|
| 71 |
+
elements = [
|
| 72 |
+
Element(
|
| 73 |
+
element_id="docling_p1_e3",
|
| 74 |
+
doc_id="d1",
|
| 75 |
+
page_num=1,
|
| 76 |
+
type="table",
|
| 77 |
+
markdown="| Region | Q1 | Q2 North America | 10 | 12 Europe | 8 | 7 |\n| --- | --- | --- | --- | --- | --- | --- |",
|
| 78 |
+
reading_order=3,
|
| 79 |
+
confidence=0.88,
|
| 80 |
+
source_parser="docling",
|
| 81 |
+
),
|
| 82 |
+
Element(
|
| 83 |
+
element_id="pymupdf_p1_e3",
|
| 84 |
+
doc_id="d1",
|
| 85 |
+
page_num=1,
|
| 86 |
+
type="table",
|
| 87 |
+
bbox=(72.0, 144.0, 237.0, 186.0),
|
| 88 |
+
markdown="| Region | Q1 | Q2 |\n| --- | --- | --- |\n| North America | 10 | 12 |\n| Europe | 8 | 7 |",
|
| 89 |
+
reading_order=3,
|
| 90 |
+
confidence=0.72,
|
| 91 |
+
source_parser="pymupdf",
|
| 92 |
+
),
|
| 93 |
+
]
|
| 94 |
+
|
| 95 |
+
deduped = dedupe_elements(elements)
|
| 96 |
+
|
| 97 |
+
self.assertEqual(len(deduped), 1)
|
| 98 |
+
self.assertEqual(deduped[0].source_parser, "pymupdf")
|
| 99 |
+
self.assertEqual(deduped[0].confidence, 0.88)
|
| 100 |
+
self.assertIn("| North America | 10 | 12 |", deduped[0].markdown or "")
|
| 101 |
+
self.assertEqual(deduped[0].bbox, (72.0, 144.0, 237.0, 186.0))
|
| 102 |
+
|
| 103 |
+
def test_merges_duplicate_tables_and_keeps_better_grid_assets(self):
|
| 104 |
+
tables = [
|
| 105 |
+
TableObject(
|
| 106 |
+
table_id="docling_t1",
|
| 107 |
+
page_nums=[1],
|
| 108 |
+
markdown="| Region | Q1 | Q2 North America | 10 | 12 Europe | 8 | 7 |\n| --- | --- | --- | --- | --- | --- | --- |",
|
| 109 |
+
confidence=0.84,
|
| 110 |
+
source_parser="docling",
|
| 111 |
+
),
|
| 112 |
+
TableObject(
|
| 113 |
+
table_id="pymupdf_t1",
|
| 114 |
+
page_nums=[1],
|
| 115 |
+
bbox=[(72.0, 144.0, 237.0, 186.0)],
|
| 116 |
+
markdown="| Region | Q1 | Q2 |\n| --- | --- | --- |\n| North America | 10 | 12 |\n| Europe | 8 | 7 |",
|
| 117 |
+
confidence=0.72,
|
| 118 |
+
source_parser="pymupdf",
|
| 119 |
+
provenance={"crop_path": "/tmp/table.png"},
|
| 120 |
+
),
|
| 121 |
+
]
|
| 122 |
+
|
| 123 |
+
deduped = dedupe_tables(tables)
|
| 124 |
+
|
| 125 |
+
self.assertEqual(len(deduped), 1)
|
| 126 |
+
self.assertEqual(deduped[0].source_parser, "pymupdf")
|
| 127 |
+
self.assertEqual(deduped[0].confidence, 0.84)
|
| 128 |
+
self.assertEqual(deduped[0].bbox, [(72.0, 144.0, 237.0, 186.0)])
|
| 129 |
+
self.assertEqual(deduped[0].provenance["crop_path"], "/tmp/table.png")
|
| 130 |
+
self.assertEqual(deduped[0].provenance["source_parsers"], ["pymupdf", "docling"])
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
if __name__ == "__main__":
|
| 134 |
+
unittest.main()
|
tests/test_parser_disagreement.py
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for parser-disagreement and repair-success metrics."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import tempfile
|
| 6 |
+
import unittest
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
|
| 9 |
+
from zsgdp.merge.conflict_detection import build_candidate_conflict_report
|
| 10 |
+
from zsgdp.pipeline import parse_document
|
| 11 |
+
from zsgdp.schema import DocumentProfile, Element, ParseCandidate, PageProfile, TableObject
|
| 12 |
+
from zsgdp.verify.parser_disagreement import compute_parser_disagreement
|
| 13 |
+
from zsgdp.verify.repair_success import compute_repair_success
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def _profile() -> DocumentProfile:
|
| 17 |
+
return DocumentProfile(
|
| 18 |
+
doc_id="d1",
|
| 19 |
+
source_path="/tmp/d1.md",
|
| 20 |
+
file_type="markdown",
|
| 21 |
+
page_count=1,
|
| 22 |
+
extension=".md",
|
| 23 |
+
pages=[PageProfile(page_num=1, digital_text_chars=400, digital_text_quality=0.9)],
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def _candidate(name: str, *, text: str, table_count: int = 0) -> ParseCandidate:
|
| 28 |
+
elements = [
|
| 29 |
+
Element(
|
| 30 |
+
element_id=f"{name}_e1",
|
| 31 |
+
doc_id="d1",
|
| 32 |
+
page_num=1,
|
| 33 |
+
type="paragraph",
|
| 34 |
+
text=text,
|
| 35 |
+
reading_order=1,
|
| 36 |
+
source_parser=name,
|
| 37 |
+
)
|
| 38 |
+
]
|
| 39 |
+
tables: list[TableObject] = []
|
| 40 |
+
for index in range(table_count):
|
| 41 |
+
tables.append(
|
| 42 |
+
TableObject(
|
| 43 |
+
table_id=f"{name}_t{index + 1}",
|
| 44 |
+
page_nums=[1],
|
| 45 |
+
markdown="| A | B |\n| --- | --- |\n| 1 | 2 |",
|
| 46 |
+
source_parser=name,
|
| 47 |
+
)
|
| 48 |
+
)
|
| 49 |
+
return ParseCandidate(
|
| 50 |
+
parser_name=name,
|
| 51 |
+
doc_id="d1",
|
| 52 |
+
source_path="/tmp/d1.md",
|
| 53 |
+
file_type="markdown",
|
| 54 |
+
elements=elements,
|
| 55 |
+
tables=tables,
|
| 56 |
+
figures=[],
|
| 57 |
+
pages=[{"page_num": 1, "source_parser": name}],
|
| 58 |
+
confidence=0.8,
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
class TestParserDisagreement(unittest.TestCase):
|
| 63 |
+
def test_disagreement_rate_uses_pair_count_denominator(self):
|
| 64 |
+
candidates = [
|
| 65 |
+
_candidate("docling", text="A" * 800, table_count=4),
|
| 66 |
+
_candidate("pymupdf", text="A" * 100, table_count=0),
|
| 67 |
+
]
|
| 68 |
+
report = build_candidate_conflict_report(candidates)
|
| 69 |
+
parser_metrics = {
|
| 70 |
+
"docling": {"parser": "docling", "failed": False},
|
| 71 |
+
"pymupdf": {"parser": "pymupdf", "failed": False},
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
result = compute_parser_disagreement(report, parser_metrics)
|
| 75 |
+
|
| 76 |
+
self.assertEqual(result["candidate_count"], 2)
|
| 77 |
+
self.assertEqual(result["parser_pair_count"], 1)
|
| 78 |
+
self.assertGreater(result["conflict_count"], 0)
|
| 79 |
+
self.assertGreater(result["disagreement_rate"], 0.0)
|
| 80 |
+
self.assertIn("text_coverage_gap", result["disagreement_by_type"])
|
| 81 |
+
self.assertIn("docling|pymupdf", result["disagreement_by_parser_pair"])
|
| 82 |
+
|
| 83 |
+
def test_disagreement_rate_zero_when_single_parser(self):
|
| 84 |
+
result = compute_parser_disagreement(
|
| 85 |
+
{"conflicts": []},
|
| 86 |
+
{"docling": {"parser": "docling", "failed": False}},
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
+
self.assertEqual(result["candidate_count"], 1)
|
| 90 |
+
self.assertEqual(result["parser_pair_count"], 0)
|
| 91 |
+
self.assertEqual(result["disagreement_rate"], 0.0)
|
| 92 |
+
|
| 93 |
+
def test_failed_parsers_excluded_from_pair_count(self):
|
| 94 |
+
result = compute_parser_disagreement(
|
| 95 |
+
{"conflicts": []},
|
| 96 |
+
{
|
| 97 |
+
"docling": {"parser": "docling", "failed": False},
|
| 98 |
+
"marker": {"parser": "marker", "failed": True, "error": "boom"},
|
| 99 |
+
"pymupdf": {"parser": "pymupdf", "failed": False},
|
| 100 |
+
},
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
self.assertEqual(result["candidate_count"], 2)
|
| 104 |
+
self.assertEqual(result["parser_pair_count"], 1)
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
class TestRepairSuccess(unittest.TestCase):
|
| 108 |
+
def test_resolution_rate_when_blocking_issue_resolved(self):
|
| 109 |
+
pre = {"score": 0.5, "issues": [{"issue_type": "invalid_table", "blocking": True, "page_num": 1, "region_id": "t1"}]}
|
| 110 |
+
post = {"score": 0.9, "issues": []}
|
| 111 |
+
history = [{"iteration": 1, "before_score": 0.5, "after_score": 0.9, "actions": [{"action": "repair_table"}]}]
|
| 112 |
+
|
| 113 |
+
result = compute_repair_success(pre, post, history)
|
| 114 |
+
|
| 115 |
+
self.assertEqual(result["pre_repair_blocking_count"], 1)
|
| 116 |
+
self.assertEqual(result["post_repair_blocking_count"], 0)
|
| 117 |
+
self.assertEqual(result["resolved_blocking_count"], 1)
|
| 118 |
+
self.assertEqual(result["repair_resolution_rate"], 1.0)
|
| 119 |
+
self.assertEqual(result["repair_regression_rate"], 0.0)
|
| 120 |
+
self.assertEqual(result["iteration_count"], 1)
|
| 121 |
+
self.assertAlmostEqual(result["score_delta"], 0.4, places=6)
|
| 122 |
+
|
| 123 |
+
def test_regression_rate_counts_new_blocking_issues(self):
|
| 124 |
+
pre = {"score": 0.7, "issues": [{"issue_type": "invalid_table", "blocking": True, "region_id": "t1"}]}
|
| 125 |
+
post = {
|
| 126 |
+
"score": 0.6,
|
| 127 |
+
"issues": [
|
| 128 |
+
{"issue_type": "invalid_table", "blocking": True, "region_id": "t1"},
|
| 129 |
+
{"issue_type": "missing_text_coverage", "blocking": True, "page_num": 2},
|
| 130 |
+
],
|
| 131 |
+
}
|
| 132 |
+
history = [{"iteration": 1, "before_score": 0.7, "after_score": 0.6, "actions": []}]
|
| 133 |
+
|
| 134 |
+
result = compute_repair_success(pre, post, history)
|
| 135 |
+
|
| 136 |
+
self.assertEqual(result["resolved_blocking_count"], 0)
|
| 137 |
+
self.assertEqual(result["regressed_blocking_count"], 1)
|
| 138 |
+
self.assertEqual(result["repair_regression_rate"], 1.0)
|
| 139 |
+
self.assertEqual(result["repair_resolution_rate"], 0.0)
|
| 140 |
+
|
| 141 |
+
def test_vacuous_success_when_no_pre_repair_blocking_issues(self):
|
| 142 |
+
result = compute_repair_success(
|
| 143 |
+
{"score": 1.0, "issues": []},
|
| 144 |
+
{"score": 1.0, "issues": []},
|
| 145 |
+
[],
|
| 146 |
+
)
|
| 147 |
+
|
| 148 |
+
self.assertEqual(result["repair_resolution_rate"], 1.0)
|
| 149 |
+
self.assertEqual(result["repair_regression_rate"], 0.0)
|
| 150 |
+
self.assertEqual(result["iteration_count"], 0)
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
class TestRepairSuccessIntegration(unittest.TestCase):
|
| 154 |
+
def test_pipeline_records_resolution_for_iterative_table_repair(self):
|
| 155 |
+
with tempfile.TemporaryDirectory() as tmp:
|
| 156 |
+
input_path = Path(tmp) / "report.md"
|
| 157 |
+
input_path.write_text(
|
| 158 |
+
"# Report\n\n| A | B |\n| --- | --- |\n| 1 | 2 | 3 |\n",
|
| 159 |
+
encoding="utf-8",
|
| 160 |
+
)
|
| 161 |
+
|
| 162 |
+
parsed = parse_document(input_path, Path(tmp) / "out")
|
| 163 |
+
|
| 164 |
+
metrics = parsed.quality_report.metrics
|
| 165 |
+
self.assertIn("repair_resolution_rate", metrics)
|
| 166 |
+
self.assertIn("repair_regression_rate", metrics)
|
| 167 |
+
self.assertIn("parser_disagreement_rate", metrics)
|
| 168 |
+
|
| 169 |
+
success = parsed.provenance["repair_success"]
|
| 170 |
+
self.assertGreaterEqual(success["pre_repair_issue_count"], 1)
|
| 171 |
+
self.assertGreaterEqual(success["resolved_any_count"], 1)
|
| 172 |
+
self.assertGreaterEqual(success["repair_resolution_rate_any"], 0.0)
|
| 173 |
+
self.assertGreater(success["iteration_count"], 0)
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
if __name__ == "__main__":
|
| 177 |
+
unittest.main()
|