adelevett's picture
Upload 2 files
8ac770e verified
# ---------------------------------------------------------------------------
# Force-upgrade transformers to >=5.1.0 before any other import.
#
# Why: PP-DocLayoutV3's custom model classes (PPDocLayoutV3ImageProcessor,
# PPDocLayoutV3ForObjectDetection) were added to the transformers library in
# version 5.1.0. docling-ibm-models caps transformers<5.0.0 (conservative
# pinning), so pip resolves transformers ~4.x at build time. We upgrade it
# here at runtime, before any docling/transformers import, so the correct
# classes are available. docling-ibm-models' usage (AutoModel, pipeline API)
# remains compatible with transformers 5.x.
# ---------------------------------------------------------------------------
import subprocess
import sys
subprocess.run(
[
sys.executable, "-m", "pip", "install",
"transformers>=5.1.0",
"--quiet",
],
check=True,
)
# `spaces` MUST be imported before any package that touches CUDA (torch,
# transformers, docling …). ZeroGPU intercepts the CUDA initialisation; if
# anything else triggers it first the import raises RuntimeError.
import spaces # noqa: E402
# ---------------------------------------------------------------------------
# Plugin registration
# ---------------------------------------------------------------------------
# docling-pp-doc-layout requires Python >=3.12 on PyPI, but the code itself
# is compatible with Python 3.10 (all annotations are guarded by
# `from __future__ import annotations`). Instead of installing the package,
# we bundle the source directly and register the model with docling's factory
# by monkey-patching BaseFactory.load_from_plugins so that every new
# LayoutFactory instance automatically includes PPDocLayoutV3Model.
from docling.models.factories.base_factory import BaseFactory
from docling.models.factories.layout_factory import LayoutFactory
from docling_pp_doc_layout.model import PPDocLayoutV3Model
_orig_load = BaseFactory.load_from_plugins
def _load_with_pp_doc_layout(
self, plugin_name=None, allow_external_plugins=False
):
_orig_load(
self,
plugin_name=plugin_name,
allow_external_plugins=allow_external_plugins,
)
if isinstance(self, LayoutFactory):
try:
self.register(
PPDocLayoutV3Model,
"docling-pp-doc-layout",
"docling_pp_doc_layout.model",
)
except ValueError:
pass # already registered on a previous factory creation
BaseFactory.load_from_plugins = _load_with_pp_doc_layout
# ---------------------------------------------------------------------------
import gradio as gr
from docling.datamodel.base_models import InputFormat
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling_pp_doc_layout.options import PPDocLayoutV3Options
# Global initialisation — pipeline is constructed lazily on the first
# convert() call, which happens inside @spaces.GPU, so decide_device()
# correctly resolves "cuda:0" when the H200 is allocated.
pipeline_options = PdfPipelineOptions(
layout_options=PPDocLayoutV3Options(
batch_size=2,
confidence_threshold=0.5,
)
)
converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
}
)
@spaces.GPU(duration=120)
def infer_layout(file_path: str | None):
if not file_path:
return {"error": "No file uploaded"}, None
try:
result = converter.convert(file_path)
structured_data = []
for item, _level in result.document.iterate_items():
structured_data.append({
"type": type(item).__name__,
"content": getattr(item, "text", "No text mapping"),
})
# Write to a temp file so Gradio can serve it as a download.
import json, tempfile, os
tmp = tempfile.NamedTemporaryFile(
mode="w", suffix=".json", delete=False, encoding="utf-8"
)
json.dump(structured_data, tmp, ensure_ascii=False, indent=2)
tmp.close()
return structured_data, tmp.name
except Exception as e:
return {"runtime_exception": str(e)}, None
with gr.Blocks(title="PP-DocLayoutV3 Empirical Parser") as interface:
gr.Markdown(
"## Layout Detection Inference\n"
"Upload a PDF to parse structural components through the "
"PaddlePaddle PP-DocLayoutV3 model."
)
with gr.Row():
pdf_input = gr.File(label="Source Document", file_types=[".pdf"])
json_output = gr.JSON(label="Structured Extraction Matrix")
download_btn = gr.DownloadButton(label="Download JSON", visible=False)
execute_btn = gr.Button("Run Layout Detection")
def run_and_reveal(file_path):
data, path = infer_layout(file_path)
return data, gr.DownloadButton(value=path, visible=path is not None)
execute_btn.click(
fn=run_and_reveal,
inputs=pdf_input,
outputs=[json_output, download_btn],
)
if __name__ == "__main__":
interface.launch()