Spaces:

chunking-ai
/

pdf-playground

Paused

App Files Files Community

taprosoft commited on Feb 25

Commit

77fbded

0 Parent(s):

feat: initial commit

Browse files

Files changed (9) hide show

README.md +13 -0
app.py +200 -0
backends/__init__.py +11 -0
backends/docling.py +47 -0
backends/marker.py +24 -0
backends/mineru.py +56 -0
backends/unstructured.py +68 -0
header.html +47 -0
utils.py +29 -0

README.md ADDED Viewed

	@@ -0,0 +1,13 @@

+---
+title: DoclingConverter
+emoji: 🐢
+colorFrom: blue
+colorTo: red
+sdk: gradio
+sdk_version: 5.7.1
+app_file: app.py
+pinned: false
+short_description: Convert documents to Markdown or JSON with metadata
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,200 @@

+import time
+from pathlib import Path
+import gradio as gr
+import pymupdf4llm
+from gradio_pdf import PDF
+from backends import (
+    convert_docling,
+    convert_marker,
+    convert_mineru,
+    convert_unstructured,
+)
+from utils import remove_images_from_markdown, trim_pages
+TRIMMED_PDF_PATH = Path("/tmp/gradio/trim")
+TRIMMED_PDF_PATH.mkdir(exist_ok=True)
+def convert_document(path, method, enabled=True):
+    print("Processing file", path, "with method", method, "enabled", enabled)
+    if not enabled:
+        return "", "", []
+    # benchmarking
+    start = time.time()
+    path = trim_pages(path, output_path=TRIMMED_PDF_PATH)
+    file_name = Path(path).stem
+    debug_image_paths = []
+    text = "unknown method"
+    if method == "Docling":
+        text, debug_image_paths = convert_docling(path, file_name)
+    elif method == "Marker":
+        text, debug_image_paths = convert_marker(path, file_name)
+    elif method == "Unstructured":
+        text, debug_image_paths = convert_unstructured(path, file_name)
+    elif method == "PyMuPDF":
+        text = pymupdf4llm.to_markdown(
+            path,
+            embed_images=True,
+        )
+    elif method == "MinerU":
+        text, debug_image_paths = convert_mineru(path, file_name)
+    end = time.time()
+    print(f"Conversion with {method} took {end - start} seconds")
+    return text, remove_images_from_markdown(text), debug_image_paths
+def show_tabs(selected_methods):
+    visible_tabs = []
+    for method in supported_methods:
+        visible_tabs.append(gr.update(visible=method in selected_methods))
+    return visible_tabs
+latex_delimiters = [
+    {"left": "$$", "right": "$$", "display": True},
+    {"left": "$", "right": "$", "display": False},
+]
+# startup test (also for loading models the first time)
+start_startup = time.time()
+test_pdf_path = "/home/tadashi/MinerU/examples/complex_layout.pdf"
+supported_methods = ["Docling", "Marker", "Unstructured", "MinerU", "PyMuPDF"]
+# print("Warm-up sequence")
+# for method in supported_methods:
+#     for _ in range(1):
+#         convert_document(test_pdf_path, method)
+# print("Start up time", time.time() - start_startup, "seconds")
+with gr.Blocks(
+    theme=gr.themes.Ocean(),
+) as demo:
+    with open("header.html", "r") as file:
+        header = file.read()
+    gr.HTML(header)
+    output_components = []
+    output_tabs = []
+    visualization_sub_tabs = []
+    first_method = supported_methods[0]
+    num_methods = len(supported_methods)
+    with gr.Row():
+        with gr.Column(variant="panel", scale=5):
+            input_file = gr.File(
+                label="Upload PDF document",
+                file_types=[
+                    ".pdf",
+                ],
+            )
+            progress_status = gr.Markdown("", show_label=False, container=False)
+        with gr.Column(variant="panel", scale=5):
+            with gr.Row():
+                methods = gr.Dropdown(
+                    supported_methods,
+                    label="Conversion methods",
+                    value=first_method,
+                    multiselect=True,
+                )
+            with gr.Row():
+                visual_checkbox = gr.Checkbox(
+                    label="Enable debug visualizations", value=True
+                )
+            with gr.Row():
+                convert_btn = gr.Button("Convert", variant="primary", scale=2)
+                clear_btn = gr.ClearButton(value="Clear", scale=1)
+    with gr.Row():
+        with gr.Column(variant="panel", scale=5):
+            pdf_preview = PDF(
+                label="PDF preview",
+                interactive=False,
+                visible=True,
+                height=800,
+            )
+        with gr.Column(variant="panel", scale=5):
+            with gr.Tabs():
+                for method in supported_methods:
+                    with gr.Tab(method, visible=False) as output_tab:
+                        with gr.Tabs():
+                            with gr.Tab("Markdown rendering"):
+                                markdown_render = gr.Markdown(
+                                    label="Markdown rendering",
+                                    height=900,
+                                    show_copy_button=True,
+                                    line_breaks=True,
+                                    latex_delimiters=latex_delimiters,
+                                )
+                            with gr.Tab("Debug visualizations") as visual_sub_tab:
+                                debug_images = gr.Gallery(
+                                    show_label=False,
+                                    container=False,
+                                    interactive=False,
+                                )
+                            with gr.Tab("Raw text"):
+                                markdown_text = gr.TextArea(
+                                    lines=45, show_label=False, container=False
+                                )
+                    output_components.extend(
+                        [markdown_render, markdown_text, debug_images]
+                    )
+                    output_tabs.append(output_tab)
+                    visualization_sub_tabs.append(visual_sub_tab)
+    input_file.change(fn=lambda x: x, inputs=input_file, outputs=pdf_preview)
+    click_event = convert_btn.click(
+        fn=show_tabs,
+        inputs=[methods],
+        outputs=output_tabs,
+    )
+    for idx, method in enumerate(supported_methods):
+        def progress_message(idx=idx, method=method):
+            return f"Processing ({idx + 1} / {num_methods}) **{method}**...\n\n"
+        def process_method(input_file, selected_methods, method=method):
+            return convert_document(
+                input_file, method=method, enabled=method in selected_methods
+            )
+        click_event = click_event.then(
+            fn=lambda idx=idx, method=method: progress_message(idx, method),
+            outputs=[progress_status],
+        ).then(
+            fn=lambda input_file, methods, method=method: process_method(
+                input_file, methods, method
+            ),
+            inputs=[input_file, methods],
+            outputs=output_components[idx * 3 : (idx + 1) * 3],
+        )
+    click_event.then(
+        lambda: "All tasks completed.",
+        outputs=[progress_status],
+    )
+    clear_btn.add(
+        [
+            input_file,
+            pdf_preview,
+        ]
+        + output_components
+    )
+    visual_checkbox.change(
+        fn=lambda state: [gr.update(visible=state)] * len(visualization_sub_tabs),
+        inputs=visual_checkbox,
+        outputs=visualization_sub_tabs,
+    )
+    demo.launch(show_error=True)

backends/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from .docling import convert_docling
+from .marker import convert_marker
+from .mineru import convert_mineru
+from .unstructured import convert_unstructured
+__all__ = [
+    "convert_docling",
+    "convert_marker",
+    "convert_mineru",
+    "convert_unstructured",
+]

backends/docling.py ADDED Viewed

	@@ -0,0 +1,47 @@

+from pathlib import Path
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.pipeline_options import (
+    AcceleratorDevice,
+    AcceleratorOptions,
+    PdfPipelineOptions,
+)
+from docling.datamodel.settings import settings
+from docling.document_converter import DocumentConverter, PdfFormatOption
+from docling_core.types.doc import ImageRefMode
+DOCLING_DEBUG_PATH = Path("/tmp/docling")
+# Docling settings
+accelerator_options = AcceleratorOptions(num_threads=8, device=AcceleratorDevice.AUTO)
+pipeline_options = PdfPipelineOptions()
+pipeline_options.accelerator_options = accelerator_options
+pipeline_options.do_ocr = True
+pipeline_options.do_table_structure = True
+pipeline_options.generate_picture_images = True
+pipeline_options.images_scale = 2.0
+# debug visualization settings
+settings.debug.debug_output_path = str(DOCLING_DEBUG_PATH)
+settings.debug.visualize_layout = True
+settings.debug.visualize_tables = True
+# Docling init
+docling_converter = DocumentConverter(
+    format_options={
+        InputFormat.PDF: PdfFormatOption(
+            pipeline_options=pipeline_options,
+        )
+    }
+)
+def convert_docling(path: str, file_name: str):
+    result = docling_converter.convert(path)
+    text = result.document.export_to_markdown(image_mode=ImageRefMode.EMBEDDED)
+    debug_image_dir = DOCLING_DEBUG_PATH / f"debug_{file_name}"
+    debug_image_paths = [
+        path for path in debug_image_dir.iterdir() if path.suffix == ".png"
+    ]
+    return text, debug_image_paths

backends/marker.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from pathlib import Path
+from marker.converters.pdf import PdfConverter
+from marker.models import create_model_dict
+from marker.output import text_from_rendered
+# Marker init
+marker_converter = PdfConverter(
+    artifact_dict=create_model_dict(),
+    config={
+        "debug_pdf_images": True,
+    },
+)
+def convert_marker(path: str, file_name: str):
+    rendered = marker_converter(path)
+    text, _, images = text_from_rendered(rendered)
+    debug_image_dir = Path(rendered.metadata.get("debug_data_path"))
+    debug_image_paths = [
+        path for path in debug_image_dir.iterdir() if "pdf_page" in path.stem
+    ]
+    return text, debug_image_paths

backends/mineru.py ADDED Viewed

	@@ -0,0 +1,56 @@

+from pathlib import Path
+import pymupdf
+from magic_pdf.data.data_reader_writer import FileBasedDataReader
+from magic_pdf.tools.common import do_parse, prepare_env
+MINERU_DEBUG_PATH = Path("/tmp/mineru")
+MINERU_DEBUG_PATH.mkdir(exist_ok=True)
+def read_fn(path):
+    disk_rw = FileBasedDataReader(MINERU_DEBUG_PATH)
+    return disk_rw.read(path)
+def do_process_mineru(input_path, output_dir):
+    file_name = Path(input_path).stem
+    output_dir = Path(output_dir)
+    pdf_data = read_fn(input_path)
+    parse_method = "auto"
+    local_image_dir, local_md_dir = prepare_env(output_dir, file_name, parse_method)
+    do_parse(
+        output_dir,
+        file_name,
+        pdf_data,
+        [],
+        parse_method,
+        debug_able=False,
+        f_dump_orig_pdf=False,
+        formula_enable=False,
+        table_enable=True,
+    )
+    return local_md_dir, file_name
+def convert_mineru(path: str, file_name: str):
+    debug_image_paths = []
+    output_path = MINERU_DEBUG_PATH / file_name
+    output_path.mkdir(exist_ok=True)
+    local_md_dir, _ = do_process_mineru(path, output_path)
+    local_md_dir = Path(local_md_dir)
+    with open(local_md_dir / f"{file_name}.md", "r") as file:
+        text = file.read()
+    debug_pdf = str(local_md_dir / (file_name + "_layout.pdf"))
+    doc = pymupdf.open(debug_pdf)  # open document
+    for page in doc:  # iterate through the pages
+        pix = page.get_pixmap()  # render page to an image
+        page_debug_path = str(output_path / ("page-%i.png" % page.number))
+        debug_image_paths.append(page_debug_path)
+        pix.save(page_debug_path)  # store image as a PNG
+    return text, debug_image_paths

backends/unstructured.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import functools
+from pathlib import Path
+from matplotlib import font_manager
+from unstructured.partition.pdf import partition_pdf
+from unstructured.partition.pdf_image.analysis import bbox_visualisation
+UNSTRUCTURED_DEBUG_PATH = Path("/tmp/unstructured")
+def convert_elements_to_markdown(elements):
+    lines = []
+    for e in elements:
+        if e.category == "Title":
+            line = f"\n# {e.text}\n"
+        elif e.category == "ListItem":
+            line = f"- {e.text}"
+        elif e.category == "Table":
+            line = f"\n{e.metadata.text_as_html}\n"
+        elif e.category == "UncategorizedText":
+            line = ""
+        else:
+            line = e.text
+        lines.append(line)
+    md = "\n".join(lines)
+    return md
+@functools.lru_cache(maxsize=None)
+def get_font():
+    preferred_fonts = ["Arial.ttf", "DejaVuSans.ttf"]
+    available_fonts = font_manager.findSystemFonts()
+    if not available_fonts:
+        raise ValueError("No fonts available")
+    for font in preferred_fonts:
+        for available_font in available_fonts:
+            if font in available_font:
+                return available_font
+    return available_fonts[0]
+# monkey patch
+bbox_visualisation.get_font = get_font
+def convert_unstructured(path: str, file_name: str):
+    elements = partition_pdf(
+        filename=path,
+        # mandatory to use ``hi_res`` strategy
+        strategy="hi_res",
+        infer_table_structure=True,
+        # extract_images_in_pdf=True,
+        # extract_image_block_types=["Image", "Table"],
+        # extract_image_block_to_payload=False,
+        analysis=True,
+        analyzed_image_output_dir_path=UNSTRUCTURED_DEBUG_PATH,
+    )
+    text = convert_elements_to_markdown(elements)
+    debug_image_dir = UNSTRUCTURED_DEBUG_PATH / "analysis" / file_name / "bboxes"
+    debug_image_paths = [
+        path for path in debug_image_dir.iterdir() if "od_model" in path.stem
+    ]
+    return text, debug_image_paths

header.html ADDED Viewed

	@@ -0,0 +1,47 @@

+<html>
+  <head>
+  <link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.15.4/css/all.css">
+  </head>
+<body>
+  <div style="
+      display: flex;
+      flex-direction: column;
+      justify-content: center;
+      align-items: center;
+      text-align: center;
+      background: #059669;
+      padding: 18px;
+      gap: 18px;
+      border-radius: 8px;
+    ">
+    <div style="
+        display: flex;
+        flex-direction: column;
+        align-items: center;
+        gap: 12px;
+      ">
+      <div style="display: flex; flex-direction: column; gap: 8px">
+        <h1 style="
+            font-size: 48px;
+            color: #fafafa;
+            margin: 0;
+            font-family: 'Trebuchet MS', 'Lucida Sans Unicode',
+              'Lucida Grande', 'Lucida Sans', Arial, sans-serif;
+          ">
+          PDF Parsers Playground
+        </h1>
+      </div>
+    </div>
+    <p style="
+        margin: 0;
+        line-height: 1.6rem;
+        font-size: 16px;
+        color: #fafafa;
+        opacity: 0.8;
+      ">
+      Playground for quick and easy experiment with many popular open-source PDF parsers.<br>
+    </p>
+  </div>
+</body></html>

utils.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import re
+from pathlib import Path
+from shutil import copy2
+import pymupdf
+def remove_images_from_markdown(markdown_text):
+    # remove <image> and ![image](path) from markdown
+    markdown_text = re.sub(r"<img[^>]*>", "", markdown_text)
+    markdown_text = re.sub(r"!\[[^\]]*\]\([^)]*\)", "", markdown_text)
+    return markdown_text
+def trim_pages(pdf_path, output_path, trim_pages=5):
+    doc = pymupdf.open(pdf_path)
+    parent_dir_name = Path(pdf_path).parent.name
+    output_file_path = Path(output_path) / f"{parent_dir_name}.pdf"
+    num_pages = len(doc)
+    if num_pages > trim_pages:
+        to_select = list(range(trim_pages))
+        doc.select(to_select)
+        doc.ez_save(output_file_path)
+        print("Trimmed pdf to with pages", to_select, "path", output_file_path)
+    else:
+        copy2(pdf_path, str(output_file_path))
+    return str(output_file_path)