from pathlib import Path import gradio as gr import pymupdf from ultralytics import YOLO from PIL import Image from huggingface_hub import hf_hub_download SAMPLES = Path(__file__).parent / "samples" IMAGE_SAMPLES = [ SAMPLES / "image1.png", SAMPLES / "image2.png", SAMPLES / "image3.png", SAMPLES / "image4.png", ] AVAILABLE_MODELS = { "yolo11n": ("Armaggheddon/yolo11-document-layout", "yolo11n_doc_layout.pt"), "yolo11s": ("Armaggheddon/yolo11-document-layout", "yolo11s_doc_layout.pt"), "yolo11m": ("Armaggheddon/yolo11-document-layout", "yolo11m_doc_layout.pt"), } current_model = "yolo11n" model = None def load_model(selected_model): global model if model is None or current_model != selected_model: repo_id, filename = AVAILABLE_MODELS[selected_model] model_path = hf_hub_download(repo_id=repo_id, filename=filename) model = YOLO(model_path) def model_runner(image, conf=0.25, iou=0.45): result = model.predict(source=image, save=False, verbose=False, conf=conf, iou=iou, imgsz=1280) result_img = result[0].plot() return result_img def process_input(selected_model, pdf_input, image_input, conf=0.25, iou=0.45): if pdf_input is None and image_input is None: return gr.Error("Please upload a PDF or an image file.") load_model(selected_model) pages = [] if pdf_input is not None and pdf_input.endswith(".pdf"): doc = pymupdf.open(pdf_input) for page in doc: pix = page.get_pixmap(dpi=200) # if A4 should result in above 1400px width pil_img = pix.pil_image() result_img = model_runner(pil_img) pages.append(result_img) elif image_input is not None and image_input.endswith((".png", ".jpg", ".jpeg")): image = image_input result_img = model_runner(image) pages.append(result_img) else: return gr.Error("Unsupported file type. Please upload a PDF or an image file with .pdf, .jpg or .jpeg extension.") return ((page, f"Page {i+1}") for i, page in enumerate(pages)) with gr.Blocks() as demo: gr.Markdown("# YOLO11 Document Layout 🔎📄") gr.Markdown( """ Detects layout elements in documents (PDFs or images) using YOLOv11 models and the Ultralytics library. Upload a PDF or an image, select a model size, and click "Run" to see the detected layout elements. - Finetuned models available at [Armaggheddon/yolo11-document-layout](https://huggingface.co/Armaggheddon/yolo11-document-layout) - More available in the [GitHub Repository](https://github.com/Armaggheddon/yolo11_doc_layout) """ ) with gr.Row(): with gr.Column(): pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"], file_count="single") image_input = gr.Image(label="Upload Image", type="filepath") clear_button = gr.Button("Clear") run_button = gr.Button("Run", variant="primary") with gr.Column(): outputs = gr.Gallery(label="Output Image") with gr.Group(): model_name = gr.Dropdown( list(AVAILABLE_MODELS.keys()), value="yolo11n", label="Model size", ) conf = gr.Slider(0, 1, value=0.25, step=0.01, label="Confidence threshold") iou = gr.Slider(0, 1, value=0.45, step=0.01, label="IOU threshold") examples = gr.Examples( examples=[[str(p), "yolo11n"] for p in IMAGE_SAMPLES], inputs=[image_input, model_name], cache_examples=False, fn=process_input, outputs=outputs, ) run_button.click( fn=process_input, inputs=[model_name, pdf_input, image_input, conf, iou], outputs=outputs, ) clear_button.click( fn=lambda: (None, None, None), inputs=[], outputs=[pdf_input, image_input, outputs], ) demo.launch()