Spaces:

salverz
/

llm-document-parser

Running

File size: 5,129 Bytes

3f7e152

import os
from pathlib import Path
from docling.datamodel.document import ConversionResult
from huggingface_hub import snapshot_download

from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import EasyOcrOptions, OcrMacOptions, PdfPipeline, PdfPipelineOptions, PipelineOptions, RapidOcrOptions, TesseractOcrOptions
from docling.document_converter import DocumentConverter, ImageFormatOption, PdfFormatOption
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend, PyPdfiumPageBackend
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
from docling.pipeline.simple_pipeline import SimplePipeline


# TODO: REFACTOR LOAD OCR MODEL TO JUST EITHER USE SERVER MODELS OR MOBILE MODELS
def load_rapid_ocr_model(det_model: str, rec_model: str, cls_model: str) -> DocumentConverter:
    """
    Load the RapidOCR model from Hugging Face Hub.
    Args:
        det_model (str): Path to the detection model.
        rec_model (str): Path to the recognition model.
        cls_model (str): Path to the classification model.
    Returns:
        DocumentConverter: The loaded RapidOCR model.
    """
    print("Downloading RapidOCR models")
    download_path = snapshot_download(repo_id="SWHL/RapidOCR")

    det_model_path = os.path.join(
        download_path, det_model
    )
    rec_model_path = os.path.join(
        download_path, rec_model
    )
    cls_model_path = os.path.join(
        download_path, cls_model
    )

    ocr_options = RapidOcrOptions(
        det_model_path=det_model_path,
        rec_model_path=rec_model_path,
        cls_model_path=cls_model_path
    )

    pipeline_options = PdfPipelineOptions(
        ocr_options=ocr_options
    )

    doc_converter = DocumentConverter(
        format_options={
            InputFormat.IMAGE: ImageFormatOption(
                pipeline_options=pipeline_options
            )
        }
    )

    return doc_converter

def load_ocr_mac_model() -> DocumentConverter:
    """
    Load the OCR Mac model.
    Returns:
        DocumentConverter: The loaded OCR Mac model.
    """
    ocr_options = OcrMacOptions(
        framework='vision'
    )

    pipeline_options = PdfPipelineOptions(
        ocr_options=ocr_options
    )

    doc_converter = DocumentConverter(
        allowed_formats=[
            InputFormat.PDF,
            InputFormat.IMAGE,
        ],
        format_options={
            InputFormat.PDF: PdfFormatOption(
                pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend, pipeline_options=pipeline_options
            ),
            InputFormat.IMAGE: PdfFormatOption(
                pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend, pipeline_options=pipeline_options
            )
        }
    )
    
    return doc_converter

def load_tesseract_model(tessdata_path: str) -> DocumentConverter:
    """
    Load the Tesseract OCR model. 
    Args:
        tessdata_path (str): Path to the Tesseract data directory.
    Returns:
        DocumentConverter: The loaded Tesseract OCR model.
    """
    os.environ["TESSDATA_PREFIX"] = tessdata_path

    ocr_options = TesseractOcrOptions()

    pipeline_options = PdfPipelineOptions(
        ocr_options=ocr_options
    )

    doc_converter = DocumentConverter(
        allowed_formats=[
            InputFormat.PDF,
            InputFormat.IMAGE
        ],
        format_options={
            InputFormat.PDF: PdfFormatOption(
                pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend, pipeline_options=pipeline_options
            ),
            InputFormat.IMAGE: PdfFormatOption(
                pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend, pipeline_options=pipeline_options
            )
        }
    )

    return doc_converter

def load_easy_ocr_model() -> DocumentConverter:
    """
    Load the EasyOCR model.
    Returns:
        DocumentConverter: The loaded EasyOCR model.
    """
    ocr_options = EasyOcrOptions()

    pipeline_options = PdfPipelineOptions(
        ocr_options=ocr_options
    )

    doc_converter = DocumentConverter(
        allowed_formats=[
            InputFormat.PDF,
            InputFormat.IMAGE
        ],
        format_options={
            InputFormat.PDF: PdfFormatOption(
                pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend, pipeline_options=pipeline_options
            ),
            InputFormat.IMAGE: PdfFormatOption(
                pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend, pipeline_options=pipeline_options
            )
        }
    )
    return doc_converter

def image_to_text(document_converter: DocumentConverter, file_path: Path) -> ConversionResult:
    """
    Convert an image to text using the specified document converter.
    Args:
        document_converter (DocumentConverter): The document converter to use.
        file_path (Path): Path to the image file.
    Returns:
        ConversionResult: The result of the conversion.
    """
    conv_results = document_converter.convert(file_path)
    return conv_results