llm-document-parser / llm_document_parser /convert_doc_docling.py
salverz's picture
Update file structure
4b3616f
import os
from pathlib import Path
from docling.datamodel.document import ConversionResult
from huggingface_hub import snapshot_download
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import EasyOcrOptions, OcrMacOptions, PdfPipeline, PdfPipelineOptions, PipelineOptions, RapidOcrOptions, TesseractOcrOptions
from docling.document_converter import DocumentConverter, ImageFormatOption, PdfFormatOption
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend, PyPdfiumPageBackend
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
from docling.pipeline.simple_pipeline import SimplePipeline
# TODO: REFACTOR LOAD OCR MODEL TO JUST EITHER USE SERVER MODELS OR MOBILE MODELS
def load_rapid_ocr_model(det_model: str, rec_model: str, cls_model: str) -> DocumentConverter:
"""
Load the RapidOCR model from Hugging Face Hub.
Args:
det_model (str): Path to the detection model.
rec_model (str): Path to the recognition model.
cls_model (str): Path to the classification model.
Returns:
DocumentConverter: The loaded RapidOCR model.
"""
print("Downloading RapidOCR models")
download_path = snapshot_download(repo_id="SWHL/RapidOCR")
det_model_path = os.path.join(
download_path, det_model
)
rec_model_path = os.path.join(
download_path, rec_model
)
cls_model_path = os.path.join(
download_path, cls_model
)
ocr_options = RapidOcrOptions(
det_model_path=det_model_path,
rec_model_path=rec_model_path,
cls_model_path=cls_model_path
)
pipeline_options = PdfPipelineOptions(
ocr_options=ocr_options
)
doc_converter = DocumentConverter(
format_options={
InputFormat.IMAGE: ImageFormatOption(
pipeline_options=pipeline_options
)
}
)
return doc_converter
def load_ocr_mac_model() -> DocumentConverter:
"""
Load the OCR Mac model.
Returns:
DocumentConverter: The loaded OCR Mac model.
"""
ocr_options = OcrMacOptions(
framework='vision'
)
pipeline_options = PdfPipelineOptions(
ocr_options=ocr_options
)
doc_converter = DocumentConverter(
allowed_formats=[
InputFormat.PDF,
InputFormat.IMAGE,
],
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend, pipeline_options=pipeline_options
),
InputFormat.IMAGE: PdfFormatOption(
pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend, pipeline_options=pipeline_options
)
}
)
return doc_converter
def load_tesseract_model(tessdata_path: str) -> DocumentConverter:
"""
Load the Tesseract OCR model.
Args:
tessdata_path (str): Path to the Tesseract data directory.
Returns:
DocumentConverter: The loaded Tesseract OCR model.
"""
os.environ["TESSDATA_PREFIX"] = tessdata_path
ocr_options = TesseractOcrOptions()
pipeline_options = PdfPipelineOptions(
ocr_options=ocr_options
)
doc_converter = DocumentConverter(
allowed_formats=[
InputFormat.PDF,
InputFormat.IMAGE
],
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend, pipeline_options=pipeline_options
),
InputFormat.IMAGE: PdfFormatOption(
pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend, pipeline_options=pipeline_options
)
}
)
return doc_converter
def load_easy_ocr_model() -> DocumentConverter:
"""
Load the EasyOCR model.
Returns:
DocumentConverter: The loaded EasyOCR model.
"""
ocr_options = EasyOcrOptions()
pipeline_options = PdfPipelineOptions(
ocr_options=ocr_options
)
doc_converter = DocumentConverter(
allowed_formats=[
InputFormat.PDF,
InputFormat.IMAGE
],
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend, pipeline_options=pipeline_options
),
InputFormat.IMAGE: PdfFormatOption(
pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend, pipeline_options=pipeline_options
)
}
)
return doc_converter
def image_to_text(document_converter: DocumentConverter, file_path: Path) -> ConversionResult:
"""
Convert an image to text using the specified document converter.
Args:
document_converter (DocumentConverter): The document converter to use.
file_path (Path): Path to the image file.
Returns:
ConversionResult: The result of the conversion.
"""
conv_results = document_converter.convert(file_path)
return conv_results