Spaces:
Running
Running
import os | |
from pathlib import Path | |
from docling.datamodel.document import ConversionResult | |
from huggingface_hub import snapshot_download | |
from docling.datamodel.base_models import InputFormat | |
from docling.datamodel.pipeline_options import EasyOcrOptions, OcrMacOptions, PdfPipeline, PdfPipelineOptions, PipelineOptions, RapidOcrOptions, TesseractOcrOptions | |
from docling.document_converter import DocumentConverter, ImageFormatOption, PdfFormatOption | |
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend, PyPdfiumPageBackend | |
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline | |
from docling.pipeline.simple_pipeline import SimplePipeline | |
# TODO: REFACTOR LOAD OCR MODEL TO JUST EITHER USE SERVER MODELS OR MOBILE MODELS | |
def load_rapid_ocr_model(det_model: str, rec_model: str, cls_model: str) -> DocumentConverter: | |
""" | |
Load the RapidOCR model from Hugging Face Hub. | |
Args: | |
det_model (str): Path to the detection model. | |
rec_model (str): Path to the recognition model. | |
cls_model (str): Path to the classification model. | |
Returns: | |
DocumentConverter: The loaded RapidOCR model. | |
""" | |
print("Downloading RapidOCR models") | |
download_path = snapshot_download(repo_id="SWHL/RapidOCR") | |
det_model_path = os.path.join( | |
download_path, det_model | |
) | |
rec_model_path = os.path.join( | |
download_path, rec_model | |
) | |
cls_model_path = os.path.join( | |
download_path, cls_model | |
) | |
ocr_options = RapidOcrOptions( | |
det_model_path=det_model_path, | |
rec_model_path=rec_model_path, | |
cls_model_path=cls_model_path | |
) | |
pipeline_options = PdfPipelineOptions( | |
ocr_options=ocr_options | |
) | |
doc_converter = DocumentConverter( | |
format_options={ | |
InputFormat.IMAGE: ImageFormatOption( | |
pipeline_options=pipeline_options | |
) | |
} | |
) | |
return doc_converter | |
def load_ocr_mac_model() -> DocumentConverter: | |
""" | |
Load the OCR Mac model. | |
Returns: | |
DocumentConverter: The loaded OCR Mac model. | |
""" | |
ocr_options = OcrMacOptions( | |
framework='vision' | |
) | |
pipeline_options = PdfPipelineOptions( | |
ocr_options=ocr_options | |
) | |
doc_converter = DocumentConverter( | |
allowed_formats=[ | |
InputFormat.PDF, | |
InputFormat.IMAGE, | |
], | |
format_options={ | |
InputFormat.PDF: PdfFormatOption( | |
pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend, pipeline_options=pipeline_options | |
), | |
InputFormat.IMAGE: PdfFormatOption( | |
pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend, pipeline_options=pipeline_options | |
) | |
} | |
) | |
return doc_converter | |
def load_tesseract_model(tessdata_path: str) -> DocumentConverter: | |
""" | |
Load the Tesseract OCR model. | |
Args: | |
tessdata_path (str): Path to the Tesseract data directory. | |
Returns: | |
DocumentConverter: The loaded Tesseract OCR model. | |
""" | |
os.environ["TESSDATA_PREFIX"] = tessdata_path | |
ocr_options = TesseractOcrOptions() | |
pipeline_options = PdfPipelineOptions( | |
ocr_options=ocr_options | |
) | |
doc_converter = DocumentConverter( | |
allowed_formats=[ | |
InputFormat.PDF, | |
InputFormat.IMAGE | |
], | |
format_options={ | |
InputFormat.PDF: PdfFormatOption( | |
pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend, pipeline_options=pipeline_options | |
), | |
InputFormat.IMAGE: PdfFormatOption( | |
pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend, pipeline_options=pipeline_options | |
) | |
} | |
) | |
return doc_converter | |
def load_easy_ocr_model() -> DocumentConverter: | |
""" | |
Load the EasyOCR model. | |
Returns: | |
DocumentConverter: The loaded EasyOCR model. | |
""" | |
ocr_options = EasyOcrOptions() | |
pipeline_options = PdfPipelineOptions( | |
ocr_options=ocr_options | |
) | |
doc_converter = DocumentConverter( | |
allowed_formats=[ | |
InputFormat.PDF, | |
InputFormat.IMAGE | |
], | |
format_options={ | |
InputFormat.PDF: PdfFormatOption( | |
pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend, pipeline_options=pipeline_options | |
), | |
InputFormat.IMAGE: PdfFormatOption( | |
pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend, pipeline_options=pipeline_options | |
) | |
} | |
) | |
return doc_converter | |
def image_to_text(document_converter: DocumentConverter, file_path: Path) -> ConversionResult: | |
""" | |
Convert an image to text using the specified document converter. | |
Args: | |
document_converter (DocumentConverter): The document converter to use. | |
file_path (Path): Path to the image file. | |
Returns: | |
ConversionResult: The result of the conversion. | |
""" | |
conv_results = document_converter.convert(file_path) | |
return conv_results | |