Spaces:

salverz
/

llm-document-parser

Running

App Files Files Community

llm-document-parser / llm_document_parser /convert_doc_docling.py

salverz

Update file structure

4b3616f about 1 month ago

raw

history blame contribute delete

5.13 kB

	import os
	from pathlib import Path
	from docling.datamodel.document import ConversionResult
	from huggingface_hub import snapshot_download

	from docling.datamodel.base_models import InputFormat
	from docling.datamodel.pipeline_options import EasyOcrOptions, OcrMacOptions, PdfPipeline, PdfPipelineOptions, PipelineOptions, RapidOcrOptions, TesseractOcrOptions
	from docling.document_converter import DocumentConverter, ImageFormatOption, PdfFormatOption
	from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend, PyPdfiumPageBackend
	from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
	from docling.pipeline.simple_pipeline import SimplePipeline


	# TODO: REFACTOR LOAD OCR MODEL TO JUST EITHER USE SERVER MODELS OR MOBILE MODELS
	def load_rapid_ocr_model(det_model: str, rec_model: str, cls_model: str) -> DocumentConverter:
	"""
	Load the RapidOCR model from Hugging Face Hub.
	Args:
	det_model (str): Path to the detection model.
	rec_model (str): Path to the recognition model.
	cls_model (str): Path to the classification model.
	Returns:
	DocumentConverter: The loaded RapidOCR model.
	"""
	print("Downloading RapidOCR models")
	download_path = snapshot_download(repo_id="SWHL/RapidOCR")

	det_model_path = os.path.join(
	download_path, det_model
	)
	rec_model_path = os.path.join(
	download_path, rec_model
	)
	cls_model_path = os.path.join(
	download_path, cls_model
	)

	ocr_options = RapidOcrOptions(
	det_model_path=det_model_path,
	rec_model_path=rec_model_path,
	cls_model_path=cls_model_path
	)

	pipeline_options = PdfPipelineOptions(
	ocr_options=ocr_options
	)

	doc_converter = DocumentConverter(
	format_options={
	InputFormat.IMAGE: ImageFormatOption(
	pipeline_options=pipeline_options
	)
	}
	)

	return doc_converter

	def load_ocr_mac_model() -> DocumentConverter:
	"""
	Load the OCR Mac model.
	Returns:
	DocumentConverter: The loaded OCR Mac model.
	"""
	ocr_options = OcrMacOptions(
	framework='vision'
	)

	pipeline_options = PdfPipelineOptions(
	ocr_options=ocr_options
	)

	doc_converter = DocumentConverter(
	allowed_formats=[
	InputFormat.PDF,
	InputFormat.IMAGE,
	],
	format_options={
	InputFormat.PDF: PdfFormatOption(
	pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend, pipeline_options=pipeline_options
	),
	InputFormat.IMAGE: PdfFormatOption(
	pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend, pipeline_options=pipeline_options
	)
	}
	)

	return doc_converter

	def load_tesseract_model(tessdata_path: str) -> DocumentConverter:
	"""
	Load the Tesseract OCR model.
	Args:
	tessdata_path (str): Path to the Tesseract data directory.
	Returns:
	DocumentConverter: The loaded Tesseract OCR model.
	"""
	os.environ["TESSDATA_PREFIX"] = tessdata_path

	ocr_options = TesseractOcrOptions()

	pipeline_options = PdfPipelineOptions(
	ocr_options=ocr_options
	)

	doc_converter = DocumentConverter(
	allowed_formats=[
	InputFormat.PDF,
	InputFormat.IMAGE
	],
	format_options={
	InputFormat.PDF: PdfFormatOption(
	pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend, pipeline_options=pipeline_options
	),
	InputFormat.IMAGE: PdfFormatOption(
	pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend, pipeline_options=pipeline_options
	)
	}
	)

	return doc_converter

	def load_easy_ocr_model() -> DocumentConverter:
	"""
	Load the EasyOCR model.
	Returns:
	DocumentConverter: The loaded EasyOCR model.
	"""
	ocr_options = EasyOcrOptions()

	pipeline_options = PdfPipelineOptions(
	ocr_options=ocr_options
	)

	doc_converter = DocumentConverter(
	allowed_formats=[
	InputFormat.PDF,
	InputFormat.IMAGE
	],
	format_options={
	InputFormat.PDF: PdfFormatOption(
	pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend, pipeline_options=pipeline_options
	),
	InputFormat.IMAGE: PdfFormatOption(
	pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend, pipeline_options=pipeline_options
	)
	}
	)
	return doc_converter

	def image_to_text(document_converter: DocumentConverter, file_path: Path) -> ConversionResult:
	"""
	Convert an image to text using the specified document converter.
	Args:
	document_converter (DocumentConverter): The document converter to use.
	file_path (Path): Path to the image file.
	Returns:
	ConversionResult: The result of the conversion.
	"""
	conv_results = document_converter.convert(file_path)
	return conv_results