Spaces:
Paused
Paused
| from pathlib import Path | |
| from typing import Dict, List, Optional, Any, Union | |
| import json | |
| import os | |
| import shutil | |
| from src.parsers.parser_interface import DocumentParser | |
| from src.parsers.parser_registry import ParserRegistry | |
| from docling.document_converter import DocumentConverter, PdfFormatOption | |
| from docling.datamodel.base_models import InputFormat | |
| from docling.datamodel.pipeline_options import ( | |
| AcceleratorDevice, | |
| AcceleratorOptions, | |
| PdfPipelineOptions, | |
| ) | |
| from docling.models.tesseract_ocr_model import TesseractOcrOptions | |
| from docling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions | |
| class DoclingParser(DocumentParser): | |
| """Parser implementation using Docling.""" | |
| def get_name(cls) -> str: | |
| return "Docling" | |
| def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]: | |
| return [ | |
| { | |
| "id": "no_ocr", | |
| "name": "No OCR", | |
| "default_params": {} | |
| }, | |
| { | |
| "id": "easyocr", | |
| "name": "EasyOCR", | |
| "default_params": {"languages": ["en"]} | |
| }, | |
| { | |
| "id": "easyocr_cpu", | |
| "name": "EasyOCR (CPU only)", | |
| "default_params": {"languages": ["en"], "use_gpu": False} | |
| }, | |
| { | |
| "id": "tesseract", | |
| "name": "Tesseract", | |
| "default_params": {} | |
| }, | |
| { | |
| "id": "tesseract_cli", | |
| "name": "Tesseract CLI", | |
| "default_params": {} | |
| }, | |
| { | |
| "id": "full_force_ocr", | |
| "name": "Full Force OCR", | |
| "default_params": {} | |
| } | |
| ] | |
| def parse(self, file_path: Union[str, Path], ocr_method: Optional[str] = None, **kwargs) -> str: | |
| """Parse a document using Docling.""" | |
| # Special case for full force OCR | |
| if ocr_method == "full_force_ocr": | |
| return self._apply_full_force_ocr(file_path) | |
| # Regular Docling parsing | |
| pipeline_options = PdfPipelineOptions() | |
| pipeline_options.do_table_structure = True | |
| pipeline_options.table_structure_options.do_cell_matching = True | |
| # Configure OCR based on the method | |
| if ocr_method == "no_ocr": | |
| pipeline_options.do_ocr = False | |
| elif ocr_method == "easyocr": | |
| pipeline_options.do_ocr = True | |
| pipeline_options.ocr_options.lang = kwargs.get("languages", ["en"]) | |
| pipeline_options.accelerator_options = AcceleratorOptions( | |
| num_threads=4, device=AcceleratorDevice.AUTO | |
| ) | |
| elif ocr_method == "easyocr_cpu": | |
| pipeline_options.do_ocr = True | |
| pipeline_options.ocr_options.lang = kwargs.get("languages", ["en"]) | |
| pipeline_options.ocr_options.use_gpu = False | |
| elif ocr_method == "tesseract": | |
| pipeline_options.do_ocr = True | |
| pipeline_options.ocr_options = TesseractOcrOptions() | |
| elif ocr_method == "tesseract_cli": | |
| pipeline_options.do_ocr = True | |
| pipeline_options.ocr_options = TesseractCliOcrOptions() | |
| # Create the converter | |
| converter = DocumentConverter( | |
| format_options={ | |
| InputFormat.PDF: PdfFormatOption( | |
| pipeline_options=pipeline_options | |
| ) | |
| } | |
| ) | |
| # Convert the document | |
| result = converter.convert(Path(file_path)) | |
| doc = result.document | |
| # Return the content in the requested format | |
| output_format = kwargs.get("output_format", "markdown") | |
| if output_format.lower() == "json": | |
| return json.dumps(doc.export_to_dict(), ensure_ascii=False, indent=2) | |
| elif output_format.lower() == "text": | |
| return doc.export_to_text() | |
| elif output_format.lower() == "document_tags": | |
| return doc.export_to_document_tokens() | |
| else: | |
| return doc.export_to_markdown() | |
| def _apply_full_force_ocr(self, file_path: Union[str, Path]) -> str: | |
| """Apply full force OCR to a document.""" | |
| input_doc = Path(file_path) | |
| file_extension = input_doc.suffix.lower() | |
| # Debug information | |
| print(f"Applying full force OCR to file: {input_doc} (type: {file_extension})") | |
| # Basic pipeline setup | |
| pipeline_options = PdfPipelineOptions() | |
| pipeline_options.do_ocr = True | |
| pipeline_options.do_table_structure = True | |
| pipeline_options.table_structure_options.do_cell_matching = True | |
| # Find tesseract executable | |
| tesseract_path = shutil.which("tesseract") or "/usr/bin/tesseract" | |
| print(f"Using tesseract at: {tesseract_path}") | |
| # Configure OCR options | |
| ocr_options = TesseractCliOcrOptions(force_full_page_ocr=True) # Using standard options instead of CLI | |
| pipeline_options.ocr_options = ocr_options | |
| # Set up format options based on file type | |
| format_options = { | |
| InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options) | |
| } | |
| # Handle image files | |
| if file_extension in ['.jpg', '.jpeg', '.png', '.tiff', '.tif', '.bmp']: | |
| print(f"Processing as image file: {file_extension}") | |
| format_options[InputFormat.IMAGE] = PdfFormatOption(pipeline_options=pipeline_options) | |
| # Try full force OCR with standard options | |
| try: | |
| converter = DocumentConverter(format_options=format_options) | |
| result = converter.convert(input_doc) | |
| return result.document.export_to_markdown() | |
| except Exception as e: | |
| print(f"Error with standard OCR: {e}") | |
| print(f"Attempting fallback to tesseract_cli OCR...") | |
| return self.parse(file_path, ocr_method="tesseract_cli") | |
| # Register the parser with the registry | |
| ParserRegistry.register(DoclingParser) |