Spaces:
Paused
Paused
full force ocr
Browse files- src/parsers/docling_parser.py +18 -22
src/parsers/docling_parser.py
CHANGED
|
@@ -131,43 +131,39 @@ class DoclingParser(DocumentParser):
|
|
| 131 |
# Debug information
|
| 132 |
print(f"Applying full force OCR to file: {input_doc} (type: {file_extension})")
|
| 133 |
|
| 134 |
-
#
|
| 135 |
pipeline_options = PdfPipelineOptions()
|
| 136 |
pipeline_options.do_ocr = True
|
| 137 |
pipeline_options.do_table_structure = True
|
| 138 |
pipeline_options.table_structure_options.do_cell_matching = True
|
| 139 |
|
| 140 |
-
#
|
| 141 |
-
|
| 142 |
-
|
| 143 |
|
| 144 |
-
#
|
| 145 |
-
|
|
|
|
| 146 |
|
| 147 |
-
#
|
| 148 |
-
format_options
|
| 149 |
-
pipeline_options=pipeline_options
|
| 150 |
-
|
| 151 |
|
| 152 |
# Handle image files
|
| 153 |
if file_extension in ['.jpg', '.jpeg', '.png', '.tiff', '.tif', '.bmp']:
|
| 154 |
print(f"Processing as image file: {file_extension}")
|
| 155 |
-
format_options[InputFormat.IMAGE] = PdfFormatOption(
|
| 156 |
-
pipeline_options=pipeline_options,
|
| 157 |
-
)
|
| 158 |
-
|
| 159 |
-
# Create converter with appropriate format options
|
| 160 |
-
converter = DocumentConverter(format_options=format_options)
|
| 161 |
|
|
|
|
| 162 |
try:
|
| 163 |
-
|
| 164 |
result = converter.convert(input_doc)
|
| 165 |
-
|
| 166 |
-
return doc.export_to_markdown()
|
| 167 |
except Exception as e:
|
| 168 |
-
print(f"Error
|
| 169 |
-
print(f"
|
| 170 |
-
return
|
| 171 |
|
| 172 |
|
| 173 |
# Register the parser with the registry
|
|
|
|
| 131 |
# Debug information
|
| 132 |
print(f"Applying full force OCR to file: {input_doc} (type: {file_extension})")
|
| 133 |
|
| 134 |
+
# Basic pipeline setup
|
| 135 |
pipeline_options = PdfPipelineOptions()
|
| 136 |
pipeline_options.do_ocr = True
|
| 137 |
pipeline_options.do_table_structure = True
|
| 138 |
pipeline_options.table_structure_options.do_cell_matching = True
|
| 139 |
|
| 140 |
+
# Find tesseract executable
|
| 141 |
+
tesseract_path = shutil.which("tesseract") or "/usr/bin/tesseract"
|
| 142 |
+
print(f"Using tesseract at: {tesseract_path}")
|
| 143 |
|
| 144 |
+
# Configure OCR options
|
| 145 |
+
ocr_options = TesseractOcrOptions(force_full_page_ocr=True) # Using standard options instead of CLI
|
| 146 |
+
pipeline_options.ocr_options = ocr_options
|
| 147 |
|
| 148 |
+
# Set up format options based on file type
|
| 149 |
+
format_options = {
|
| 150 |
+
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
|
| 151 |
+
}
|
| 152 |
|
| 153 |
# Handle image files
|
| 154 |
if file_extension in ['.jpg', '.jpeg', '.png', '.tiff', '.tif', '.bmp']:
|
| 155 |
print(f"Processing as image file: {file_extension}")
|
| 156 |
+
format_options[InputFormat.IMAGE] = PdfFormatOption(pipeline_options=pipeline_options)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
|
| 158 |
+
# Try full force OCR with standard options
|
| 159 |
try:
|
| 160 |
+
converter = DocumentConverter(format_options=format_options)
|
| 161 |
result = converter.convert(input_doc)
|
| 162 |
+
return result.document.export_to_markdown()
|
|
|
|
| 163 |
except Exception as e:
|
| 164 |
+
print(f"Error with standard OCR: {e}")
|
| 165 |
+
print(f"Attempting fallback to tesseract_cli OCR...")
|
| 166 |
+
return self.parse(file_path, ocr_method="tesseract_cli")
|
| 167 |
|
| 168 |
|
| 169 |
# Register the parser with the registry
|