Spaces:
Paused
Paused
made some changes on full force ocr clean up
Browse files
src/parsers/docling_parser.py
CHANGED
|
@@ -137,47 +137,21 @@ class DoclingParser(DocumentParser):
|
|
| 137 |
pipeline_options.do_table_structure = True
|
| 138 |
pipeline_options.table_structure_options.do_cell_matching = True
|
| 139 |
|
| 140 |
-
#
|
| 141 |
-
|
| 142 |
-
tesseract_paths = [
|
| 143 |
-
"tesseract", # Default PATH
|
| 144 |
-
"/usr/bin/tesseract", # Common Linux location
|
| 145 |
-
"/app/tesseract/tesseract", # Possible custom location in Hugging Face
|
| 146 |
-
"/opt/conda/bin/tesseract", # Possible Conda env in Hugging Face
|
| 147 |
-
r"C:\Program Files\Tesseract-OCR\tesseract.exe" # Windows location
|
| 148 |
-
]
|
| 149 |
-
|
| 150 |
-
for path in tesseract_paths:
|
| 151 |
-
if shutil.which(path) or (os.path.isfile(path) and os.access(path, os.X_OK)):
|
| 152 |
-
tesseract_cmd = path
|
| 153 |
-
print(f"Found tesseract at: {tesseract_cmd}")
|
| 154 |
-
break
|
| 155 |
-
|
| 156 |
-
if not tesseract_cmd:
|
| 157 |
-
print("Warning: Tesseract executable not found. Using default configuration.")
|
| 158 |
-
tesseract_cmd = "tesseract" # Use default as fallback
|
| 159 |
-
|
| 160 |
-
# Configure OCR options with explicit tesseract path
|
| 161 |
-
ocr_options = TesseractCliOcrOptions(
|
| 162 |
-
force_full_page_ocr=True,
|
| 163 |
-
tesseract_cmd=tesseract_cmd
|
| 164 |
-
)
|
| 165 |
pipeline_options.ocr_options = ocr_options
|
| 166 |
|
| 167 |
# Set up format options for both PDF and image formats
|
| 168 |
format_options = {}
|
| 169 |
|
| 170 |
-
#
|
| 171 |
format_options[InputFormat.PDF] = PdfFormatOption(
|
| 172 |
pipeline_options=pipeline_options,
|
| 173 |
)
|
| 174 |
|
| 175 |
-
#
|
| 176 |
if file_extension in ['.jpg', '.jpeg', '.png', '.tiff', '.tif', '.bmp']:
|
| 177 |
-
# For image files, we'll use the same pipeline options
|
| 178 |
-
# but we need to specify the input format as IMAGE
|
| 179 |
print(f"Processing as image file: {file_extension}")
|
| 180 |
-
# Note: InputFormat.IMAGE is used for image files in Docling
|
| 181 |
format_options[InputFormat.IMAGE] = PdfFormatOption(
|
| 182 |
pipeline_options=pipeline_options,
|
| 183 |
)
|
|
@@ -191,17 +165,9 @@ class DoclingParser(DocumentParser):
|
|
| 191 |
doc = result.document
|
| 192 |
return doc.export_to_markdown()
|
| 193 |
except Exception as e:
|
| 194 |
-
|
| 195 |
-
print(f"Error during full force OCR: {e}")
|
| 196 |
print(f"File type: {file_extension}, File exists: {input_doc.exists()}")
|
| 197 |
-
|
| 198 |
-
# Try fallback to regular OCR if full force fails
|
| 199 |
-
try:
|
| 200 |
-
print("Attempting fallback to regular tesseract_cli OCR...")
|
| 201 |
-
return self.parse(file_path, ocr_method="tesseract_cli")
|
| 202 |
-
except Exception as fallback_error:
|
| 203 |
-
print(f"Fallback OCR also failed: {fallback_error}")
|
| 204 |
-
return f"OCR failed for {input_doc}. Error: {str(e)}"
|
| 205 |
|
| 206 |
|
| 207 |
# Register the parser with the registry
|
|
|
|
| 137 |
pipeline_options.do_table_structure = True
|
| 138 |
pipeline_options.table_structure_options.do_cell_matching = True
|
| 139 |
|
| 140 |
+
# Configure OCR options - using TesseractCliOcrOptions directly without the text column issue
|
| 141 |
+
ocr_options = TesseractCliOcrOptions(force_full_page_ocr=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
pipeline_options.ocr_options = ocr_options
|
| 143 |
|
| 144 |
# Set up format options for both PDF and image formats
|
| 145 |
format_options = {}
|
| 146 |
|
| 147 |
+
# PDF format option
|
| 148 |
format_options[InputFormat.PDF] = PdfFormatOption(
|
| 149 |
pipeline_options=pipeline_options,
|
| 150 |
)
|
| 151 |
|
| 152 |
+
# Handle image files
|
| 153 |
if file_extension in ['.jpg', '.jpeg', '.png', '.tiff', '.tif', '.bmp']:
|
|
|
|
|
|
|
| 154 |
print(f"Processing as image file: {file_extension}")
|
|
|
|
| 155 |
format_options[InputFormat.IMAGE] = PdfFormatOption(
|
| 156 |
pipeline_options=pipeline_options,
|
| 157 |
)
|
|
|
|
| 165 |
doc = result.document
|
| 166 |
return doc.export_to_markdown()
|
| 167 |
except Exception as e:
|
| 168 |
+
print(f"Error during OCR processing: {e}")
|
|
|
|
| 169 |
print(f"File type: {file_extension}, File exists: {input_doc.exists()}")
|
| 170 |
+
return f"OCR failed for {input_doc}. Error: {str(e)}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
|
| 172 |
|
| 173 |
# Register the parser with the registry
|