AnseMin commited on
Commit
f1d63ad
·
1 Parent(s): 99c8f7d

full force ocr

Browse files
Files changed (1) hide show
  1. src/parsers/docling_parser.py +18 -22
src/parsers/docling_parser.py CHANGED
@@ -131,43 +131,39 @@ class DoclingParser(DocumentParser):
131
  # Debug information
132
  print(f"Applying full force OCR to file: {input_doc} (type: {file_extension})")
133
 
134
- # Set up pipeline options
135
  pipeline_options = PdfPipelineOptions()
136
  pipeline_options.do_ocr = True
137
  pipeline_options.do_table_structure = True
138
  pipeline_options.table_structure_options.do_cell_matching = True
139
 
140
- # Configure OCR options - using TesseractCliOcrOptions directly without the text column issue
141
- ocr_options = TesseractCliOcrOptions(force_full_page_ocr=True)
142
- pipeline_options.ocr_options = ocr_options
143
 
144
- # Set up format options for both PDF and image formats
145
- format_options = {}
 
146
 
147
- # PDF format option
148
- format_options[InputFormat.PDF] = PdfFormatOption(
149
- pipeline_options=pipeline_options,
150
- )
151
 
152
  # Handle image files
153
  if file_extension in ['.jpg', '.jpeg', '.png', '.tiff', '.tif', '.bmp']:
154
  print(f"Processing as image file: {file_extension}")
155
- format_options[InputFormat.IMAGE] = PdfFormatOption(
156
- pipeline_options=pipeline_options,
157
- )
158
-
159
- # Create converter with appropriate format options
160
- converter = DocumentConverter(format_options=format_options)
161
 
 
162
  try:
163
- # Convert the document
164
  result = converter.convert(input_doc)
165
- doc = result.document
166
- return doc.export_to_markdown()
167
  except Exception as e:
168
- print(f"Error during OCR processing: {e}")
169
- print(f"File type: {file_extension}, File exists: {input_doc.exists()}")
170
- return f"OCR failed for {input_doc}. Error: {str(e)}"
171
 
172
 
173
  # Register the parser with the registry
 
131
  # Debug information
132
  print(f"Applying full force OCR to file: {input_doc} (type: {file_extension})")
133
 
134
+ # Basic pipeline setup
135
  pipeline_options = PdfPipelineOptions()
136
  pipeline_options.do_ocr = True
137
  pipeline_options.do_table_structure = True
138
  pipeline_options.table_structure_options.do_cell_matching = True
139
 
140
+ # Find tesseract executable
141
+ tesseract_path = shutil.which("tesseract") or "/usr/bin/tesseract"
142
+ print(f"Using tesseract at: {tesseract_path}")
143
 
144
+ # Configure OCR options
145
+ ocr_options = TesseractOcrOptions(force_full_page_ocr=True) # Using standard options instead of CLI
146
+ pipeline_options.ocr_options = ocr_options
147
 
148
+ # Set up format options based on file type
149
+ format_options = {
150
+ InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
151
+ }
152
 
153
  # Handle image files
154
  if file_extension in ['.jpg', '.jpeg', '.png', '.tiff', '.tif', '.bmp']:
155
  print(f"Processing as image file: {file_extension}")
156
+ format_options[InputFormat.IMAGE] = PdfFormatOption(pipeline_options=pipeline_options)
 
 
 
 
 
157
 
158
+ # Try full force OCR with standard options
159
  try:
160
+ converter = DocumentConverter(format_options=format_options)
161
  result = converter.convert(input_doc)
162
+ return result.document.export_to_markdown()
 
163
  except Exception as e:
164
+ print(f"Error with standard OCR: {e}")
165
+ print(f"Attempting fallback to tesseract_cli OCR...")
166
+ return self.parse(file_path, ocr_method="tesseract_cli")
167
 
168
 
169
  # Register the parser with the registry