pszemraj commited on
Commit
6585180
·
1 Parent(s): 23f1fc4

workaround for spellcheck fail

Browse files
Files changed (1) hide show
  1. pdf2text.py +23 -20
pdf2text.py CHANGED
@@ -213,26 +213,29 @@ def eval_and_replace(text: str, match_token: str = "- ") -> str:
213
  str: text with replaced tokens
214
  """
215
 
216
- if match_token not in text:
217
- return text
218
- else:
219
- while True:
220
- full_before_text = text.split(match_token, maxsplit=1)[0]
221
- before_text = [
222
- char for char in full_before_text.split()[-1] if char.isalpha()
223
- ]
224
- before_text = "".join(before_text)
225
- full_after_text = text.split(match_token, maxsplit=1)[-1]
226
- after_text = [char for char in full_after_text.split()[0] if char.isalpha()]
227
- after_text = "".join(after_text)
228
- full_text = before_text + after_text
229
- if check_word_spelling(full_text):
230
- text = full_before_text + full_after_text
231
- else:
232
- text = full_before_text + " " + full_after_text
233
- if match_token not in text:
234
- break
235
- return text
 
 
 
236
 
237
 
238
  def cleantxt_ocr(ugly_text, lower=False, lang: str = "en") -> str:
 
213
  str: text with replaced tokens
214
  """
215
 
216
+ try:
217
+ if match_token not in text:
218
+ return text
219
+ else:
220
+ while True:
221
+ full_before_text = text.split(match_token, maxsplit=1)[0]
222
+ before_text = [
223
+ char for char in full_before_text.split()[-1] if char.isalpha()
224
+ ]
225
+ before_text = "".join(before_text)
226
+ full_after_text = text.split(match_token, maxsplit=1)[-1]
227
+ after_text = [char for char in full_after_text.split()[0] if char.isalpha()]
228
+ after_text = "".join(after_text)
229
+ full_text = before_text + after_text
230
+ if check_word_spelling(full_text):
231
+ text = full_before_text + full_after_text
232
+ else:
233
+ text = full_before_text + " " + full_after_text
234
+ if match_token not in text:
235
+ break
236
+ except Exception as e:
237
+ logging.error(f"Error spell-checking OCR output, returning default text:\t{e}")
238
+ return text
239
 
240
 
241
  def cleantxt_ocr(ugly_text, lower=False, lang: str = "en") -> str: