pszemraj commited on
Commit
89d61d5
β€’
1 Parent(s): e38879f

πŸ’„ improve formatting

Browse files

Signed-off-by: peter szemraj <peterszemraj@gmail.com>

Files changed (1) hide show
  1. pdf2text.py +8 -7
pdf2text.py CHANGED
@@ -556,7 +556,7 @@ def postprocess(text: str) -> str:
556
  return eval_and_replace(proc)
557
 
558
 
559
- def result2text(result) -> str:
560
  """Convert OCR result to text"""
561
 
562
  full_doc = []
@@ -570,8 +570,7 @@ def result2text(result) -> str:
570
  text += word.value + " "
571
  full_doc.append(text)
572
 
573
- full_text = "\n".join(full_doc)
574
- return full_text
575
 
576
 
577
  import warnings
@@ -603,8 +602,10 @@ def convert_PDF_to_Text(
603
  logging.info(f"running OCR on {len(doc)} pages")
604
  result = ocr_model(doc)
605
  raw_text = result2text(result)
606
- proc_text = format_ocr_out(raw_text)
607
- output_text = postprocess(proc_text)
 
 
608
 
609
  fn_rt = time.perf_counter() - st
610
 
@@ -614,8 +615,8 @@ def convert_PDF_to_Text(
614
  "num_pages": len(doc),
615
  "runtime": round(fn_rt, 2),
616
  "date": str(date.today()),
617
- "converted_text": output_text,
618
- "length": len(output_text),
619
  }
620
 
621
  return results_dict
 
556
  return eval_and_replace(proc)
557
 
558
 
559
+ def result2text(result, as_text=False) -> str or list:
560
  """Convert OCR result to text"""
561
 
562
  full_doc = []
 
570
  text += word.value + " "
571
  full_doc.append(text)
572
 
573
+ return "\n".join(full_doc) if as_text else full_doc
 
574
 
575
 
576
  import warnings
 
602
  logging.info(f"running OCR on {len(doc)} pages")
603
  result = ocr_model(doc)
604
  raw_text = result2text(result)
605
+ proc_text = [format_ocr_out(r) for r in raw_text]
606
+ fin_text = [postprocess(t) for t in proc_text]
607
+
608
+ ocr_results = "\n\n".join(fin_text)
609
 
610
  fn_rt = time.perf_counter() - st
611
 
 
615
  "num_pages": len(doc),
616
  "runtime": round(fn_rt, 2),
617
  "date": str(date.today()),
618
+ "converted_text": ocr_results,
619
+ "length": len(ocr_results),
620
  }
621
 
622
  return results_dict