pszemraj commited on
Commit
2bd35a0
1 Parent(s): ccaf8ca
Files changed (2) hide show
  1. app.py +6 -1
  2. pdf2text.py +7 -8
app.py CHANGED
@@ -95,7 +95,12 @@ if __name__ == "__main__":
95
  logging.info(f"Using GPU status: {use_GPU}")
96
  logging.info("Loading OCR model")
97
  with contextlib.redirect_stdout(None):
98
- ocr_model = ocr_predictor('db_resnet50', 'crnn_mobilenet_v3_large', pretrained=True, assume_straight_pages=True)
 
 
 
 
 
99
 
100
  # define pdf bytes as None
101
  pdf_obj = _here / "example_file.pdf"
 
95
  logging.info(f"Using GPU status: {use_GPU}")
96
  logging.info("Loading OCR model")
97
  with contextlib.redirect_stdout(None):
98
+ ocr_model = ocr_predictor(
99
+ "db_resnet50",
100
+ "crnn_mobilenet_v3_large",
101
+ pretrained=True,
102
+ assume_straight_pages=True,
103
+ )
104
 
105
  # define pdf bytes as None
106
  pdf_obj = _here / "example_file.pdf"
pdf2text.py CHANGED
@@ -32,6 +32,8 @@ from tqdm.auto import tqdm
32
 
33
  from doctr.io import DocumentFile
34
  from doctr.models import ocr_predictor
 
 
35
  def fast_scandir(dirname):
36
  # return all subfolders in a given filepath
37
 
@@ -421,7 +423,6 @@ def download_URL(url: str, file=None, dlpath=None, verbose=False):
421
  """
422
 
423
 
424
-
425
  # need to run only once to load model into memory
426
 
427
  custom_replace_list = {
@@ -554,6 +555,7 @@ def postprocess(text: str) -> str:
554
 
555
  return eval_and_replace(proc)
556
 
 
557
  def result2text(result) -> str:
558
  """Convert OCR result to text"""
559
 
@@ -568,11 +570,10 @@ def result2text(result) -> str:
568
  text += word.value + " "
569
  full_doc.append(text)
570
 
571
-
572
-
573
  full_text = "\n".join(full_doc)
574
  return full_text
575
 
 
576
  import warnings
577
  from datetime import date
578
  from os.path import join
@@ -593,7 +594,9 @@ def convert_PDF_to_Text(
593
  doc = DocumentFile.from_pdf(PDF_file)
594
 
595
  if len(doc) > max_pages:
596
- logging.warning(f"PDF has {len(doc)} pages, which is more than {max_pages}.. truncating")
 
 
597
  doc = doc[:max_pages]
598
 
599
  # Analyze
@@ -603,14 +606,10 @@ def convert_PDF_to_Text(
603
  proc_text = format_ocr_out(raw_text)
604
  output_text = postprocess(proc_text)
605
 
606
-
607
  fn_rt = time.perf_counter() - st
608
 
609
-
610
-
611
  logging.info("OCR complete")
612
 
613
-
614
  results_dict = {
615
  "num_pages": len(doc),
616
  "runtime": round(fn_rt, 2),
 
32
 
33
  from doctr.io import DocumentFile
34
  from doctr.models import ocr_predictor
35
+
36
+
37
  def fast_scandir(dirname):
38
  # return all subfolders in a given filepath
39
 
 
423
  """
424
 
425
 
 
426
  # need to run only once to load model into memory
427
 
428
  custom_replace_list = {
 
555
 
556
  return eval_and_replace(proc)
557
 
558
+
559
  def result2text(result) -> str:
560
  """Convert OCR result to text"""
561
 
 
570
  text += word.value + " "
571
  full_doc.append(text)
572
 
 
 
573
  full_text = "\n".join(full_doc)
574
  return full_text
575
 
576
+
577
  import warnings
578
  from datetime import date
579
  from os.path import join
 
594
  doc = DocumentFile.from_pdf(PDF_file)
595
 
596
  if len(doc) > max_pages:
597
+ logging.warning(
598
+ f"PDF has {len(doc)} pages, which is more than {max_pages}.. truncating"
599
+ )
600
  doc = doc[:max_pages]
601
 
602
  # Analyze
 
606
  proc_text = format_ocr_out(raw_text)
607
  output_text = postprocess(proc_text)
608
 
 
609
  fn_rt = time.perf_counter() - st
610
 
 
 
611
  logging.info("OCR complete")
612
 
 
613
  results_dict = {
614
  "num_pages": len(doc),
615
  "runtime": round(fn_rt, 2),