pdf-ocr

Sleeping

pszemraj commited on Oct 4, 2022

Commit

2bd35a0

•

1 Parent(s): ccaf8ca

format

Files changed (2) hide show

app.py CHANGED Viewed

@@ -95,7 +95,12 @@ if __name__ == "__main__":
     logging.info(f"Using GPU status: {use_GPU}")
     logging.info("Loading OCR model")
     with contextlib.redirect_stdout(None):
-        ocr_model = ocr_predictor('db_resnet50', 'crnn_mobilenet_v3_large', pretrained=True, assume_straight_pages=True)
     # define pdf bytes as None
     pdf_obj = _here / "example_file.pdf"

     logging.info(f"Using GPU status: {use_GPU}")
     logging.info("Loading OCR model")
     with contextlib.redirect_stdout(None):
+        ocr_model = ocr_predictor(
+            "db_resnet50",
+            "crnn_mobilenet_v3_large",
+            pretrained=True,
+            assume_straight_pages=True,
+        )
     # define pdf bytes as None
     pdf_obj = _here / "example_file.pdf"

pdf2text.py CHANGED Viewed

@@ -32,6 +32,8 @@ from tqdm.auto import tqdm
 from doctr.io import DocumentFile
 from doctr.models import ocr_predictor
 def fast_scandir(dirname):
     # return all subfolders in a given filepath
@@ -421,7 +423,6 @@ def download_URL(url: str, file=None, dlpath=None, verbose=False):
 """
 # need to run only once to load model into memory
 custom_replace_list = {
@@ -554,6 +555,7 @@ def postprocess(text: str) -> str:
     return eval_and_replace(proc)
 def result2text(result) -> str:
     """Convert OCR result to text"""
@@ -568,11 +570,10 @@ def result2text(result) -> str:
                     text += word.value + " "
         full_doc.append(text)
     full_text = "\n".join(full_doc)
     return full_text
 import warnings
 from datetime import date
 from os.path import join
@@ -593,7 +594,9 @@ def convert_PDF_to_Text(
     doc = DocumentFile.from_pdf(PDF_file)
     if len(doc) > max_pages:
-        logging.warning(f"PDF has {len(doc)} pages, which is more than {max_pages}.. truncating")
         doc = doc[:max_pages]
     # Analyze
@@ -603,14 +606,10 @@ def convert_PDF_to_Text(
     proc_text = format_ocr_out(raw_text)
     output_text = postprocess(proc_text)
     fn_rt = time.perf_counter() - st
     logging.info("OCR complete")
     results_dict = {
         "num_pages": len(doc),
         "runtime": round(fn_rt, 2),

 from doctr.io import DocumentFile
 from doctr.models import ocr_predictor
 def fast_scandir(dirname):
     # return all subfolders in a given filepath
 """
 # need to run only once to load model into memory
 custom_replace_list = {
     return eval_and_replace(proc)
 def result2text(result) -> str:
     """Convert OCR result to text"""
                     text += word.value + " "
         full_doc.append(text)
     full_text = "\n".join(full_doc)
     return full_text
 import warnings
 from datetime import date
 from os.path import join
     doc = DocumentFile.from_pdf(PDF_file)
     if len(doc) > max_pages:
+        logging.warning(
+            f"PDF has {len(doc)} pages, which is more than {max_pages}.. truncating"
+        )
         doc = doc[:max_pages]
     # Analyze
     proc_text = format_ocr_out(raw_text)
     output_text = postprocess(proc_text)
     fn_rt = time.perf_counter() - st
     logging.info("OCR complete")
     results_dict = {
         "num_pages": len(doc),
         "runtime": round(fn_rt, 2),