pdf-ocr

Sleeping

App Files Files Community

pszemraj commited on Oct 4, 2022

Commit

57c06b4

1 Parent(s): 4f80413

⚡️ add warning for truncation

Browse files

Signed-off-by: peter szemraj <peterszemraj@gmail.com>

Files changed (2) hide show

app.py +17 -15
pdf2text.py +3 -1

app.py CHANGED Viewed

@@ -55,7 +55,7 @@ def load_uploaded_file(file_obj, temp_dir: Path = None):
         return None
-def convert_PDF(pdf_obj, language: str = "en"):
     """
     convert_PDF - convert a PDF file to text
@@ -76,15 +76,18 @@ def convert_PDF(pdf_obj, language: str = "en"):
     conversion_stats = convert_PDF_to_Text(
         file_path,
         ocr_model=ocr_model,
-        max_pages=20,
     )
     converted_txt = conversion_stats["converted_text"]
     num_pages = conversion_stats["num_pages"]
     # if alt_lang: # TODO: fix this
     rt = round((time.perf_counter() - st) / 60, 2)
     print(f"Runtime: {rt} minutes")
     html = ""
     html += f"<p>Runtime: {rt} minutes on CPU for {num_pages} pages</p>"
     return converted_txt, html
@@ -125,20 +128,14 @@ if __name__ == "__main__":
             gr.Markdown("Upload your own file:")
             gr.Markdown("_If no file is uploaded, a sample PDF will be used_")
-            pdf_obj = gr.Textbox(
-                lines=1,
-                label="VM file path",
-                placeholder="When the file is uploaded, the path will appear here",
-                value=pdf_obj,
-            )
-            with gr.Row():
-                uploaded_file = gr.File(
                     label="Upload a PDF file",
                     file_count="single",
                     type="file",
                     value= _here / "example_file.pdf",
                 )
-                load_file_button = gr.Button("Load Uploaded File")
             gr.Markdown("---")
@@ -150,10 +147,15 @@ if __name__ == "__main__":
             OCR_text = gr.Textbox(
                 label="OCR Result", placeholder="The OCR text will appear here"
             )
-        load_file_button.click(
-            fn=load_uploaded_file, inputs=uploaded_file, outputs=[pdf_obj]
-        )
         convert_button.click(
             fn=convert_PDF, inputs=[uploaded_file], outputs=[OCR_text, out_placeholder]

         return None
+def convert_PDF(pdf_obj, language: str = "en", max_pages=20,):
     """
     convert_PDF - convert a PDF file to text
     conversion_stats = convert_PDF_to_Text(
         file_path,
         ocr_model=ocr_model,
+        max_pages=max_pages,
     )
     converted_txt = conversion_stats["converted_text"]
     num_pages = conversion_stats["num_pages"]
+    was_truncated = conversion_stats["truncated"]
     # if alt_lang: # TODO: fix this
     rt = round((time.perf_counter() - st) / 60, 2)
     print(f"Runtime: {rt} minutes")
     html = ""
+    if was_truncated:
+        html += f"<p>WARNING - PDF was truncated to {max_pages} pages</p>"
     html += f"<p>Runtime: {rt} minutes on CPU for {num_pages} pages</p>"
     return converted_txt, html
             gr.Markdown("Upload your own file:")
             gr.Markdown("_If no file is uploaded, a sample PDF will be used_")
+            uploaded_file = gr.File(
                     label="Upload a PDF file",
                     file_count="single",
                     type="file",
                     value= _here / "example_file.pdf",
                 )
+                # load_file_button = gr.Button("Load Uploaded File")
             gr.Markdown("---")
             OCR_text = gr.Textbox(
                 label="OCR Result", placeholder="The OCR text will appear here"
             )
+            text_file = gr.File(
+                label="Download Text File",
+                file_count="single",
+                type="file",
+                interactive=False,
+            )
+        # load_file_button.click(
+        #     fn=load_uploaded_file, inputs=uploaded_file, outputs=[pdf_obj]
+        # )
         convert_button.click(
             fn=convert_PDF, inputs=[uploaded_file], outputs=[OCR_text, out_placeholder]

pdf2text.py CHANGED Viewed

@@ -591,12 +591,13 @@ def convert_PDF_to_Text(
     ocr_model = ocr_predictor(pretrained=True) if ocr_model is None else ocr_model
     logging.info(f"starting OCR on {PDF_file.name}")
     doc = DocumentFile.from_pdf(PDF_file)
     if len(doc) > max_pages:
         logging.warning(
             f"PDF has {len(doc)} pages, which is more than {max_pages}.. truncating"
         )
         doc = doc[:max_pages]
     # Analyze
     logging.info(f"running OCR on {len(doc)} pages")
@@ -616,6 +617,7 @@ def convert_PDF_to_Text(
         "runtime": round(fn_rt, 2),
         "date": str(date.today()),
         "converted_text": ocr_results,
         "length": len(ocr_results),
     }

     ocr_model = ocr_predictor(pretrained=True) if ocr_model is None else ocr_model
     logging.info(f"starting OCR on {PDF_file.name}")
     doc = DocumentFile.from_pdf(PDF_file)
+    truncated = False
     if len(doc) > max_pages:
         logging.warning(
             f"PDF has {len(doc)} pages, which is more than {max_pages}.. truncating"
         )
         doc = doc[:max_pages]
+        truncated = True
     # Analyze
     logging.info(f"running OCR on {len(doc)} pages")
         "runtime": round(fn_rt, 2),
         "date": str(date.today()),
         "converted_text": ocr_results,
+        "truncated": truncated,
         "length": len(ocr_results),
     }