pdf-ocr

Sleeping

App Files Files Community

pszemraj commited on Oct 4, 2022

Commit

2205c39

•

1 Parent(s): 5040391

💄 general ease of use

Browse files

Signed-off-by: peter szemraj <peterszemraj@gmail.com>

Files changed (1) hide show

app.py +11 -9

app.py CHANGED Viewed

@@ -72,7 +72,9 @@ def convert_PDF(pdf_obj, language: str = "en", max_pages=20,):
     if isinstance(pdf_obj, list):
         pdf_obj = pdf_obj[0]
     file_path = Path(pdf_obj.name)
     conversion_stats = convert_PDF_to_Text(
         file_path,
         ocr_model=ocr_model,
@@ -90,7 +92,11 @@ def convert_PDF(pdf_obj, language: str = "en", max_pages=20,):
         html += f"<p>WARNING - PDF was truncated to {max_pages} pages</p>"
     html += f"<p>Runtime: {rt} minutes on CPU for {num_pages} pages</p>"
-    return converted_txt, html
 if __name__ == "__main__":
@@ -125,7 +131,7 @@ if __name__ == "__main__":
         with gr.Column():
             gr.Markdown("## Load Inputs")
-            gr.Markdown("Upload your own file:")
             gr.Markdown("_If no file is uploaded, a sample PDF will be used_")
@@ -135,13 +141,12 @@ if __name__ == "__main__":
                     type="file",
                     value= _here / "example_file.pdf",
                 )
-                # load_file_button = gr.Button("Load Uploaded File")
             gr.Markdown("---")
         with gr.Column():
             gr.Markdown("## Convert PDF to Text")
-            convert_button = gr.Button("Convert PDF!")
             out_placeholder = gr.HTML("<p><em>Output will appear below:</em></p>")
             gr.Markdown("### Output")
             OCR_text = gr.Textbox(
@@ -153,11 +158,8 @@ if __name__ == "__main__":
                 type="file",
                 interactive=False,
             )
-        # load_file_button.click(
-        #     fn=load_uploaded_file, inputs=uploaded_file, outputs=[pdf_obj]
-        # )
         convert_button.click(
-            fn=convert_PDF, inputs=[uploaded_file], outputs=[OCR_text, out_placeholder]
         )
     demo.launch(enable_queue=True)

     if isinstance(pdf_obj, list):
         pdf_obj = pdf_obj[0]
     file_path = Path(pdf_obj.name)
+    if not file_path.suffix == ".pdf":
+        logging.error(f"File {file_path} is not a PDF file")
+        return "File is not a PDF file", None, None
     conversion_stats = convert_PDF_to_Text(
         file_path,
         ocr_model=ocr_model,
         html += f"<p>WARNING - PDF was truncated to {max_pages} pages</p>"
     html += f"<p>Runtime: {rt} minutes on CPU for {num_pages} pages</p>"
+    _output_name = f"RESULT_{file_path.stem}_OCR.txt"
+    with open(_output_name, "w", encoding="utf-8", errors="ignore") as f:
+        f.write(converted_txt)
+    return converted_txt, html, _output_name
 if __name__ == "__main__":
         with gr.Column():
             gr.Markdown("## Load Inputs")
+            gr.Markdown("Upload your own file & replace the default")
             gr.Markdown("_If no file is uploaded, a sample PDF will be used_")
                     type="file",
                     value= _here / "example_file.pdf",
                 )
             gr.Markdown("---")
         with gr.Column():
             gr.Markdown("## Convert PDF to Text")
+            convert_button = gr.Button("Convert PDF!", variant="primary")
             out_placeholder = gr.HTML("<p><em>Output will appear below:</em></p>")
             gr.Markdown("### Output")
             OCR_text = gr.Textbox(
                 type="file",
                 interactive=False,
             )
         convert_button.click(
+            fn=convert_PDF, inputs=[uploaded_file], outputs=[OCR_text, out_placeholder, text_file]
         )
     demo.launch(enable_queue=True)