pszemraj commited on
Commit
2205c39
β€’
1 Parent(s): 5040391

πŸ’„ general ease of use

Browse files

Signed-off-by: peter szemraj <peterszemraj@gmail.com>

Files changed (1) hide show
  1. app.py +11 -9
app.py CHANGED
@@ -72,7 +72,9 @@ def convert_PDF(pdf_obj, language: str = "en", max_pages=20,):
72
  if isinstance(pdf_obj, list):
73
  pdf_obj = pdf_obj[0]
74
  file_path = Path(pdf_obj.name)
75
-
 
 
76
  conversion_stats = convert_PDF_to_Text(
77
  file_path,
78
  ocr_model=ocr_model,
@@ -90,7 +92,11 @@ def convert_PDF(pdf_obj, language: str = "en", max_pages=20,):
90
  html += f"<p>WARNING - PDF was truncated to {max_pages} pages</p>"
91
  html += f"<p>Runtime: {rt} minutes on CPU for {num_pages} pages</p>"
92
 
93
- return converted_txt, html
 
 
 
 
94
 
95
 
96
  if __name__ == "__main__":
@@ -125,7 +131,7 @@ if __name__ == "__main__":
125
  with gr.Column():
126
 
127
  gr.Markdown("## Load Inputs")
128
- gr.Markdown("Upload your own file:")
129
  gr.Markdown("_If no file is uploaded, a sample PDF will be used_")
130
 
131
 
@@ -135,13 +141,12 @@ if __name__ == "__main__":
135
  type="file",
136
  value= _here / "example_file.pdf",
137
  )
138
- # load_file_button = gr.Button("Load Uploaded File")
139
 
140
  gr.Markdown("---")
141
 
142
  with gr.Column():
143
  gr.Markdown("## Convert PDF to Text")
144
- convert_button = gr.Button("Convert PDF!")
145
  out_placeholder = gr.HTML("<p><em>Output will appear below:</em></p>")
146
  gr.Markdown("### Output")
147
  OCR_text = gr.Textbox(
@@ -153,11 +158,8 @@ if __name__ == "__main__":
153
  type="file",
154
  interactive=False,
155
  )
156
- # load_file_button.click(
157
- # fn=load_uploaded_file, inputs=uploaded_file, outputs=[pdf_obj]
158
- # )
159
 
160
  convert_button.click(
161
- fn=convert_PDF, inputs=[uploaded_file], outputs=[OCR_text, out_placeholder]
162
  )
163
  demo.launch(enable_queue=True)
 
72
  if isinstance(pdf_obj, list):
73
  pdf_obj = pdf_obj[0]
74
  file_path = Path(pdf_obj.name)
75
+ if not file_path.suffix == ".pdf":
76
+ logging.error(f"File {file_path} is not a PDF file")
77
+ return "File is not a PDF file", None, None
78
  conversion_stats = convert_PDF_to_Text(
79
  file_path,
80
  ocr_model=ocr_model,
 
92
  html += f"<p>WARNING - PDF was truncated to {max_pages} pages</p>"
93
  html += f"<p>Runtime: {rt} minutes on CPU for {num_pages} pages</p>"
94
 
95
+ _output_name = f"RESULT_{file_path.stem}_OCR.txt"
96
+ with open(_output_name, "w", encoding="utf-8", errors="ignore") as f:
97
+ f.write(converted_txt)
98
+
99
+ return converted_txt, html, _output_name
100
 
101
 
102
  if __name__ == "__main__":
 
131
  with gr.Column():
132
 
133
  gr.Markdown("## Load Inputs")
134
+ gr.Markdown("Upload your own file & replace the default")
135
  gr.Markdown("_If no file is uploaded, a sample PDF will be used_")
136
 
137
 
 
141
  type="file",
142
  value= _here / "example_file.pdf",
143
  )
 
144
 
145
  gr.Markdown("---")
146
 
147
  with gr.Column():
148
  gr.Markdown("## Convert PDF to Text")
149
+ convert_button = gr.Button("Convert PDF!", variant="primary")
150
  out_placeholder = gr.HTML("<p><em>Output will appear below:</em></p>")
151
  gr.Markdown("### Output")
152
  OCR_text = gr.Textbox(
 
158
  type="file",
159
  interactive=False,
160
  )
 
 
 
161
 
162
  convert_button.click(
163
+ fn=convert_PDF, inputs=[uploaded_file], outputs=[OCR_text, out_placeholder, text_file]
164
  )
165
  demo.launch(enable_queue=True)