pszemraj commited on
Commit
57c06b4
1 Parent(s): 4f80413

⚡️ add warning for truncation

Browse files

Signed-off-by: peter szemraj <peterszemraj@gmail.com>

Files changed (2) hide show
  1. app.py +17 -15
  2. pdf2text.py +3 -1
app.py CHANGED
@@ -55,7 +55,7 @@ def load_uploaded_file(file_obj, temp_dir: Path = None):
55
  return None
56
 
57
 
58
- def convert_PDF(pdf_obj, language: str = "en"):
59
  """
60
  convert_PDF - convert a PDF file to text
61
 
@@ -76,15 +76,18 @@ def convert_PDF(pdf_obj, language: str = "en"):
76
  conversion_stats = convert_PDF_to_Text(
77
  file_path,
78
  ocr_model=ocr_model,
79
- max_pages=20,
80
  )
81
  converted_txt = conversion_stats["converted_text"]
82
  num_pages = conversion_stats["num_pages"]
 
83
  # if alt_lang: # TODO: fix this
84
 
85
  rt = round((time.perf_counter() - st) / 60, 2)
86
  print(f"Runtime: {rt} minutes")
87
  html = ""
 
 
88
  html += f"<p>Runtime: {rt} minutes on CPU for {num_pages} pages</p>"
89
 
90
  return converted_txt, html
@@ -125,20 +128,14 @@ if __name__ == "__main__":
125
  gr.Markdown("Upload your own file:")
126
  gr.Markdown("_If no file is uploaded, a sample PDF will be used_")
127
 
128
- pdf_obj = gr.Textbox(
129
- lines=1,
130
- label="VM file path",
131
- placeholder="When the file is uploaded, the path will appear here",
132
- value=pdf_obj,
133
- )
134
- with gr.Row():
135
- uploaded_file = gr.File(
136
  label="Upload a PDF file",
137
  file_count="single",
138
  type="file",
139
  value= _here / "example_file.pdf",
140
  )
141
- load_file_button = gr.Button("Load Uploaded File")
142
 
143
  gr.Markdown("---")
144
 
@@ -150,10 +147,15 @@ if __name__ == "__main__":
150
  OCR_text = gr.Textbox(
151
  label="OCR Result", placeholder="The OCR text will appear here"
152
  )
153
-
154
- load_file_button.click(
155
- fn=load_uploaded_file, inputs=uploaded_file, outputs=[pdf_obj]
156
- )
 
 
 
 
 
157
 
158
  convert_button.click(
159
  fn=convert_PDF, inputs=[uploaded_file], outputs=[OCR_text, out_placeholder]
 
55
  return None
56
 
57
 
58
+ def convert_PDF(pdf_obj, language: str = "en", max_pages=20,):
59
  """
60
  convert_PDF - convert a PDF file to text
61
 
 
76
  conversion_stats = convert_PDF_to_Text(
77
  file_path,
78
  ocr_model=ocr_model,
79
+ max_pages=max_pages,
80
  )
81
  converted_txt = conversion_stats["converted_text"]
82
  num_pages = conversion_stats["num_pages"]
83
+ was_truncated = conversion_stats["truncated"]
84
  # if alt_lang: # TODO: fix this
85
 
86
  rt = round((time.perf_counter() - st) / 60, 2)
87
  print(f"Runtime: {rt} minutes")
88
  html = ""
89
+ if was_truncated:
90
+ html += f"<p>WARNING - PDF was truncated to {max_pages} pages</p>"
91
  html += f"<p>Runtime: {rt} minutes on CPU for {num_pages} pages</p>"
92
 
93
  return converted_txt, html
 
128
  gr.Markdown("Upload your own file:")
129
  gr.Markdown("_If no file is uploaded, a sample PDF will be used_")
130
 
131
+
132
+ uploaded_file = gr.File(
 
 
 
 
 
 
133
  label="Upload a PDF file",
134
  file_count="single",
135
  type="file",
136
  value= _here / "example_file.pdf",
137
  )
138
+ # load_file_button = gr.Button("Load Uploaded File")
139
 
140
  gr.Markdown("---")
141
 
 
147
  OCR_text = gr.Textbox(
148
  label="OCR Result", placeholder="The OCR text will appear here"
149
  )
150
+ text_file = gr.File(
151
+ label="Download Text File",
152
+ file_count="single",
153
+ type="file",
154
+ interactive=False,
155
+ )
156
+ # load_file_button.click(
157
+ # fn=load_uploaded_file, inputs=uploaded_file, outputs=[pdf_obj]
158
+ # )
159
 
160
  convert_button.click(
161
  fn=convert_PDF, inputs=[uploaded_file], outputs=[OCR_text, out_placeholder]
pdf2text.py CHANGED
@@ -591,12 +591,13 @@ def convert_PDF_to_Text(
591
  ocr_model = ocr_predictor(pretrained=True) if ocr_model is None else ocr_model
592
  logging.info(f"starting OCR on {PDF_file.name}")
593
  doc = DocumentFile.from_pdf(PDF_file)
594
-
595
  if len(doc) > max_pages:
596
  logging.warning(
597
  f"PDF has {len(doc)} pages, which is more than {max_pages}.. truncating"
598
  )
599
  doc = doc[:max_pages]
 
600
 
601
  # Analyze
602
  logging.info(f"running OCR on {len(doc)} pages")
@@ -616,6 +617,7 @@ def convert_PDF_to_Text(
616
  "runtime": round(fn_rt, 2),
617
  "date": str(date.today()),
618
  "converted_text": ocr_results,
 
619
  "length": len(ocr_results),
620
  }
621
 
 
591
  ocr_model = ocr_predictor(pretrained=True) if ocr_model is None else ocr_model
592
  logging.info(f"starting OCR on {PDF_file.name}")
593
  doc = DocumentFile.from_pdf(PDF_file)
594
+ truncated = False
595
  if len(doc) > max_pages:
596
  logging.warning(
597
  f"PDF has {len(doc)} pages, which is more than {max_pages}.. truncating"
598
  )
599
  doc = doc[:max_pages]
600
+ truncated = True
601
 
602
  # Analyze
603
  logging.info(f"running OCR on {len(doc)} pages")
 
617
  "runtime": round(fn_rt, 2),
618
  "date": str(date.today()),
619
  "converted_text": ocr_results,
620
+ "truncated": truncated,
621
  "length": len(ocr_results),
622
  }
623