seanpedrickcase commited on
Commit
3518b67
·
1 Parent(s): 3187788

Corrected large image reduction code

Browse files
Files changed (3) hide show
  1. app.py +2 -5
  2. tools/file_conversion.py +5 -5
  3. tools/redaction_review.py +2 -0
app.py CHANGED
@@ -371,11 +371,8 @@ with app:
371
  clear_all_redactions_on_page_btn.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base, clear_all_page_redactions], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
372
  then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
373
 
374
- #annotation_button_get.click(get_boxes_json, annotator, json_boxes)
375
  annotation_button_apply.click(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output], scroll_to_output=True)
376
 
377
- do_not_save_pdf_state
378
-
379
  # Page controls at bottom
380
  annotate_current_page_bottom.submit(
381
  modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
@@ -392,10 +389,10 @@ with app:
392
  then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
393
  then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
394
 
395
- # Review side bar controls
396
  recogniser_entity_dropdown.select(update_entities_df, inputs=[recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs=[recogniser_entity_dataframe])
397
 
398
- recogniser_entity_dataframe.select(df_select_callback, inputs=[recogniser_entity_dataframe], outputs=annotate_current_page).\
399
  then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
400
  then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
401
 
 
371
  clear_all_redactions_on_page_btn.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base, clear_all_page_redactions], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
372
  then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
373
 
 
374
  annotation_button_apply.click(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output], scroll_to_output=True)
375
 
 
 
376
  # Page controls at bottom
377
  annotate_current_page_bottom.submit(
378
  modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
 
389
  then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
390
  then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
391
 
392
+ # Review table controls
393
  recogniser_entity_dropdown.select(update_entities_df, inputs=[recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs=[recogniser_entity_dataframe])
394
 
395
+ recogniser_entity_dataframe.select(df_select_callback, inputs=[recogniser_entity_dataframe], outputs=[annotate_current_page]).\
396
  then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
397
  then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
398
 
tools/file_conversion.py CHANGED
@@ -1,7 +1,6 @@
1
  from pdf2image import convert_from_path, pdfinfo_from_path
2
  from tools.helper_functions import get_file_path_end, output_folder, tesseract_ocr_option, text_ocr_option, textract_option, read_file, get_or_create_env_var
3
  from PIL import Image, ImageFile
4
- ImageFile.LOAD_TRUNCATED_IMAGES = True
5
  import os
6
  import re
7
  import time
@@ -16,6 +15,7 @@ from typing import List, Optional
16
  from concurrent.futures import ThreadPoolExecutor, as_completed
17
 
18
  image_dpi = 300.0
 
19
  Image.MAX_IMAGE_PIXELS = None
20
 
21
  def is_pdf_or_image(filename):
@@ -75,7 +75,7 @@ def process_single_page(pdf_path: str, page_num: int, image_dpi: float, output_d
75
  image.save(out_path, format="PNG")
76
 
77
  # Check file size and resize if necessary
78
- max_size = 5 * 1024 * 1024 # 5 MB in bytes # 5
79
  file_size = os.path.getsize(out_path)
80
 
81
  # Resize images if they are too big
@@ -83,7 +83,7 @@ def process_single_page(pdf_path: str, page_num: int, image_dpi: float, output_d
83
  # Start with the original image size
84
  width, height = image.size
85
 
86
- print(f"Image size before {new_width}x{new_height}, original file_size: {file_size}")
87
 
88
  while file_size > max_size:
89
  # Reduce the size by a factor (e.g., 50% of the current size)
@@ -107,9 +107,9 @@ def process_single_page(pdf_path: str, page_num: int, image_dpi: float, output_d
107
  print(f"Error processing page {page_num + 1}: {e}")
108
  return page_num, None
109
 
110
- def convert_pdf_to_images(pdf_path: str, prepare_for_review:bool=False, page_min: int = 0, image_dpi: float = 200, num_threads: int = 8, output_dir: str = '/input'):
111
 
112
- # If preparing for review, just load the first page
113
  if prepare_for_review == True:
114
  page_count = pdfinfo_from_path(pdf_path)['Pages'] #1
115
  else:
 
1
  from pdf2image import convert_from_path, pdfinfo_from_path
2
  from tools.helper_functions import get_file_path_end, output_folder, tesseract_ocr_option, text_ocr_option, textract_option, read_file, get_or_create_env_var
3
  from PIL import Image, ImageFile
 
4
  import os
5
  import re
6
  import time
 
15
  from concurrent.futures import ThreadPoolExecutor, as_completed
16
 
17
  image_dpi = 300.0
18
+ ImageFile.LOAD_TRUNCATED_IMAGES = True
19
  Image.MAX_IMAGE_PIXELS = None
20
 
21
  def is_pdf_or_image(filename):
 
75
  image.save(out_path, format="PNG")
76
 
77
  # Check file size and resize if necessary
78
+ max_size = 4.5 * 1024 * 1024 # 5 MB in bytes # 5
79
  file_size = os.path.getsize(out_path)
80
 
81
  # Resize images if they are too big
 
83
  # Start with the original image size
84
  width, height = image.size
85
 
86
+ print(f"Image size before {width}x{height}, original file_size: {file_size}")
87
 
88
  while file_size > max_size:
89
  # Reduce the size by a factor (e.g., 50% of the current size)
 
107
  print(f"Error processing page {page_num + 1}: {e}")
108
  return page_num, None
109
 
110
+ def convert_pdf_to_images(pdf_path: str, prepare_for_review:bool=False, page_min: int = 0, image_dpi: float = image_dpi, num_threads: int = 8, output_dir: str = '/input'):
111
 
112
+ # If preparing for review, just load the first page (not used)
113
  if prepare_for_review == True:
114
  page_count = pdfinfo_from_path(pdf_path)['Pages'] #1
115
  else:
tools/redaction_review.py CHANGED
@@ -15,6 +15,8 @@ from fitz import Document
15
  from PIL import ImageDraw, Image
16
  from collections import defaultdict
17
 
 
 
18
  def decrease_page(number:int):
19
  '''
20
  Decrease page number for review redactions page.
 
15
  from PIL import ImageDraw, Image
16
  from collections import defaultdict
17
 
18
+ Image.MAX_IMAGE_PIXELS = None
19
+
20
  def decrease_page(number:int):
21
  '''
22
  Decrease page number for review redactions page.