Commit
·
3518b67
1
Parent(s):
3187788
Corrected large image reduction code
Browse files- app.py +2 -5
- tools/file_conversion.py +5 -5
- tools/redaction_review.py +2 -0
app.py
CHANGED
@@ -371,11 +371,8 @@ with app:
|
|
371 |
clear_all_redactions_on_page_btn.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base, clear_all_page_redactions], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
372 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
373 |
|
374 |
-
#annotation_button_get.click(get_boxes_json, annotator, json_boxes)
|
375 |
annotation_button_apply.click(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output], scroll_to_output=True)
|
376 |
|
377 |
-
do_not_save_pdf_state
|
378 |
-
|
379 |
# Page controls at bottom
|
380 |
annotate_current_page_bottom.submit(
|
381 |
modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
@@ -392,10 +389,10 @@ with app:
|
|
392 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
393 |
then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
|
394 |
|
395 |
-
# Review
|
396 |
recogniser_entity_dropdown.select(update_entities_df, inputs=[recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs=[recogniser_entity_dataframe])
|
397 |
|
398 |
-
recogniser_entity_dataframe.select(df_select_callback, inputs=[recogniser_entity_dataframe], outputs=annotate_current_page).\
|
399 |
then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
400 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
401 |
|
|
|
371 |
clear_all_redactions_on_page_btn.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base, clear_all_page_redactions], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
372 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
373 |
|
|
|
374 |
annotation_button_apply.click(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output], scroll_to_output=True)
|
375 |
|
|
|
|
|
376 |
# Page controls at bottom
|
377 |
annotate_current_page_bottom.submit(
|
378 |
modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
|
|
389 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
390 |
then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
|
391 |
|
392 |
+
# Review table controls
|
393 |
recogniser_entity_dropdown.select(update_entities_df, inputs=[recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs=[recogniser_entity_dataframe])
|
394 |
|
395 |
+
recogniser_entity_dataframe.select(df_select_callback, inputs=[recogniser_entity_dataframe], outputs=[annotate_current_page]).\
|
396 |
then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
397 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
398 |
|
tools/file_conversion.py
CHANGED
@@ -1,7 +1,6 @@
|
|
1 |
from pdf2image import convert_from_path, pdfinfo_from_path
|
2 |
from tools.helper_functions import get_file_path_end, output_folder, tesseract_ocr_option, text_ocr_option, textract_option, read_file, get_or_create_env_var
|
3 |
from PIL import Image, ImageFile
|
4 |
-
ImageFile.LOAD_TRUNCATED_IMAGES = True
|
5 |
import os
|
6 |
import re
|
7 |
import time
|
@@ -16,6 +15,7 @@ from typing import List, Optional
|
|
16 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
17 |
|
18 |
image_dpi = 300.0
|
|
|
19 |
Image.MAX_IMAGE_PIXELS = None
|
20 |
|
21 |
def is_pdf_or_image(filename):
|
@@ -75,7 +75,7 @@ def process_single_page(pdf_path: str, page_num: int, image_dpi: float, output_d
|
|
75 |
image.save(out_path, format="PNG")
|
76 |
|
77 |
# Check file size and resize if necessary
|
78 |
-
max_size = 5 * 1024 * 1024 # 5 MB in bytes # 5
|
79 |
file_size = os.path.getsize(out_path)
|
80 |
|
81 |
# Resize images if they are too big
|
@@ -83,7 +83,7 @@ def process_single_page(pdf_path: str, page_num: int, image_dpi: float, output_d
|
|
83 |
# Start with the original image size
|
84 |
width, height = image.size
|
85 |
|
86 |
-
print(f"Image size before {
|
87 |
|
88 |
while file_size > max_size:
|
89 |
# Reduce the size by a factor (e.g., 50% of the current size)
|
@@ -107,9 +107,9 @@ def process_single_page(pdf_path: str, page_num: int, image_dpi: float, output_d
|
|
107 |
print(f"Error processing page {page_num + 1}: {e}")
|
108 |
return page_num, None
|
109 |
|
110 |
-
def convert_pdf_to_images(pdf_path: str, prepare_for_review:bool=False, page_min: int = 0, image_dpi: float =
|
111 |
|
112 |
-
# If preparing for review, just load the first page
|
113 |
if prepare_for_review == True:
|
114 |
page_count = pdfinfo_from_path(pdf_path)['Pages'] #1
|
115 |
else:
|
|
|
1 |
from pdf2image import convert_from_path, pdfinfo_from_path
|
2 |
from tools.helper_functions import get_file_path_end, output_folder, tesseract_ocr_option, text_ocr_option, textract_option, read_file, get_or_create_env_var
|
3 |
from PIL import Image, ImageFile
|
|
|
4 |
import os
|
5 |
import re
|
6 |
import time
|
|
|
15 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
16 |
|
17 |
image_dpi = 300.0
|
18 |
+
ImageFile.LOAD_TRUNCATED_IMAGES = True
|
19 |
Image.MAX_IMAGE_PIXELS = None
|
20 |
|
21 |
def is_pdf_or_image(filename):
|
|
|
75 |
image.save(out_path, format="PNG")
|
76 |
|
77 |
# Check file size and resize if necessary
|
78 |
+
max_size = 4.5 * 1024 * 1024 # 5 MB in bytes # 5
|
79 |
file_size = os.path.getsize(out_path)
|
80 |
|
81 |
# Resize images if they are too big
|
|
|
83 |
# Start with the original image size
|
84 |
width, height = image.size
|
85 |
|
86 |
+
print(f"Image size before {width}x{height}, original file_size: {file_size}")
|
87 |
|
88 |
while file_size > max_size:
|
89 |
# Reduce the size by a factor (e.g., 50% of the current size)
|
|
|
107 |
print(f"Error processing page {page_num + 1}: {e}")
|
108 |
return page_num, None
|
109 |
|
110 |
+
def convert_pdf_to_images(pdf_path: str, prepare_for_review:bool=False, page_min: int = 0, image_dpi: float = image_dpi, num_threads: int = 8, output_dir: str = '/input'):
|
111 |
|
112 |
+
# If preparing for review, just load the first page (not used)
|
113 |
if prepare_for_review == True:
|
114 |
page_count = pdfinfo_from_path(pdf_path)['Pages'] #1
|
115 |
else:
|
tools/redaction_review.py
CHANGED
@@ -15,6 +15,8 @@ from fitz import Document
|
|
15 |
from PIL import ImageDraw, Image
|
16 |
from collections import defaultdict
|
17 |
|
|
|
|
|
18 |
def decrease_page(number:int):
|
19 |
'''
|
20 |
Decrease page number for review redactions page.
|
|
|
15 |
from PIL import ImageDraw, Image
|
16 |
from collections import defaultdict
|
17 |
|
18 |
+
Image.MAX_IMAGE_PIXELS = None
|
19 |
+
|
20 |
def decrease_page(number:int):
|
21 |
'''
|
22 |
Decrease page number for review redactions page.
|