Commit
·
c1dca16
1
Parent(s):
4852fb5
Fixed Textract coordinate transformation issue. Made feedback buttons display optional in config.py
Browse files- tools/config.py +4 -0
- tools/file_redaction.py +21 -11
- tools/helper_functions.py +9 -4
tools/config.py
CHANGED
|
@@ -776,6 +776,10 @@ except Exception as e:
|
|
| 776 |
# Get some environment variables and Launch the Gradio app
|
| 777 |
COGNITO_AUTH = convert_string_to_boolean(get_or_create_env_var("COGNITO_AUTH", "False"))
|
| 778 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 779 |
|
| 780 |
# Link to user guide - ensure it is a valid URL
|
| 781 |
def validate_safe_url(url_candidate: str, allowed_domains: list = None) -> str:
|
|
|
|
| 776 |
# Get some environment variables and Launch the Gradio app
|
| 777 |
COGNITO_AUTH = convert_string_to_boolean(get_or_create_env_var("COGNITO_AUTH", "False"))
|
| 778 |
|
| 779 |
+
SHOW_FEEDBACK_BUTTONS = convert_string_to_boolean(
|
| 780 |
+
get_or_create_env_var("SHOW_FEEDBACK_BUTTONS", "False")
|
| 781 |
+
)
|
| 782 |
+
|
| 783 |
|
| 784 |
# Link to user guide - ensure it is a valid URL
|
| 785 |
def validate_safe_url(url_candidate: str, allowed_domains: list = None) -> str:
|
tools/file_redaction.py
CHANGED
|
@@ -1131,7 +1131,7 @@ def choose_and_run_redactor(
|
|
| 1131 |
|
| 1132 |
print(
|
| 1133 |
"Current page number",
|
| 1134 |
-
(page_min + current_loop_page)
|
| 1135 |
"is the last page processed.",
|
| 1136 |
)
|
| 1137 |
latest_file_completed += 1
|
|
@@ -2896,11 +2896,11 @@ def merge_img_bboxes(
|
|
| 2896 |
if page_signature_recogniser_results or page_handwriting_recogniser_results:
|
| 2897 |
|
| 2898 |
if "Extract handwriting" in handwrite_signature_checkbox:
|
| 2899 |
-
print("Extracting handwriting in merge_img_bboxes function")
|
| 2900 |
merged_bboxes.extend(copy.deepcopy(page_handwriting_recogniser_results))
|
| 2901 |
|
| 2902 |
if "Extract signatures" in handwrite_signature_checkbox:
|
| 2903 |
-
print("Extracting signatures in merge_img_bboxes function")
|
| 2904 |
merged_bboxes.extend(copy.deepcopy(page_signature_recogniser_results))
|
| 2905 |
|
| 2906 |
# Reconstruct bounding boxes for substrings of interest
|
|
@@ -3230,6 +3230,8 @@ def redact_image_pdf(
|
|
| 3230 |
textract_json_file_path, log_files_output_paths, page_sizes_df
|
| 3231 |
)
|
| 3232 |
)
|
|
|
|
|
|
|
| 3233 |
original_textract_data = textract_data.copy()
|
| 3234 |
|
| 3235 |
if textract_client_not_found and is_missing:
|
|
@@ -3504,6 +3506,7 @@ def redact_image_pdf(
|
|
| 3504 |
# Check if page exists in existing textract data. If not, send to service to analyse
|
| 3505 |
if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
|
| 3506 |
text_blocks = list()
|
|
|
|
| 3507 |
|
| 3508 |
if not textract_data:
|
| 3509 |
try:
|
|
@@ -3624,23 +3627,30 @@ def redact_image_pdf(
|
|
| 3624 |
if page["page_no"] == reported_page_number
|
| 3625 |
)
|
| 3626 |
|
| 3627 |
-
# Check if
|
| 3628 |
-
|
| 3629 |
-
|
| 3630 |
-
|
| 3631 |
-
|
| 3632 |
-
|
| 3633 |
|
| 3634 |
if use_mediabox_for_textract:
|
| 3635 |
# Whole-document Textract: use mediabox dimensions
|
| 3636 |
textract_page_width = pymupdf_page.mediabox.width
|
| 3637 |
textract_page_height = pymupdf_page.mediabox.height
|
| 3638 |
-
|
|
|
|
|
|
|
| 3639 |
else:
|
| 3640 |
# Individual image Textract: use image dimensions (current behavior)
|
| 3641 |
textract_page_width = page_width
|
| 3642 |
textract_page_height = page_height
|
| 3643 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3644 |
|
| 3645 |
(
|
| 3646 |
page_line_level_ocr_results,
|
|
|
|
| 1131 |
|
| 1132 |
print(
|
| 1133 |
"Current page number",
|
| 1134 |
+
(page_min + current_loop_page),
|
| 1135 |
"is the last page processed.",
|
| 1136 |
)
|
| 1137 |
latest_file_completed += 1
|
|
|
|
| 2896 |
if page_signature_recogniser_results or page_handwriting_recogniser_results:
|
| 2897 |
|
| 2898 |
if "Extract handwriting" in handwrite_signature_checkbox:
|
| 2899 |
+
# print("Extracting handwriting in merge_img_bboxes function")
|
| 2900 |
merged_bboxes.extend(copy.deepcopy(page_handwriting_recogniser_results))
|
| 2901 |
|
| 2902 |
if "Extract signatures" in handwrite_signature_checkbox:
|
| 2903 |
+
# print("Extracting signatures in merge_img_bboxes function")
|
| 2904 |
merged_bboxes.extend(copy.deepcopy(page_signature_recogniser_results))
|
| 2905 |
|
| 2906 |
# Reconstruct bounding boxes for substrings of interest
|
|
|
|
| 3230 |
textract_json_file_path, log_files_output_paths, page_sizes_df
|
| 3231 |
)
|
| 3232 |
)
|
| 3233 |
+
if textract_data:
|
| 3234 |
+
textract_output_found = True
|
| 3235 |
original_textract_data = textract_data.copy()
|
| 3236 |
|
| 3237 |
if textract_client_not_found and is_missing:
|
|
|
|
| 3506 |
# Check if page exists in existing textract data. If not, send to service to analyse
|
| 3507 |
if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
|
| 3508 |
text_blocks = list()
|
| 3509 |
+
page_exists = False
|
| 3510 |
|
| 3511 |
if not textract_data:
|
| 3512 |
try:
|
|
|
|
| 3627 |
if page["page_no"] == reported_page_number
|
| 3628 |
)
|
| 3629 |
|
| 3630 |
+
# Check if existing Textract output for this page
|
| 3631 |
+
|
| 3632 |
+
if textract_output_found and page_exists:
|
| 3633 |
+
use_mediabox_for_textract = True
|
| 3634 |
+
else:
|
| 3635 |
+
use_mediabox_for_textract = False
|
| 3636 |
|
| 3637 |
if use_mediabox_for_textract:
|
| 3638 |
# Whole-document Textract: use mediabox dimensions
|
| 3639 |
textract_page_width = pymupdf_page.mediabox.width
|
| 3640 |
textract_page_height = pymupdf_page.mediabox.height
|
| 3641 |
+
print(
|
| 3642 |
+
f"Using mediabox dimensions for Textract: {textract_page_width}x{textract_page_height}"
|
| 3643 |
+
)
|
| 3644 |
else:
|
| 3645 |
# Individual image Textract: use image dimensions (current behavior)
|
| 3646 |
textract_page_width = page_width
|
| 3647 |
textract_page_height = page_height
|
| 3648 |
+
print(
|
| 3649 |
+
f"Using image dimensions for Textract: {textract_page_width}x{textract_page_height}"
|
| 3650 |
+
)
|
| 3651 |
+
|
| 3652 |
+
# textract_page_width = page_width
|
| 3653 |
+
# textract_page_height = page_height
|
| 3654 |
|
| 3655 |
(
|
| 3656 |
page_line_level_ocr_results,
|
tools/helper_functions.py
CHANGED
|
@@ -26,6 +26,7 @@ from tools.config import (
|
|
| 26 |
OUTPUT_FOLDER,
|
| 27 |
SELECTABLE_TEXT_EXTRACT_OPTION,
|
| 28 |
SESSION_OUTPUT_FOLDER,
|
|
|
|
| 29 |
TESSERACT_TEXT_EXTRACT_OPTION,
|
| 30 |
TEXTRACT_JOBS_LOCAL_LOC,
|
| 31 |
TEXTRACT_JOBS_S3_LOC,
|
|
@@ -456,14 +457,18 @@ def add_folder_to_path(folder_path: str):
|
|
| 456 |
|
| 457 |
# Upon running a process, the feedback buttons are revealed
|
| 458 |
def reveal_feedback_buttons():
|
|
|
|
|
|
|
|
|
|
|
|
|
| 459 |
return (
|
| 460 |
gr.Radio(
|
| 461 |
-
visible=
|
| 462 |
label="Please give some feedback about the results of the redaction. A reminder that the app is only expected to identify about 80% of personally identifiable information in a given (typed) document.",
|
| 463 |
),
|
| 464 |
-
gr.Textbox(visible=
|
| 465 |
-
gr.Button(visible=
|
| 466 |
-
gr.Markdown(visible=
|
| 467 |
)
|
| 468 |
|
| 469 |
|
|
|
|
| 26 |
OUTPUT_FOLDER,
|
| 27 |
SELECTABLE_TEXT_EXTRACT_OPTION,
|
| 28 |
SESSION_OUTPUT_FOLDER,
|
| 29 |
+
SHOW_FEEDBACK_BUTTONS,
|
| 30 |
TESSERACT_TEXT_EXTRACT_OPTION,
|
| 31 |
TEXTRACT_JOBS_LOCAL_LOC,
|
| 32 |
TEXTRACT_JOBS_S3_LOC,
|
|
|
|
| 457 |
|
| 458 |
# Upon running a process, the feedback buttons are revealed
|
| 459 |
def reveal_feedback_buttons():
|
| 460 |
+
if SHOW_FEEDBACK_BUTTONS:
|
| 461 |
+
is_visible = True
|
| 462 |
+
else:
|
| 463 |
+
is_visible = False
|
| 464 |
return (
|
| 465 |
gr.Radio(
|
| 466 |
+
visible=is_visible,
|
| 467 |
label="Please give some feedback about the results of the redaction. A reminder that the app is only expected to identify about 80% of personally identifiable information in a given (typed) document.",
|
| 468 |
),
|
| 469 |
+
gr.Textbox(visible=is_visible),
|
| 470 |
+
gr.Button(visible=is_visible),
|
| 471 |
+
gr.Markdown(visible=is_visible),
|
| 472 |
)
|
| 473 |
|
| 474 |
|