Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on 21 days ago

Commit

c1dca16

1 Parent(s): 4852fb5

Fixed Textract coordinate transformation issue. Made feedback buttons display optional in config.py

Browse files

Files changed (3) hide show

tools/config.py +4 -0
tools/file_redaction.py +21 -11
tools/helper_functions.py +9 -4

tools/config.py CHANGED Viewed

@@ -776,6 +776,10 @@ except Exception as e:
 # Get some environment variables and Launch the Gradio app
 COGNITO_AUTH = convert_string_to_boolean(get_or_create_env_var("COGNITO_AUTH", "False"))
 # Link to user guide - ensure it is a valid URL
 def validate_safe_url(url_candidate: str, allowed_domains: list = None) -> str:

 # Get some environment variables and Launch the Gradio app
 COGNITO_AUTH = convert_string_to_boolean(get_or_create_env_var("COGNITO_AUTH", "False"))
+SHOW_FEEDBACK_BUTTONS = convert_string_to_boolean(
+    get_or_create_env_var("SHOW_FEEDBACK_BUTTONS", "False")
+)
 # Link to user guide - ensure it is a valid URL
 def validate_safe_url(url_candidate: str, allowed_domains: list = None) -> str:

tools/file_redaction.py CHANGED Viewed

@@ -1131,7 +1131,7 @@ def choose_and_run_redactor(
         print(
             "Current page number",
-            (page_min + current_loop_page) - 1,
             "is the last page processed.",
         )
         latest_file_completed += 1
@@ -2896,11 +2896,11 @@ def merge_img_bboxes(
     if page_signature_recogniser_results or page_handwriting_recogniser_results:
         if "Extract handwriting" in handwrite_signature_checkbox:
-            print("Extracting handwriting in merge_img_bboxes function")
             merged_bboxes.extend(copy.deepcopy(page_handwriting_recogniser_results))
         if "Extract signatures" in handwrite_signature_checkbox:
-            print("Extracting signatures in merge_img_bboxes function")
             merged_bboxes.extend(copy.deepcopy(page_signature_recogniser_results))
     # Reconstruct bounding boxes for substrings of interest
@@ -3230,6 +3230,8 @@ def redact_image_pdf(
                     textract_json_file_path, log_files_output_paths, page_sizes_df
                 )
             )
         original_textract_data = textract_data.copy()
         if textract_client_not_found and is_missing:
@@ -3504,6 +3506,7 @@ def redact_image_pdf(
             # Check if page exists in existing textract data. If not, send to service to analyse
             if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
                 text_blocks = list()
                 if not textract_data:
                     try:
@@ -3624,23 +3627,30 @@ def redact_image_pdf(
                             if page["page_no"] == reported_page_number
                         )
-                # Check if this is whole-document Textract output (already converted to mediabox space)
-                # by checking if the JSON structure indicates it came from restructure_textract_output
-                # or if textract_output_found is True (indicating pre-existing whole-document output)
-                use_mediabox_for_textract = textract_output_found or (
-                    "pages" in textract_data and len(textract_data.get("pages", [])) > 0
-                )
                 if use_mediabox_for_textract:
                     # Whole-document Textract: use mediabox dimensions
                     textract_page_width = pymupdf_page.mediabox.width
                     textract_page_height = pymupdf_page.mediabox.height
-                    # print(f"Using mediabox dimensions for whole-document Textract: {textract_page_width}x{textract_page_height}")
                 else:
                     # Individual image Textract: use image dimensions (current behavior)
                     textract_page_width = page_width
                     textract_page_height = page_height
-                    # print(f"Using image dimensions for individual image Textract: {textract_page_width}x{textract_page_height}")
                 (
                     page_line_level_ocr_results,

         print(
             "Current page number",
+            (page_min + current_loop_page),
             "is the last page processed.",
         )
         latest_file_completed += 1
     if page_signature_recogniser_results or page_handwriting_recogniser_results:
         if "Extract handwriting" in handwrite_signature_checkbox:
+            # print("Extracting handwriting in merge_img_bboxes function")
             merged_bboxes.extend(copy.deepcopy(page_handwriting_recogniser_results))
         if "Extract signatures" in handwrite_signature_checkbox:
+            # print("Extracting signatures in merge_img_bboxes function")
             merged_bboxes.extend(copy.deepcopy(page_signature_recogniser_results))
     # Reconstruct bounding boxes for substrings of interest
                     textract_json_file_path, log_files_output_paths, page_sizes_df
                 )
             )
+            if textract_data:
+                textract_output_found = True
         original_textract_data = textract_data.copy()
         if textract_client_not_found and is_missing:
             # Check if page exists in existing textract data. If not, send to service to analyse
             if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
                 text_blocks = list()
+                page_exists = False
                 if not textract_data:
                     try:
                             if page["page_no"] == reported_page_number
                         )
+                # Check if existing Textract output for this page
+                if textract_output_found and page_exists:
+                    use_mediabox_for_textract = True
+                else:
+                    use_mediabox_for_textract = False
                 if use_mediabox_for_textract:
                     # Whole-document Textract: use mediabox dimensions
                     textract_page_width = pymupdf_page.mediabox.width
                     textract_page_height = pymupdf_page.mediabox.height
+                    print(
+                        f"Using mediabox dimensions for Textract: {textract_page_width}x{textract_page_height}"
+                    )
                 else:
                     # Individual image Textract: use image dimensions (current behavior)
                     textract_page_width = page_width
                     textract_page_height = page_height
+                    print(
+                        f"Using image dimensions for Textract: {textract_page_width}x{textract_page_height}"
+                    )
+                # textract_page_width = page_width
+                # textract_page_height = page_height
                 (
                     page_line_level_ocr_results,

tools/helper_functions.py CHANGED Viewed

@@ -26,6 +26,7 @@ from tools.config import (
     OUTPUT_FOLDER,
     SELECTABLE_TEXT_EXTRACT_OPTION,
     SESSION_OUTPUT_FOLDER,
     TESSERACT_TEXT_EXTRACT_OPTION,
     TEXTRACT_JOBS_LOCAL_LOC,
     TEXTRACT_JOBS_S3_LOC,
@@ -456,14 +457,18 @@ def add_folder_to_path(folder_path: str):
 # Upon running a process, the feedback buttons are revealed
 def reveal_feedback_buttons():
     return (
         gr.Radio(
-            visible=True,
             label="Please give some feedback about the results of the redaction. A reminder that the app is only expected to identify about 80% of personally identifiable information in a given (typed) document.",
         ),
-        gr.Textbox(visible=True),
-        gr.Button(visible=True),
-        gr.Markdown(visible=True),
     )

     OUTPUT_FOLDER,
     SELECTABLE_TEXT_EXTRACT_OPTION,
     SESSION_OUTPUT_FOLDER,
+    SHOW_FEEDBACK_BUTTONS,
     TESSERACT_TEXT_EXTRACT_OPTION,
     TEXTRACT_JOBS_LOCAL_LOC,
     TEXTRACT_JOBS_S3_LOC,
 # Upon running a process, the feedback buttons are revealed
 def reveal_feedback_buttons():
+    if SHOW_FEEDBACK_BUTTONS:
+        is_visible = True
+    else:
+        is_visible = False
     return (
         gr.Radio(
+            visible=is_visible,
             label="Please give some feedback about the results of the redaction. A reminder that the app is only expected to identify about 80% of personally identifiable information in a given (typed) document.",
         ),
+        gr.Textbox(visible=is_visible),
+        gr.Button(visible=is_visible),
+        gr.Markdown(visible=is_visible),
     )