seanpedrickcase commited on
Commit
c1dca16
·
1 Parent(s): 4852fb5

Fixed Textract coordinate transformation issue. Made feedback buttons display optional in config.py

Browse files
tools/config.py CHANGED
@@ -776,6 +776,10 @@ except Exception as e:
776
  # Get some environment variables and Launch the Gradio app
777
  COGNITO_AUTH = convert_string_to_boolean(get_or_create_env_var("COGNITO_AUTH", "False"))
778
 
 
 
 
 
779
 
780
  # Link to user guide - ensure it is a valid URL
781
  def validate_safe_url(url_candidate: str, allowed_domains: list = None) -> str:
 
776
  # Get some environment variables and Launch the Gradio app
777
  COGNITO_AUTH = convert_string_to_boolean(get_or_create_env_var("COGNITO_AUTH", "False"))
778
 
779
+ SHOW_FEEDBACK_BUTTONS = convert_string_to_boolean(
780
+ get_or_create_env_var("SHOW_FEEDBACK_BUTTONS", "False")
781
+ )
782
+
783
 
784
  # Link to user guide - ensure it is a valid URL
785
  def validate_safe_url(url_candidate: str, allowed_domains: list = None) -> str:
tools/file_redaction.py CHANGED
@@ -1131,7 +1131,7 @@ def choose_and_run_redactor(
1131
 
1132
  print(
1133
  "Current page number",
1134
- (page_min + current_loop_page) - 1,
1135
  "is the last page processed.",
1136
  )
1137
  latest_file_completed += 1
@@ -2896,11 +2896,11 @@ def merge_img_bboxes(
2896
  if page_signature_recogniser_results or page_handwriting_recogniser_results:
2897
 
2898
  if "Extract handwriting" in handwrite_signature_checkbox:
2899
- print("Extracting handwriting in merge_img_bboxes function")
2900
  merged_bboxes.extend(copy.deepcopy(page_handwriting_recogniser_results))
2901
 
2902
  if "Extract signatures" in handwrite_signature_checkbox:
2903
- print("Extracting signatures in merge_img_bboxes function")
2904
  merged_bboxes.extend(copy.deepcopy(page_signature_recogniser_results))
2905
 
2906
  # Reconstruct bounding boxes for substrings of interest
@@ -3230,6 +3230,8 @@ def redact_image_pdf(
3230
  textract_json_file_path, log_files_output_paths, page_sizes_df
3231
  )
3232
  )
 
 
3233
  original_textract_data = textract_data.copy()
3234
 
3235
  if textract_client_not_found and is_missing:
@@ -3504,6 +3506,7 @@ def redact_image_pdf(
3504
  # Check if page exists in existing textract data. If not, send to service to analyse
3505
  if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
3506
  text_blocks = list()
 
3507
 
3508
  if not textract_data:
3509
  try:
@@ -3624,23 +3627,30 @@ def redact_image_pdf(
3624
  if page["page_no"] == reported_page_number
3625
  )
3626
 
3627
- # Check if this is whole-document Textract output (already converted to mediabox space)
3628
- # by checking if the JSON structure indicates it came from restructure_textract_output
3629
- # or if textract_output_found is True (indicating pre-existing whole-document output)
3630
- use_mediabox_for_textract = textract_output_found or (
3631
- "pages" in textract_data and len(textract_data.get("pages", [])) > 0
3632
- )
3633
 
3634
  if use_mediabox_for_textract:
3635
  # Whole-document Textract: use mediabox dimensions
3636
  textract_page_width = pymupdf_page.mediabox.width
3637
  textract_page_height = pymupdf_page.mediabox.height
3638
- # print(f"Using mediabox dimensions for whole-document Textract: {textract_page_width}x{textract_page_height}")
 
 
3639
  else:
3640
  # Individual image Textract: use image dimensions (current behavior)
3641
  textract_page_width = page_width
3642
  textract_page_height = page_height
3643
- # print(f"Using image dimensions for individual image Textract: {textract_page_width}x{textract_page_height}")
 
 
 
 
 
3644
 
3645
  (
3646
  page_line_level_ocr_results,
 
1131
 
1132
  print(
1133
  "Current page number",
1134
+ (page_min + current_loop_page),
1135
  "is the last page processed.",
1136
  )
1137
  latest_file_completed += 1
 
2896
  if page_signature_recogniser_results or page_handwriting_recogniser_results:
2897
 
2898
  if "Extract handwriting" in handwrite_signature_checkbox:
2899
+ # print("Extracting handwriting in merge_img_bboxes function")
2900
  merged_bboxes.extend(copy.deepcopy(page_handwriting_recogniser_results))
2901
 
2902
  if "Extract signatures" in handwrite_signature_checkbox:
2903
+ # print("Extracting signatures in merge_img_bboxes function")
2904
  merged_bboxes.extend(copy.deepcopy(page_signature_recogniser_results))
2905
 
2906
  # Reconstruct bounding boxes for substrings of interest
 
3230
  textract_json_file_path, log_files_output_paths, page_sizes_df
3231
  )
3232
  )
3233
+ if textract_data:
3234
+ textract_output_found = True
3235
  original_textract_data = textract_data.copy()
3236
 
3237
  if textract_client_not_found and is_missing:
 
3506
  # Check if page exists in existing textract data. If not, send to service to analyse
3507
  if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
3508
  text_blocks = list()
3509
+ page_exists = False
3510
 
3511
  if not textract_data:
3512
  try:
 
3627
  if page["page_no"] == reported_page_number
3628
  )
3629
 
3630
+ # Check if existing Textract output for this page
3631
+
3632
+ if textract_output_found and page_exists:
3633
+ use_mediabox_for_textract = True
3634
+ else:
3635
+ use_mediabox_for_textract = False
3636
 
3637
  if use_mediabox_for_textract:
3638
  # Whole-document Textract: use mediabox dimensions
3639
  textract_page_width = pymupdf_page.mediabox.width
3640
  textract_page_height = pymupdf_page.mediabox.height
3641
+ print(
3642
+ f"Using mediabox dimensions for Textract: {textract_page_width}x{textract_page_height}"
3643
+ )
3644
  else:
3645
  # Individual image Textract: use image dimensions (current behavior)
3646
  textract_page_width = page_width
3647
  textract_page_height = page_height
3648
+ print(
3649
+ f"Using image dimensions for Textract: {textract_page_width}x{textract_page_height}"
3650
+ )
3651
+
3652
+ # textract_page_width = page_width
3653
+ # textract_page_height = page_height
3654
 
3655
  (
3656
  page_line_level_ocr_results,
tools/helper_functions.py CHANGED
@@ -26,6 +26,7 @@ from tools.config import (
26
  OUTPUT_FOLDER,
27
  SELECTABLE_TEXT_EXTRACT_OPTION,
28
  SESSION_OUTPUT_FOLDER,
 
29
  TESSERACT_TEXT_EXTRACT_OPTION,
30
  TEXTRACT_JOBS_LOCAL_LOC,
31
  TEXTRACT_JOBS_S3_LOC,
@@ -456,14 +457,18 @@ def add_folder_to_path(folder_path: str):
456
 
457
  # Upon running a process, the feedback buttons are revealed
458
  def reveal_feedback_buttons():
 
 
 
 
459
  return (
460
  gr.Radio(
461
- visible=True,
462
  label="Please give some feedback about the results of the redaction. A reminder that the app is only expected to identify about 80% of personally identifiable information in a given (typed) document.",
463
  ),
464
- gr.Textbox(visible=True),
465
- gr.Button(visible=True),
466
- gr.Markdown(visible=True),
467
  )
468
 
469
 
 
26
  OUTPUT_FOLDER,
27
  SELECTABLE_TEXT_EXTRACT_OPTION,
28
  SESSION_OUTPUT_FOLDER,
29
+ SHOW_FEEDBACK_BUTTONS,
30
  TESSERACT_TEXT_EXTRACT_OPTION,
31
  TEXTRACT_JOBS_LOCAL_LOC,
32
  TEXTRACT_JOBS_S3_LOC,
 
457
 
458
  # Upon running a process, the feedback buttons are revealed
459
  def reveal_feedback_buttons():
460
+ if SHOW_FEEDBACK_BUTTONS:
461
+ is_visible = True
462
+ else:
463
+ is_visible = False
464
  return (
465
  gr.Radio(
466
+ visible=is_visible,
467
  label="Please give some feedback about the results of the redaction. A reminder that the app is only expected to identify about 80% of personally identifiable information in a given (typed) document.",
468
  ),
469
+ gr.Textbox(visible=is_visible),
470
+ gr.Button(visible=is_visible),
471
+ gr.Markdown(visible=is_visible),
472
  )
473
 
474