seanpedrickcase commited on
Commit
a03496e
·
1 Parent(s): 59ff822

Side review bar is mostly there. A couple of bugs fixed. Can now return identified text in initial review files. Still working on retaining found text throughout review process

Browse files
app.py CHANGED
@@ -8,6 +8,7 @@ import gradio as gr
8
  import pandas as pd
9
  from datetime import datetime
10
  from gradio_image_annotation import image_annotator
 
11
 
12
  from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
13
  from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
@@ -80,6 +81,8 @@ with app:
80
  output_file_list_state = gr.State([])
81
  text_output_file_list_state = gr.State([])
82
  log_files_output_list_state = gr.State([])
 
 
83
 
84
  # Logging state
85
  log_file_name = 'log.csv'
@@ -113,7 +116,7 @@ with app:
113
 
114
 
115
  ## Annotator zoom value
116
- annotator_zoom_number = gr.Number(label = "Current annotator zoom level", value=100, precision=0, visible=False)
117
  zoom_true_bool = gr.State(True)
118
  zoom_false_bool = gr.State(False)
119
 
@@ -204,9 +207,9 @@ with app:
204
  annotate_zoom_in = gr.Button("Zoom in")
205
  annotate_zoom_out = gr.Button("Zoom out")
206
  with gr.Row():
207
- clear_all_redactions_on_page_btn = gr.Button("Clear all redactions on page")
 
208
 
209
- annotation_button_apply = gr.Button("Apply revised redactions", variant="primary")
210
 
211
  with gr.Row():
212
 
@@ -233,10 +236,8 @@ with app:
233
  )
234
 
235
  with gr.Column(scale=1):
236
- recogniser_entity_dropdown = gr.Dropdown(value="ALL", allow_custom_value=True)
237
- recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas")
238
-
239
-
240
 
241
  with gr.Row():
242
  annotation_last_page_button_bottom = gr.Button("Previous page", scale = 3)
@@ -245,6 +246,9 @@ with app:
245
  annotation_next_page_button_bottom = gr.Button("Next page", scale = 3)
246
 
247
 
 
 
 
248
  # TEXT / TABULAR DATA TAB
249
  with gr.Tab(label="Open text or Excel/csv files"):
250
  gr.Markdown(
@@ -304,7 +308,7 @@ with app:
304
 
305
  in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Entities to redact - local PII identification model (click close to down arrow for full list)")
306
 
307
- handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract settings", choices=["Redact all identified handwriting", "Redact all identified signatures"], value=["Redact all identified handwriting"])
308
  #with gr.Row():
309
  in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False, visible=False)
310
 
@@ -319,7 +323,7 @@ with app:
319
  in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list])
320
 
321
  document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator]).\
322
- then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state], api_name="prepare_doc").\
323
  then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
324
  outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number], api_name="redact_doc").\
325
  then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
@@ -339,49 +343,56 @@ with app:
339
 
340
  # Upload previous files for modifying redactions
341
  upload_previous_review_file_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
342
- then(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state]).\
343
  then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
344
 
345
  # Page controls at top
346
  annotate_current_page.submit(
347
- modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
348
  then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
349
 
 
 
350
  annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
 
351
  then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
352
  annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
 
353
  then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
354
 
355
  # Zoom in and out on annotator
356
- annotate_zoom_in.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
357
  then(update_zoom, inputs=[annotator_zoom_number, annotate_current_page, zoom_true_bool], outputs=[annotator_zoom_number, annotate_current_page])
358
 
359
- annotate_zoom_out.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
360
  then(update_zoom, inputs=[annotator_zoom_number, annotate_current_page, zoom_false_bool], outputs=[annotator_zoom_number, annotate_current_page])
361
 
362
  annotator_zoom_number.change(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
363
 
364
- clear_all_redactions_on_page_btn.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, clear_all_page_redactions], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
365
  then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
366
 
367
  #annotation_button_get.click(get_boxes_json, annotator, json_boxes)
368
- annotation_button_apply.click(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output], scroll_to_output=True)
369
 
370
  # Page controls at bottom
371
  annotate_current_page_bottom.submit(
372
- modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page]).\
373
  then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
374
 
375
  annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
 
376
  then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
 
377
  annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
 
378
  then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
379
 
380
  # Review side bar controls
381
  recogniser_entity_dropdown.select(update_entities_df, inputs=[recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs=[recogniser_entity_dataframe])
382
 
383
  recogniser_entity_dataframe.select(df_select_callback, inputs=[recogniser_entity_dataframe], outputs=annotate_current_page).\
384
- then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
385
  then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
386
 
387
 
 
8
  import pandas as pd
9
  from datetime import datetime
10
  from gradio_image_annotation import image_annotator
11
+ from gradio_image_annotation.image_annotator import AnnotatedImageData
12
 
13
  from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
14
  from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
 
81
  output_file_list_state = gr.State([])
82
  text_output_file_list_state = gr.State([])
83
  log_files_output_list_state = gr.State([])
84
+
85
+ review_file_state = gr.State(pd.DataFrame())
86
 
87
  # Logging state
88
  log_file_name = 'log.csv'
 
116
 
117
 
118
  ## Annotator zoom value
119
+ annotator_zoom_number = gr.Number(label = "Current annotator zoom level", value=80, precision=0, visible=False)
120
  zoom_true_bool = gr.State(True)
121
  zoom_false_bool = gr.State(False)
122
 
 
207
  annotate_zoom_in = gr.Button("Zoom in")
208
  annotate_zoom_out = gr.Button("Zoom out")
209
  with gr.Row():
210
+ clear_all_redactions_on_page_btn = gr.Button("Clear all redactions on page", visible=False)
211
+ annotation_button_apply = gr.Button("Apply revised redactions", variant="primary")
212
 
 
213
 
214
  with gr.Row():
215
 
 
236
  )
237
 
238
  with gr.Column(scale=1):
239
+ recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
240
+ recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", label="Search results. Click to go to page")
 
 
241
 
242
  with gr.Row():
243
  annotation_last_page_button_bottom = gr.Button("Previous page", scale = 3)
 
246
  annotation_next_page_button_bottom = gr.Button("Next page", scale = 3)
247
 
248
 
249
+
250
+
251
+
252
  # TEXT / TABULAR DATA TAB
253
  with gr.Tab(label="Open text or Excel/csv files"):
254
  gr.Markdown(
 
308
 
309
  in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Entities to redact - local PII identification model (click close to down arrow for full list)")
310
 
311
+ handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract settings", choices=["Redact all identified handwriting", "Redact all identified signatures"], value=["Redact all identified handwriting", "Redact all identified signatures"])
312
  #with gr.Row():
313
  in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False, visible=False)
314
 
 
323
  in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list])
324
 
325
  document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator]).\
326
+ then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state], api_name="prepare_doc").\
327
  then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
328
  outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number], api_name="redact_doc").\
329
  then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
 
343
 
344
  # Upload previous files for modifying redactions
345
  upload_previous_review_file_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
346
+ then(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state]).\
347
  then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
348
 
349
  # Page controls at top
350
  annotate_current_page.submit(
351
+ modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
352
  then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
353
 
354
+
355
+
356
  annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
357
+ then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
358
  then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
359
  annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
360
+ then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
361
  then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
362
 
363
  # Zoom in and out on annotator
364
+ annotate_zoom_in.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
365
  then(update_zoom, inputs=[annotator_zoom_number, annotate_current_page, zoom_true_bool], outputs=[annotator_zoom_number, annotate_current_page])
366
 
367
+ annotate_zoom_out.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
368
  then(update_zoom, inputs=[annotator_zoom_number, annotate_current_page, zoom_false_bool], outputs=[annotator_zoom_number, annotate_current_page])
369
 
370
  annotator_zoom_number.change(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
371
 
372
+ clear_all_redactions_on_page_btn.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base, clear_all_page_redactions], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
373
  then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
374
 
375
  #annotation_button_get.click(get_boxes_json, annotator, json_boxes)
376
+ annotation_button_apply.click(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output], scroll_to_output=True)
377
 
378
  # Page controls at bottom
379
  annotate_current_page_bottom.submit(
380
+ modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
381
  then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
382
 
383
  annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
384
+ then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
385
  then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
386
+
387
  annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
388
+ then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
389
  then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
390
 
391
  # Review side bar controls
392
  recogniser_entity_dropdown.select(update_entities_df, inputs=[recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs=[recogniser_entity_dataframe])
393
 
394
  recogniser_entity_dataframe.select(df_select_callback, inputs=[recogniser_entity_dataframe], outputs=annotate_current_page).\
395
+ then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
396
  then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
397
 
398
 
tools/file_conversion.py CHANGED
@@ -399,6 +399,7 @@ def prepare_image_or_pdf(
399
  converted_file_paths = []
400
  image_file_paths = []
401
  pymupdf_doc = []
 
402
 
403
  if not file_paths:
404
  file_paths = []
@@ -424,7 +425,7 @@ def prepare_image_or_pdf(
424
  final_out_message = '\n'.join(out_message)
425
  else:
426
  final_out_message = out_message
427
- return final_out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
428
 
429
  #in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
430
 
@@ -457,7 +458,7 @@ def prepare_image_or_pdf(
457
  if not file_path:
458
  out_message = "Please select a file."
459
  print(out_message)
460
- return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
461
 
462
  file_extension = os.path.splitext(file_path)[1].lower()
463
 
@@ -478,7 +479,7 @@ def prepare_image_or_pdf(
478
 
479
  all_annotations_object.append(annotation)
480
 
481
- print("all_annotations_object:", all_annotations_object)
482
 
483
 
484
  elif is_pdf_or_image(file_path): # Alternatively, if it's an image
@@ -597,13 +598,13 @@ def prepare_image_or_pdf(
597
  if is_pdf_or_image(file_path) == False:
598
  out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
599
  print(out_message)
600
- return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
601
 
602
  elif in_redact_method == text_ocr_option:
603
  if is_pdf(file_path) == False:
604
  out_message = "Please upload a PDF file for text analysis."
605
  print(out_message)
606
- return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
607
 
608
 
609
  converted_file_paths.append(converted_file_path)
@@ -624,7 +625,7 @@ def prepare_image_or_pdf(
624
 
625
  #print("all_annotations_object at end:", all_annotations_object)
626
 
627
- return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
628
 
629
  def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str], image_dpi:float=image_dpi):
630
  file_path_without_ext = get_file_path_end(in_file_path)
@@ -650,7 +651,7 @@ def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str],
650
  return out_message, out_file_paths
651
 
652
 
653
- def convert_review_json_to_pandas_df(data:List[dict]) -> pd.DataFrame:
654
  # Flatten the data
655
  flattened_data = []
656
 
@@ -670,16 +671,40 @@ def convert_review_json_to_pandas_df(data:List[dict]) -> pd.DataFrame:
670
 
671
  # Check if 'boxes' is in the entry, if not, add an empty list
672
  if 'boxes' not in entry:
673
- entry['boxes'] = []
674
 
675
  for box in entry["boxes"]:
676
- data_to_add = {"image": image_path, "page": reported_number, **box}
 
 
 
677
  #print("data_to_add:", data_to_add)
678
  flattened_data.append(data_to_add)
679
 
680
  # Convert to a DataFrame
681
  df = pd.DataFrame(flattened_data)
682
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
683
  return df
684
 
685
  def convert_pandas_df_to_review_json(df: pd.DataFrame) -> List[dict]:
 
399
  converted_file_paths = []
400
  image_file_paths = []
401
  pymupdf_doc = []
402
+ review_file_csv = pd.DataFrame()
403
 
404
  if not file_paths:
405
  file_paths = []
 
425
  final_out_message = '\n'.join(out_message)
426
  else:
427
  final_out_message = out_message
428
+ return final_out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv
429
 
430
  #in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
431
 
 
458
  if not file_path:
459
  out_message = "Please select a file."
460
  print(out_message)
461
+ return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv
462
 
463
  file_extension = os.path.splitext(file_path)[1].lower()
464
 
 
479
 
480
  all_annotations_object.append(annotation)
481
 
482
+ #print("all_annotations_object:", all_annotations_object)
483
 
484
 
485
  elif is_pdf_or_image(file_path): # Alternatively, if it's an image
 
598
  if is_pdf_or_image(file_path) == False:
599
  out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
600
  print(out_message)
601
+ return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv
602
 
603
  elif in_redact_method == text_ocr_option:
604
  if is_pdf(file_path) == False:
605
  out_message = "Please upload a PDF file for text analysis."
606
  print(out_message)
607
+ return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv
608
 
609
 
610
  converted_file_paths.append(converted_file_path)
 
625
 
626
  #print("all_annotations_object at end:", all_annotations_object)
627
 
628
+ return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv
629
 
630
  def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str], image_dpi:float=image_dpi):
631
  file_path_without_ext = get_file_path_end(in_file_path)
 
651
  return out_message, out_file_paths
652
 
653
 
654
+ def convert_review_json_to_pandas_df(data:List[dict], text_join_data=pd.DataFrame) -> pd.DataFrame:
655
  # Flatten the data
656
  flattened_data = []
657
 
 
671
 
672
  # Check if 'boxes' is in the entry, if not, add an empty list
673
  if 'boxes' not in entry:
674
+ entry['boxes'] = []
675
 
676
  for box in entry["boxes"]:
677
+ if 'text' not in box:
678
+ data_to_add = {"image": image_path, "page": reported_number, **box} # "text": entry['text'],
679
+ else:
680
+ data_to_add = {"image": image_path, "page": reported_number, "text": entry['text'], **box}
681
  #print("data_to_add:", data_to_add)
682
  flattened_data.append(data_to_add)
683
 
684
  # Convert to a DataFrame
685
  df = pd.DataFrame(flattened_data)
686
 
687
+ # Join on additional text data from decision output results if included
688
+ if not text_join_data.empty:
689
+ #print("text_join_data:", text_join_data)
690
+ #print("df:", df)
691
+ text_join_data['page'] = text_join_data['page'].astype(str)
692
+ df['page'] = df['page'].astype(str)
693
+ text_join_data = text_join_data[['xmin', 'ymin', 'xmax', 'ymax', 'label', 'page', 'text']]
694
+ text_join_data[['xmin', 'ymin', 'xmax', 'ymax']] = text_join_data[['xmin', 'ymin', 'xmax', 'ymax']].astype(float).round(0)
695
+ df[['xmin1', 'ymin1', 'xmax1', 'ymax1']] = df[['xmin', 'ymin', 'xmax', 'ymax']].astype(float).round(0)
696
+
697
+ df = df.merge(text_join_data, left_on = ['xmin1', 'ymin1', 'xmax1', 'ymax1', 'label', 'page'], right_on = ['xmin', 'ymin', 'xmax', 'ymax', 'label', 'page'], how = "left", suffixes=("", "_y"))
698
+
699
+ df = df.drop(['xmin1', 'ymin1', 'xmax1', 'ymax1', 'xmin_y', 'ymin_y', 'xmax_y', 'ymax_y'], axis=1, errors="ignore")
700
+
701
+ df = df[["image", "page", "label", "color", "xmin", "ymin", "xmax", "ymax", "text"]]
702
+
703
+ if 'text' not in df.columns:
704
+ df['text'] = ''
705
+
706
+ df = df.sort_values(['page', 'ymin', 'xmin', 'label'])
707
+
708
  return df
709
 
710
  def convert_pandas_df_to_review_json(df: pd.DataFrame) -> List[dict]:
tools/file_redaction.py CHANGED
@@ -40,6 +40,11 @@ print(f'The value of page_break_value is {page_break_value}')
40
  max_time_value = get_or_create_env_var('max_time_value', '999999')
41
  print(f'The value of max_time_value is {max_time_value}')
42
 
 
 
 
 
 
43
  def sum_numbers_before_seconds(string:str):
44
  """Extracts numbers that precede the word 'seconds' from a string and adds them up.
45
 
@@ -396,7 +401,7 @@ def choose_and_run_redactor(file_paths:List[str],
396
  # Convert json to csv and also save this
397
  #print("annotations_all_pages:", annotations_all_pages)
398
 
399
- review_df = convert_review_json_to_pandas_df(annotations_all_pages)
400
 
401
  out_review_file_file_path = out_image_file_path + '_review_file.csv'
402
  review_df.to_csv(out_review_file_file_path, index=None)
@@ -452,7 +457,7 @@ def choose_and_run_redactor(file_paths:List[str],
452
 
453
  return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
454
 
455
- def convert_pikepdf_coords_to_pymupdf(pymupdf_page, pikepdf_bbox):
456
  '''
457
  Convert annotations from pikepdf to pymupdf format, handling the mediabox larger than rect.
458
  '''
@@ -474,7 +479,10 @@ def convert_pikepdf_coords_to_pymupdf(pymupdf_page, pikepdf_bbox):
474
  x_diff_ratio = media_reference_x_diff / reference_box_width
475
 
476
  # Extract the annotation rectangle field
477
- rect_field = pikepdf_bbox["/Rect"]
 
 
 
478
  rect_coordinates = [float(coord) for coord in rect_field] # Convert to floats
479
 
480
  # Unpack coordinates
@@ -487,7 +495,7 @@ def convert_pikepdf_coords_to_pymupdf(pymupdf_page, pikepdf_bbox):
487
 
488
  return new_x1, new_y1, new_x2, new_y2
489
 
490
- def convert_pikepdf_to_image_coords(pymupdf_page, annot, image:Image):
491
  '''
492
  Convert annotations from pikepdf coordinates to image coordinates.
493
  '''
@@ -504,7 +512,10 @@ def convert_pikepdf_to_image_coords(pymupdf_page, annot, image:Image):
504
  scale_height = image_page_height / rect_height
505
 
506
  # Extract the /Rect field
507
- rect_field = annot["/Rect"]
 
 
 
508
 
509
  # Convert the extracted /Rect field to a list of floats
510
  rect_coordinates = [float(coord) for coord in rect_field]
@@ -518,9 +529,30 @@ def convert_pikepdf_to_image_coords(pymupdf_page, annot, image:Image):
518
 
519
  return x1_image, new_y1_image, x2_image, new_y2_image
520
 
521
- def convert_image_coords_to_pymupdf(pymupdf_page, annot:CustomImageRecognizerResult, image:Image):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
522
  '''
523
- Converts an image with redaction coordinates from a CustomImageRecognizerResult to pymupdf coordinates.
524
  '''
525
 
526
  rect_height = pymupdf_page.rect.height
@@ -533,14 +565,29 @@ def convert_image_coords_to_pymupdf(pymupdf_page, annot:CustomImageRecognizerRes
533
  scale_height = rect_height / image_page_height
534
 
535
  # Calculate scaled coordinates
536
- x1 = (annot.left * scale_width)# + page_x_adjust
537
- new_y1 = (annot.top * scale_height)# - page_y_adjust # Flip Y0 (since it starts from bottom)
538
- x2 = ((annot.left + annot.width) * scale_width)# + page_x_adjust # Calculate x1
539
- new_y2 = ((annot.top + annot.height) * scale_height)# - page_y_adjust # Calculate y1 correctly
 
 
 
 
 
540
 
541
- return x1, new_y1, x2, new_y2
 
 
 
 
542
 
 
 
 
 
 
543
 
 
544
 
545
  def convert_gradio_annotation_coords_to_pymupdf(pymupdf_page:Page, annot:dict, image:Image):
546
  '''
@@ -577,7 +624,7 @@ def move_page_info(file_path: str) -> str:
577
 
578
  return new_file_path
579
 
580
- def redact_page_with_pymupdf(page:Page, annotations_on_page:dict, image:Image.Image=None, custom_colours:bool=False, redact_whole_page:bool=False):
581
 
582
  mediabox_height = page.mediabox[3] - page.mediabox[1]
583
  mediabox_width = page.mediabox[2] - page.mediabox[0]
@@ -599,10 +646,10 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page:dict, image:Image.Im
599
  image = Image.open(image_path)
600
 
601
  # Check if this is an object used in the Gradio Annotation component
602
- if isinstance (annotations_on_page, dict):
603
- annotations_on_page = annotations_on_page["boxes"]
604
 
605
- for annot in annotations_on_page:
606
  # Check if an Image recogniser result, or a Gradio annotation object
607
  if (isinstance(annot, CustomImageRecognizerResult)) | isinstance(annot, dict):
608
 
@@ -611,12 +658,16 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page:dict, image:Image.Im
611
  # Should already be in correct format if img_annotator_box is an input
612
  if isinstance(annot, dict):
613
  img_annotation_box = annot
614
-
615
  pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2 = convert_gradio_annotation_coords_to_pymupdf(page, annot, image)
616
 
617
  x1 = pymupdf_x1
618
  x2 = pymupdf_x2
619
 
 
 
 
 
 
620
  # Else should be CustomImageRecognizerResult
621
  else:
622
  pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2 = convert_image_coords_to_pymupdf(page, annot, image)
@@ -633,12 +684,19 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page:dict, image:Image.Im
633
  img_annotation_box["label"] = annot.entity_type
634
  except:
635
  img_annotation_box["label"] = "Redaction"
 
 
 
 
636
 
637
  rect = Rect(x1, pymupdf_y1, x2, pymupdf_y2) # Create the PyMuPDF Rect
638
 
639
  # Else it should be a pikepdf annotation object
640
- else:
641
- pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2 = convert_pikepdf_coords_to_pymupdf(page, annot)
 
 
 
642
 
643
  x1 = pymupdf_x1
644
  x2 = pymupdf_x2
@@ -650,6 +708,8 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page:dict, image:Image.Im
650
  if image:
651
  img_width, img_height = image.size
652
 
 
 
653
  x1, image_y1, x2, image_y2 = convert_pymupdf_to_image_coords(page, x1, pymupdf_y1, x2, pymupdf_y2, image)
654
 
655
  img_annotation_box["xmin"] = x1 #* (img_width / rect_width) # Use adjusted x1
@@ -662,6 +722,10 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page:dict, image:Image.Im
662
  img_annotation_box["label"] = str(annot["/T"])
663
  else:
664
  img_annotation_box["label"] = "REDACTION"
 
 
 
 
665
 
666
  # Convert to a PyMuPDF Rect object
667
  #rect = Rect(rect_coordinates)
@@ -672,29 +736,6 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page:dict, image:Image.Im
672
 
673
  # If whole page is to be redacted, do that here
674
  if redact_whole_page == True:
675
- # # Small border to page that remains white
676
- # border = 5
677
- # # Define the coordinates for the Rect
678
- # whole_page_x1, whole_page_y1 = 0 + border, 0 + border # Bottom-left corner
679
- # whole_page_x2, whole_page_y2 = rect_width - border, rect_height - border # Top-right corner
680
-
681
- # whole_page_image_x1, whole_page_image_y1, whole_page_image_x2, whole_page_image_y2 = convert_pymupdf_to_image_coords(page, whole_page_x1, whole_page_y1, whole_page_x2, whole_page_y2, image)
682
-
683
- # # Create new image annotation element based on whole page coordinates
684
- # whole_page_rect = Rect(whole_page_x1, whole_page_y1, whole_page_x2, whole_page_y2)
685
-
686
- # # Write whole page annotation to annotation boxes
687
- # whole_page_img_annotation_box = {}
688
- # whole_page_img_annotation_box["xmin"] = whole_page_image_x1
689
- # whole_page_img_annotation_box["ymin"] = whole_page_image_y1
690
- # whole_page_img_annotation_box["xmax"] = whole_page_image_x2
691
- # whole_page_img_annotation_box["ymax"] = whole_page_image_y2
692
- # whole_page_img_annotation_box["color"] = (0,0,0)
693
- # whole_page_img_annotation_box["label"] = "Whole page"
694
-
695
- # redact_single_box(page, whole_page_rect, whole_page_img_annotation_box, custom_colours)
696
-
697
- # all_image_annotation_boxes.append(whole_page_img_annotation_box)
698
 
699
  whole_page_img_annotation_box = redact_whole_pymupdf_page(rect_height, rect_width, image, page, custom_colours, border = 5)
700
  all_image_annotation_boxes.append(whole_page_img_annotation_box)
@@ -712,14 +753,7 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page:dict, image:Image.Im
712
 
713
  return page, out_annotation_boxes
714
 
715
- def bounding_boxes_overlap(box1, box2):
716
- """Check if two bounding boxes overlap."""
717
- return (box1[0] < box2[2] and box2[0] < box1[2] and
718
- box1[1] < box2[3] and box2[1] < box1[3])
719
 
720
- from collections import defaultdict
721
- from typing import List, Dict
722
- import copy
723
 
724
  def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_results=[], handwriting_recogniser_results=[], handwrite_signature_checkbox: List[str]=["Redact all identified handwriting", "Redact all identified signatures"], horizontal_threshold:int=50, vertical_threshold:int=12):
725
 
@@ -822,117 +856,6 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
822
  unique_bboxes = list({(bbox.left, bbox.top, bbox.width, bbox.height): bbox for bbox in all_bboxes}.values())
823
  return unique_bboxes
824
 
825
-
826
- # def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_results=[], handwriting_recogniser_results=[], handwrite_signature_checkbox: List[str]=["Redact all identified handwriting", "Redact all identified signatures"], horizontal_threshold:int=50, vertical_threshold:int=12):
827
- # merged_bboxes = []
828
- # grouped_bboxes = defaultdict(list)
829
-
830
- # # Process signature and handwriting results
831
- # if signature_recogniser_results or handwriting_recogniser_results:
832
- # if "Redact all identified handwriting" in handwrite_signature_checkbox:
833
- # #print("Handwriting boxes exist at merge:", handwriting_recogniser_results)
834
- # merged_bboxes.extend(handwriting_recogniser_results)
835
-
836
- # if "Redact all identified signatures" in handwrite_signature_checkbox:
837
- # #print("Signature boxes exist at merge:", signature_recogniser_results)
838
- # merged_bboxes.extend(signature_recogniser_results)
839
-
840
-
841
- # # Reconstruct bounding boxes for substrings of interest
842
- # reconstructed_bboxes = []
843
- # for bbox in bboxes:
844
- # #print("bbox:", bbox)
845
- # bbox_box = (bbox.left, bbox.top, bbox.left + bbox.width, bbox.top + bbox.height)
846
- # for line_text, line_info in combined_results.items():
847
- # line_box = line_info['bounding_box']
848
- # if bounding_boxes_overlap(bbox_box, line_box):
849
- # if bbox.text in line_text:
850
- # start_char = line_text.index(bbox.text)
851
- # end_char = start_char + len(bbox.text)
852
-
853
- # relevant_words = []
854
- # current_char = 0
855
- # for word in line_info['words']:
856
- # word_end = current_char + len(word['text'])
857
- # if current_char <= start_char < word_end or current_char < end_char <= word_end or (start_char <= current_char and word_end <= end_char):
858
- # relevant_words.append(word)
859
- # if word_end >= end_char:
860
- # break
861
- # current_char = word_end
862
- # if not word['text'].endswith(' '):
863
- # current_char += 1 # +1 for space if the word doesn't already end with a space
864
-
865
- # if relevant_words:
866
- # #print("Relevant words:", relevant_words)
867
- # left = min(word['bounding_box'][0] for word in relevant_words)
868
- # top = min(word['bounding_box'][1] for word in relevant_words)
869
- # right = max(word['bounding_box'][2] for word in relevant_words)
870
- # bottom = max(word['bounding_box'][3] for word in relevant_words)
871
-
872
- # # Combine the text of all relevant words
873
- # combined_text = " ".join(word['text'] for word in relevant_words)
874
-
875
- # # Calculate new dimensions for the merged box
876
- # reconstructed_bbox = CustomImageRecognizerResult(
877
- # bbox.entity_type,
878
- # bbox.start,
879
- # bbox.end,
880
- # bbox.score,
881
- # left,
882
- # top,
883
- # right - left, # width
884
- # bottom - top, # height
885
- # combined_text
886
- # )
887
- # # Add both the original and the merged bounding box
888
- # reconstructed_bboxes.append(bbox) # Retain the original bbox
889
- # reconstructed_bboxes.append(reconstructed_bbox) # Add the merged bbox
890
- # break
891
- # else:
892
- # # If the bbox text is not found in any line in combined_results, keep the original bbox
893
- # reconstructed_bboxes.append(bbox)
894
-
895
- # # Group reconstructed bboxes by approximate vertical proximity
896
- # for box in reconstructed_bboxes:
897
- # grouped_bboxes[round(box.top / vertical_threshold)].append(box)
898
-
899
- # # Merge within each group
900
- # for _, group in grouped_bboxes.items():
901
- # group.sort(key=lambda box: box.left)
902
-
903
- # merged_box = group[0]
904
- # for next_box in group[1:]:
905
- # if next_box.left - (merged_box.left + merged_box.width) <= horizontal_threshold:
906
- # # Calculate new dimensions for the merged box
907
- # if merged_box.text == next_box.text:
908
- # new_text = merged_box.text
909
- # else:
910
- # new_text = merged_box.text + " " + next_box.text
911
-
912
- # if merged_box.text == next_box.text:
913
- # new_text = merged_box.text
914
- # new_entity_type = merged_box.entity_type # Keep the original entity type
915
- # else:
916
- # new_text = merged_box.text + " " + next_box.text
917
- # new_entity_type = merged_box.entity_type + " - " + next_box.entity_type # Concatenate entity types
918
-
919
- # new_left = min(merged_box.left, next_box.left)
920
- # new_top = min(merged_box.top, next_box.top)
921
- # new_width = max(merged_box.left + merged_box.width, next_box.left + next_box.width) - new_left
922
- # new_height = max(merged_box.top + merged_box.height, next_box.top + next_box.height) - new_top
923
- # merged_box = CustomImageRecognizerResult(
924
- # new_entity_type, merged_box.start, merged_box.end, merged_box.score, new_left, new_top, new_width, new_height, new_text
925
- # )
926
- # else:
927
- # merged_bboxes.append(merged_box)
928
- # merged_box = next_box
929
-
930
- # merged_bboxes.append(merged_box)
931
-
932
- # #print("bboxes:", bboxes)
933
-
934
- # return merged_bboxes
935
-
936
  def redact_image_pdf(file_path:str,
937
  prepared_pdf_file_paths:List[str],
938
  language:str,
@@ -1279,17 +1202,21 @@ def redact_image_pdf(file_path:str,
1279
 
1280
  # Convert decision process to table
1281
  decision_process_table = pd.DataFrame([{
1282
- 'page': reported_page_number,
1283
- 'entity_type': result.entity_type,
 
 
 
 
1284
  'start': result.start,
1285
  'end': result.end,
1286
  'score': result.score,
1287
- 'left': result.left,
1288
- 'top': result.top,
1289
- 'width': result.width,
1290
- 'height': result.height,
1291
- 'text': result.text
1292
- } for result in merged_redaction_bboxes])
1293
 
1294
  all_decision_process_table = pd.concat([all_decision_process_table, decision_process_table])
1295
 
@@ -1323,7 +1250,7 @@ def redact_image_pdf(file_path:str,
1323
  pymupdf_doc = images
1324
 
1325
  # Check if the image already exists in annotations_all_pages
1326
- print("annotations_all_pages:", annotations_all_pages)
1327
  existing_index = next((index for index, ann in enumerate(annotations_all_pages) if ann["image"] == image_annotations["image"]), None)
1328
  if existing_index is not None:
1329
  # Replace the existing annotation
@@ -1346,7 +1273,7 @@ def redact_image_pdf(file_path:str,
1346
  pymupdf_doc = images
1347
 
1348
  # Check if the image already exists in annotations_all_pages
1349
- print("annotations_all_pages:", annotations_all_pages)
1350
  existing_index = next((index for index, ann in enumerate(annotations_all_pages) if ann["image"] == image_annotations["image"]), None)
1351
  if existing_index is not None:
1352
  # Replace the existing annotation
@@ -1595,105 +1522,25 @@ def merge_text_bounding_boxes(analyser_results, characters: List[LTChar], combin
1595
 
1596
  return analysed_bounding_boxes
1597
 
1598
-
1599
- # def merge_text_bounding_boxes(analyser_results, characters:List[LTChar], combine_pixel_dist:int=20, vertical_padding:int=0):
1600
- # '''
1601
- # Merge identified bounding boxes containing PII that are very close to one another
1602
- # '''
1603
- # analysed_bounding_boxes = []
1604
- # if len(analyser_results) > 0 and len(characters) > 0:
1605
- # # Extract bounding box coordinates for sorting
1606
- # bounding_boxes = []
1607
- # text_out = []
1608
- # for result in analyser_results:
1609
- # char_boxes = [char.bbox for char in characters[result.start:result.end] if isinstance(char, LTChar)]
1610
- # char_text = [char._text for char in characters[result.start:result.end] if isinstance(char, LTChar)]
1611
- # if char_boxes:
1612
- # # Calculate the bounding box that encompasses all characters
1613
- # left = min(box[0] for box in char_boxes)
1614
- # bottom = min(box[1] for box in char_boxes)
1615
- # right = max(box[2] for box in char_boxes)
1616
- # top = max(box[3] for box in char_boxes) + vertical_padding
1617
- # bounding_boxes.append((bottom, left, result, [left, bottom, right, top], char_text)) # (y, x, result, bbox, text)
1618
-
1619
- # char_text = "".join(char_text)
1620
-
1621
- # # Sort the results by y-coordinate and then by x-coordinate
1622
- # bounding_boxes.sort()
1623
-
1624
- # merged_bounding_boxes = []
1625
- # current_box = None
1626
- # current_y = None
1627
- # current_result = None
1628
- # current_text = []
1629
-
1630
- # for y, x, result, char_box, text in bounding_boxes:
1631
- # #print(f"Considering result: {result}")
1632
- # #print(f"Character box: {char_box}")
1633
-
1634
- # if current_y is None or current_box is None:
1635
- # current_box = char_box
1636
- # current_y = char_box[1]
1637
- # current_result = result
1638
- # current_text = list(text)
1639
- # #print(f"Starting new box: {current_box}")
1640
- # else:
1641
- # vertical_diff_bboxes = abs(char_box[1] - current_y)
1642
- # horizontal_diff_bboxes = abs(char_box[0] - current_box[2])
1643
-
1644
- # if (
1645
- # vertical_diff_bboxes <= 5 and horizontal_diff_bboxes <= combine_pixel_dist
1646
- # ):
1647
- # #print("box is being extended")
1648
- # current_box[2] = char_box[2] # Extend the current box horizontally
1649
- # current_box[3] = max(current_box[3], char_box[3]) # Ensure the top is the highest
1650
- # current_result.end = max(current_result.end, result.end) # Extend the text range
1651
- # try:
1652
- # current_result.entity_type = current_result.entity_type + " - " + result.entity_type
1653
- # except Exception as e:
1654
- # print("Unable to combine result entity types:")
1655
- # print(e)
1656
- # # Add a space if current_text is not empty
1657
- # if current_text:
1658
- # current_text.append(" ") # Add space between texts
1659
- # current_text.extend(text)
1660
-
1661
- # #print(f"Latest merged box: {current_box[-1]}")
1662
- # else:
1663
- # merged_bounding_boxes.append(
1664
- # {"text":"".join(current_text),"boundingBox": current_box, "result": current_result})
1665
-
1666
- # # Reset current_box and current_y after appending
1667
- # current_box = char_box
1668
- # current_y = char_box[1]
1669
- # current_result = result
1670
- # current_text = list(text)
1671
-
1672
- # # After finishing with the current result, add the last box for this result
1673
- # if current_box:
1674
- # merged_bounding_boxes.append({"text":"".join(current_text), "boundingBox": current_box, "result": current_result})
1675
-
1676
- # if not merged_bounding_boxes:
1677
- # analysed_bounding_boxes.extend(
1678
- # {"text":text, "boundingBox": char.bbox, "result": result}
1679
- # for result in analyser_results
1680
- # for char in characters[result.start:result.end]
1681
- # if isinstance(char, LTChar)
1682
- # )
1683
- # else:
1684
- # analysed_bounding_boxes.extend(merged_bounding_boxes)
1685
-
1686
- # return analysed_bounding_boxes
1687
-
1688
-
1689
  def create_text_redaction_process_results(analyser_results, analysed_bounding_boxes, page_num):
1690
  decision_process_table = pd.DataFrame()
1691
 
1692
  if len(analyser_results) > 0:
1693
  # Create summary df of annotations to be made
1694
  analysed_bounding_boxes_df_new = pd.DataFrame(analysed_bounding_boxes)
 
 
 
 
 
 
 
 
 
 
 
1695
  analysed_bounding_boxes_df_text = analysed_bounding_boxes_df_new['result'].astype(str).str.split(",",expand=True).replace(".*: ", "", regex=True)
1696
- analysed_bounding_boxes_df_text.columns = ["type", "start", "end", "score"]
1697
  analysed_bounding_boxes_df_new = pd.concat([analysed_bounding_boxes_df_new, analysed_bounding_boxes_df_text], axis = 1)
1698
  analysed_bounding_boxes_df_new['page'] = page_num + 1
1699
  decision_process_table = pd.concat([decision_process_table, analysed_bounding_boxes_df_new], axis = 0).drop('result', axis=1)
@@ -1702,8 +1549,8 @@ def create_text_redaction_process_results(analyser_results, analysed_bounding_bo
1702
 
1703
  return decision_process_table
1704
 
1705
- def create_annotations_for_bounding_boxes(analysed_bounding_boxes):
1706
- annotations_on_page = []
1707
  for analysed_bounding_box in analysed_bounding_boxes:
1708
  bounding_box = analysed_bounding_box["boundingBox"]
1709
  annotation = Dictionary(
@@ -1721,8 +1568,8 @@ def create_annotations_for_bounding_boxes(analysed_bounding_boxes):
1721
  S=Name.S # Border style: solid
1722
  )
1723
  )
1724
- annotations_on_page.append(annotation)
1725
- return annotations_on_page
1726
 
1727
  def redact_text_pdf(
1728
  filename: str, # Path to the PDF file to be redacted
@@ -1840,13 +1687,17 @@ def redact_text_pdf(
1840
 
1841
  if page_min <= page_no < page_max:
1842
 
 
 
 
 
1843
  for page_layout in extract_pages(filename, page_numbers = [page_no], maxpages=1):
1844
 
1845
  page_analyser_results = []
1846
  page_analysed_bounding_boxes = []
1847
 
1848
  characters = []
1849
- annotations_on_page = []
1850
  decision_process_table_on_page = pd.DataFrame()
1851
  page_text_outputs = pd.DataFrame()
1852
 
@@ -1900,8 +1751,7 @@ def redact_text_pdf(
1900
  )
1901
  all_text_line_results.append((i, text_line_analyser_result))
1902
 
1903
- print("all_text_line_results:", all_text_line_results)
1904
-
1905
  elif pii_identification_method == "AWS Comprehend":
1906
 
1907
  # First use the local Spacy model to pick up custom entities that AWS Comprehend can't search for.
@@ -2006,17 +1856,24 @@ def redact_text_pdf(
2006
  text_container_analyser_results.extend(text_line_analyser_result)
2007
  text_container_analysed_bounding_boxes.extend(text_line_bounding_boxes)
2008
 
2009
- print("text_container_analyser_results:", text_container_analyser_results)
2010
 
 
2011
  page_analysed_bounding_boxes.extend(text_line_bounding_boxes) # Add this line
2012
 
2013
 
2014
- print("page_analysed_bounding_boxes:", page_analysed_bounding_boxes)
 
 
 
 
 
 
2015
 
2016
  # Annotate redactions on page
2017
- annotations_on_page = create_annotations_for_bounding_boxes(page_analysed_bounding_boxes)
2018
 
2019
- print("annotations_on_page:", annotations_on_page)
2020
 
2021
  # Make pymupdf page redactions
2022
  #print("redact_whole_page_list:", redact_whole_page_list)
@@ -2025,7 +1882,9 @@ def redact_text_pdf(
2025
  else: redact_whole_page = False
2026
  else: redact_whole_page = False
2027
 
2028
- pymupdf_page, image_annotations = redact_page_with_pymupdf(pymupdf_page, annotations_on_page, image, redact_whole_page=redact_whole_page)
 
 
2029
 
2030
  #print("Did redact_page_with_pymupdf function")
2031
  reported_page_no = page_no + 1
@@ -2037,6 +1896,7 @@ def redact_text_pdf(
2037
 
2038
  if not decision_process_table_on_page.empty:
2039
  all_decision_process_table = pd.concat([all_decision_process_table, decision_process_table_on_page])
 
2040
 
2041
  if not page_text_outputs.empty:
2042
  page_text_outputs = page_text_outputs.sort_values(["top", "left"], ascending=[False, False]).reset_index(drop=True)
 
40
  max_time_value = get_or_create_env_var('max_time_value', '999999')
41
  print(f'The value of max_time_value is {max_time_value}')
42
 
43
+ def bounding_boxes_overlap(box1, box2):
44
+ """Check if two bounding boxes overlap."""
45
+ return (box1[0] < box2[2] and box2[0] < box1[2] and
46
+ box1[1] < box2[3] and box2[1] < box1[3])
47
+
48
  def sum_numbers_before_seconds(string:str):
49
  """Extracts numbers that precede the word 'seconds' from a string and adds them up.
50
 
 
401
  # Convert json to csv and also save this
402
  #print("annotations_all_pages:", annotations_all_pages)
403
 
404
+ review_df = convert_review_json_to_pandas_df(annotations_all_pages, all_decision_process_table)
405
 
406
  out_review_file_file_path = out_image_file_path + '_review_file.csv'
407
  review_df.to_csv(out_review_file_file_path, index=None)
 
457
 
458
  return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
459
 
460
+ def convert_pikepdf_coords_to_pymupdf(pymupdf_page, pikepdf_bbox, type="pikepdf_annot"):
461
  '''
462
  Convert annotations from pikepdf to pymupdf format, handling the mediabox larger than rect.
463
  '''
 
479
  x_diff_ratio = media_reference_x_diff / reference_box_width
480
 
481
  # Extract the annotation rectangle field
482
+ if type=="pikepdf_annot":
483
+ rect_field = pikepdf_bbox["/Rect"]
484
+ else:
485
+ rect_field = pikepdf_bbox
486
  rect_coordinates = [float(coord) for coord in rect_field] # Convert to floats
487
 
488
  # Unpack coordinates
 
495
 
496
  return new_x1, new_y1, new_x2, new_y2
497
 
498
+ def convert_pikepdf_to_image_coords(pymupdf_page, annot, image:Image, type="pikepdf_annot"):
499
  '''
500
  Convert annotations from pikepdf coordinates to image coordinates.
501
  '''
 
512
  scale_height = image_page_height / rect_height
513
 
514
  # Extract the /Rect field
515
+ if type=="pikepdf_annot":
516
+ rect_field = annot["/Rect"]
517
+ else:
518
+ rect_field = annot
519
 
520
  # Convert the extracted /Rect field to a list of floats
521
  rect_coordinates = [float(coord) for coord in rect_field]
 
529
 
530
  return x1_image, new_y1_image, x2_image, new_y2_image
531
 
532
+ def convert_pikepdf_decision_output_to_image_coords(pymupdf_page, pikepdf_decision_ouput_data:List, image):
533
+ if isinstance(image, str):
534
+ image_path = image
535
+ image = Image.open(image_path)
536
+
537
+ # Loop through each item in the data
538
+ for item in pikepdf_decision_ouput_data:
539
+ # Extract the bounding box
540
+ bounding_box = item['boundingBox']
541
+
542
+ # Create a pikepdf_bbox dictionary to match the expected input
543
+ pikepdf_bbox = {"/Rect": bounding_box}
544
+
545
+ # Call the conversion function
546
+ new_x1, new_y1, new_x2, new_y2 = convert_pikepdf_to_image_coords(pymupdf_page, pikepdf_bbox, image, type="pikepdf_annot")
547
+
548
+ # Update the original object with the new bounding box values
549
+ item['boundingBox'] = [new_x1, new_y1, new_x2, new_y2]
550
+
551
+ return pikepdf_decision_ouput_data
552
+
553
+ def convert_image_coords_to_pymupdf(pymupdf_page, annot, image:Image, type="image_recognizer"):
554
  '''
555
+ Converts an image with redaction coordinates from a CustomImageRecognizerResult or pikepdf object with image coordinates to pymupdf coordinates.
556
  '''
557
 
558
  rect_height = pymupdf_page.rect.height
 
565
  scale_height = rect_height / image_page_height
566
 
567
  # Calculate scaled coordinates
568
+ if type == "image_recognizer":
569
+ x1 = (annot.left * scale_width)# + page_x_adjust
570
+ new_y1 = (annot.top * scale_height)# - page_y_adjust # Flip Y0 (since it starts from bottom)
571
+ x2 = ((annot.left + annot.width) * scale_width)# + page_x_adjust # Calculate x1
572
+ new_y2 = ((annot.top + annot.height) * scale_height)# - page_y_adjust # Calculate y1 correctly
573
+ # Else assume it is a pikepdf derived object
574
+ else:
575
+ rect_field = annot["/Rect"]
576
+ rect_coordinates = [float(coord) for coord in rect_field] # Convert to floats
577
 
578
+ # Unpack coordinates
579
+ x1, y1, x2, y2 = rect_coordinates
580
+
581
+ #print("scale_width:", scale_width)
582
+ #print("scale_height:", scale_height)
583
 
584
+ x1 = (x1* scale_width)# + page_x_adjust
585
+ new_y1 = ((y2 + (y1 - y2))* scale_height)# - page_y_adjust # Calculate y1 correctly
586
+ x2 = ((x1 + (x2 - x1)) * scale_width)# + page_x_adjust # Calculate x1
587
+ new_y2 = (y2 * scale_height)# - page_y_adjust # Flip Y0 (since it starts from bottom)
588
+
589
 
590
+ return x1, new_y1, x2, new_y2
591
 
592
  def convert_gradio_annotation_coords_to_pymupdf(pymupdf_page:Page, annot:dict, image:Image):
593
  '''
 
624
 
625
  return new_file_path
626
 
627
+ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image=None, custom_colours:bool=False, redact_whole_page:bool=False, convert_coords:bool=True):
628
 
629
  mediabox_height = page.mediabox[3] - page.mediabox[1]
630
  mediabox_width = page.mediabox[2] - page.mediabox[0]
 
646
  image = Image.open(image_path)
647
 
648
  # Check if this is an object used in the Gradio Annotation component
649
+ if isinstance (page_annotations, dict):
650
+ page_annotations = page_annotations["boxes"]
651
 
652
+ for annot in page_annotations:
653
  # Check if an Image recogniser result, or a Gradio annotation object
654
  if (isinstance(annot, CustomImageRecognizerResult)) | isinstance(annot, dict):
655
 
 
658
  # Should already be in correct format if img_annotator_box is an input
659
  if isinstance(annot, dict):
660
  img_annotation_box = annot
 
661
  pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2 = convert_gradio_annotation_coords_to_pymupdf(page, annot, image)
662
 
663
  x1 = pymupdf_x1
664
  x2 = pymupdf_x2
665
 
666
+ # if hasattr(annot, 'text') and annot.text:
667
+ # img_annotation_box["text"] = annot.text
668
+ # else:
669
+ # img_annotation_box["text"] = ""
670
+
671
  # Else should be CustomImageRecognizerResult
672
  else:
673
  pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2 = convert_image_coords_to_pymupdf(page, annot, image)
 
684
  img_annotation_box["label"] = annot.entity_type
685
  except:
686
  img_annotation_box["label"] = "Redaction"
687
+ # if hasattr(annot, 'text') and annot.text:
688
+ # img_annotation_box["text"] = annot.text
689
+ # else:
690
+ # img_annotation_box["text"] = ""
691
 
692
  rect = Rect(x1, pymupdf_y1, x2, pymupdf_y2) # Create the PyMuPDF Rect
693
 
694
  # Else it should be a pikepdf annotation object
695
+ else:
696
+ if convert_coords == True:
697
+ pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2 = convert_pikepdf_coords_to_pymupdf(page, annot)
698
+ else:
699
+ pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2 = convert_image_coords_to_pymupdf(page, annot, image, type="pikepdf_image_coords")
700
 
701
  x1 = pymupdf_x1
702
  x2 = pymupdf_x2
 
708
  if image:
709
  img_width, img_height = image.size
710
 
711
+ print("annot:", annot)
712
+
713
  x1, image_y1, x2, image_y2 = convert_pymupdf_to_image_coords(page, x1, pymupdf_y1, x2, pymupdf_y2, image)
714
 
715
  img_annotation_box["xmin"] = x1 #* (img_width / rect_width) # Use adjusted x1
 
722
  img_annotation_box["label"] = str(annot["/T"])
723
  else:
724
  img_annotation_box["label"] = "REDACTION"
725
+ # if hasattr(annot, 'text') and annot.text:
726
+ # img_annotation_box["text"] = annot.text
727
+ # else:
728
+ # img_annotation_box["text"] = ""
729
 
730
  # Convert to a PyMuPDF Rect object
731
  #rect = Rect(rect_coordinates)
 
736
 
737
  # If whole page is to be redacted, do that here
738
  if redact_whole_page == True:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
739
 
740
  whole_page_img_annotation_box = redact_whole_pymupdf_page(rect_height, rect_width, image, page, custom_colours, border = 5)
741
  all_image_annotation_boxes.append(whole_page_img_annotation_box)
 
753
 
754
  return page, out_annotation_boxes
755
 
 
 
 
 
756
 
 
 
 
757
 
758
  def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_results=[], handwriting_recogniser_results=[], handwrite_signature_checkbox: List[str]=["Redact all identified handwriting", "Redact all identified signatures"], horizontal_threshold:int=50, vertical_threshold:int=12):
759
 
 
856
  unique_bboxes = list({(bbox.left, bbox.top, bbox.width, bbox.height): bbox for bbox in all_bboxes}.values())
857
  return unique_bboxes
858
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
859
  def redact_image_pdf(file_path:str,
860
  prepared_pdf_file_paths:List[str],
861
  language:str,
 
1202
 
1203
  # Convert decision process to table
1204
  decision_process_table = pd.DataFrame([{
1205
+ 'text': result.text,
1206
+ 'xmin': result.left,
1207
+ 'ymin': result.top,
1208
+ 'xmax': result.left + result.width,
1209
+ 'ymax': result.top + result.height,
1210
+ 'label': result.entity_type,
1211
  'start': result.start,
1212
  'end': result.end,
1213
  'score': result.score,
1214
+ 'page': reported_page_number
1215
+
1216
+ } for result in merged_redaction_bboxes]) #'left': result.left,
1217
+ #'top': result.top,
1218
+ #'width': result.width,
1219
+ #'height': result.height,
1220
 
1221
  all_decision_process_table = pd.concat([all_decision_process_table, decision_process_table])
1222
 
 
1250
  pymupdf_doc = images
1251
 
1252
  # Check if the image already exists in annotations_all_pages
1253
+ #print("annotations_all_pages:", annotations_all_pages)
1254
  existing_index = next((index for index, ann in enumerate(annotations_all_pages) if ann["image"] == image_annotations["image"]), None)
1255
  if existing_index is not None:
1256
  # Replace the existing annotation
 
1273
  pymupdf_doc = images
1274
 
1275
  # Check if the image already exists in annotations_all_pages
1276
+ #print("annotations_all_pages:", annotations_all_pages)
1277
  existing_index = next((index for index, ann in enumerate(annotations_all_pages) if ann["image"] == image_annotations["image"]), None)
1278
  if existing_index is not None:
1279
  # Replace the existing annotation
 
1522
 
1523
  return analysed_bounding_boxes
1524
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1525
  def create_text_redaction_process_results(analyser_results, analysed_bounding_boxes, page_num):
1526
  decision_process_table = pd.DataFrame()
1527
 
1528
  if len(analyser_results) > 0:
1529
  # Create summary df of annotations to be made
1530
  analysed_bounding_boxes_df_new = pd.DataFrame(analysed_bounding_boxes)
1531
+
1532
+ # Remove brackets and split the string into four separate columns
1533
+ #print("analysed_bounding_boxes_df_new:", analysed_bounding_boxes_df_new['boundingBox'])
1534
+ # analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']] = analysed_bounding_boxes_df_new['boundingBox'].str.strip('[]').str.split(',', expand=True)
1535
+
1536
+ # Split the boundingBox list into four separate columns
1537
+ analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']] = analysed_bounding_boxes_df_new['boundingBox'].apply(pd.Series)
1538
+
1539
+ # Convert the new columns to integers (if needed)
1540
+ analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']] = analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']].astype(float)
1541
+
1542
  analysed_bounding_boxes_df_text = analysed_bounding_boxes_df_new['result'].astype(str).str.split(",",expand=True).replace(".*: ", "", regex=True)
1543
+ analysed_bounding_boxes_df_text.columns = ["label", "start", "end", "score"]
1544
  analysed_bounding_boxes_df_new = pd.concat([analysed_bounding_boxes_df_new, analysed_bounding_boxes_df_text], axis = 1)
1545
  analysed_bounding_boxes_df_new['page'] = page_num + 1
1546
  decision_process_table = pd.concat([decision_process_table, analysed_bounding_boxes_df_new], axis = 0).drop('result', axis=1)
 
1549
 
1550
  return decision_process_table
1551
 
1552
+ def create_pikepdf_annotations_for_bounding_boxes(analysed_bounding_boxes):
1553
+ pikepdf_annotations_on_page = []
1554
  for analysed_bounding_box in analysed_bounding_boxes:
1555
  bounding_box = analysed_bounding_box["boundingBox"]
1556
  annotation = Dictionary(
 
1568
  S=Name.S # Border style: solid
1569
  )
1570
  )
1571
+ pikepdf_annotations_on_page.append(annotation)
1572
+ return pikepdf_annotations_on_page
1573
 
1574
  def redact_text_pdf(
1575
  filename: str, # Path to the PDF file to be redacted
 
1687
 
1688
  if page_min <= page_no < page_max:
1689
 
1690
+ if isinstance(image, str):
1691
+ image_path = image
1692
+ image = Image.open(image_path)
1693
+
1694
  for page_layout in extract_pages(filename, page_numbers = [page_no], maxpages=1):
1695
 
1696
  page_analyser_results = []
1697
  page_analysed_bounding_boxes = []
1698
 
1699
  characters = []
1700
+ pikepdf_annotations_on_page = []
1701
  decision_process_table_on_page = pd.DataFrame()
1702
  page_text_outputs = pd.DataFrame()
1703
 
 
1751
  )
1752
  all_text_line_results.append((i, text_line_analyser_result))
1753
 
1754
+
 
1755
  elif pii_identification_method == "AWS Comprehend":
1756
 
1757
  # First use the local Spacy model to pick up custom entities that AWS Comprehend can't search for.
 
1856
  text_container_analyser_results.extend(text_line_analyser_result)
1857
  text_container_analysed_bounding_boxes.extend(text_line_bounding_boxes)
1858
 
1859
+ #print("text_container_analyser_results:", text_container_analyser_results)
1860
 
1861
+ page_analyser_results.extend(text_container_analyser_results) # Add this line
1862
  page_analysed_bounding_boxes.extend(text_line_bounding_boxes) # Add this line
1863
 
1864
 
1865
+ #print("page_analyser_results:", page_analyser_results)
1866
+ #print("page_analysed_bounding_boxes:", page_analysed_bounding_boxes)
1867
+ #print("image:", image)
1868
+
1869
+ page_analysed_bounding_boxes = convert_pikepdf_decision_output_to_image_coords(pymupdf_page, page_analysed_bounding_boxes, image)
1870
+
1871
+ #print("page_analysed_bounding_boxes_out_converted:", page_analysed_bounding_boxes)
1872
 
1873
  # Annotate redactions on page
1874
+ pikepdf_annotations_on_page = create_pikepdf_annotations_for_bounding_boxes(page_analysed_bounding_boxes)
1875
 
1876
+ #print("pikepdf_annotations_on_page:", pikepdf_annotations_on_page)
1877
 
1878
  # Make pymupdf page redactions
1879
  #print("redact_whole_page_list:", redact_whole_page_list)
 
1882
  else: redact_whole_page = False
1883
  else: redact_whole_page = False
1884
 
1885
+ pymupdf_page, image_annotations = redact_page_with_pymupdf(pymupdf_page, pikepdf_annotations_on_page, image, redact_whole_page=redact_whole_page, convert_coords=False)
1886
+
1887
+ #print("image_annotations:", image_annotations)
1888
 
1889
  #print("Did redact_page_with_pymupdf function")
1890
  reported_page_no = page_no + 1
 
1896
 
1897
  if not decision_process_table_on_page.empty:
1898
  all_decision_process_table = pd.concat([all_decision_process_table, decision_process_table_on_page])
1899
+ #print("all_decision_process_table:", all_decision_process_table)
1900
 
1901
  if not page_text_outputs.empty:
1902
  page_text_outputs = page_text_outputs.sort_values(["top", "left"], ascending=[False, False]).reset_index(drop=True)
tools/redaction_review.py CHANGED
@@ -68,6 +68,7 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, re
68
  #print("review_dataframe['label']", review_dataframe["label"])
69
  recogniser_entities = review_dataframe["label"].unique().tolist()
70
  recogniser_entities.append("ALL")
 
71
 
72
  #print("recogniser_entities:", recogniser_entities)
73
 
@@ -187,7 +188,7 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, re
187
 
188
  return out_image_annotator, number_reported, number_reported, page_num_reported, recogniser_entities_drop, recogniser_dataframe_out, recogniser_dataframe_gr
189
 
190
- def modify_existing_page_redactions(image_annotated:AnnotatedImageData, current_page:int, previous_page:int, all_image_annotations:List[AnnotatedImageData], clear_all:bool=False):
191
  '''
192
  Overwrite current image annotations with modifications
193
  '''
@@ -198,6 +199,8 @@ def modify_existing_page_redactions(image_annotated:AnnotatedImageData, current_
198
  #If no previous page or is 0, i.e. first time run, then rewrite current page
199
  #if not previous_page:
200
  # previous_page = current_page
 
 
201
 
202
  image_annotated['image'] = all_image_annotations[previous_page - 1]["image"]
203
 
@@ -206,9 +209,26 @@ def modify_existing_page_redactions(image_annotated:AnnotatedImageData, current_
206
  else:
207
  all_image_annotations[previous_page - 1]["boxes"] = []
208
 
209
- return all_image_annotations, current_page, current_page
 
 
 
 
 
 
 
 
210
 
211
- def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], doc:Document, all_image_annotations:List[AnnotatedImageData], current_page:int, progress=gr.Progress(track_tqdm=True)):
 
 
 
 
 
 
 
 
 
212
  '''
213
  Apply modified redactions to a pymupdf and export review files
214
  '''
@@ -302,7 +322,7 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
302
  output_files.append(out_pdf_file_path)
303
 
304
  try:
305
- # print("Saving annotations to JSON")
306
 
307
  out_annotation_file_path = output_folder + file_base + '_review_file.json'
308
  with open(out_annotation_file_path, 'w') as f:
@@ -311,14 +331,16 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
311
 
312
  print("Saving annotations to CSV review file")
313
 
 
 
314
  # Convert json to csv and also save this
315
- review_df = convert_review_json_to_pandas_df(all_image_annotations)
316
  out_review_file_file_path = output_folder + file_base + '_review_file.csv'
317
  review_df.to_csv(out_review_file_file_path, index=None)
318
  output_files.append(out_review_file_file_path)
319
 
320
  except Exception as e:
321
- print("Could not save annotations to json file:", e)
322
 
323
  return doc, all_image_annotations, output_files, output_log_files
324
 
 
68
  #print("review_dataframe['label']", review_dataframe["label"])
69
  recogniser_entities = review_dataframe["label"].unique().tolist()
70
  recogniser_entities.append("ALL")
71
+ recogniser_entities = sorted(recogniser_entities)
72
 
73
  #print("recogniser_entities:", recogniser_entities)
74
 
 
188
 
189
  return out_image_annotator, number_reported, number_reported, page_num_reported, recogniser_entities_drop, recogniser_dataframe_out, recogniser_dataframe_gr
190
 
191
+ def modify_existing_page_redactions(image_annotated:AnnotatedImageData, current_page:int, previous_page:int, all_image_annotations:List[AnnotatedImageData], recogniser_entities_drop=gr.Dropdown(value="ALL", allow_custom_value=True),recogniser_dataframe=gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]})), clear_all:bool=False):
192
  '''
193
  Overwrite current image annotations with modifications
194
  '''
 
199
  #If no previous page or is 0, i.e. first time run, then rewrite current page
200
  #if not previous_page:
201
  # previous_page = current_page
202
+
203
+ #print("image_annotated:", image_annotated)
204
 
205
  image_annotated['image'] = all_image_annotations[previous_page - 1]["image"]
206
 
 
209
  else:
210
  all_image_annotations[previous_page - 1]["boxes"] = []
211
 
212
+ #print("all_image_annotations:", all_image_annotations)
213
+
214
+ # Rewrite all_image_annotations search dataframe with latest updates
215
+ try:
216
+ review_dataframe = convert_review_json_to_pandas_df(all_image_annotations)[["page", "label"]]
217
+ #print("review_dataframe['label']", review_dataframe["label"])
218
+ recogniser_entities = review_dataframe["label"].unique().tolist()
219
+ recogniser_entities.append("ALL")
220
+ recogniser_entities = sorted(recogniser_entities)
221
 
222
+ recogniser_dataframe_out = gr.Dataframe(review_dataframe)
223
+ #recogniser_dataframe_gr = gr.Dataframe(review_dataframe)
224
+ recogniser_entities_drop = gr.Dropdown(value=recogniser_entities_drop, choices=recogniser_entities, allow_custom_value=True, interactive=True)
225
+ except Exception as e:
226
+ print("Could not extract recogniser information:", e)
227
+ recogniser_dataframe_out = recogniser_dataframe
228
+
229
+ return all_image_annotations, current_page, current_page, recogniser_entities_drop, recogniser_dataframe_out
230
+
231
+ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], doc:Document, all_image_annotations:List[AnnotatedImageData], current_page:int, review_file_state, progress=gr.Progress(track_tqdm=True)):
232
  '''
233
  Apply modified redactions to a pymupdf and export review files
234
  '''
 
322
  output_files.append(out_pdf_file_path)
323
 
324
  try:
325
+ print("Saving annotations to JSON")
326
 
327
  out_annotation_file_path = output_folder + file_base + '_review_file.json'
328
  with open(out_annotation_file_path, 'w') as f:
 
331
 
332
  print("Saving annotations to CSV review file")
333
 
334
+ print("review_file_state:", review_file_state)
335
+
336
  # Convert json to csv and also save this
337
+ review_df = convert_review_json_to_pandas_df(all_image_annotations, review_file_state)
338
  out_review_file_file_path = output_folder + file_base + '_review_file.csv'
339
  review_df.to_csv(out_review_file_file_path, index=None)
340
  output_files.append(out_review_file_file_path)
341
 
342
  except Exception as e:
343
+ print("Could not save annotations to json or csv file:", e)
344
 
345
  return doc, all_image_annotations, output_files, output_log_files
346