Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Dec 24, 2024

Commit

1d772de

1 Parent(s): a770956

Refactor redaction functionality and enhance UI components: Added support for custom recognizers and whole page redaction options. Updated file handling to include new dropdowns for entity selection and improved dataframes for entity management. Enhanced the annotator with better state management and UI responsiveness. Cleaned up redundant code and improved overall performance in the redaction process.

Browse files

Files changed (6) hide show

app.py +75 -42
tools/file_conversion.py +216 -47
tools/file_redaction.py +83 -109
tools/helper_functions.py +0 -2
tools/load_spacy_model_custom_recognisers.py +3 -1
tools/redaction_review.py +81 -34

app.py CHANGED Viewed

@@ -13,7 +13,7 @@ from tools.helper_functions import ensure_output_folder_exists, add_folder_to_pa
 from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
 from tools.file_redaction import choose_and_run_redactor
 from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
-from tools.redaction_review import apply_redactions, modify_existing_page_redactions, decrease_page, increase_page, update_annotator, update_zoom
 from tools.data_anonymise import anonymise_data_files
 from tools.auth import authenticate_user
 from tools.load_spacy_model_custom_recognisers import custom_entities
@@ -34,7 +34,7 @@ full_comprehend_entity_list = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBI
 chosen_comprehend_entities.extend(custom_entities)
 full_comprehend_entity_list.extend(custom_entities)
-chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE"]
 full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS', 'CUSTOM']
@@ -67,8 +67,6 @@ with app:
     all_line_level_ocr_results_df_state = gr.State(pd.DataFrame())
     all_decision_process_table_state = gr.State(pd.DataFrame())
     session_hash_state = gr.State()
     s3_output_folder_state = gr.State()
@@ -101,7 +99,12 @@ with app:
     doc_full_file_name_textbox = gr.Textbox(label = "doc_full_file_name_textbox", value="", visible=False)
     doc_file_name_no_extension_textbox = gr.Textbox(label = "doc_full_file_name_textbox", value="", visible=False)
     doc_file_name_with_extension_textbox = gr.Textbox(label = "doc_file_name_with_extension_textbox", value="", visible=False)
-    data_file_name_textbox = gr.Textbox(label = "data_file_name_textbox", value="", visible=False)
     estimated_time_taken_number = gr.Number(label = "estimated_time_taken_number", value=0.0, precision=1, visible=False) # This keeps track of the time taken to redact files for logging purposes.
     annotate_previous_page = gr.Number(value=0, label="Previous page", precision=0, visible=False) # Keeps track of the last page that the annotator was on
@@ -124,12 +127,12 @@ with app:
     default_deny_list_file_name = "default_deny_list.csv"
     default_deny_list_loc = output_folder + "/" + default_deny_list_file_name
-    in_deny_list_state = gr.State(pd.DataFrame())
     in_deny_list_text_in = gr.Textbox(value="Deny list", visible=False)
     fully_redacted_list_file_name = "default_fully_redacted_list.csv"
     fully_redacted_list_loc = output_folder + "/" + fully_redacted_list_file_name
-    in_fully_redacted_list_state = gr.State(pd.DataFrame())
     in_fully_redacted_text_in = gr.Textbox(value="Fully redacted page list", visible=False)
     # S3 settings for default allow list load
@@ -137,6 +140,9 @@ with app:
     s3_default_allow_list_file = gr.Textbox(label = "Default allow list file", value=default_allow_list_file_name, visible=False)
     default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=default_allow_list_loc, visible=False)
     ###
     # UI DESIGN
     ###
@@ -146,7 +152,9 @@ with app:
     Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Documents/images can be redacted using 'Quick' image analysis that works fine for typed text, but not handwriting/signatures. On the Redaction settings tab, choose 'Complex image analysis' OCR using AWS Textract (if you are using AWS) to redact these more complex elements (this service has a cost). Addtionally you can choose the method for PII identification. 'Local' gives quick, lower quality results, AWS Comprehend gives better results but has a cost.
-    Review suggested redactions on the 'Review redactions' tab using a point and click visual interface. See the 'Redaction settings' tab to choose which pages to redact, the type of information to redact (e.g. people, places), or terms to exclude from redaction. Please see the [User Guide](https://github.com/seanpedrick-case/doc_redaction/blob/main/README.md) for a walkthrough on how to use this and all other features in the app. The app accepts a maximum file size of 100mb. Please consider giving feedback for the quality of the answers underneath the redact buttons when the option appears, this will help to improve the app in future.
     NOTE: In testing the app seems to find about 60% of personal information on a given (typed) page of text. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.""")
@@ -183,7 +191,7 @@ with app:
     # Object annotation
     with gr.Tab("Review redactions", id="tab_object_annotation"):
-        with gr.Accordion(label = "Review previous redactions", open=True):
             output_review_files = gr.File(label="Review output files", file_count='multiple')
             upload_previous_review_file_btn = gr.Button("Review previously created redaction file (upload original PDF and ...review_file.csv)")
@@ -200,17 +208,35 @@ with app:
         annotation_button_apply = gr.Button("Apply revised redactions", variant="primary")
-        annotator = image_annotator(
-            label="Modify redaction boxes",
-            label_list=["Redaction"],
-            label_colors=[(0, 0, 0)],
-            show_label=False,
-            sources=None,#["upload"],
-            show_clear_button=False,
-            show_share_button=False,
-            show_remove_button=False,
-            interactive=False
-        )
         with gr.Row():
             annotation_last_page_button_bottom = gr.Button("Previous page", scale = 3)
@@ -290,36 +316,41 @@ with app:
     ###
     # PDF/IMAGE REDACTION
     ###
-    in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox])
     document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator]).\
     then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state], api_name="prepare_doc").\
     then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
                     outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number], api_name="redact_doc").\
-                    then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
     # If the app has completed a batch of pages, it will run this until the end of all pages in the document
     current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
                     outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number]).\
-                    then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
     # If a file has been completed, the function will continue onto the next document
-    latest_file_completed_text.change(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page]).\
                     then(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
     ###
     # REVIEW PDF REDACTIONS
     ###
     # Page controls at top
     annotate_current_page.submit(
         modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
-        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
     annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
-        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
     annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
-        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
     # Zoom in and out on annotator
     annotate_zoom_in.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
@@ -328,10 +359,10 @@ with app:
     annotate_zoom_out.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
         then(update_zoom, inputs=[annotator_zoom_number, annotate_current_page, zoom_false_bool], outputs=[annotator_zoom_number, annotate_current_page])
-    annotator_zoom_number.change(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
     clear_all_redactions_on_page_btn.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, clear_all_page_redactions], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
-        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
     #annotation_button_get.click(get_boxes_json, annotator, json_boxes)
     annotation_button_apply.click(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output], scroll_to_output=True)
@@ -339,23 +370,26 @@ with app:
     # Page controls at bottom
     annotate_current_page_bottom.submit(
         modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page]).\
-        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
     annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
-        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
     annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
-        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
-    # Upload previous files for modifying redactions
-    upload_previous_review_file_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox]).\
-        then(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state]).\
-        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
     ###
     # TABULAR DATA REDACTION
     ###
     in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets]).\
-                  then(fn=get_input_file_names, inputs=[in_data_files], outputs=[data_file_name_textbox])
     tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state], api_name="redact_data")
@@ -370,7 +404,6 @@ with app:
     in_allow_list.change(fn=custom_regex_load, inputs=[in_allow_list], outputs=[in_allow_list_text, in_allow_list_state])
     in_deny_list.change(fn=custom_regex_load, inputs=[in_deny_list, in_deny_list_text_in], outputs=[in_deny_list_text, in_deny_list_state])
     in_fully_redacted_list.change(fn=custom_regex_load, inputs=[in_fully_redacted_list, in_fully_redacted_text_in], outputs=[in_fully_redacted_list_text, in_fully_redacted_list_state])
     ###
@@ -403,14 +436,14 @@ with app:
     # User submitted feedback for data redactions
     data_callback = CSVLogger_custom(dataset_file_name=log_file_name)
-    data_callback.setup([data_feedback_radio, data_further_details_text, data_file_name_textbox], feedback_logs_folder)
-    data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args)), [data_feedback_radio, data_further_details_text, data_file_name_textbox], None, preprocess=False).\
     then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
     # Log processing time/token usage when making a query
     usage_callback = CSVLogger_custom(dataset_file_name=log_file_name)
-    usage_callback.setup([session_hash_textbox, doc_file_name_no_extension_textbox, data_file_name_textbox, estimated_time_taken_number, textract_metadata_textbox, pii_identification_method_drop, comprehend_query_number], usage_logs_folder)
-    latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_no_extension_textbox, data_file_name_textbox, estimated_time_taken_number, textract_metadata_textbox, pii_identification_method_drop, comprehend_query_number], None, preprocess=False).\
     then(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
 # Get some environment variables and Launch the Gradio app

 from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
 from tools.file_redaction import choose_and_run_redactor
 from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
+from tools.redaction_review import apply_redactions, modify_existing_page_redactions, decrease_page, increase_page, update_annotator, update_zoom, update_entities_df, df_select_callback
 from tools.data_anonymise import anonymise_data_files
 from tools.auth import authenticate_user
 from tools.load_spacy_model_custom_recognisers import custom_entities
 chosen_comprehend_entities.extend(custom_entities)
 full_comprehend_entity_list.extend(custom_entities)
+chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", "CUSTOM"]
 full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS', 'CUSTOM']
     all_line_level_ocr_results_df_state = gr.State(pd.DataFrame())
     all_decision_process_table_state = gr.State(pd.DataFrame())
     session_hash_state = gr.State()
     s3_output_folder_state = gr.State()
     doc_full_file_name_textbox = gr.Textbox(label = "doc_full_file_name_textbox", value="", visible=False)
     doc_file_name_no_extension_textbox = gr.Textbox(label = "doc_full_file_name_textbox", value="", visible=False)
     doc_file_name_with_extension_textbox = gr.Textbox(label = "doc_file_name_with_extension_textbox", value="", visible=False)
+    doc_file_name_textbox_list = gr.Dropdown(label = "doc_file_name_textbox_list", value="", allow_custom_value=True,visible=False)
+    data_full_file_name_textbox = gr.Textbox(label = "data_full_file_name_textbox", value="", visible=False)
+    data_file_name_no_extension_textbox = gr.Textbox(label = "data_full_file_name_textbox", value="", visible=False)
+    data_file_name_with_extension_textbox = gr.Textbox(label = "data_file_name_with_extension_textbox", value="", visible=False)
+    data_file_name_textbox_list = gr.Dropdown(label = "data_file_name_textbox_list", value="", allow_custom_value=True,visible=False)
     estimated_time_taken_number = gr.Number(label = "estimated_time_taken_number", value=0.0, precision=1, visible=False) # This keeps track of the time taken to redact files for logging purposes.
     annotate_previous_page = gr.Number(value=0, label="Previous page", precision=0, visible=False) # Keeps track of the last page that the annotator was on
     default_deny_list_file_name = "default_deny_list.csv"
     default_deny_list_loc = output_folder + "/" + default_deny_list_file_name
+    in_deny_list_state = gr.State([])
     in_deny_list_text_in = gr.Textbox(value="Deny list", visible=False)
     fully_redacted_list_file_name = "default_fully_redacted_list.csv"
     fully_redacted_list_loc = output_folder + "/" + fully_redacted_list_file_name
+    in_fully_redacted_list_state = gr.State([])
     in_fully_redacted_text_in = gr.Textbox(value="Fully redacted page list", visible=False)
     # S3 settings for default allow list load
     s3_default_allow_list_file = gr.Textbox(label = "Default allow list file", value=default_allow_list_file_name, visible=False)
     default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=default_allow_list_loc, visible=False)
+    # Base dataframe for recognisers that is not modified subsequent to load
+    recogniser_entity_dataframe_base = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", visible=False)
     ###
     # UI DESIGN
     ###
     Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Documents/images can be redacted using 'Quick' image analysis that works fine for typed text, but not handwriting/signatures. On the Redaction settings tab, choose 'Complex image analysis' OCR using AWS Textract (if you are using AWS) to redact these more complex elements (this service has a cost). Addtionally you can choose the method for PII identification. 'Local' gives quick, lower quality results, AWS Comprehend gives better results but has a cost.
+    Review suggested redactions on the 'Review redactions' tab using a point and click visual interface. Upload a pdf alone to start from scratch, or upload the original pdf alongside a '...redaction_file.csv' to continue a previous redaction/review task.
+    See the 'Redaction settings' tab to choose which pages to redact, the type of information to redact (e.g. people, places), or terms to exclude from redaction. Please see the [User Guide](https://github.com/seanpedrick-case/doc_redaction/blob/main/README.md) for a walkthrough on how to use this and all other features in the app. The app accepts a maximum file size of 100mb. Please consider giving feedback for the quality of the answers underneath the redact buttons when the option appears, this will help to improve the app in future.
     NOTE: In testing the app seems to find about 60% of personal information on a given (typed) page of text. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.""")
     # Object annotation
     with gr.Tab("Review redactions", id="tab_object_annotation"):
+        with gr.Accordion(label = "Review redaction file", open=True):
             output_review_files = gr.File(label="Review output files", file_count='multiple')
             upload_previous_review_file_btn = gr.Button("Review previously created redaction file (upload original PDF and ...review_file.csv)")
         annotation_button_apply = gr.Button("Apply revised redactions", variant="primary")
+        with gr.Row():
+            with gr.Column(scale=4):
+                zoom_str = str(annotator_zoom_number) + '%'
+                annotator = image_annotator(
+                    label="Modify redaction boxes",
+                    label_list=["Redaction"],
+                    label_colors=[(0, 0, 0)],
+                    show_label=False,
+                    height=zoom_str,
+                    width=zoom_str,
+                    box_min_size=1,
+                    box_selected_thickness=2,
+                    handle_size=4,
+                    sources=None,#["upload"],
+                    show_clear_button=False,
+                    show_share_button=False,
+                    show_remove_button=False,
+                    handles_cursor=True,
+                    interactive=False
+                )
+            with gr.Column(scale=1):
+                recogniser_entity_dropdown = gr.Dropdown(value="ALL", allow_custom_value=True)
+                recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas")
         with gr.Row():
             annotation_last_page_button_bottom = gr.Button("Previous page", scale = 3)
     ###
     # PDF/IMAGE REDACTION
     ###
+    in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list])
     document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator]).\
     then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state], api_name="prepare_doc").\
     then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
                     outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number], api_name="redact_doc").\
+                    then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
     # If the app has completed a batch of pages, it will run this until the end of all pages in the document
     current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
                     outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number]).\
+                    then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
     # If a file has been completed, the function will continue onto the next document
+    latest_file_completed_text.change(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
                     then(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
     ###
     # REVIEW PDF REDACTIONS
     ###
+    # Upload previous files for modifying redactions
+    upload_previous_review_file_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
+        then(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state]).\
+        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
     # Page controls at top
     annotate_current_page.submit(
         modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
+        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
     annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
+        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
     annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
+        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
     # Zoom in and out on annotator
     annotate_zoom_in.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
     annotate_zoom_out.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
         then(update_zoom, inputs=[annotator_zoom_number, annotate_current_page, zoom_false_bool], outputs=[annotator_zoom_number, annotate_current_page])
+    annotator_zoom_number.change(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
     clear_all_redactions_on_page_btn.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, clear_all_page_redactions], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
+        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
     #annotation_button_get.click(get_boxes_json, annotator, json_boxes)
     annotation_button_apply.click(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output], scroll_to_output=True)
     # Page controls at bottom
     annotate_current_page_bottom.submit(
         modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page]).\
+        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
     annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
+        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
     annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
+        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
+    # Review side bar controls
+    recogniser_entity_dropdown.select(update_entities_df, inputs=[recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs=[recogniser_entity_dataframe])
+    recogniser_entity_dataframe.select(df_select_callback, inputs=[recogniser_entity_dataframe], outputs=annotate_current_page).\
+    then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
+        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
     ###
     # TABULAR DATA REDACTION
     ###
     in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets]).\
+                  then(fn=get_input_file_names, inputs=[in_data_files], outputs=[data_full_file_name_textbox, data_file_name_no_extension_textbox, data_file_name_with_extension_textbox, data_full_file_name_textbox, data_file_name_textbox_list])
     tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state], api_name="redact_data")
     in_allow_list.change(fn=custom_regex_load, inputs=[in_allow_list], outputs=[in_allow_list_text, in_allow_list_state])
     in_deny_list.change(fn=custom_regex_load, inputs=[in_deny_list, in_deny_list_text_in], outputs=[in_deny_list_text, in_deny_list_state])
     in_fully_redacted_list.change(fn=custom_regex_load, inputs=[in_fully_redacted_list, in_fully_redacted_text_in], outputs=[in_fully_redacted_list_text, in_fully_redacted_list_state])
     ###
     # User submitted feedback for data redactions
     data_callback = CSVLogger_custom(dataset_file_name=log_file_name)
+    data_callback.setup([data_feedback_radio, data_further_details_text, data_full_file_name_textbox], feedback_logs_folder)
+    data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args)), [data_feedback_radio, data_further_details_text, data_full_file_name_textbox], None, preprocess=False).\
     then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
     # Log processing time/token usage when making a query
     usage_callback = CSVLogger_custom(dataset_file_name=log_file_name)
+    usage_callback.setup([session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, estimated_time_taken_number, textract_metadata_textbox, pii_identification_method_drop, comprehend_query_number], usage_logs_folder)
+    latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, estimated_time_taken_number, textract_metadata_textbox, pii_identification_method_drop, comprehend_query_number], None, preprocess=False).\
     then(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
 # Get some environment variables and Launch the Gradio app

tools/file_conversion.py CHANGED Viewed

@@ -8,6 +8,8 @@ import time
 import json
 import pymupdf
 import pandas as pd
 from tqdm import tqdm
 from gradio import Progress
 from typing import List, Optional
@@ -58,10 +60,10 @@ def process_single_page(pdf_path: str, page_num: int, image_dpi: float, output_d
         os.makedirs(os.path.dirname(out_path), exist_ok=True)
         if os.path.exists(out_path):
-            print(f"Loading existing image for page {page_num + 1}")
             image = Image.open(out_path)
         else:
-            print(f"Converting page {page_num + 1}")
             image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1,
                                         dpi=image_dpi, use_cropbox=True, use_pdftocairo=False)
             image = image_l[0]
@@ -181,7 +183,7 @@ def process_file(file_path:str, prepare_for_review:bool=False):
     return img_object
-def get_input_file_names(file_input):
     '''
     Get list of input files to report to logs.
     '''
@@ -210,14 +212,123 @@ def get_input_file_names(file_input):
         file_extension = os.path.splitext(file_path)[1].lower()
         # Check if the file is an image type
-        if file_extension in ['.jpg', '.jpeg', '.png', '.pdf', '.xlsx', '.csv', '.parquet']:
             all_relevant_files.append(file_path_without_ext)
             file_name_with_extension = file_path_without_ext + file_extension
             full_file_name = file_path
     all_relevant_files_str = ", ".join(all_relevant_files)
-    return all_relevant_files_str, file_name_with_extension, full_file_name
 def prepare_image_or_pdf(
     file_paths: List[str],
@@ -230,6 +341,7 @@ def prepare_image_or_pdf(
     current_loop_page_number:int=0,
     all_annotations_object:List = [],
     prepare_for_review:bool = False,
     progress: Progress = Progress(track_tqdm=True)
 ) -> tuple[List[str], List[str]]:
     """
@@ -241,15 +353,16 @@ def prepare_image_or_pdf(
     Args:
         file_paths (List[str]): List of file paths to process.
         in_redact_method (str): The redaction method to use.
-        in_allow_list (Optional[List[List[str]]]): List of allowed terms for redaction.
-        latest_file_completed (int): Index of the last completed file.
-        out_message (List[str]): List to store output messages.
-        first_loop_state (bool): Flag indicating if this is the first iteration.
-        number_of_pages (int): integer indicating the number of pages in the document
-        current_loop_page_number (int): Current number of loop
-        all_annotations_object(List of annotation objects): All annotations for current document
-        prepare_for_review(bool): Is this preparation step preparing pdfs and json files to review current redactions?
-        progress (Progress): Progress tracker for the operation.
     Returns:
@@ -259,6 +372,9 @@ def prepare_image_or_pdf(
     tic = time.perf_counter()
     json_from_csv = False
     # If this is the first time around, set variables to 0/blank
     if first_loop_state==True:
         print("first_loop_state is True")
@@ -329,6 +445,9 @@ def prepare_image_or_pdf(
     # Loop through files to load in
     for file in file_paths_loop:
         if isinstance(file, str):
             file_path = file
         else:
@@ -342,15 +461,45 @@ def prepare_image_or_pdf(
         file_extension = os.path.splitext(file_path)[1].lower()
         # Check if the file is an image type and the user selected text ocr option
-        if file_extension in ['.jpg', '.jpeg', '.png'] and in_redact_method == text_ocr_option:
             in_redact_method = tesseract_ocr_option
-        if file_extension in ['.csv']:
             review_file_csv = read_file(file)
             all_annotations_object = convert_pandas_df_to_review_json(review_file_csv)
             json_from_csv = True
         # If the file name ends with redactions.json, assume it is an annoations object, overwrite the current variable
         if (file_extension in ['.json']) | (json_from_csv == True):
@@ -376,7 +525,7 @@ def prepare_image_or_pdf(
             # If you have an annotations object from the above code
             if all_annotations_object:
-                #print("out_annotations_object found:", all_annotations_object)
                 # Get list of page numbers
                 image_file_paths_pages = [
@@ -388,9 +537,27 @@ def prepare_image_or_pdf(
                 # If PDF pages have been converted to image files, replace the current image paths in the json to this.
                 if image_file_paths:
-                    for i, annotation in enumerate(all_annotations_object):
-                        annotation_page_number = int(re.search(r'_(\d+)\.png$', annotation["image"]).group(1))
                         #print("Annotation page number:", annotation_page_number)
                         # Check if the annotation page number exists in the image file paths pages
@@ -402,19 +569,30 @@ def prepare_image_or_pdf(
                         else:
                             print("Page", annotation_page_number, "image file not found.")
-                    #print("all_annotations_object:", all_annotations_object)
                 # Write the response to a JSON file in output folder
                 out_folder = output_folder + file_path_without_ext + ".json"
                 with open(out_folder, 'w') as json_file:
                     json.dump(all_annotations_object, json_file, indent=4)  # indent=4 makes the JSON file pretty-printed
                 continue
-        # Must be a pdf or image at this point
         else:
-            # Convert pdf/image file to correct format for redaction
             if in_redact_method == tesseract_ocr_option or in_redact_method == textract_option:
                 if is_pdf_or_image(file_path) == False:
                     out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
@@ -425,25 +603,11 @@ def prepare_image_or_pdf(
                 if is_pdf(file_path) == False:
                     out_message = "Please upload a PDF file for text analysis."
                     print(out_message)
-                    return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
-            converted_file_path = file_path # Pikepdf works with the basic unconverted pdf file
-            image_file_path = process_file(file_path, prepare_for_review)
-        converted_file_paths.append(converted_file_path)
-        image_file_paths.extend(image_file_path)
-        # If a pdf, load as a pymupdf document
-        if is_pdf(file_path):
-            pymupdf_doc = pymupdf.open(file_path)
-        elif is_pdf_or_image(file_path):  # Alternatively, if it's an image
-            # Convert image to a pymupdf document
-            pymupdf_doc = pymupdf.open()  # Create a new empty document
-            img = Image.open(file_path)  # Open the image file
-            rect = pymupdf.Rect(0, 0, img.width, img.height)  # Create a rectangle for the image
-            page = pymupdf_doc.new_page(width=img.width, height=img.height)  # Add a new page
-            page.insert_image(rect, filename=file_path)  # Insert the image into the page
         toc = time.perf_counter()
         out_time = f"File '{file_path_without_ext}' prepared in {toc - tic:0.1f} seconds."
@@ -453,11 +617,12 @@ def prepare_image_or_pdf(
         out_message.append(out_time)
         out_message_out = '\n'.join(out_message)
-    if prepare_for_review == False:
-        number_of_pages = len(image_file_paths)
-    else:
-        number_of_pages = len(all_annotations_object)
     return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
@@ -498,13 +663,17 @@ def convert_review_json_to_pandas_df(data:List[dict]) -> pd.DataFrame:
         match = re.search(r'_(\d+)\.png$', image_path)
         if match:
             number = match.group(1)  # Extract the number
-            print(number)  # Output: 0
             reported_number = int(number) + 1
         else:
             print("No number found before .png")
         for box in entry["boxes"]:
-            data_to_add = {"image": image_path, "page":reported_number, **box}
             #print("data_to_add:", data_to_add)
             flattened_data.append(data_to_add)

 import json
 import pymupdf
 import pandas as pd
+from pymupdf import Rect
+from fitz import Page
 from tqdm import tqdm
 from gradio import Progress
 from typing import List, Optional
         os.makedirs(os.path.dirname(out_path), exist_ok=True)
         if os.path.exists(out_path):
+            #print(f"Loading existing image for page {page_num + 1}")
             image = Image.open(out_path)
         else:
+            #print(f"Converting page {page_num + 1}")
             image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1,
                                         dpi=image_dpi, use_cropbox=True, use_pdftocairo=False)
             image = image_l[0]
     return img_object
+def get_input_file_names(file_input:List[str]):
     '''
     Get list of input files to report to logs.
     '''
         file_extension = os.path.splitext(file_path)[1].lower()
         # Check if the file is an image type
+        if (file_extension in ['.jpg', '.jpeg', '.png', '.pdf', '.xlsx', '.csv', '.parquet']) & ("review_file" not in file_path_without_ext):
             all_relevant_files.append(file_path_without_ext)
             file_name_with_extension = file_path_without_ext + file_extension
             full_file_name = file_path
     all_relevant_files_str = ", ".join(all_relevant_files)
+    #print("all_relevant_files_str in input_file_names", all_relevant_files_str)
+    #print("all_relevant_files in input_file_names", all_relevant_files)
+    return all_relevant_files_str, file_name_with_extension, full_file_name, all_relevant_files
+def convert_color_to_range_0_1(color):
+    return tuple(component / 255 for component in color)
+def redact_single_box(pymupdf_page:Page, pymupdf_rect:Rect, img_annotation_box:dict, custom_colours:bool=False):
+    pymupdf_x1 = pymupdf_rect[0]
+    pymupdf_y1 = pymupdf_rect[1]
+    pymupdf_x2 = pymupdf_rect[2]
+    pymupdf_y2 = pymupdf_rect[3]
+    # Calculate area to actually remove text from the pdf (different from black box size)
+    redact_bottom_y = pymupdf_y1 + 2
+    redact_top_y = pymupdf_y2 - 2
+    # Calculate the middle y value and set a small height if default values are too close together
+    if (redact_top_y - redact_bottom_y) < 1:
+        middle_y = (pymupdf_y1 + pymupdf_y2) / 2
+        redact_bottom_y = middle_y - 1
+        redact_top_y = middle_y + 1
+    #print("Rect:", rect)
+    rect_small_pixel_height = Rect(pymupdf_x1, redact_bottom_y, pymupdf_x2, redact_top_y)  # Slightly smaller than outside box
+    # Add the annotation to the middle of the character line, so that it doesn't delete text from adjacent lines
+    #page.add_redact_annot(rect)#rect_small_pixel_height)
+    pymupdf_page.add_redact_annot(rect_small_pixel_height)
+    # Set up drawing a black box over the whole rect
+    shape = pymupdf_page.new_shape()
+    shape.draw_rect(pymupdf_rect)
+    if custom_colours == True:
+        if img_annotation_box["color"][0] > 1:
+            out_colour = convert_color_to_range_0_1(img_annotation_box["color"])
+        else:
+            out_colour = img_annotation_box["color"]
+    else:
+        out_colour = (0,0,0)
+    shape.finish(color=out_colour, fill=out_colour)  # Black fill for the rectangle
+    #shape.finish(color=(0, 0, 0))  # Black fill for the rectangle
+    shape.commit()
+def convert_pymupdf_to_image_coords(pymupdf_page, x1, y1, x2, y2, image: Image):
+    '''
+    Converts coordinates from pymupdf format to image coordinates,
+    accounting for mediabox dimensions.
+    '''
+    rect_height = pymupdf_page.rect.height
+    rect_width = pymupdf_page.rect.width
+    # Get mediabox dimensions
+    mediabox = pymupdf_page.mediabox
+    mediabox_width = mediabox.width
+    mediabox_height = mediabox.height
+    image_page_width, image_page_height = image.size
+    # Calculate scaling factors using mediabox dimensions
+    scale_width = image_page_width / mediabox_width
+    scale_height = image_page_height / mediabox_height
+    #print("scale_width:", scale_width)
+    #print("scale_height:", scale_height)
+    rect_to_mediabox_x_scale = mediabox_width / rect_width
+    rect_to_mediabox_y_scale = mediabox_height / rect_height
+    #print("rect_to_mediabox_x_scale:", rect_to_mediabox_x_scale)
+    #print("rect_to_mediabox_y_scale:", rect_to_mediabox_y_scale)
+    # Adjust coordinates based on scaling factors
+    x1_image = (x1 * scale_width) * rect_to_mediabox_x_scale
+    y1_image = (y1 * scale_height) * rect_to_mediabox_y_scale
+    x2_image = (x2 * scale_width) * rect_to_mediabox_x_scale
+    y2_image = (y2 * scale_height) * rect_to_mediabox_y_scale
+    return x1_image, y1_image, x2_image, y2_image
+def redact_whole_pymupdf_page(rect_height, rect_width, image, page, custom_colours, border = 5):
+    # Small border to page that remains white
+    border = 5
+    # Define the coordinates for the Rect
+    whole_page_x1, whole_page_y1 = 0 + border, 0 + border  # Bottom-left corner
+    whole_page_x2, whole_page_y2 = rect_width - border, rect_height - border  # Top-right corner
+    whole_page_image_x1, whole_page_image_y1, whole_page_image_x2, whole_page_image_y2 = convert_pymupdf_to_image_coords(page, whole_page_x1, whole_page_y1, whole_page_x2, whole_page_y2, image)
+    # Create new image annotation element based on whole page coordinates
+    whole_page_rect = Rect(whole_page_x1, whole_page_y1, whole_page_x2, whole_page_y2)
+    # Write whole page annotation to annotation boxes
+    whole_page_img_annotation_box = {}
+    whole_page_img_annotation_box["xmin"] = whole_page_image_x1
+    whole_page_img_annotation_box["ymin"] = whole_page_image_y1
+    whole_page_img_annotation_box["xmax"] = whole_page_image_x2
+    whole_page_img_annotation_box["ymax"] = whole_page_image_y2
+    whole_page_img_annotation_box["color"] = (0,0,0)
+    whole_page_img_annotation_box["label"] = "Whole page"
+    redact_single_box(page, whole_page_rect, whole_page_img_annotation_box, custom_colours)
+    return whole_page_img_annotation_box
 def prepare_image_or_pdf(
     file_paths: List[str],
     current_loop_page_number:int=0,
     all_annotations_object:List = [],
     prepare_for_review:bool = False,
+    in_fully_redacted_list:List[int]=[],
     progress: Progress = Progress(track_tqdm=True)
 ) -> tuple[List[str], List[str]]:
     """
     Args:
         file_paths (List[str]): List of file paths to process.
         in_redact_method (str): The redaction method to use.
+        in_allow_list (optional, Optional[List[List[str]]]): List of allowed terms for redaction.
+        latest_file_completed (optional, int): Index of the last completed file.
+        out_message (optional, List[str]): List to store output messages.
+        first_loop_state (optional, bool): Flag indicating if this is the first iteration.
+        number_of_pages (optional, int): integer indicating the number of pages in the document
+        current_loop_page_number (optional, int): Current number of loop
+        all_annotations_object(optional, List of annotation objects): All annotations for current document
+        prepare_for_review(optional, bool): Is this preparation step preparing pdfs and json files to review current redactions?
+        in_fully_redacted_list(optional, List of int): A list of pages to fully redact
+        progress (optional, Progress): Progress tracker for the operation.
     Returns:
     tic = time.perf_counter()
     json_from_csv = False
+    if isinstance(in_fully_redacted_list, pd.DataFrame):
+        in_fully_redacted_list = in_fully_redacted_list.iloc[:,0].tolist()
     # If this is the first time around, set variables to 0/blank
     if first_loop_state==True:
         print("first_loop_state is True")
     # Loop through files to load in
     for file in file_paths_loop:
+        converted_file_path = []
+        image_file_path = []
         if isinstance(file, str):
             file_path = file
         else:
         file_extension = os.path.splitext(file_path)[1].lower()
+        # If a pdf, load as a pymupdf document
+        if is_pdf(file_path):
+            pymupdf_doc = pymupdf.open(file_path)
+            converted_file_path = file_path
+            image_file_paths = process_file(file_path, prepare_for_review)
+            # Create base version of the annotation object that doesn't have any annotations in it
+            if not all_annotations_object:
+                all_annotations_object = []
+                for image_path in image_file_paths:
+                    annotation = {}
+                    annotation["image"] = image_path
+                    all_annotations_object.append(annotation)
+                #print("all_annotations_object:", all_annotations_object)
+        elif is_pdf_or_image(file_path):  # Alternatively, if it's an image
+            # Convert image to a pymupdf document
+            pymupdf_doc = pymupdf.open()  # Create a new empty document
+            img = Image.open(file_path)  # Open the image file
+            rect = pymupdf.Rect(0, 0, img.width, img.height)  # Create a rectangle for the image
+            page = pymupdf_doc.new_page(width=img.width, height=img.height)  # Add a new page
+            page.insert_image(rect, filename=file_path)  # Insert the image into the page
         # Check if the file is an image type and the user selected text ocr option
+        elif file_extension in ['.jpg', '.jpeg', '.png'] and in_redact_method == text_ocr_option:
             in_redact_method = tesseract_ocr_option
+        elif file_extension in ['.csv']:
             review_file_csv = read_file(file)
             all_annotations_object = convert_pandas_df_to_review_json(review_file_csv)
             json_from_csv = True
+            print("Converted CSV review file to json")
         # If the file name ends with redactions.json, assume it is an annoations object, overwrite the current variable
         if (file_extension in ['.json']) | (json_from_csv == True):
             # If you have an annotations object from the above code
             if all_annotations_object:
+                #print("out_annotations_object before reloading images:", all_annotations_object)
                 # Get list of page numbers
                 image_file_paths_pages = [
                 # If PDF pages have been converted to image files, replace the current image paths in the json to this.
                 if image_file_paths:
+                    #print("Image file paths found")
+                    #print("Image_file_paths:", image_file_paths)
+                    #for i, annotation in enumerate(all_annotations_object):
+                    for i, image_file_path in enumerate(image_file_paths):
+                        if i < len(all_annotations_object):
+                            annotation = all_annotations_object[i]
+                        else:
+                            annotation = {}
+                            all_annotations_object.append(annotation)
+                        #print("annotation:", annotation, "for page:", str(i))
+                        if not annotation:
+                            annotation = {"image":"", "boxes": []}
+                            annotation_page_number = int(re.search(r'_(\d+)\.png$', image_file_path).group(1))
+                        else:
+                            annotation_page_number = int(re.search(r'_(\d+)\.png$', annotation["image"]).group(1))
                         #print("Annotation page number:", annotation_page_number)
                         # Check if the annotation page number exists in the image file paths pages
                         else:
                             print("Page", annotation_page_number, "image file not found.")
+                        all_annotations_object[i] = annotation
+                    #print("all_annotations_object at end of json/csv load part:", all_annotations_object)
+                # Get list of pages that are to be fully redacted and redact them
+                if in_fully_redacted_list:
+                    print("Redacting whole pages")
+                    for i, image in enumerate(image_file_paths):
+                        page = pymupdf_doc.load_page(i)
+                        rect_height = page.rect.height
+                        rect_width = page.rect.width
+                        whole_page_img_annotation_box = redact_whole_pymupdf_page(rect_height, rect_width, image, page, custom_colours = False, border = 5)
+                        all_annotations_object.append(whole_page_img_annotation_box)
                 # Write the response to a JSON file in output folder
                 out_folder = output_folder + file_path_without_ext + ".json"
                 with open(out_folder, 'w') as json_file:
                     json.dump(all_annotations_object, json_file, indent=4)  # indent=4 makes the JSON file pretty-printed
                 continue
+        # Must be something else, return with error message
         else:
             if in_redact_method == tesseract_ocr_option or in_redact_method == textract_option:
                 if is_pdf_or_image(file_path) == False:
                     out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
                 if is_pdf(file_path) == False:
                     out_message = "Please upload a PDF file for text analysis."
                     print(out_message)
+                    return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
+        converted_file_paths.append(converted_file_path)
+        image_file_paths.extend(image_file_path)
         toc = time.perf_counter()
         out_time = f"File '{file_path_without_ext}' prepared in {toc - tic:0.1f} seconds."
         out_message.append(out_time)
         out_message_out = '\n'.join(out_message)
+    #if prepare_for_review == False:
+    number_of_pages = len(image_file_paths)
+    #else:
+    #    number_of_pages = len(all_annotations_object)
+    #print("all_annotations_object at end:", all_annotations_object)
     return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
         match = re.search(r'_(\d+)\.png$', image_path)
         if match:
             number = match.group(1)  # Extract the number
+            #print(number)  # Output: 0
             reported_number = int(number) + 1
         else:
             print("No number found before .png")
+        # Check if 'boxes' is in the entry, if not, add an empty list
+        if 'boxes' not in entry:
+            entry['boxes'] = []
         for box in entry["boxes"]:
+            data_to_add = {"image": image_path, "page": reported_number, **box}
             #print("data_to_add:", data_to_add)
             flattened_data.append(data_to_add)

tools/file_redaction.py CHANGED Viewed

@@ -18,7 +18,7 @@ from pdfminer.layout import LTTextContainer, LTChar, LTTextLine, LTTextLineHoriz
 from pikepdf import Pdf, Dictionary, Name
 import pymupdf
 from pymupdf import Rect
-from fitz import Document, Page
 import gradio as gr
 from gradio import Progress
 from collections import defaultdict  # For efficient grouping
@@ -26,7 +26,7 @@ from collections import defaultdict  # For efficient grouping
 from presidio_analyzer import RecognizerResult
 from tools.aws_functions import RUN_AWS_FUNCTIONS
 from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult
-from tools.file_conversion import process_file, image_dpi, convert_review_json_to_pandas_df
 from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser
 from tools.helper_functions import get_file_path_end, output_folder, clean_unicode_text, get_or_create_env_var, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
 from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
@@ -69,8 +69,8 @@ def choose_and_run_redactor(file_paths:List[str],
  chosen_redact_comprehend_entities:List[str],
  in_redact_method:str,
  in_allow_list:List[List[str]]=None,
- in_deny_list:List[List[str]]=None,
- in_fully_redacted_list:List[List[str]]=None,
  latest_file_completed:int=0,
  out_message:list=[],
  out_file_paths:list=[],
@@ -102,8 +102,8 @@ def choose_and_run_redactor(file_paths:List[str],
     - chosen_redact_comprehend_entities (List[str]): A list of entity types to redact from files, chosen from the official list from AWS Comprehend service
     - in_redact_method (str): The method to use for redaction.
     - in_allow_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to None.
-    - in_deny_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to None.
-    - in_fully_redacted_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to None.
     - latest_file_completed (int, optional): The index of the last completed file. Defaults to 0.
     - out_message (list, optional): A list to store output messages. Defaults to an empty list.
     - out_file_paths (list, optional): A list to store paths to the output files. Defaults to an empty list.
@@ -131,6 +131,15 @@ def choose_and_run_redactor(file_paths:List[str],
     tic = time.perf_counter()
     all_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
     # If this is the first time around, set variables to 0/blank
     if first_loop_state==True:
@@ -296,7 +305,9 @@ def choose_and_run_redactor(file_paths:List[str],
              pii_identification_method,
              comprehend_query_number,
              comprehend_client,
-             textract_client)
             # Save Textract request metadata (if exists)
             if new_request_metadata:
@@ -330,7 +341,9 @@ def choose_and_run_redactor(file_paths:List[str],
             pymupdf_doc,
             pii_identification_method,
             comprehend_query_number,
-            comprehend_client)
         else:
             out_message = "No redaction method selected"
@@ -378,14 +391,19 @@ def choose_and_run_redactor(file_paths:List[str],
                     json.dump(annotations_all_pages, f)
                 log_files_output_paths.append(out_annotation_file_path)
-                print("Saving annotations to CSV")
                 # Convert json to csv and also save this
                 review_df = convert_review_json_to_pandas_df(annotations_all_pages)
                 out_review_file_file_path = out_image_file_path + '_review_file.csv'
                 review_df.to_csv(out_review_file_file_path, index=None)
                 out_file_paths.append(out_review_file_file_path)
             except Exception as e:
                 print("Could not save annotations to json file:", e)
@@ -522,42 +540,7 @@ def convert_image_coords_to_pymupdf(pymupdf_page, annot:CustomImageRecognizerRes
     return x1, new_y1, x2, new_y2
-def convert_pymupdf_to_image_coords(pymupdf_page, x1, y1, x2, y2, image: Image):
-    '''
-    Converts coordinates from pymupdf format to image coordinates,
-    accounting for mediabox dimensions.
-    '''
-    rect_height = pymupdf_page.rect.height
-    rect_width = pymupdf_page.rect.width
-    # Get mediabox dimensions
-    mediabox = pymupdf_page.mediabox
-    mediabox_width = mediabox.width
-    mediabox_height = mediabox.height
-    image_page_width, image_page_height = image.size
-    # Calculate scaling factors using mediabox dimensions
-    scale_width = image_page_width / mediabox_width
-    scale_height = image_page_height / mediabox_height
-    #print("scale_width:", scale_width)
-    #print("scale_height:", scale_height)
-    rect_to_mediabox_x_scale = mediabox_width / rect_width
-    rect_to_mediabox_y_scale = mediabox_height / rect_height
-    #print("rect_to_mediabox_x_scale:", rect_to_mediabox_x_scale)
-    #print("rect_to_mediabox_y_scale:", rect_to_mediabox_y_scale)
-    # Adjust coordinates based on scaling factors
-    x1_image = (x1 * scale_width) * rect_to_mediabox_x_scale
-    y1_image = (y1 * scale_height) * rect_to_mediabox_y_scale
-    x2_image = (x2 * scale_width) * rect_to_mediabox_x_scale
-    y2_image = (y2 * scale_height) * rect_to_mediabox_y_scale
-    return x1_image, y1_image, x2_image, y2_image
 def convert_gradio_annotation_coords_to_pymupdf(pymupdf_page:Page, annot:dict, image:Image):
     '''
@@ -594,49 +577,6 @@ def move_page_info(file_path: str) -> str:
     return new_file_path
-def convert_color_to_range_0_1(color):
-    return tuple(component / 255 for component in color)
-def redact_single_box(pymupdf_page:Page, pymupdf_rect:Rect, img_annotation_box:dict, custom_colours:bool=False):
-    pymupdf_x1 = pymupdf_rect[0]
-    pymupdf_y1 = pymupdf_rect[1]
-    pymupdf_x2 = pymupdf_rect[2]
-    pymupdf_y2 = pymupdf_rect[3]
-    # Calculate area to actually remove text from the pdf (different from black box size)
-    redact_bottom_y = pymupdf_y1 + 2
-    redact_top_y = pymupdf_y2 - 2
-    # Calculate the middle y value and set a small height if default values are too close together
-    if (redact_top_y - redact_bottom_y) < 1:
-        middle_y = (pymupdf_y1 + pymupdf_y2) / 2
-        redact_bottom_y = middle_y - 1
-        redact_top_y = middle_y + 1
-    #print("Rect:", rect)
-    rect_small_pixel_height = Rect(pymupdf_x1, redact_bottom_y, pymupdf_x2, redact_top_y)  # Slightly smaller than outside box
-    # Add the annotation to the middle of the character line, so that it doesn't delete text from adjacent lines
-    #page.add_redact_annot(rect)#rect_small_pixel_height)
-    pymupdf_page.add_redact_annot(rect_small_pixel_height)
-    # Set up drawing a black box over the whole rect
-    shape = pymupdf_page.new_shape()
-    shape.draw_rect(pymupdf_rect)
-    if custom_colours == True:
-        if img_annotation_box["color"][0] > 1:
-            out_colour = convert_color_to_range_0_1(img_annotation_box["color"])
-        else:
-            out_colour = img_annotation_box["color"]
-    else:
-        out_colour = (0,0,0)
-    shape.finish(color=out_colour, fill=out_colour)  # Black fill for the rectangle
-    #shape.finish(color=(0, 0, 0))  # Black fill for the rectangle
-    shape.commit()
 def redact_page_with_pymupdf(page:Page, annotations_on_page:dict, image:Image.Image=None, custom_colours:bool=False, redact_whole_page:bool=False):
     mediabox_height = page.mediabox[3] - page.mediabox[1]
@@ -732,28 +672,31 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page:dict, image:Image.Im
     # If whole page is to be redacted, do that here
     if redact_whole_page == True:
-        # Small border to page that remains white
-        border = 5
-        # Define the coordinates for the Rect
-        whole_page_x1, whole_page_y1 = 0 + border, 0 + border  # Bottom-left corner
-        whole_page_x2, whole_page_y2 = rect_width - border, rect_height - border  # Top-right corner
-        whole_page_image_x1, whole_page_image_y1, whole_page_image_x2, whole_page_image_y2 = convert_pymupdf_to_image_coords(page, whole_page_x1, whole_page_y1, whole_page_x2, whole_page_y2, image)
-        # Create new image annotation element based on whole page coordinates
-        whole_page_rect = Rect(whole_page_x1, whole_page_y1, whole_page_x2, whole_page_y2)
-        # Write whole page annotation to annotation boxes
-        whole_page_img_annotation_box = {}
-        whole_page_img_annotation_box["xmin"] = whole_page_image_x1
-        whole_page_img_annotation_box["ymin"] = whole_page_image_y1
-        whole_page_img_annotation_box["xmax"] = whole_page_image_x2
-        whole_page_img_annotation_box["ymax"] = whole_page_image_y2
-        whole_page_img_annotation_box["color"] = (0,0,0)
-        whole_page_img_annotation_box["label"] = "Whole page"
-        redact_single_box(page, whole_page_rect, whole_page_img_annotation_box, custom_colours)
         all_image_annotation_boxes.append(whole_page_img_annotation_box)
     out_annotation_boxes = {
@@ -1058,11 +1001,20 @@ def redact_image_pdf(file_path:str,
     comprehend_query_number_new = 0
     # Update custom word list analyser object with any new words that have been added to the custom deny list
-    if custom_recogniser_word_list:
         nlp_analyser.registry.remove_recognizer("CUSTOM")
         new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
         nlp_analyser.registry.add_recognizer(new_custom_recogniser)
     image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
@@ -1315,9 +1267,15 @@ def redact_image_pdf(file_path:str,
                 image_annotations = {"image": file_path, "boxes": all_image_annotations_boxes}
-            ## Apply annotations with pymupdf
             else:
-                pymupdf_page, image_annotations = redact_page_with_pymupdf(pymupdf_page, merged_redaction_bboxes, image)
             # Convert decision process to table
             decision_process_table = pd.DataFrame([{
@@ -1811,11 +1769,20 @@ def redact_text_pdf(
         return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
     # Update custom word list analyser object with any new words that have been added to the custom deny list
-    if custom_recogniser_word_list:
         nlp_analyser.registry.remove_recognizer("CUSTOM")
         new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
         nlp_analyser.registry.add_recognizer(new_custom_recogniser)
     tic = time.perf_counter()
     # Open with Pikepdf to get text lines
@@ -1903,6 +1870,7 @@ def redact_text_pdf(
                         for i, text_line in enumerate(line_level_text_results_list):
                             if chosen_redact_entities:
                                 if pii_identification_method == "Local":
                                     # Process immediately for local analysis
                                     text_line_analyser_result = nlp_analyser.analyze(
                                         text=text_line.text,
@@ -2024,7 +1992,13 @@ def redact_text_pdf(
                 annotations_on_page = create_annotations_for_bounding_boxes(page_analysed_bounding_boxes)
                 # Make pymupdf page redactions
-                pymupdf_page, image_annotations = redact_page_with_pymupdf(pymupdf_page, annotations_on_page, image)
                 #print("Did redact_page_with_pymupdf function")
                 reported_page_no = page_no + 1

 from pikepdf import Pdf, Dictionary, Name
 import pymupdf
 from pymupdf import Rect
+from fitz import Page
 import gradio as gr
 from gradio import Progress
 from collections import defaultdict  # For efficient grouping
 from presidio_analyzer import RecognizerResult
 from tools.aws_functions import RUN_AWS_FUNCTIONS
 from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult
+from tools.file_conversion import process_file, image_dpi, convert_review_json_to_pandas_df, redact_whole_pymupdf_page, redact_single_box, convert_pymupdf_to_image_coords
 from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser
 from tools.helper_functions import get_file_path_end, output_folder, clean_unicode_text, get_or_create_env_var, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
 from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
  chosen_redact_comprehend_entities:List[str],
  in_redact_method:str,
  in_allow_list:List[List[str]]=None,
+ custom_recogniser_word_list:List[str]=None,
+ redact_whole_page_list:List[str]=None,
  latest_file_completed:int=0,
  out_message:list=[],
  out_file_paths:list=[],
     - chosen_redact_comprehend_entities (List[str]): A list of entity types to redact from files, chosen from the official list from AWS Comprehend service
     - in_redact_method (str): The method to use for redaction.
     - in_allow_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to None.
+    - custom_recogniser_word_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to None.
+    - redact_whole_page_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to None.
     - latest_file_completed (int, optional): The index of the last completed file. Defaults to 0.
     - out_message (list, optional): A list to store output messages. Defaults to an empty list.
     - out_file_paths (list, optional): A list to store paths to the output files. Defaults to an empty list.
     tic = time.perf_counter()
     all_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
+    if isinstance(custom_recogniser_word_list, pd.DataFrame):
+        custom_recogniser_word_list = custom_recogniser_word_list.iloc[:,0].tolist()
+        # Sort the strings in order from the longest string to the shortest
+        custom_recogniser_word_list = sorted(custom_recogniser_word_list, key=len, reverse=True)
+    if isinstance(redact_whole_page_list, pd.DataFrame):
+        redact_whole_page_list = redact_whole_page_list.iloc[:,0].tolist()
     # If this is the first time around, set variables to 0/blank
     if first_loop_state==True:
              pii_identification_method,
              comprehend_query_number,
              comprehend_client,
+             textract_client,
+             custom_recogniser_word_list,
+             redact_whole_page_list)
             # Save Textract request metadata (if exists)
             if new_request_metadata:
             pymupdf_doc,
             pii_identification_method,
             comprehend_query_number,
+            comprehend_client,
+            custom_recogniser_word_list,
+            redact_whole_page_list)
         else:
             out_message = "No redaction method selected"
                     json.dump(annotations_all_pages, f)
                 log_files_output_paths.append(out_annotation_file_path)
+                #print("Saving annotations to CSV")
                 # Convert json to csv and also save this
+                #print("annotations_all_pages:", annotations_all_pages)
                 review_df = convert_review_json_to_pandas_df(annotations_all_pages)
                 out_review_file_file_path = out_image_file_path + '_review_file.csv'
                 review_df.to_csv(out_review_file_file_path, index=None)
                 out_file_paths.append(out_review_file_file_path)
+                print("Saved review file to csv")
             except Exception as e:
                 print("Could not save annotations to json file:", e)
     return x1, new_y1, x2, new_y2
 def convert_gradio_annotation_coords_to_pymupdf(pymupdf_page:Page, annot:dict, image:Image):
     '''
     return new_file_path
 def redact_page_with_pymupdf(page:Page, annotations_on_page:dict, image:Image.Image=None, custom_colours:bool=False, redact_whole_page:bool=False):
     mediabox_height = page.mediabox[3] - page.mediabox[1]
     # If whole page is to be redacted, do that here
     if redact_whole_page == True:
+        # # Small border to page that remains white
+        # border = 5
+        # # Define the coordinates for the Rect
+        # whole_page_x1, whole_page_y1 = 0 + border, 0 + border  # Bottom-left corner
+        # whole_page_x2, whole_page_y2 = rect_width - border, rect_height - border  # Top-right corner
+        # whole_page_image_x1, whole_page_image_y1, whole_page_image_x2, whole_page_image_y2 = convert_pymupdf_to_image_coords(page, whole_page_x1, whole_page_y1, whole_page_x2, whole_page_y2, image)
+        # # Create new image annotation element based on whole page coordinates
+        # whole_page_rect = Rect(whole_page_x1, whole_page_y1, whole_page_x2, whole_page_y2)
+        # # Write whole page annotation to annotation boxes
+        # whole_page_img_annotation_box = {}
+        # whole_page_img_annotation_box["xmin"] = whole_page_image_x1
+        # whole_page_img_annotation_box["ymin"] = whole_page_image_y1
+        # whole_page_img_annotation_box["xmax"] = whole_page_image_x2
+        # whole_page_img_annotation_box["ymax"] = whole_page_image_y2
+        # whole_page_img_annotation_box["color"] = (0,0,0)
+        # whole_page_img_annotation_box["label"] = "Whole page"
+        # redact_single_box(page, whole_page_rect, whole_page_img_annotation_box, custom_colours)
+        # all_image_annotation_boxes.append(whole_page_img_annotation_box)
+        whole_page_img_annotation_box = redact_whole_pymupdf_page(rect_height, rect_width, image, page, custom_colours, border = 5)
         all_image_annotation_boxes.append(whole_page_img_annotation_box)
     out_annotation_boxes = {
     comprehend_query_number_new = 0
     # Update custom word list analyser object with any new words that have been added to the custom deny list
+    #print("custom_recogniser_word_list:", custom_recogniser_word_list)
+    if custom_recogniser_word_list:
         nlp_analyser.registry.remove_recognizer("CUSTOM")
         new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
+        #print("new_custom_recogniser:", new_custom_recogniser)
         nlp_analyser.registry.add_recognizer(new_custom_recogniser)
+        # List all elements currently in the nlp_analyser registry
+        #print("Current recognizers in nlp_analyser registry:")
+        for recognizer_name in nlp_analyser.registry.recognizers:
+            print(recognizer_name)
     image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
                 image_annotations = {"image": file_path, "boxes": all_image_annotations_boxes}
+            ## Apply annotations with pymupdf
             else:
+                #print("redact_whole_page_list:", redact_whole_page_list)
+                if redact_whole_page_list:
+                    if current_loop_page in redact_whole_page_list: redact_whole_page = True
+                    else: redact_whole_page = False
+                else: redact_whole_page = False
+                pymupdf_page, image_annotations = redact_page_with_pymupdf(pymupdf_page, merged_redaction_bboxes, image, redact_whole_page=redact_whole_page)
             # Convert decision process to table
             decision_process_table = pd.DataFrame([{
         return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
     # Update custom word list analyser object with any new words that have been added to the custom deny list
+    #print("custom_recogniser_word_list:", custom_recogniser_word_list)
+    if custom_recogniser_word_list:
         nlp_analyser.registry.remove_recognizer("CUSTOM")
         new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
+        #print("new_custom_recogniser:", new_custom_recogniser)
         nlp_analyser.registry.add_recognizer(new_custom_recogniser)
+        # List all elements currently in the nlp_analyser registry
+        #print("Current recognizers in nlp_analyser registry:")
+        #for recognizer_name in nlp_analyser.registry.recognizers:
+        #    print(recognizer_name)
+        #print("Custom recogniser:", nlp_analyser.registry.)
     tic = time.perf_counter()
     # Open with Pikepdf to get text lines
                         for i, text_line in enumerate(line_level_text_results_list):
                             if chosen_redact_entities:
                                 if pii_identification_method == "Local":
                                     # Process immediately for local analysis
                                     text_line_analyser_result = nlp_analyser.analyze(
                                         text=text_line.text,
                 annotations_on_page = create_annotations_for_bounding_boxes(page_analysed_bounding_boxes)
                 # Make pymupdf page redactions
+                #print("redact_whole_page_list:", redact_whole_page_list)
+                if redact_whole_page_list:
+                    if current_loop_page in redact_whole_page_list: redact_whole_page = True
+                    else: redact_whole_page = False
+                else: redact_whole_page = False
+                pymupdf_page, image_annotations = redact_page_with_pymupdf(pymupdf_page, annotations_on_page, image, redact_whole_page=redact_whole_page)
                 #print("Did redact_page_with_pymupdf function")
                 reported_page_no = page_no + 1

tools/helper_functions.py CHANGED Viewed

@@ -112,8 +112,6 @@ def custom_regex_load(in_file:List[str], file_type:str = "Allow list"):
     custom_regex = pd.DataFrame()
     if in_file:
-        print("File type:", file_type)
         file_list = [string.name for string in in_file]
         regex_file_names = [string for string in file_list if "csv" in string.lower()]

     custom_regex = pd.DataFrame()
     if in_file:
         file_list = [string.name for string in in_file]
         regex_file_names = [string for string in file_list if "csv" in string.lower()]

tools/load_spacy_model_custom_recognisers.py CHANGED Viewed

@@ -28,8 +28,10 @@ except:
 def custom_word_list_recogniser(custom_list:List[str]=[]):
     custom_regex = '\\b' + '\\b|\\b'.join(rf"{re.escape(term)}" for term in custom_list) + '\\b'
     custom_pattern = Pattern(name="custom_pattern",regex=custom_regex, score = 1)
     custom_recogniser = PatternRecognizer(supported_entity="CUSTOM", name="CUSTOM", patterns = [custom_pattern],
-    global_regex_flags=re.DOTALL | re.MULTILINE)
     return custom_recogniser

 def custom_word_list_recogniser(custom_list:List[str]=[]):
     custom_regex = '\\b' + '\\b|\\b'.join(rf"{re.escape(term)}" for term in custom_list) + '\\b'
     custom_pattern = Pattern(name="custom_pattern",regex=custom_regex, score = 1)
+    #print("custom_pattern:", custom_pattern)
     custom_recogniser = PatternRecognizer(supported_entity="CUSTOM", name="CUSTOM", patterns = [custom_pattern],
+    global_regex_flags=re.DOTALL | re.MULTILINE | re.IGNORECASE)
     return custom_recogniser

tools/redaction_review.py CHANGED Viewed

@@ -49,30 +49,66 @@ def update_zoom(current_zoom_level:int, annotate_current_page:int, decrease:bool
     return current_zoom_level, annotate_current_page
-def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, zoom:int=100):
     '''
     Update a gradio_image_annotation object with new annotation data
     '''
     zoom_str = str(zoom) + '%'
     if not image_annotator_object:
         out_image_annotator = image_annotator(
-        label="Modify redaction boxes",
         #label_list=["Redaction"],
         #label_colors=[(0, 0, 0)],
         height=zoom_str,
         width=zoom_str,
-        show_label=False,
-        sources=None,
         show_clear_button=False,
         show_share_button=False,
         show_remove_button=False,
-        interactive=False)
-        number_reported = gr.Number(label = "Page (press enter to change)", value=1, precision=0)
-        return out_image_annotator, number_reported, number_reported, page_num_reported
     #print("page_num at start of update_annotator function:", page_num)
@@ -95,6 +131,28 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, zo
         page_num_reported = page_max_reported
     out_image_annotator = image_annotator(
         value = image_annotator_object[page_num_reported - 1],
         boxes_alpha=0.1,
@@ -117,7 +175,7 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, zo
     number_reported = gr.Number(label = "Page (press enter to change)", value=page_num_reported, precision=0)
-    return out_image_annotator, number_reported, number_reported, page_num_reported
 def modify_existing_page_redactions(image_annotated:AnnotatedImageData, current_page:int, previous_page:int, all_image_annotations:List[AnnotatedImageData], clear_all:bool=False):
     '''
@@ -149,6 +207,8 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
     output_files = []
     output_log_files = []
     image_annotated['image'] = all_image_annotations[current_page - 1]["image"]
     all_image_annotations[current_page - 1] = image_annotated
@@ -252,32 +312,19 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
     return doc, all_image_annotations, output_files, output_log_files
-def crop(annotations:AnnotatedImageData):
-    if annotations["boxes"]:
-        box = annotations["boxes"][0]
-        return annotations["image"][
-            box["ymin"]:box["ymax"],
-            box["xmin"]:box["xmax"]
-        ]
-    return None
 def get_boxes_json(annotations:AnnotatedImageData):
     return annotations["boxes"]
-    # Group the DataFrame by the 'image' column
-    grouped = df.groupby('image')
-    # Create a list to hold the JSON data
-    json_data = []
-    # Iterate over each group
-    for image_path, group in grouped:
-        # Convert each group to a list of box dictionaries
-        boxes = group.drop(columns='image').to_dict(orient='records')
-        # Append the structured data to the json_data list
-        json_data.append({
-            "image": image_path,
-            "boxes": boxes
-        })
-    return json_data

     return current_zoom_level, annotate_current_page
+def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, recogniser_entities_drop=gr.Dropdown(value="ALL", allow_custom_value=True), recogniser_dataframe_gr=gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]})), zoom:int=80):
     '''
     Update a gradio_image_annotation object with new annotation data
     '''
+    recogniser_entities = []
+    recogniser_dataframe = pd.DataFrame()
+    #recogniser_entities_drop = gr.Dropdown(value="ALL", allow_custom_value=True)
+    #recogniser_dataframe_gr = gr.Dataframe(pd.DataFrame(data={"page":[""], "label":[""]}))
+    #print("recogniser_dataframe_gr", recogniser_dataframe_gr)
+    #print("recogniser_dataframe_gr shape", recogniser_dataframe_gr.shape)
+    #print("recogniser_dataframe_gr.iloc[0,0]:",  recogniser_dataframe_gr.iloc[0,0])
+    if recogniser_dataframe_gr.iloc[0,0] == "":
+        try:
+            review_dataframe = convert_review_json_to_pandas_df(image_annotator_object)[["page", "label"]]
+            #print("review_dataframe['label']", review_dataframe["label"])
+            recogniser_entities = review_dataframe["label"].unique().tolist()
+            recogniser_entities.append("ALL")
+            #print("recogniser_entities:", recogniser_entities)
+            recogniser_dataframe_out = gr.Dataframe(review_dataframe)
+            recogniser_dataframe_gr = gr.Dataframe(review_dataframe)
+            recogniser_entities_drop = gr.Dropdown(value=recogniser_entities[0], choices=recogniser_entities, allow_custom_value=True, interactive=True)
+        except Exception as e:
+            print("Could not extract recogniser information:", e)
+    else:
+        review_dataframe = update_entities_df(recogniser_entities_drop, recogniser_dataframe_gr)
+        recogniser_dataframe_out = gr.Dataframe(review_dataframe)
     zoom_str = str(zoom) + '%'
     if not image_annotator_object:
+        page_num_reported = 1
         out_image_annotator = image_annotator(
+        image_annotator_object[page_num_reported - 1],
+        boxes_alpha=0.1,
+        box_thickness=1,
         #label_list=["Redaction"],
         #label_colors=[(0, 0, 0)],
+        show_label=False,
         height=zoom_str,
         width=zoom_str,
+        box_min_size=1,
+        box_selected_thickness=2,
+        handle_size=4,
+        sources=None,#["upload"],
         show_clear_button=False,
         show_share_button=False,
         show_remove_button=False,
+        handles_cursor=True,
+        interactive=True
+    )
+        number_reported = gr.Number(label = "Page (press enter to change)", value=page_num_reported, precision=0)
+        return out_image_annotator, number_reported, number_reported, page_num_reported, recogniser_entities_drop, recogniser_dataframe_out, recogniser_dataframe_gr
     #print("page_num at start of update_annotator function:", page_num)
         page_num_reported = page_max_reported
+    # Remove duplicate elements that are blank
+    def remove_duplicate_images_with_blank_boxes(data: List[AnnotatedImageData]) -> List[AnnotatedImageData]:
+        seen_images = set()
+        filtered_data = []
+        for item in data:
+            # Check if 'image' is unique
+            if item['image'] not in seen_images:
+                filtered_data.append(item)
+                seen_images.add(item['image'])
+            # If 'boxes' is empty but 'image' is unique, keep the entry
+            elif item['boxes']:
+                filtered_data.append(item)
+        return filtered_data
+    image_annotator_object = remove_duplicate_images_with_blank_boxes(image_annotator_object)
+    #print("image_annotator_object in update_annotator:", image_annotator_object)
+    #print("image_annotator_object[page_num_reported - 1]:", image_annotator_object[page_num_reported - 1])
     out_image_annotator = image_annotator(
         value = image_annotator_object[page_num_reported - 1],
         boxes_alpha=0.1,
     number_reported = gr.Number(label = "Page (press enter to change)", value=page_num_reported, precision=0)
+    return out_image_annotator, number_reported, number_reported, page_num_reported, recogniser_entities_drop, recogniser_dataframe_out, recogniser_dataframe_gr
 def modify_existing_page_redactions(image_annotated:AnnotatedImageData, current_page:int, previous_page:int, all_image_annotations:List[AnnotatedImageData], clear_all:bool=False):
     '''
     output_files = []
     output_log_files = []
+    #print("File paths in apply_redactions:", file_paths)
     image_annotated['image'] = all_image_annotations[current_page - 1]["image"]
     all_image_annotations[current_page - 1] = image_annotated
     return doc, all_image_annotations, output_files, output_log_files
 def get_boxes_json(annotations:AnnotatedImageData):
     return annotations["boxes"]
+def update_entities_df(choice:str, df:pd.DataFrame):
+    if choice=="ALL":
+        return df
+    else:
+        return df.loc[df["label"]==choice,:]
+def df_select_callback(df: pd.DataFrame, evt: gr.SelectData):
+        #print("index", evt.index)
+        #print("value", evt.value)
+        #print("row_value", evt.row_value)
+        row_value_page = evt.row_value[0] # This is the page number value
+        return row_value_page