seanpedrickcase commited on
Commit
1d772de
·
1 Parent(s): a770956

Refactor redaction functionality and enhance UI components: Added support for custom recognizers and whole page redaction options. Updated file handling to include new dropdowns for entity selection and improved dataframes for entity management. Enhanced the annotator with better state management and UI responsiveness. Cleaned up redundant code and improved overall performance in the redaction process.

Browse files
app.py CHANGED
@@ -13,7 +13,7 @@ from tools.helper_functions import ensure_output_folder_exists, add_folder_to_pa
13
  from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
14
  from tools.file_redaction import choose_and_run_redactor
15
  from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
16
- from tools.redaction_review import apply_redactions, modify_existing_page_redactions, decrease_page, increase_page, update_annotator, update_zoom
17
  from tools.data_anonymise import anonymise_data_files
18
  from tools.auth import authenticate_user
19
  from tools.load_spacy_model_custom_recognisers import custom_entities
@@ -34,7 +34,7 @@ full_comprehend_entity_list = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBI
34
  chosen_comprehend_entities.extend(custom_entities)
35
  full_comprehend_entity_list.extend(custom_entities)
36
 
37
- chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE"]
38
 
39
  full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS', 'CUSTOM']
40
 
@@ -67,8 +67,6 @@ with app:
67
  all_line_level_ocr_results_df_state = gr.State(pd.DataFrame())
68
  all_decision_process_table_state = gr.State(pd.DataFrame())
69
 
70
-
71
-
72
  session_hash_state = gr.State()
73
  s3_output_folder_state = gr.State()
74
 
@@ -101,7 +99,12 @@ with app:
101
  doc_full_file_name_textbox = gr.Textbox(label = "doc_full_file_name_textbox", value="", visible=False)
102
  doc_file_name_no_extension_textbox = gr.Textbox(label = "doc_full_file_name_textbox", value="", visible=False)
103
  doc_file_name_with_extension_textbox = gr.Textbox(label = "doc_file_name_with_extension_textbox", value="", visible=False)
104
- data_file_name_textbox = gr.Textbox(label = "data_file_name_textbox", value="", visible=False)
 
 
 
 
 
105
 
106
  estimated_time_taken_number = gr.Number(label = "estimated_time_taken_number", value=0.0, precision=1, visible=False) # This keeps track of the time taken to redact files for logging purposes.
107
  annotate_previous_page = gr.Number(value=0, label="Previous page", precision=0, visible=False) # Keeps track of the last page that the annotator was on
@@ -124,12 +127,12 @@ with app:
124
 
125
  default_deny_list_file_name = "default_deny_list.csv"
126
  default_deny_list_loc = output_folder + "/" + default_deny_list_file_name
127
- in_deny_list_state = gr.State(pd.DataFrame())
128
  in_deny_list_text_in = gr.Textbox(value="Deny list", visible=False)
129
 
130
  fully_redacted_list_file_name = "default_fully_redacted_list.csv"
131
  fully_redacted_list_loc = output_folder + "/" + fully_redacted_list_file_name
132
- in_fully_redacted_list_state = gr.State(pd.DataFrame())
133
  in_fully_redacted_text_in = gr.Textbox(value="Fully redacted page list", visible=False)
134
 
135
  # S3 settings for default allow list load
@@ -137,6 +140,9 @@ with app:
137
  s3_default_allow_list_file = gr.Textbox(label = "Default allow list file", value=default_allow_list_file_name, visible=False)
138
  default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=default_allow_list_loc, visible=False)
139
 
 
 
 
140
  ###
141
  # UI DESIGN
142
  ###
@@ -146,7 +152,9 @@ with app:
146
 
147
  Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Documents/images can be redacted using 'Quick' image analysis that works fine for typed text, but not handwriting/signatures. On the Redaction settings tab, choose 'Complex image analysis' OCR using AWS Textract (if you are using AWS) to redact these more complex elements (this service has a cost). Addtionally you can choose the method for PII identification. 'Local' gives quick, lower quality results, AWS Comprehend gives better results but has a cost.
148
 
149
- Review suggested redactions on the 'Review redactions' tab using a point and click visual interface. See the 'Redaction settings' tab to choose which pages to redact, the type of information to redact (e.g. people, places), or terms to exclude from redaction. Please see the [User Guide](https://github.com/seanpedrick-case/doc_redaction/blob/main/README.md) for a walkthrough on how to use this and all other features in the app. The app accepts a maximum file size of 100mb. Please consider giving feedback for the quality of the answers underneath the redact buttons when the option appears, this will help to improve the app in future.
 
 
150
 
151
  NOTE: In testing the app seems to find about 60% of personal information on a given (typed) page of text. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.""")
152
 
@@ -183,7 +191,7 @@ with app:
183
  # Object annotation
184
  with gr.Tab("Review redactions", id="tab_object_annotation"):
185
 
186
- with gr.Accordion(label = "Review previous redactions", open=True):
187
  output_review_files = gr.File(label="Review output files", file_count='multiple')
188
  upload_previous_review_file_btn = gr.Button("Review previously created redaction file (upload original PDF and ...review_file.csv)")
189
 
@@ -200,17 +208,35 @@ with app:
200
 
201
  annotation_button_apply = gr.Button("Apply revised redactions", variant="primary")
202
 
203
- annotator = image_annotator(
204
- label="Modify redaction boxes",
205
- label_list=["Redaction"],
206
- label_colors=[(0, 0, 0)],
207
- show_label=False,
208
- sources=None,#["upload"],
209
- show_clear_button=False,
210
- show_share_button=False,
211
- show_remove_button=False,
212
- interactive=False
213
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
 
215
  with gr.Row():
216
  annotation_last_page_button_bottom = gr.Button("Previous page", scale = 3)
@@ -290,36 +316,41 @@ with app:
290
  ###
291
  # PDF/IMAGE REDACTION
292
  ###
293
- in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox])
294
 
295
  document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator]).\
296
  then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state], api_name="prepare_doc").\
297
  then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
298
  outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number], api_name="redact_doc").\
299
- then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
300
 
301
  # If the app has completed a batch of pages, it will run this until the end of all pages in the document
302
  current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
303
  outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number]).\
304
- then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
305
 
306
  # If a file has been completed, the function will continue onto the next document
307
- latest_file_completed_text.change(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page]).\
308
  then(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
309
 
310
  ###
311
  # REVIEW PDF REDACTIONS
312
  ###
313
 
 
 
 
 
 
314
  # Page controls at top
315
  annotate_current_page.submit(
316
  modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
317
- then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
318
 
319
  annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
320
- then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
321
  annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
322
- then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
323
 
324
  # Zoom in and out on annotator
325
  annotate_zoom_in.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
@@ -328,10 +359,10 @@ with app:
328
  annotate_zoom_out.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
329
  then(update_zoom, inputs=[annotator_zoom_number, annotate_current_page, zoom_false_bool], outputs=[annotator_zoom_number, annotate_current_page])
330
 
331
- annotator_zoom_number.change(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
332
 
333
  clear_all_redactions_on_page_btn.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, clear_all_page_redactions], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
334
- then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
335
 
336
  #annotation_button_get.click(get_boxes_json, annotator, json_boxes)
337
  annotation_button_apply.click(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output], scroll_to_output=True)
@@ -339,23 +370,26 @@ with app:
339
  # Page controls at bottom
340
  annotate_current_page_bottom.submit(
341
  modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page]).\
342
- then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
343
 
344
  annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
345
- then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
346
  annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
347
- then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
 
 
 
 
 
 
 
348
 
349
- # Upload previous files for modifying redactions
350
- upload_previous_review_file_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox]).\
351
- then(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state]).\
352
- then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
353
 
354
  ###
355
  # TABULAR DATA REDACTION
356
  ###
357
  in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets]).\
358
- then(fn=get_input_file_names, inputs=[in_data_files], outputs=[data_file_name_textbox])
359
 
360
  tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state], api_name="redact_data")
361
 
@@ -370,7 +404,6 @@ with app:
370
  in_allow_list.change(fn=custom_regex_load, inputs=[in_allow_list], outputs=[in_allow_list_text, in_allow_list_state])
371
  in_deny_list.change(fn=custom_regex_load, inputs=[in_deny_list, in_deny_list_text_in], outputs=[in_deny_list_text, in_deny_list_state])
372
  in_fully_redacted_list.change(fn=custom_regex_load, inputs=[in_fully_redacted_list, in_fully_redacted_text_in], outputs=[in_fully_redacted_list_text, in_fully_redacted_list_state])
373
-
374
 
375
 
376
  ###
@@ -403,14 +436,14 @@ with app:
403
 
404
  # User submitted feedback for data redactions
405
  data_callback = CSVLogger_custom(dataset_file_name=log_file_name)
406
- data_callback.setup([data_feedback_radio, data_further_details_text, data_file_name_textbox], feedback_logs_folder)
407
- data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args)), [data_feedback_radio, data_further_details_text, data_file_name_textbox], None, preprocess=False).\
408
  then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
409
 
410
  # Log processing time/token usage when making a query
411
  usage_callback = CSVLogger_custom(dataset_file_name=log_file_name)
412
- usage_callback.setup([session_hash_textbox, doc_file_name_no_extension_textbox, data_file_name_textbox, estimated_time_taken_number, textract_metadata_textbox, pii_identification_method_drop, comprehend_query_number], usage_logs_folder)
413
- latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_no_extension_textbox, data_file_name_textbox, estimated_time_taken_number, textract_metadata_textbox, pii_identification_method_drop, comprehend_query_number], None, preprocess=False).\
414
  then(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
415
 
416
  # Get some environment variables and Launch the Gradio app
 
13
  from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
14
  from tools.file_redaction import choose_and_run_redactor
15
  from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
16
+ from tools.redaction_review import apply_redactions, modify_existing_page_redactions, decrease_page, increase_page, update_annotator, update_zoom, update_entities_df, df_select_callback
17
  from tools.data_anonymise import anonymise_data_files
18
  from tools.auth import authenticate_user
19
  from tools.load_spacy_model_custom_recognisers import custom_entities
 
34
  chosen_comprehend_entities.extend(custom_entities)
35
  full_comprehend_entity_list.extend(custom_entities)
36
 
37
+ chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", "CUSTOM"]
38
 
39
  full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS', 'CUSTOM']
40
 
 
67
  all_line_level_ocr_results_df_state = gr.State(pd.DataFrame())
68
  all_decision_process_table_state = gr.State(pd.DataFrame())
69
 
 
 
70
  session_hash_state = gr.State()
71
  s3_output_folder_state = gr.State()
72
 
 
99
  doc_full_file_name_textbox = gr.Textbox(label = "doc_full_file_name_textbox", value="", visible=False)
100
  doc_file_name_no_extension_textbox = gr.Textbox(label = "doc_full_file_name_textbox", value="", visible=False)
101
  doc_file_name_with_extension_textbox = gr.Textbox(label = "doc_file_name_with_extension_textbox", value="", visible=False)
102
+ doc_file_name_textbox_list = gr.Dropdown(label = "doc_file_name_textbox_list", value="", allow_custom_value=True,visible=False)
103
+
104
+ data_full_file_name_textbox = gr.Textbox(label = "data_full_file_name_textbox", value="", visible=False)
105
+ data_file_name_no_extension_textbox = gr.Textbox(label = "data_full_file_name_textbox", value="", visible=False)
106
+ data_file_name_with_extension_textbox = gr.Textbox(label = "data_file_name_with_extension_textbox", value="", visible=False)
107
+ data_file_name_textbox_list = gr.Dropdown(label = "data_file_name_textbox_list", value="", allow_custom_value=True,visible=False)
108
 
109
  estimated_time_taken_number = gr.Number(label = "estimated_time_taken_number", value=0.0, precision=1, visible=False) # This keeps track of the time taken to redact files for logging purposes.
110
  annotate_previous_page = gr.Number(value=0, label="Previous page", precision=0, visible=False) # Keeps track of the last page that the annotator was on
 
127
 
128
  default_deny_list_file_name = "default_deny_list.csv"
129
  default_deny_list_loc = output_folder + "/" + default_deny_list_file_name
130
+ in_deny_list_state = gr.State([])
131
  in_deny_list_text_in = gr.Textbox(value="Deny list", visible=False)
132
 
133
  fully_redacted_list_file_name = "default_fully_redacted_list.csv"
134
  fully_redacted_list_loc = output_folder + "/" + fully_redacted_list_file_name
135
+ in_fully_redacted_list_state = gr.State([])
136
  in_fully_redacted_text_in = gr.Textbox(value="Fully redacted page list", visible=False)
137
 
138
  # S3 settings for default allow list load
 
140
  s3_default_allow_list_file = gr.Textbox(label = "Default allow list file", value=default_allow_list_file_name, visible=False)
141
  default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=default_allow_list_loc, visible=False)
142
 
143
+ # Base dataframe for recognisers that is not modified subsequent to load
144
+ recogniser_entity_dataframe_base = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", visible=False)
145
+
146
  ###
147
  # UI DESIGN
148
  ###
 
152
 
153
  Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Documents/images can be redacted using 'Quick' image analysis that works fine for typed text, but not handwriting/signatures. On the Redaction settings tab, choose 'Complex image analysis' OCR using AWS Textract (if you are using AWS) to redact these more complex elements (this service has a cost). Addtionally you can choose the method for PII identification. 'Local' gives quick, lower quality results, AWS Comprehend gives better results but has a cost.
154
 
155
+ Review suggested redactions on the 'Review redactions' tab using a point and click visual interface. Upload a pdf alone to start from scratch, or upload the original pdf alongside a '...redaction_file.csv' to continue a previous redaction/review task.
156
+
157
+ See the 'Redaction settings' tab to choose which pages to redact, the type of information to redact (e.g. people, places), or terms to exclude from redaction. Please see the [User Guide](https://github.com/seanpedrick-case/doc_redaction/blob/main/README.md) for a walkthrough on how to use this and all other features in the app. The app accepts a maximum file size of 100mb. Please consider giving feedback for the quality of the answers underneath the redact buttons when the option appears, this will help to improve the app in future.
158
 
159
  NOTE: In testing the app seems to find about 60% of personal information on a given (typed) page of text. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.""")
160
 
 
191
  # Object annotation
192
  with gr.Tab("Review redactions", id="tab_object_annotation"):
193
 
194
+ with gr.Accordion(label = "Review redaction file", open=True):
195
  output_review_files = gr.File(label="Review output files", file_count='multiple')
196
  upload_previous_review_file_btn = gr.Button("Review previously created redaction file (upload original PDF and ...review_file.csv)")
197
 
 
208
 
209
  annotation_button_apply = gr.Button("Apply revised redactions", variant="primary")
210
 
211
+ with gr.Row():
212
+
213
+ with gr.Column(scale=4):
214
+
215
+ zoom_str = str(annotator_zoom_number) + '%'
216
+
217
+ annotator = image_annotator(
218
+ label="Modify redaction boxes",
219
+ label_list=["Redaction"],
220
+ label_colors=[(0, 0, 0)],
221
+ show_label=False,
222
+ height=zoom_str,
223
+ width=zoom_str,
224
+ box_min_size=1,
225
+ box_selected_thickness=2,
226
+ handle_size=4,
227
+ sources=None,#["upload"],
228
+ show_clear_button=False,
229
+ show_share_button=False,
230
+ show_remove_button=False,
231
+ handles_cursor=True,
232
+ interactive=False
233
+ )
234
+
235
+ with gr.Column(scale=1):
236
+ recogniser_entity_dropdown = gr.Dropdown(value="ALL", allow_custom_value=True)
237
+ recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas")
238
+
239
+
240
 
241
  with gr.Row():
242
  annotation_last_page_button_bottom = gr.Button("Previous page", scale = 3)
 
316
  ###
317
  # PDF/IMAGE REDACTION
318
  ###
319
+ in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list])
320
 
321
  document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator]).\
322
  then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state], api_name="prepare_doc").\
323
  then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
324
  outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number], api_name="redact_doc").\
325
+ then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
326
 
327
  # If the app has completed a batch of pages, it will run this until the end of all pages in the document
328
  current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
329
  outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number]).\
330
+ then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
331
 
332
  # If a file has been completed, the function will continue onto the next document
333
+ latest_file_completed_text.change(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
334
  then(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
335
 
336
  ###
337
  # REVIEW PDF REDACTIONS
338
  ###
339
 
340
+ # Upload previous files for modifying redactions
341
+ upload_previous_review_file_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
342
+ then(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state]).\
343
+ then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
344
+
345
  # Page controls at top
346
  annotate_current_page.submit(
347
  modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
348
+ then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
349
 
350
  annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
351
+ then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
352
  annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
353
+ then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
354
 
355
  # Zoom in and out on annotator
356
  annotate_zoom_in.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
 
359
  annotate_zoom_out.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
360
  then(update_zoom, inputs=[annotator_zoom_number, annotate_current_page, zoom_false_bool], outputs=[annotator_zoom_number, annotate_current_page])
361
 
362
+ annotator_zoom_number.change(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
363
 
364
  clear_all_redactions_on_page_btn.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, clear_all_page_redactions], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
365
+ then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
366
 
367
  #annotation_button_get.click(get_boxes_json, annotator, json_boxes)
368
  annotation_button_apply.click(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output], scroll_to_output=True)
 
370
  # Page controls at bottom
371
  annotate_current_page_bottom.submit(
372
  modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page]).\
373
+ then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
374
 
375
  annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
376
+ then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
377
  annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
378
+ then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
379
+
380
+ # Review side bar controls
381
+ recogniser_entity_dropdown.select(update_entities_df, inputs=[recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs=[recogniser_entity_dataframe])
382
+
383
+ recogniser_entity_dataframe.select(df_select_callback, inputs=[recogniser_entity_dataframe], outputs=annotate_current_page).\
384
+ then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
385
+ then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
386
 
 
 
 
 
387
 
388
  ###
389
  # TABULAR DATA REDACTION
390
  ###
391
  in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets]).\
392
+ then(fn=get_input_file_names, inputs=[in_data_files], outputs=[data_full_file_name_textbox, data_file_name_no_extension_textbox, data_file_name_with_extension_textbox, data_full_file_name_textbox, data_file_name_textbox_list])
393
 
394
  tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state], api_name="redact_data")
395
 
 
404
  in_allow_list.change(fn=custom_regex_load, inputs=[in_allow_list], outputs=[in_allow_list_text, in_allow_list_state])
405
  in_deny_list.change(fn=custom_regex_load, inputs=[in_deny_list, in_deny_list_text_in], outputs=[in_deny_list_text, in_deny_list_state])
406
  in_fully_redacted_list.change(fn=custom_regex_load, inputs=[in_fully_redacted_list, in_fully_redacted_text_in], outputs=[in_fully_redacted_list_text, in_fully_redacted_list_state])
 
407
 
408
 
409
  ###
 
436
 
437
  # User submitted feedback for data redactions
438
  data_callback = CSVLogger_custom(dataset_file_name=log_file_name)
439
+ data_callback.setup([data_feedback_radio, data_further_details_text, data_full_file_name_textbox], feedback_logs_folder)
440
+ data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args)), [data_feedback_radio, data_further_details_text, data_full_file_name_textbox], None, preprocess=False).\
441
  then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
442
 
443
  # Log processing time/token usage when making a query
444
  usage_callback = CSVLogger_custom(dataset_file_name=log_file_name)
445
+ usage_callback.setup([session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, estimated_time_taken_number, textract_metadata_textbox, pii_identification_method_drop, comprehend_query_number], usage_logs_folder)
446
+ latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, estimated_time_taken_number, textract_metadata_textbox, pii_identification_method_drop, comprehend_query_number], None, preprocess=False).\
447
  then(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
448
 
449
  # Get some environment variables and Launch the Gradio app
tools/file_conversion.py CHANGED
@@ -8,6 +8,8 @@ import time
8
  import json
9
  import pymupdf
10
  import pandas as pd
 
 
11
  from tqdm import tqdm
12
  from gradio import Progress
13
  from typing import List, Optional
@@ -58,10 +60,10 @@ def process_single_page(pdf_path: str, page_num: int, image_dpi: float, output_d
58
  os.makedirs(os.path.dirname(out_path), exist_ok=True)
59
 
60
  if os.path.exists(out_path):
61
- print(f"Loading existing image for page {page_num + 1}")
62
  image = Image.open(out_path)
63
  else:
64
- print(f"Converting page {page_num + 1}")
65
  image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1,
66
  dpi=image_dpi, use_cropbox=True, use_pdftocairo=False)
67
  image = image_l[0]
@@ -181,7 +183,7 @@ def process_file(file_path:str, prepare_for_review:bool=False):
181
 
182
  return img_object
183
 
184
- def get_input_file_names(file_input):
185
  '''
186
  Get list of input files to report to logs.
187
  '''
@@ -210,14 +212,123 @@ def get_input_file_names(file_input):
210
  file_extension = os.path.splitext(file_path)[1].lower()
211
 
212
  # Check if the file is an image type
213
- if file_extension in ['.jpg', '.jpeg', '.png', '.pdf', '.xlsx', '.csv', '.parquet']:
214
  all_relevant_files.append(file_path_without_ext)
215
  file_name_with_extension = file_path_without_ext + file_extension
216
  full_file_name = file_path
217
 
218
  all_relevant_files_str = ", ".join(all_relevant_files)
219
 
220
- return all_relevant_files_str, file_name_with_extension, full_file_name
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
 
222
  def prepare_image_or_pdf(
223
  file_paths: List[str],
@@ -230,6 +341,7 @@ def prepare_image_or_pdf(
230
  current_loop_page_number:int=0,
231
  all_annotations_object:List = [],
232
  prepare_for_review:bool = False,
 
233
  progress: Progress = Progress(track_tqdm=True)
234
  ) -> tuple[List[str], List[str]]:
235
  """
@@ -241,15 +353,16 @@ def prepare_image_or_pdf(
241
  Args:
242
  file_paths (List[str]): List of file paths to process.
243
  in_redact_method (str): The redaction method to use.
244
- in_allow_list (Optional[List[List[str]]]): List of allowed terms for redaction.
245
- latest_file_completed (int): Index of the last completed file.
246
- out_message (List[str]): List to store output messages.
247
- first_loop_state (bool): Flag indicating if this is the first iteration.
248
- number_of_pages (int): integer indicating the number of pages in the document
249
- current_loop_page_number (int): Current number of loop
250
- all_annotations_object(List of annotation objects): All annotations for current document
251
- prepare_for_review(bool): Is this preparation step preparing pdfs and json files to review current redactions?
252
- progress (Progress): Progress tracker for the operation.
 
253
 
254
 
255
  Returns:
@@ -259,6 +372,9 @@ def prepare_image_or_pdf(
259
  tic = time.perf_counter()
260
  json_from_csv = False
261
 
 
 
 
262
  # If this is the first time around, set variables to 0/blank
263
  if first_loop_state==True:
264
  print("first_loop_state is True")
@@ -329,6 +445,9 @@ def prepare_image_or_pdf(
329
 
330
  # Loop through files to load in
331
  for file in file_paths_loop:
 
 
 
332
  if isinstance(file, str):
333
  file_path = file
334
  else:
@@ -342,15 +461,45 @@ def prepare_image_or_pdf(
342
 
343
  file_extension = os.path.splitext(file_path)[1].lower()
344
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
345
 
346
  # Check if the file is an image type and the user selected text ocr option
347
- if file_extension in ['.jpg', '.jpeg', '.png'] and in_redact_method == text_ocr_option:
348
  in_redact_method = tesseract_ocr_option
349
 
350
- if file_extension in ['.csv']:
351
  review_file_csv = read_file(file)
352
  all_annotations_object = convert_pandas_df_to_review_json(review_file_csv)
353
  json_from_csv = True
 
354
 
355
  # If the file name ends with redactions.json, assume it is an annoations object, overwrite the current variable
356
  if (file_extension in ['.json']) | (json_from_csv == True):
@@ -376,7 +525,7 @@ def prepare_image_or_pdf(
376
 
377
  # If you have an annotations object from the above code
378
  if all_annotations_object:
379
- #print("out_annotations_object found:", all_annotations_object)
380
 
381
  # Get list of page numbers
382
  image_file_paths_pages = [
@@ -388,9 +537,27 @@ def prepare_image_or_pdf(
388
 
389
  # If PDF pages have been converted to image files, replace the current image paths in the json to this.
390
  if image_file_paths:
 
 
 
 
 
 
391
 
392
- for i, annotation in enumerate(all_annotations_object):
393
- annotation_page_number = int(re.search(r'_(\d+)\.png$', annotation["image"]).group(1))
 
 
 
 
 
 
 
 
 
 
 
 
394
  #print("Annotation page number:", annotation_page_number)
395
 
396
  # Check if the annotation page number exists in the image file paths pages
@@ -402,19 +569,30 @@ def prepare_image_or_pdf(
402
  else:
403
  print("Page", annotation_page_number, "image file not found.")
404
 
405
- #print("all_annotations_object:", all_annotations_object)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
406
 
407
  # Write the response to a JSON file in output folder
408
  out_folder = output_folder + file_path_without_ext + ".json"
409
  with open(out_folder, 'w') as json_file:
410
  json.dump(all_annotations_object, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
411
  continue
412
-
413
 
414
- # Must be a pdf or image at this point
415
  else:
416
-
417
- # Convert pdf/image file to correct format for redaction
418
  if in_redact_method == tesseract_ocr_option or in_redact_method == textract_option:
419
  if is_pdf_or_image(file_path) == False:
420
  out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
@@ -425,25 +603,11 @@ def prepare_image_or_pdf(
425
  if is_pdf(file_path) == False:
426
  out_message = "Please upload a PDF file for text analysis."
427
  print(out_message)
428
- return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
429
-
430
- converted_file_path = file_path # Pikepdf works with the basic unconverted pdf file
431
- image_file_path = process_file(file_path, prepare_for_review)
432
 
433
- converted_file_paths.append(converted_file_path)
434
- image_file_paths.extend(image_file_path)
435
-
436
- # If a pdf, load as a pymupdf document
437
- if is_pdf(file_path):
438
- pymupdf_doc = pymupdf.open(file_path)
439
 
440
- elif is_pdf_or_image(file_path): # Alternatively, if it's an image
441
- # Convert image to a pymupdf document
442
- pymupdf_doc = pymupdf.open() # Create a new empty document
443
- img = Image.open(file_path) # Open the image file
444
- rect = pymupdf.Rect(0, 0, img.width, img.height) # Create a rectangle for the image
445
- page = pymupdf_doc.new_page(width=img.width, height=img.height) # Add a new page
446
- page.insert_image(rect, filename=file_path) # Insert the image into the page
447
 
448
  toc = time.perf_counter()
449
  out_time = f"File '{file_path_without_ext}' prepared in {toc - tic:0.1f} seconds."
@@ -453,11 +617,12 @@ def prepare_image_or_pdf(
453
  out_message.append(out_time)
454
  out_message_out = '\n'.join(out_message)
455
 
456
- if prepare_for_review == False:
457
- number_of_pages = len(image_file_paths)
458
- else:
459
- number_of_pages = len(all_annotations_object)
460
 
 
461
 
462
  return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
463
 
@@ -498,13 +663,17 @@ def convert_review_json_to_pandas_df(data:List[dict]) -> pd.DataFrame:
498
  match = re.search(r'_(\d+)\.png$', image_path)
499
  if match:
500
  number = match.group(1) # Extract the number
501
- print(number) # Output: 0
502
  reported_number = int(number) + 1
503
  else:
504
  print("No number found before .png")
505
 
 
 
 
 
506
  for box in entry["boxes"]:
507
- data_to_add = {"image": image_path, "page":reported_number, **box}
508
  #print("data_to_add:", data_to_add)
509
  flattened_data.append(data_to_add)
510
 
 
8
  import json
9
  import pymupdf
10
  import pandas as pd
11
+ from pymupdf import Rect
12
+ from fitz import Page
13
  from tqdm import tqdm
14
  from gradio import Progress
15
  from typing import List, Optional
 
60
  os.makedirs(os.path.dirname(out_path), exist_ok=True)
61
 
62
  if os.path.exists(out_path):
63
+ #print(f"Loading existing image for page {page_num + 1}")
64
  image = Image.open(out_path)
65
  else:
66
+ #print(f"Converting page {page_num + 1}")
67
  image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1,
68
  dpi=image_dpi, use_cropbox=True, use_pdftocairo=False)
69
  image = image_l[0]
 
183
 
184
  return img_object
185
 
186
+ def get_input_file_names(file_input:List[str]):
187
  '''
188
  Get list of input files to report to logs.
189
  '''
 
212
  file_extension = os.path.splitext(file_path)[1].lower()
213
 
214
  # Check if the file is an image type
215
+ if (file_extension in ['.jpg', '.jpeg', '.png', '.pdf', '.xlsx', '.csv', '.parquet']) & ("review_file" not in file_path_without_ext):
216
  all_relevant_files.append(file_path_without_ext)
217
  file_name_with_extension = file_path_without_ext + file_extension
218
  full_file_name = file_path
219
 
220
  all_relevant_files_str = ", ".join(all_relevant_files)
221
 
222
+ #print("all_relevant_files_str in input_file_names", all_relevant_files_str)
223
+ #print("all_relevant_files in input_file_names", all_relevant_files)
224
+
225
+ return all_relevant_files_str, file_name_with_extension, full_file_name, all_relevant_files
226
+
227
+ def convert_color_to_range_0_1(color):
228
+ return tuple(component / 255 for component in color)
229
+
230
+ def redact_single_box(pymupdf_page:Page, pymupdf_rect:Rect, img_annotation_box:dict, custom_colours:bool=False):
231
+ pymupdf_x1 = pymupdf_rect[0]
232
+ pymupdf_y1 = pymupdf_rect[1]
233
+ pymupdf_x2 = pymupdf_rect[2]
234
+ pymupdf_y2 = pymupdf_rect[3]
235
+
236
+ # Calculate area to actually remove text from the pdf (different from black box size)
237
+ redact_bottom_y = pymupdf_y1 + 2
238
+ redact_top_y = pymupdf_y2 - 2
239
+
240
+ # Calculate the middle y value and set a small height if default values are too close together
241
+ if (redact_top_y - redact_bottom_y) < 1:
242
+ middle_y = (pymupdf_y1 + pymupdf_y2) / 2
243
+ redact_bottom_y = middle_y - 1
244
+ redact_top_y = middle_y + 1
245
+
246
+ #print("Rect:", rect)
247
+
248
+ rect_small_pixel_height = Rect(pymupdf_x1, redact_bottom_y, pymupdf_x2, redact_top_y) # Slightly smaller than outside box
249
+
250
+ # Add the annotation to the middle of the character line, so that it doesn't delete text from adjacent lines
251
+ #page.add_redact_annot(rect)#rect_small_pixel_height)
252
+ pymupdf_page.add_redact_annot(rect_small_pixel_height)
253
+
254
+ # Set up drawing a black box over the whole rect
255
+ shape = pymupdf_page.new_shape()
256
+ shape.draw_rect(pymupdf_rect)
257
+
258
+ if custom_colours == True:
259
+ if img_annotation_box["color"][0] > 1:
260
+ out_colour = convert_color_to_range_0_1(img_annotation_box["color"])
261
+ else:
262
+ out_colour = img_annotation_box["color"]
263
+ else:
264
+ out_colour = (0,0,0)
265
+
266
+ shape.finish(color=out_colour, fill=out_colour) # Black fill for the rectangle
267
+ #shape.finish(color=(0, 0, 0)) # Black fill for the rectangle
268
+ shape.commit()
269
+
270
+ def convert_pymupdf_to_image_coords(pymupdf_page, x1, y1, x2, y2, image: Image):
271
+ '''
272
+ Converts coordinates from pymupdf format to image coordinates,
273
+ accounting for mediabox dimensions.
274
+ '''
275
+
276
+ rect_height = pymupdf_page.rect.height
277
+ rect_width = pymupdf_page.rect.width
278
+
279
+ # Get mediabox dimensions
280
+ mediabox = pymupdf_page.mediabox
281
+ mediabox_width = mediabox.width
282
+ mediabox_height = mediabox.height
283
+
284
+ image_page_width, image_page_height = image.size
285
+
286
+ # Calculate scaling factors using mediabox dimensions
287
+ scale_width = image_page_width / mediabox_width
288
+ scale_height = image_page_height / mediabox_height
289
+
290
+ #print("scale_width:", scale_width)
291
+ #print("scale_height:", scale_height)
292
+
293
+ rect_to_mediabox_x_scale = mediabox_width / rect_width
294
+ rect_to_mediabox_y_scale = mediabox_height / rect_height
295
+
296
+ #print("rect_to_mediabox_x_scale:", rect_to_mediabox_x_scale)
297
+ #print("rect_to_mediabox_y_scale:", rect_to_mediabox_y_scale)
298
+
299
+ # Adjust coordinates based on scaling factors
300
+ x1_image = (x1 * scale_width) * rect_to_mediabox_x_scale
301
+ y1_image = (y1 * scale_height) * rect_to_mediabox_y_scale
302
+ x2_image = (x2 * scale_width) * rect_to_mediabox_x_scale
303
+ y2_image = (y2 * scale_height) * rect_to_mediabox_y_scale
304
+
305
+ return x1_image, y1_image, x2_image, y2_image
306
+
307
+
308
+ def redact_whole_pymupdf_page(rect_height, rect_width, image, page, custom_colours, border = 5):
309
+ # Small border to page that remains white
310
+ border = 5
311
+ # Define the coordinates for the Rect
312
+ whole_page_x1, whole_page_y1 = 0 + border, 0 + border # Bottom-left corner
313
+ whole_page_x2, whole_page_y2 = rect_width - border, rect_height - border # Top-right corner
314
+
315
+ whole_page_image_x1, whole_page_image_y1, whole_page_image_x2, whole_page_image_y2 = convert_pymupdf_to_image_coords(page, whole_page_x1, whole_page_y1, whole_page_x2, whole_page_y2, image)
316
+
317
+ # Create new image annotation element based on whole page coordinates
318
+ whole_page_rect = Rect(whole_page_x1, whole_page_y1, whole_page_x2, whole_page_y2)
319
+
320
+ # Write whole page annotation to annotation boxes
321
+ whole_page_img_annotation_box = {}
322
+ whole_page_img_annotation_box["xmin"] = whole_page_image_x1
323
+ whole_page_img_annotation_box["ymin"] = whole_page_image_y1
324
+ whole_page_img_annotation_box["xmax"] = whole_page_image_x2
325
+ whole_page_img_annotation_box["ymax"] = whole_page_image_y2
326
+ whole_page_img_annotation_box["color"] = (0,0,0)
327
+ whole_page_img_annotation_box["label"] = "Whole page"
328
+
329
+ redact_single_box(page, whole_page_rect, whole_page_img_annotation_box, custom_colours)
330
+
331
+ return whole_page_img_annotation_box
332
 
333
  def prepare_image_or_pdf(
334
  file_paths: List[str],
 
341
  current_loop_page_number:int=0,
342
  all_annotations_object:List = [],
343
  prepare_for_review:bool = False,
344
+ in_fully_redacted_list:List[int]=[],
345
  progress: Progress = Progress(track_tqdm=True)
346
  ) -> tuple[List[str], List[str]]:
347
  """
 
353
  Args:
354
  file_paths (List[str]): List of file paths to process.
355
  in_redact_method (str): The redaction method to use.
356
+ in_allow_list (optional, Optional[List[List[str]]]): List of allowed terms for redaction.
357
+ latest_file_completed (optional, int): Index of the last completed file.
358
+ out_message (optional, List[str]): List to store output messages.
359
+ first_loop_state (optional, bool): Flag indicating if this is the first iteration.
360
+ number_of_pages (optional, int): integer indicating the number of pages in the document
361
+ current_loop_page_number (optional, int): Current number of loop
362
+ all_annotations_object(optional, List of annotation objects): All annotations for current document
363
+ prepare_for_review(optional, bool): Is this preparation step preparing pdfs and json files to review current redactions?
364
+ in_fully_redacted_list(optional, List of int): A list of pages to fully redact
365
+ progress (optional, Progress): Progress tracker for the operation.
366
 
367
 
368
  Returns:
 
372
  tic = time.perf_counter()
373
  json_from_csv = False
374
 
375
+ if isinstance(in_fully_redacted_list, pd.DataFrame):
376
+ in_fully_redacted_list = in_fully_redacted_list.iloc[:,0].tolist()
377
+
378
  # If this is the first time around, set variables to 0/blank
379
  if first_loop_state==True:
380
  print("first_loop_state is True")
 
445
 
446
  # Loop through files to load in
447
  for file in file_paths_loop:
448
+ converted_file_path = []
449
+ image_file_path = []
450
+
451
  if isinstance(file, str):
452
  file_path = file
453
  else:
 
461
 
462
  file_extension = os.path.splitext(file_path)[1].lower()
463
 
464
+ # If a pdf, load as a pymupdf document
465
+ if is_pdf(file_path):
466
+ pymupdf_doc = pymupdf.open(file_path)
467
+
468
+ converted_file_path = file_path
469
+ image_file_paths = process_file(file_path, prepare_for_review)
470
+
471
+ # Create base version of the annotation object that doesn't have any annotations in it
472
+ if not all_annotations_object:
473
+ all_annotations_object = []
474
+
475
+ for image_path in image_file_paths:
476
+ annotation = {}
477
+ annotation["image"] = image_path
478
+
479
+ all_annotations_object.append(annotation)
480
+
481
+ #print("all_annotations_object:", all_annotations_object)
482
+
483
+
484
+ elif is_pdf_or_image(file_path): # Alternatively, if it's an image
485
+ # Convert image to a pymupdf document
486
+ pymupdf_doc = pymupdf.open() # Create a new empty document
487
+
488
+ img = Image.open(file_path) # Open the image file
489
+ rect = pymupdf.Rect(0, 0, img.width, img.height) # Create a rectangle for the image
490
+ page = pymupdf_doc.new_page(width=img.width, height=img.height) # Add a new page
491
+ page.insert_image(rect, filename=file_path) # Insert the image into the page
492
+
493
 
494
  # Check if the file is an image type and the user selected text ocr option
495
+ elif file_extension in ['.jpg', '.jpeg', '.png'] and in_redact_method == text_ocr_option:
496
  in_redact_method = tesseract_ocr_option
497
 
498
+ elif file_extension in ['.csv']:
499
  review_file_csv = read_file(file)
500
  all_annotations_object = convert_pandas_df_to_review_json(review_file_csv)
501
  json_from_csv = True
502
+ print("Converted CSV review file to json")
503
 
504
  # If the file name ends with redactions.json, assume it is an annoations object, overwrite the current variable
505
  if (file_extension in ['.json']) | (json_from_csv == True):
 
525
 
526
  # If you have an annotations object from the above code
527
  if all_annotations_object:
528
+ #print("out_annotations_object before reloading images:", all_annotations_object)
529
 
530
  # Get list of page numbers
531
  image_file_paths_pages = [
 
537
 
538
  # If PDF pages have been converted to image files, replace the current image paths in the json to this.
539
  if image_file_paths:
540
+ #print("Image file paths found")
541
+
542
+ #print("Image_file_paths:", image_file_paths)
543
+
544
+ #for i, annotation in enumerate(all_annotations_object):
545
+ for i, image_file_path in enumerate(image_file_paths):
546
 
547
+ if i < len(all_annotations_object):
548
+ annotation = all_annotations_object[i]
549
+ else:
550
+ annotation = {}
551
+ all_annotations_object.append(annotation)
552
+
553
+ #print("annotation:", annotation, "for page:", str(i))
554
+
555
+ if not annotation:
556
+ annotation = {"image":"", "boxes": []}
557
+ annotation_page_number = int(re.search(r'_(\d+)\.png$', image_file_path).group(1))
558
+
559
+ else:
560
+ annotation_page_number = int(re.search(r'_(\d+)\.png$', annotation["image"]).group(1))
561
  #print("Annotation page number:", annotation_page_number)
562
 
563
  # Check if the annotation page number exists in the image file paths pages
 
569
  else:
570
  print("Page", annotation_page_number, "image file not found.")
571
 
572
+ all_annotations_object[i] = annotation
573
+
574
+ #print("all_annotations_object at end of json/csv load part:", all_annotations_object)
575
+
576
+ # Get list of pages that are to be fully redacted and redact them
577
+ if in_fully_redacted_list:
578
+ print("Redacting whole pages")
579
+
580
+ for i, image in enumerate(image_file_paths):
581
+ page = pymupdf_doc.load_page(i)
582
+ rect_height = page.rect.height
583
+ rect_width = page.rect.width
584
+ whole_page_img_annotation_box = redact_whole_pymupdf_page(rect_height, rect_width, image, page, custom_colours = False, border = 5)
585
+
586
+ all_annotations_object.append(whole_page_img_annotation_box)
587
 
588
  # Write the response to a JSON file in output folder
589
  out_folder = output_folder + file_path_without_ext + ".json"
590
  with open(out_folder, 'w') as json_file:
591
  json.dump(all_annotations_object, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
592
  continue
 
593
 
594
+ # Must be something else, return with error message
595
  else:
 
 
596
  if in_redact_method == tesseract_ocr_option or in_redact_method == textract_option:
597
  if is_pdf_or_image(file_path) == False:
598
  out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
 
603
  if is_pdf(file_path) == False:
604
  out_message = "Please upload a PDF file for text analysis."
605
  print(out_message)
606
+ return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
 
 
 
607
 
 
 
 
 
 
 
608
 
609
+ converted_file_paths.append(converted_file_path)
610
+ image_file_paths.extend(image_file_path)
 
 
 
 
 
611
 
612
  toc = time.perf_counter()
613
  out_time = f"File '{file_path_without_ext}' prepared in {toc - tic:0.1f} seconds."
 
617
  out_message.append(out_time)
618
  out_message_out = '\n'.join(out_message)
619
 
620
+ #if prepare_for_review == False:
621
+ number_of_pages = len(image_file_paths)
622
+ #else:
623
+ # number_of_pages = len(all_annotations_object)
624
 
625
+ #print("all_annotations_object at end:", all_annotations_object)
626
 
627
  return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
628
 
 
663
  match = re.search(r'_(\d+)\.png$', image_path)
664
  if match:
665
  number = match.group(1) # Extract the number
666
+ #print(number) # Output: 0
667
  reported_number = int(number) + 1
668
  else:
669
  print("No number found before .png")
670
 
671
+ # Check if 'boxes' is in the entry, if not, add an empty list
672
+ if 'boxes' not in entry:
673
+ entry['boxes'] = []
674
+
675
  for box in entry["boxes"]:
676
+ data_to_add = {"image": image_path, "page": reported_number, **box}
677
  #print("data_to_add:", data_to_add)
678
  flattened_data.append(data_to_add)
679
 
tools/file_redaction.py CHANGED
@@ -18,7 +18,7 @@ from pdfminer.layout import LTTextContainer, LTChar, LTTextLine, LTTextLineHoriz
18
  from pikepdf import Pdf, Dictionary, Name
19
  import pymupdf
20
  from pymupdf import Rect
21
- from fitz import Document, Page
22
  import gradio as gr
23
  from gradio import Progress
24
  from collections import defaultdict # For efficient grouping
@@ -26,7 +26,7 @@ from collections import defaultdict # For efficient grouping
26
  from presidio_analyzer import RecognizerResult
27
  from tools.aws_functions import RUN_AWS_FUNCTIONS
28
  from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult
29
- from tools.file_conversion import process_file, image_dpi, convert_review_json_to_pandas_df
30
  from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser
31
  from tools.helper_functions import get_file_path_end, output_folder, clean_unicode_text, get_or_create_env_var, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
32
  from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
@@ -69,8 +69,8 @@ def choose_and_run_redactor(file_paths:List[str],
69
  chosen_redact_comprehend_entities:List[str],
70
  in_redact_method:str,
71
  in_allow_list:List[List[str]]=None,
72
- in_deny_list:List[List[str]]=None,
73
- in_fully_redacted_list:List[List[str]]=None,
74
  latest_file_completed:int=0,
75
  out_message:list=[],
76
  out_file_paths:list=[],
@@ -102,8 +102,8 @@ def choose_and_run_redactor(file_paths:List[str],
102
  - chosen_redact_comprehend_entities (List[str]): A list of entity types to redact from files, chosen from the official list from AWS Comprehend service
103
  - in_redact_method (str): The method to use for redaction.
104
  - in_allow_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to None.
105
- - in_deny_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to None.
106
- - in_fully_redacted_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to None.
107
  - latest_file_completed (int, optional): The index of the last completed file. Defaults to 0.
108
  - out_message (list, optional): A list to store output messages. Defaults to an empty list.
109
  - out_file_paths (list, optional): A list to store paths to the output files. Defaults to an empty list.
@@ -131,6 +131,15 @@ def choose_and_run_redactor(file_paths:List[str],
131
  tic = time.perf_counter()
132
  all_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
133
 
 
 
 
 
 
 
 
 
 
134
 
135
  # If this is the first time around, set variables to 0/blank
136
  if first_loop_state==True:
@@ -296,7 +305,9 @@ def choose_and_run_redactor(file_paths:List[str],
296
  pii_identification_method,
297
  comprehend_query_number,
298
  comprehend_client,
299
- textract_client)
 
 
300
 
301
  # Save Textract request metadata (if exists)
302
  if new_request_metadata:
@@ -330,7 +341,9 @@ def choose_and_run_redactor(file_paths:List[str],
330
  pymupdf_doc,
331
  pii_identification_method,
332
  comprehend_query_number,
333
- comprehend_client)
 
 
334
 
335
  else:
336
  out_message = "No redaction method selected"
@@ -378,14 +391,19 @@ def choose_and_run_redactor(file_paths:List[str],
378
  json.dump(annotations_all_pages, f)
379
  log_files_output_paths.append(out_annotation_file_path)
380
 
381
- print("Saving annotations to CSV")
382
 
383
  # Convert json to csv and also save this
 
 
384
  review_df = convert_review_json_to_pandas_df(annotations_all_pages)
 
385
  out_review_file_file_path = out_image_file_path + '_review_file.csv'
386
  review_df.to_csv(out_review_file_file_path, index=None)
387
  out_file_paths.append(out_review_file_file_path)
388
 
 
 
389
  except Exception as e:
390
  print("Could not save annotations to json file:", e)
391
 
@@ -522,42 +540,7 @@ def convert_image_coords_to_pymupdf(pymupdf_page, annot:CustomImageRecognizerRes
522
 
523
  return x1, new_y1, x2, new_y2
524
 
525
- def convert_pymupdf_to_image_coords(pymupdf_page, x1, y1, x2, y2, image: Image):
526
- '''
527
- Converts coordinates from pymupdf format to image coordinates,
528
- accounting for mediabox dimensions.
529
- '''
530
-
531
- rect_height = pymupdf_page.rect.height
532
- rect_width = pymupdf_page.rect.width
533
-
534
- # Get mediabox dimensions
535
- mediabox = pymupdf_page.mediabox
536
- mediabox_width = mediabox.width
537
- mediabox_height = mediabox.height
538
-
539
- image_page_width, image_page_height = image.size
540
-
541
- # Calculate scaling factors using mediabox dimensions
542
- scale_width = image_page_width / mediabox_width
543
- scale_height = image_page_height / mediabox_height
544
 
545
- #print("scale_width:", scale_width)
546
- #print("scale_height:", scale_height)
547
-
548
- rect_to_mediabox_x_scale = mediabox_width / rect_width
549
- rect_to_mediabox_y_scale = mediabox_height / rect_height
550
-
551
- #print("rect_to_mediabox_x_scale:", rect_to_mediabox_x_scale)
552
- #print("rect_to_mediabox_y_scale:", rect_to_mediabox_y_scale)
553
-
554
- # Adjust coordinates based on scaling factors
555
- x1_image = (x1 * scale_width) * rect_to_mediabox_x_scale
556
- y1_image = (y1 * scale_height) * rect_to_mediabox_y_scale
557
- x2_image = (x2 * scale_width) * rect_to_mediabox_x_scale
558
- y2_image = (y2 * scale_height) * rect_to_mediabox_y_scale
559
-
560
- return x1_image, y1_image, x2_image, y2_image
561
 
562
  def convert_gradio_annotation_coords_to_pymupdf(pymupdf_page:Page, annot:dict, image:Image):
563
  '''
@@ -594,49 +577,6 @@ def move_page_info(file_path: str) -> str:
594
 
595
  return new_file_path
596
 
597
- def convert_color_to_range_0_1(color):
598
- return tuple(component / 255 for component in color)
599
-
600
- def redact_single_box(pymupdf_page:Page, pymupdf_rect:Rect, img_annotation_box:dict, custom_colours:bool=False):
601
- pymupdf_x1 = pymupdf_rect[0]
602
- pymupdf_y1 = pymupdf_rect[1]
603
- pymupdf_x2 = pymupdf_rect[2]
604
- pymupdf_y2 = pymupdf_rect[3]
605
-
606
- # Calculate area to actually remove text from the pdf (different from black box size)
607
- redact_bottom_y = pymupdf_y1 + 2
608
- redact_top_y = pymupdf_y2 - 2
609
-
610
- # Calculate the middle y value and set a small height if default values are too close together
611
- if (redact_top_y - redact_bottom_y) < 1:
612
- middle_y = (pymupdf_y1 + pymupdf_y2) / 2
613
- redact_bottom_y = middle_y - 1
614
- redact_top_y = middle_y + 1
615
-
616
- #print("Rect:", rect)
617
-
618
- rect_small_pixel_height = Rect(pymupdf_x1, redact_bottom_y, pymupdf_x2, redact_top_y) # Slightly smaller than outside box
619
-
620
- # Add the annotation to the middle of the character line, so that it doesn't delete text from adjacent lines
621
- #page.add_redact_annot(rect)#rect_small_pixel_height)
622
- pymupdf_page.add_redact_annot(rect_small_pixel_height)
623
-
624
- # Set up drawing a black box over the whole rect
625
- shape = pymupdf_page.new_shape()
626
- shape.draw_rect(pymupdf_rect)
627
-
628
- if custom_colours == True:
629
- if img_annotation_box["color"][0] > 1:
630
- out_colour = convert_color_to_range_0_1(img_annotation_box["color"])
631
- else:
632
- out_colour = img_annotation_box["color"]
633
- else:
634
- out_colour = (0,0,0)
635
-
636
- shape.finish(color=out_colour, fill=out_colour) # Black fill for the rectangle
637
- #shape.finish(color=(0, 0, 0)) # Black fill for the rectangle
638
- shape.commit()
639
-
640
  def redact_page_with_pymupdf(page:Page, annotations_on_page:dict, image:Image.Image=None, custom_colours:bool=False, redact_whole_page:bool=False):
641
 
642
  mediabox_height = page.mediabox[3] - page.mediabox[1]
@@ -732,28 +672,31 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page:dict, image:Image.Im
732
 
733
  # If whole page is to be redacted, do that here
734
  if redact_whole_page == True:
735
- # Small border to page that remains white
736
- border = 5
737
- # Define the coordinates for the Rect
738
- whole_page_x1, whole_page_y1 = 0 + border, 0 + border # Bottom-left corner
739
- whole_page_x2, whole_page_y2 = rect_width - border, rect_height - border # Top-right corner
 
 
740
 
741
- whole_page_image_x1, whole_page_image_y1, whole_page_image_x2, whole_page_image_y2 = convert_pymupdf_to_image_coords(page, whole_page_x1, whole_page_y1, whole_page_x2, whole_page_y2, image)
 
742
 
743
- # Create new image annotation element based on whole page coordinates
744
- whole_page_rect = Rect(whole_page_x1, whole_page_y1, whole_page_x2, whole_page_y2)
 
 
 
 
 
 
745
 
746
- # Write whole page annotation to annotation boxes
747
- whole_page_img_annotation_box = {}
748
- whole_page_img_annotation_box["xmin"] = whole_page_image_x1
749
- whole_page_img_annotation_box["ymin"] = whole_page_image_y1
750
- whole_page_img_annotation_box["xmax"] = whole_page_image_x2
751
- whole_page_img_annotation_box["ymax"] = whole_page_image_y2
752
- whole_page_img_annotation_box["color"] = (0,0,0)
753
- whole_page_img_annotation_box["label"] = "Whole page"
754
 
755
- redact_single_box(page, whole_page_rect, whole_page_img_annotation_box, custom_colours)
756
 
 
757
  all_image_annotation_boxes.append(whole_page_img_annotation_box)
758
 
759
  out_annotation_boxes = {
@@ -1058,11 +1001,20 @@ def redact_image_pdf(file_path:str,
1058
  comprehend_query_number_new = 0
1059
 
1060
  # Update custom word list analyser object with any new words that have been added to the custom deny list
1061
- if custom_recogniser_word_list:
 
1062
  nlp_analyser.registry.remove_recognizer("CUSTOM")
1063
  new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
 
1064
  nlp_analyser.registry.add_recognizer(new_custom_recogniser)
1065
 
 
 
 
 
 
 
 
1066
 
1067
  image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
1068
 
@@ -1315,9 +1267,15 @@ def redact_image_pdf(file_path:str,
1315
 
1316
  image_annotations = {"image": file_path, "boxes": all_image_annotations_boxes}
1317
 
1318
- ## Apply annotations with pymupdf
1319
  else:
1320
- pymupdf_page, image_annotations = redact_page_with_pymupdf(pymupdf_page, merged_redaction_bboxes, image)
 
 
 
 
 
 
1321
 
1322
  # Convert decision process to table
1323
  decision_process_table = pd.DataFrame([{
@@ -1811,11 +1769,20 @@ def redact_text_pdf(
1811
  return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
1812
 
1813
  # Update custom word list analyser object with any new words that have been added to the custom deny list
1814
- if custom_recogniser_word_list:
 
1815
  nlp_analyser.registry.remove_recognizer("CUSTOM")
1816
  new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
 
1817
  nlp_analyser.registry.add_recognizer(new_custom_recogniser)
1818
 
 
 
 
 
 
 
 
1819
  tic = time.perf_counter()
1820
 
1821
  # Open with Pikepdf to get text lines
@@ -1903,6 +1870,7 @@ def redact_text_pdf(
1903
  for i, text_line in enumerate(line_level_text_results_list):
1904
  if chosen_redact_entities:
1905
  if pii_identification_method == "Local":
 
1906
  # Process immediately for local analysis
1907
  text_line_analyser_result = nlp_analyser.analyze(
1908
  text=text_line.text,
@@ -2024,7 +1992,13 @@ def redact_text_pdf(
2024
  annotations_on_page = create_annotations_for_bounding_boxes(page_analysed_bounding_boxes)
2025
 
2026
  # Make pymupdf page redactions
2027
- pymupdf_page, image_annotations = redact_page_with_pymupdf(pymupdf_page, annotations_on_page, image)
 
 
 
 
 
 
2028
 
2029
  #print("Did redact_page_with_pymupdf function")
2030
  reported_page_no = page_no + 1
 
18
  from pikepdf import Pdf, Dictionary, Name
19
  import pymupdf
20
  from pymupdf import Rect
21
+ from fitz import Page
22
  import gradio as gr
23
  from gradio import Progress
24
  from collections import defaultdict # For efficient grouping
 
26
  from presidio_analyzer import RecognizerResult
27
  from tools.aws_functions import RUN_AWS_FUNCTIONS
28
  from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult
29
+ from tools.file_conversion import process_file, image_dpi, convert_review_json_to_pandas_df, redact_whole_pymupdf_page, redact_single_box, convert_pymupdf_to_image_coords
30
  from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser
31
  from tools.helper_functions import get_file_path_end, output_folder, clean_unicode_text, get_or_create_env_var, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
32
  from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
 
69
  chosen_redact_comprehend_entities:List[str],
70
  in_redact_method:str,
71
  in_allow_list:List[List[str]]=None,
72
+ custom_recogniser_word_list:List[str]=None,
73
+ redact_whole_page_list:List[str]=None,
74
  latest_file_completed:int=0,
75
  out_message:list=[],
76
  out_file_paths:list=[],
 
102
  - chosen_redact_comprehend_entities (List[str]): A list of entity types to redact from files, chosen from the official list from AWS Comprehend service
103
  - in_redact_method (str): The method to use for redaction.
104
  - in_allow_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to None.
105
+ - custom_recogniser_word_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to None.
106
+ - redact_whole_page_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to None.
107
  - latest_file_completed (int, optional): The index of the last completed file. Defaults to 0.
108
  - out_message (list, optional): A list to store output messages. Defaults to an empty list.
109
  - out_file_paths (list, optional): A list to store paths to the output files. Defaults to an empty list.
 
131
  tic = time.perf_counter()
132
  all_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
133
 
134
+ if isinstance(custom_recogniser_word_list, pd.DataFrame):
135
+ custom_recogniser_word_list = custom_recogniser_word_list.iloc[:,0].tolist()
136
+
137
+ # Sort the strings in order from the longest string to the shortest
138
+ custom_recogniser_word_list = sorted(custom_recogniser_word_list, key=len, reverse=True)
139
+
140
+ if isinstance(redact_whole_page_list, pd.DataFrame):
141
+ redact_whole_page_list = redact_whole_page_list.iloc[:,0].tolist()
142
+
143
 
144
  # If this is the first time around, set variables to 0/blank
145
  if first_loop_state==True:
 
305
  pii_identification_method,
306
  comprehend_query_number,
307
  comprehend_client,
308
+ textract_client,
309
+ custom_recogniser_word_list,
310
+ redact_whole_page_list)
311
 
312
  # Save Textract request metadata (if exists)
313
  if new_request_metadata:
 
341
  pymupdf_doc,
342
  pii_identification_method,
343
  comprehend_query_number,
344
+ comprehend_client,
345
+ custom_recogniser_word_list,
346
+ redact_whole_page_list)
347
 
348
  else:
349
  out_message = "No redaction method selected"
 
391
  json.dump(annotations_all_pages, f)
392
  log_files_output_paths.append(out_annotation_file_path)
393
 
394
+ #print("Saving annotations to CSV")
395
 
396
  # Convert json to csv and also save this
397
+ #print("annotations_all_pages:", annotations_all_pages)
398
+
399
  review_df = convert_review_json_to_pandas_df(annotations_all_pages)
400
+
401
  out_review_file_file_path = out_image_file_path + '_review_file.csv'
402
  review_df.to_csv(out_review_file_file_path, index=None)
403
  out_file_paths.append(out_review_file_file_path)
404
 
405
+ print("Saved review file to csv")
406
+
407
  except Exception as e:
408
  print("Could not save annotations to json file:", e)
409
 
 
540
 
541
  return x1, new_y1, x2, new_y2
542
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
543
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
544
 
545
  def convert_gradio_annotation_coords_to_pymupdf(pymupdf_page:Page, annot:dict, image:Image):
546
  '''
 
577
 
578
  return new_file_path
579
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
580
  def redact_page_with_pymupdf(page:Page, annotations_on_page:dict, image:Image.Image=None, custom_colours:bool=False, redact_whole_page:bool=False):
581
 
582
  mediabox_height = page.mediabox[3] - page.mediabox[1]
 
672
 
673
  # If whole page is to be redacted, do that here
674
  if redact_whole_page == True:
675
+ # # Small border to page that remains white
676
+ # border = 5
677
+ # # Define the coordinates for the Rect
678
+ # whole_page_x1, whole_page_y1 = 0 + border, 0 + border # Bottom-left corner
679
+ # whole_page_x2, whole_page_y2 = rect_width - border, rect_height - border # Top-right corner
680
+
681
+ # whole_page_image_x1, whole_page_image_y1, whole_page_image_x2, whole_page_image_y2 = convert_pymupdf_to_image_coords(page, whole_page_x1, whole_page_y1, whole_page_x2, whole_page_y2, image)
682
 
683
+ # # Create new image annotation element based on whole page coordinates
684
+ # whole_page_rect = Rect(whole_page_x1, whole_page_y1, whole_page_x2, whole_page_y2)
685
 
686
+ # # Write whole page annotation to annotation boxes
687
+ # whole_page_img_annotation_box = {}
688
+ # whole_page_img_annotation_box["xmin"] = whole_page_image_x1
689
+ # whole_page_img_annotation_box["ymin"] = whole_page_image_y1
690
+ # whole_page_img_annotation_box["xmax"] = whole_page_image_x2
691
+ # whole_page_img_annotation_box["ymax"] = whole_page_image_y2
692
+ # whole_page_img_annotation_box["color"] = (0,0,0)
693
+ # whole_page_img_annotation_box["label"] = "Whole page"
694
 
695
+ # redact_single_box(page, whole_page_rect, whole_page_img_annotation_box, custom_colours)
 
 
 
 
 
 
 
696
 
697
+ # all_image_annotation_boxes.append(whole_page_img_annotation_box)
698
 
699
+ whole_page_img_annotation_box = redact_whole_pymupdf_page(rect_height, rect_width, image, page, custom_colours, border = 5)
700
  all_image_annotation_boxes.append(whole_page_img_annotation_box)
701
 
702
  out_annotation_boxes = {
 
1001
  comprehend_query_number_new = 0
1002
 
1003
  # Update custom word list analyser object with any new words that have been added to the custom deny list
1004
+ #print("custom_recogniser_word_list:", custom_recogniser_word_list)
1005
+ if custom_recogniser_word_list:
1006
  nlp_analyser.registry.remove_recognizer("CUSTOM")
1007
  new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
1008
+ #print("new_custom_recogniser:", new_custom_recogniser)
1009
  nlp_analyser.registry.add_recognizer(new_custom_recogniser)
1010
 
1011
+ # List all elements currently in the nlp_analyser registry
1012
+ #print("Current recognizers in nlp_analyser registry:")
1013
+ for recognizer_name in nlp_analyser.registry.recognizers:
1014
+ print(recognizer_name)
1015
+
1016
+
1017
+
1018
 
1019
  image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
1020
 
 
1267
 
1268
  image_annotations = {"image": file_path, "boxes": all_image_annotations_boxes}
1269
 
1270
+ ## Apply annotations with pymupdf
1271
  else:
1272
+ #print("redact_whole_page_list:", redact_whole_page_list)
1273
+ if redact_whole_page_list:
1274
+ if current_loop_page in redact_whole_page_list: redact_whole_page = True
1275
+ else: redact_whole_page = False
1276
+ else: redact_whole_page = False
1277
+
1278
+ pymupdf_page, image_annotations = redact_page_with_pymupdf(pymupdf_page, merged_redaction_bboxes, image, redact_whole_page=redact_whole_page)
1279
 
1280
  # Convert decision process to table
1281
  decision_process_table = pd.DataFrame([{
 
1769
  return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
1770
 
1771
  # Update custom word list analyser object with any new words that have been added to the custom deny list
1772
+ #print("custom_recogniser_word_list:", custom_recogniser_word_list)
1773
+ if custom_recogniser_word_list:
1774
  nlp_analyser.registry.remove_recognizer("CUSTOM")
1775
  new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
1776
+ #print("new_custom_recogniser:", new_custom_recogniser)
1777
  nlp_analyser.registry.add_recognizer(new_custom_recogniser)
1778
 
1779
+ # List all elements currently in the nlp_analyser registry
1780
+ #print("Current recognizers in nlp_analyser registry:")
1781
+ #for recognizer_name in nlp_analyser.registry.recognizers:
1782
+ # print(recognizer_name)
1783
+
1784
+ #print("Custom recogniser:", nlp_analyser.registry.)
1785
+
1786
  tic = time.perf_counter()
1787
 
1788
  # Open with Pikepdf to get text lines
 
1870
  for i, text_line in enumerate(line_level_text_results_list):
1871
  if chosen_redact_entities:
1872
  if pii_identification_method == "Local":
1873
+
1874
  # Process immediately for local analysis
1875
  text_line_analyser_result = nlp_analyser.analyze(
1876
  text=text_line.text,
 
1992
  annotations_on_page = create_annotations_for_bounding_boxes(page_analysed_bounding_boxes)
1993
 
1994
  # Make pymupdf page redactions
1995
+ #print("redact_whole_page_list:", redact_whole_page_list)
1996
+ if redact_whole_page_list:
1997
+ if current_loop_page in redact_whole_page_list: redact_whole_page = True
1998
+ else: redact_whole_page = False
1999
+ else: redact_whole_page = False
2000
+
2001
+ pymupdf_page, image_annotations = redact_page_with_pymupdf(pymupdf_page, annotations_on_page, image, redact_whole_page=redact_whole_page)
2002
 
2003
  #print("Did redact_page_with_pymupdf function")
2004
  reported_page_no = page_no + 1
tools/helper_functions.py CHANGED
@@ -112,8 +112,6 @@ def custom_regex_load(in_file:List[str], file_type:str = "Allow list"):
112
  custom_regex = pd.DataFrame()
113
 
114
  if in_file:
115
- print("File type:", file_type)
116
-
117
  file_list = [string.name for string in in_file]
118
 
119
  regex_file_names = [string for string in file_list if "csv" in string.lower()]
 
112
  custom_regex = pd.DataFrame()
113
 
114
  if in_file:
 
 
115
  file_list = [string.name for string in in_file]
116
 
117
  regex_file_names = [string for string in file_list if "csv" in string.lower()]
tools/load_spacy_model_custom_recognisers.py CHANGED
@@ -28,8 +28,10 @@ except:
28
  def custom_word_list_recogniser(custom_list:List[str]=[]):
29
  custom_regex = '\\b' + '\\b|\\b'.join(rf"{re.escape(term)}" for term in custom_list) + '\\b'
30
  custom_pattern = Pattern(name="custom_pattern",regex=custom_regex, score = 1)
 
 
31
  custom_recogniser = PatternRecognizer(supported_entity="CUSTOM", name="CUSTOM", patterns = [custom_pattern],
32
- global_regex_flags=re.DOTALL | re.MULTILINE)
33
 
34
  return custom_recogniser
35
 
 
28
  def custom_word_list_recogniser(custom_list:List[str]=[]):
29
  custom_regex = '\\b' + '\\b|\\b'.join(rf"{re.escape(term)}" for term in custom_list) + '\\b'
30
  custom_pattern = Pattern(name="custom_pattern",regex=custom_regex, score = 1)
31
+
32
+ #print("custom_pattern:", custom_pattern)
33
  custom_recogniser = PatternRecognizer(supported_entity="CUSTOM", name="CUSTOM", patterns = [custom_pattern],
34
+ global_regex_flags=re.DOTALL | re.MULTILINE | re.IGNORECASE)
35
 
36
  return custom_recogniser
37
 
tools/redaction_review.py CHANGED
@@ -49,30 +49,66 @@ def update_zoom(current_zoom_level:int, annotate_current_page:int, decrease:bool
49
 
50
  return current_zoom_level, annotate_current_page
51
 
52
- def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, zoom:int=100):
53
  '''
54
  Update a gradio_image_annotation object with new annotation data
55
  '''
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
  zoom_str = str(zoom) + '%'
58
 
59
  if not image_annotator_object:
 
 
60
  out_image_annotator = image_annotator(
61
- label="Modify redaction boxes",
 
 
62
  #label_list=["Redaction"],
63
  #label_colors=[(0, 0, 0)],
 
64
  height=zoom_str,
65
  width=zoom_str,
66
- show_label=False,
67
- sources=None,
 
 
68
  show_clear_button=False,
69
  show_share_button=False,
70
  show_remove_button=False,
71
- interactive=False)
72
-
73
- number_reported = gr.Number(label = "Page (press enter to change)", value=1, precision=0)
 
74
 
75
- return out_image_annotator, number_reported, number_reported, page_num_reported
76
 
77
  #print("page_num at start of update_annotator function:", page_num)
78
 
@@ -95,6 +131,28 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, zo
95
  page_num_reported = page_max_reported
96
 
97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  out_image_annotator = image_annotator(
99
  value = image_annotator_object[page_num_reported - 1],
100
  boxes_alpha=0.1,
@@ -117,7 +175,7 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, zo
117
 
118
  number_reported = gr.Number(label = "Page (press enter to change)", value=page_num_reported, precision=0)
119
 
120
- return out_image_annotator, number_reported, number_reported, page_num_reported
121
 
122
  def modify_existing_page_redactions(image_annotated:AnnotatedImageData, current_page:int, previous_page:int, all_image_annotations:List[AnnotatedImageData], clear_all:bool=False):
123
  '''
@@ -149,6 +207,8 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
149
  output_files = []
150
  output_log_files = []
151
 
 
 
152
  image_annotated['image'] = all_image_annotations[current_page - 1]["image"]
153
 
154
  all_image_annotations[current_page - 1] = image_annotated
@@ -252,32 +312,19 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
252
 
253
  return doc, all_image_annotations, output_files, output_log_files
254
 
255
- def crop(annotations:AnnotatedImageData):
256
- if annotations["boxes"]:
257
- box = annotations["boxes"][0]
258
- return annotations["image"][
259
- box["ymin"]:box["ymax"],
260
- box["xmin"]:box["xmax"]
261
- ]
262
- return None
263
-
264
  def get_boxes_json(annotations:AnnotatedImageData):
265
  return annotations["boxes"]
266
- # Group the DataFrame by the 'image' column
267
- grouped = df.groupby('image')
268
 
269
- # Create a list to hold the JSON data
270
- json_data = []
271
-
272
- # Iterate over each group
273
- for image_path, group in grouped:
274
- # Convert each group to a list of box dictionaries
275
- boxes = group.drop(columns='image').to_dict(orient='records')
276
-
277
- # Append the structured data to the json_data list
278
- json_data.append({
279
- "image": image_path,
280
- "boxes": boxes
281
- })
282
 
283
- return json_data
 
49
 
50
  return current_zoom_level, annotate_current_page
51
 
52
+ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, recogniser_entities_drop=gr.Dropdown(value="ALL", allow_custom_value=True), recogniser_dataframe_gr=gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]})), zoom:int=80):
53
  '''
54
  Update a gradio_image_annotation object with new annotation data
55
  '''
56
+ recogniser_entities = []
57
+ recogniser_dataframe = pd.DataFrame()
58
+ #recogniser_entities_drop = gr.Dropdown(value="ALL", allow_custom_value=True)
59
+ #recogniser_dataframe_gr = gr.Dataframe(pd.DataFrame(data={"page":[""], "label":[""]}))
60
+
61
+ #print("recogniser_dataframe_gr", recogniser_dataframe_gr)
62
+ #print("recogniser_dataframe_gr shape", recogniser_dataframe_gr.shape)
63
+ #print("recogniser_dataframe_gr.iloc[0,0]:", recogniser_dataframe_gr.iloc[0,0])
64
+
65
+ if recogniser_dataframe_gr.iloc[0,0] == "":
66
+ try:
67
+ review_dataframe = convert_review_json_to_pandas_df(image_annotator_object)[["page", "label"]]
68
+ #print("review_dataframe['label']", review_dataframe["label"])
69
+ recogniser_entities = review_dataframe["label"].unique().tolist()
70
+ recogniser_entities.append("ALL")
71
+
72
+ #print("recogniser_entities:", recogniser_entities)
73
+
74
+ recogniser_dataframe_out = gr.Dataframe(review_dataframe)
75
+ recogniser_dataframe_gr = gr.Dataframe(review_dataframe)
76
+ recogniser_entities_drop = gr.Dropdown(value=recogniser_entities[0], choices=recogniser_entities, allow_custom_value=True, interactive=True)
77
+ except Exception as e:
78
+ print("Could not extract recogniser information:", e)
79
+
80
+ else:
81
+ review_dataframe = update_entities_df(recogniser_entities_drop, recogniser_dataframe_gr)
82
+ recogniser_dataframe_out = gr.Dataframe(review_dataframe)
83
+
84
 
85
  zoom_str = str(zoom) + '%'
86
 
87
  if not image_annotator_object:
88
+ page_num_reported = 1
89
+
90
  out_image_annotator = image_annotator(
91
+ image_annotator_object[page_num_reported - 1],
92
+ boxes_alpha=0.1,
93
+ box_thickness=1,
94
  #label_list=["Redaction"],
95
  #label_colors=[(0, 0, 0)],
96
+ show_label=False,
97
  height=zoom_str,
98
  width=zoom_str,
99
+ box_min_size=1,
100
+ box_selected_thickness=2,
101
+ handle_size=4,
102
+ sources=None,#["upload"],
103
  show_clear_button=False,
104
  show_share_button=False,
105
  show_remove_button=False,
106
+ handles_cursor=True,
107
+ interactive=True
108
+ )
109
+ number_reported = gr.Number(label = "Page (press enter to change)", value=page_num_reported, precision=0)
110
 
111
+ return out_image_annotator, number_reported, number_reported, page_num_reported, recogniser_entities_drop, recogniser_dataframe_out, recogniser_dataframe_gr
112
 
113
  #print("page_num at start of update_annotator function:", page_num)
114
 
 
131
  page_num_reported = page_max_reported
132
 
133
 
134
+
135
+ # Remove duplicate elements that are blank
136
+ def remove_duplicate_images_with_blank_boxes(data: List[AnnotatedImageData]) -> List[AnnotatedImageData]:
137
+ seen_images = set()
138
+ filtered_data = []
139
+
140
+ for item in data:
141
+ # Check if 'image' is unique
142
+ if item['image'] not in seen_images:
143
+ filtered_data.append(item)
144
+ seen_images.add(item['image'])
145
+ # If 'boxes' is empty but 'image' is unique, keep the entry
146
+ elif item['boxes']:
147
+ filtered_data.append(item)
148
+
149
+ return filtered_data
150
+
151
+ image_annotator_object = remove_duplicate_images_with_blank_boxes(image_annotator_object)
152
+
153
+ #print("image_annotator_object in update_annotator:", image_annotator_object)
154
+ #print("image_annotator_object[page_num_reported - 1]:", image_annotator_object[page_num_reported - 1])
155
+
156
  out_image_annotator = image_annotator(
157
  value = image_annotator_object[page_num_reported - 1],
158
  boxes_alpha=0.1,
 
175
 
176
  number_reported = gr.Number(label = "Page (press enter to change)", value=page_num_reported, precision=0)
177
 
178
+ return out_image_annotator, number_reported, number_reported, page_num_reported, recogniser_entities_drop, recogniser_dataframe_out, recogniser_dataframe_gr
179
 
180
  def modify_existing_page_redactions(image_annotated:AnnotatedImageData, current_page:int, previous_page:int, all_image_annotations:List[AnnotatedImageData], clear_all:bool=False):
181
  '''
 
207
  output_files = []
208
  output_log_files = []
209
 
210
+ #print("File paths in apply_redactions:", file_paths)
211
+
212
  image_annotated['image'] = all_image_annotations[current_page - 1]["image"]
213
 
214
  all_image_annotations[current_page - 1] = image_annotated
 
312
 
313
  return doc, all_image_annotations, output_files, output_log_files
314
 
 
 
 
 
 
 
 
 
 
315
  def get_boxes_json(annotations:AnnotatedImageData):
316
  return annotations["boxes"]
 
 
317
 
318
+ def update_entities_df(choice:str, df:pd.DataFrame):
319
+ if choice=="ALL":
320
+ return df
321
+ else:
322
+ return df.loc[df["label"]==choice,:]
323
+
324
+ def df_select_callback(df: pd.DataFrame, evt: gr.SelectData):
325
+ #print("index", evt.index)
326
+ #print("value", evt.value)
327
+ #print("row_value", evt.row_value)
328
+ row_value_page = evt.row_value[0] # This is the page number value
329
+ return row_value_page
 
330