Commit
·
1d772de
1
Parent(s):
a770956
Refactor redaction functionality and enhance UI components: Added support for custom recognizers and whole page redaction options. Updated file handling to include new dropdowns for entity selection and improved dataframes for entity management. Enhanced the annotator with better state management and UI responsiveness. Cleaned up redundant code and improved overall performance in the redaction process.
Browse files- app.py +75 -42
- tools/file_conversion.py +216 -47
- tools/file_redaction.py +83 -109
- tools/helper_functions.py +0 -2
- tools/load_spacy_model_custom_recognisers.py +3 -1
- tools/redaction_review.py +81 -34
app.py
CHANGED
@@ -13,7 +13,7 @@ from tools.helper_functions import ensure_output_folder_exists, add_folder_to_pa
|
|
13 |
from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
|
14 |
from tools.file_redaction import choose_and_run_redactor
|
15 |
from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
|
16 |
-
from tools.redaction_review import apply_redactions, modify_existing_page_redactions, decrease_page, increase_page, update_annotator, update_zoom
|
17 |
from tools.data_anonymise import anonymise_data_files
|
18 |
from tools.auth import authenticate_user
|
19 |
from tools.load_spacy_model_custom_recognisers import custom_entities
|
@@ -34,7 +34,7 @@ full_comprehend_entity_list = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBI
|
|
34 |
chosen_comprehend_entities.extend(custom_entities)
|
35 |
full_comprehend_entity_list.extend(custom_entities)
|
36 |
|
37 |
-
chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE"]
|
38 |
|
39 |
full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS', 'CUSTOM']
|
40 |
|
@@ -67,8 +67,6 @@ with app:
|
|
67 |
all_line_level_ocr_results_df_state = gr.State(pd.DataFrame())
|
68 |
all_decision_process_table_state = gr.State(pd.DataFrame())
|
69 |
|
70 |
-
|
71 |
-
|
72 |
session_hash_state = gr.State()
|
73 |
s3_output_folder_state = gr.State()
|
74 |
|
@@ -101,7 +99,12 @@ with app:
|
|
101 |
doc_full_file_name_textbox = gr.Textbox(label = "doc_full_file_name_textbox", value="", visible=False)
|
102 |
doc_file_name_no_extension_textbox = gr.Textbox(label = "doc_full_file_name_textbox", value="", visible=False)
|
103 |
doc_file_name_with_extension_textbox = gr.Textbox(label = "doc_file_name_with_extension_textbox", value="", visible=False)
|
104 |
-
|
|
|
|
|
|
|
|
|
|
|
105 |
|
106 |
estimated_time_taken_number = gr.Number(label = "estimated_time_taken_number", value=0.0, precision=1, visible=False) # This keeps track of the time taken to redact files for logging purposes.
|
107 |
annotate_previous_page = gr.Number(value=0, label="Previous page", precision=0, visible=False) # Keeps track of the last page that the annotator was on
|
@@ -124,12 +127,12 @@ with app:
|
|
124 |
|
125 |
default_deny_list_file_name = "default_deny_list.csv"
|
126 |
default_deny_list_loc = output_folder + "/" + default_deny_list_file_name
|
127 |
-
in_deny_list_state = gr.State(
|
128 |
in_deny_list_text_in = gr.Textbox(value="Deny list", visible=False)
|
129 |
|
130 |
fully_redacted_list_file_name = "default_fully_redacted_list.csv"
|
131 |
fully_redacted_list_loc = output_folder + "/" + fully_redacted_list_file_name
|
132 |
-
in_fully_redacted_list_state = gr.State(
|
133 |
in_fully_redacted_text_in = gr.Textbox(value="Fully redacted page list", visible=False)
|
134 |
|
135 |
# S3 settings for default allow list load
|
@@ -137,6 +140,9 @@ with app:
|
|
137 |
s3_default_allow_list_file = gr.Textbox(label = "Default allow list file", value=default_allow_list_file_name, visible=False)
|
138 |
default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=default_allow_list_loc, visible=False)
|
139 |
|
|
|
|
|
|
|
140 |
###
|
141 |
# UI DESIGN
|
142 |
###
|
@@ -146,7 +152,9 @@ with app:
|
|
146 |
|
147 |
Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Documents/images can be redacted using 'Quick' image analysis that works fine for typed text, but not handwriting/signatures. On the Redaction settings tab, choose 'Complex image analysis' OCR using AWS Textract (if you are using AWS) to redact these more complex elements (this service has a cost). Addtionally you can choose the method for PII identification. 'Local' gives quick, lower quality results, AWS Comprehend gives better results but has a cost.
|
148 |
|
149 |
-
Review suggested redactions on the 'Review redactions' tab using a point and click visual interface.
|
|
|
|
|
150 |
|
151 |
NOTE: In testing the app seems to find about 60% of personal information on a given (typed) page of text. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.""")
|
152 |
|
@@ -183,7 +191,7 @@ with app:
|
|
183 |
# Object annotation
|
184 |
with gr.Tab("Review redactions", id="tab_object_annotation"):
|
185 |
|
186 |
-
with gr.Accordion(label = "Review
|
187 |
output_review_files = gr.File(label="Review output files", file_count='multiple')
|
188 |
upload_previous_review_file_btn = gr.Button("Review previously created redaction file (upload original PDF and ...review_file.csv)")
|
189 |
|
@@ -200,17 +208,35 @@ with app:
|
|
200 |
|
201 |
annotation_button_apply = gr.Button("Apply revised redactions", variant="primary")
|
202 |
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
214 |
|
215 |
with gr.Row():
|
216 |
annotation_last_page_button_bottom = gr.Button("Previous page", scale = 3)
|
@@ -290,36 +316,41 @@ with app:
|
|
290 |
###
|
291 |
# PDF/IMAGE REDACTION
|
292 |
###
|
293 |
-
in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox])
|
294 |
|
295 |
document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator]).\
|
296 |
then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state], api_name="prepare_doc").\
|
297 |
then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
|
298 |
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number], api_name="redact_doc").\
|
299 |
-
then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
|
300 |
|
301 |
# If the app has completed a batch of pages, it will run this until the end of all pages in the document
|
302 |
current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
|
303 |
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number]).\
|
304 |
-
then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
|
305 |
|
306 |
# If a file has been completed, the function will continue onto the next document
|
307 |
-
latest_file_completed_text.change(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page]).\
|
308 |
then(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
|
309 |
|
310 |
###
|
311 |
# REVIEW PDF REDACTIONS
|
312 |
###
|
313 |
|
|
|
|
|
|
|
|
|
|
|
314 |
# Page controls at top
|
315 |
annotate_current_page.submit(
|
316 |
modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
|
317 |
-
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
|
318 |
|
319 |
annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
320 |
-
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
|
321 |
annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
322 |
-
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
|
323 |
|
324 |
# Zoom in and out on annotator
|
325 |
annotate_zoom_in.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
|
@@ -328,10 +359,10 @@ with app:
|
|
328 |
annotate_zoom_out.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
|
329 |
then(update_zoom, inputs=[annotator_zoom_number, annotate_current_page, zoom_false_bool], outputs=[annotator_zoom_number, annotate_current_page])
|
330 |
|
331 |
-
annotator_zoom_number.change(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
|
332 |
|
333 |
clear_all_redactions_on_page_btn.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, clear_all_page_redactions], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
|
334 |
-
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
|
335 |
|
336 |
#annotation_button_get.click(get_boxes_json, annotator, json_boxes)
|
337 |
annotation_button_apply.click(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output], scroll_to_output=True)
|
@@ -339,23 +370,26 @@ with app:
|
|
339 |
# Page controls at bottom
|
340 |
annotate_current_page_bottom.submit(
|
341 |
modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page]).\
|
342 |
-
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
|
343 |
|
344 |
annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
345 |
-
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
|
346 |
annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
347 |
-
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
348 |
|
349 |
-
# Upload previous files for modifying redactions
|
350 |
-
upload_previous_review_file_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox]).\
|
351 |
-
then(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state]).\
|
352 |
-
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
|
353 |
|
354 |
###
|
355 |
# TABULAR DATA REDACTION
|
356 |
###
|
357 |
in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets]).\
|
358 |
-
then(fn=get_input_file_names, inputs=[in_data_files], outputs=[
|
359 |
|
360 |
tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state], api_name="redact_data")
|
361 |
|
@@ -370,7 +404,6 @@ with app:
|
|
370 |
in_allow_list.change(fn=custom_regex_load, inputs=[in_allow_list], outputs=[in_allow_list_text, in_allow_list_state])
|
371 |
in_deny_list.change(fn=custom_regex_load, inputs=[in_deny_list, in_deny_list_text_in], outputs=[in_deny_list_text, in_deny_list_state])
|
372 |
in_fully_redacted_list.change(fn=custom_regex_load, inputs=[in_fully_redacted_list, in_fully_redacted_text_in], outputs=[in_fully_redacted_list_text, in_fully_redacted_list_state])
|
373 |
-
|
374 |
|
375 |
|
376 |
###
|
@@ -403,14 +436,14 @@ with app:
|
|
403 |
|
404 |
# User submitted feedback for data redactions
|
405 |
data_callback = CSVLogger_custom(dataset_file_name=log_file_name)
|
406 |
-
data_callback.setup([data_feedback_radio, data_further_details_text,
|
407 |
-
data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args)), [data_feedback_radio, data_further_details_text,
|
408 |
then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
|
409 |
|
410 |
# Log processing time/token usage when making a query
|
411 |
usage_callback = CSVLogger_custom(dataset_file_name=log_file_name)
|
412 |
-
usage_callback.setup([session_hash_textbox, doc_file_name_no_extension_textbox,
|
413 |
-
latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_no_extension_textbox,
|
414 |
then(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
415 |
|
416 |
# Get some environment variables and Launch the Gradio app
|
|
|
13 |
from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
|
14 |
from tools.file_redaction import choose_and_run_redactor
|
15 |
from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
|
16 |
+
from tools.redaction_review import apply_redactions, modify_existing_page_redactions, decrease_page, increase_page, update_annotator, update_zoom, update_entities_df, df_select_callback
|
17 |
from tools.data_anonymise import anonymise_data_files
|
18 |
from tools.auth import authenticate_user
|
19 |
from tools.load_spacy_model_custom_recognisers import custom_entities
|
|
|
34 |
chosen_comprehend_entities.extend(custom_entities)
|
35 |
full_comprehend_entity_list.extend(custom_entities)
|
36 |
|
37 |
+
chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", "CUSTOM"]
|
38 |
|
39 |
full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS', 'CUSTOM']
|
40 |
|
|
|
67 |
all_line_level_ocr_results_df_state = gr.State(pd.DataFrame())
|
68 |
all_decision_process_table_state = gr.State(pd.DataFrame())
|
69 |
|
|
|
|
|
70 |
session_hash_state = gr.State()
|
71 |
s3_output_folder_state = gr.State()
|
72 |
|
|
|
99 |
doc_full_file_name_textbox = gr.Textbox(label = "doc_full_file_name_textbox", value="", visible=False)
|
100 |
doc_file_name_no_extension_textbox = gr.Textbox(label = "doc_full_file_name_textbox", value="", visible=False)
|
101 |
doc_file_name_with_extension_textbox = gr.Textbox(label = "doc_file_name_with_extension_textbox", value="", visible=False)
|
102 |
+
doc_file_name_textbox_list = gr.Dropdown(label = "doc_file_name_textbox_list", value="", allow_custom_value=True,visible=False)
|
103 |
+
|
104 |
+
data_full_file_name_textbox = gr.Textbox(label = "data_full_file_name_textbox", value="", visible=False)
|
105 |
+
data_file_name_no_extension_textbox = gr.Textbox(label = "data_full_file_name_textbox", value="", visible=False)
|
106 |
+
data_file_name_with_extension_textbox = gr.Textbox(label = "data_file_name_with_extension_textbox", value="", visible=False)
|
107 |
+
data_file_name_textbox_list = gr.Dropdown(label = "data_file_name_textbox_list", value="", allow_custom_value=True,visible=False)
|
108 |
|
109 |
estimated_time_taken_number = gr.Number(label = "estimated_time_taken_number", value=0.0, precision=1, visible=False) # This keeps track of the time taken to redact files for logging purposes.
|
110 |
annotate_previous_page = gr.Number(value=0, label="Previous page", precision=0, visible=False) # Keeps track of the last page that the annotator was on
|
|
|
127 |
|
128 |
default_deny_list_file_name = "default_deny_list.csv"
|
129 |
default_deny_list_loc = output_folder + "/" + default_deny_list_file_name
|
130 |
+
in_deny_list_state = gr.State([])
|
131 |
in_deny_list_text_in = gr.Textbox(value="Deny list", visible=False)
|
132 |
|
133 |
fully_redacted_list_file_name = "default_fully_redacted_list.csv"
|
134 |
fully_redacted_list_loc = output_folder + "/" + fully_redacted_list_file_name
|
135 |
+
in_fully_redacted_list_state = gr.State([])
|
136 |
in_fully_redacted_text_in = gr.Textbox(value="Fully redacted page list", visible=False)
|
137 |
|
138 |
# S3 settings for default allow list load
|
|
|
140 |
s3_default_allow_list_file = gr.Textbox(label = "Default allow list file", value=default_allow_list_file_name, visible=False)
|
141 |
default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=default_allow_list_loc, visible=False)
|
142 |
|
143 |
+
# Base dataframe for recognisers that is not modified subsequent to load
|
144 |
+
recogniser_entity_dataframe_base = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", visible=False)
|
145 |
+
|
146 |
###
|
147 |
# UI DESIGN
|
148 |
###
|
|
|
152 |
|
153 |
Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Documents/images can be redacted using 'Quick' image analysis that works fine for typed text, but not handwriting/signatures. On the Redaction settings tab, choose 'Complex image analysis' OCR using AWS Textract (if you are using AWS) to redact these more complex elements (this service has a cost). Addtionally you can choose the method for PII identification. 'Local' gives quick, lower quality results, AWS Comprehend gives better results but has a cost.
|
154 |
|
155 |
+
Review suggested redactions on the 'Review redactions' tab using a point and click visual interface. Upload a pdf alone to start from scratch, or upload the original pdf alongside a '...redaction_file.csv' to continue a previous redaction/review task.
|
156 |
+
|
157 |
+
See the 'Redaction settings' tab to choose which pages to redact, the type of information to redact (e.g. people, places), or terms to exclude from redaction. Please see the [User Guide](https://github.com/seanpedrick-case/doc_redaction/blob/main/README.md) for a walkthrough on how to use this and all other features in the app. The app accepts a maximum file size of 100mb. Please consider giving feedback for the quality of the answers underneath the redact buttons when the option appears, this will help to improve the app in future.
|
158 |
|
159 |
NOTE: In testing the app seems to find about 60% of personal information on a given (typed) page of text. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.""")
|
160 |
|
|
|
191 |
# Object annotation
|
192 |
with gr.Tab("Review redactions", id="tab_object_annotation"):
|
193 |
|
194 |
+
with gr.Accordion(label = "Review redaction file", open=True):
|
195 |
output_review_files = gr.File(label="Review output files", file_count='multiple')
|
196 |
upload_previous_review_file_btn = gr.Button("Review previously created redaction file (upload original PDF and ...review_file.csv)")
|
197 |
|
|
|
208 |
|
209 |
annotation_button_apply = gr.Button("Apply revised redactions", variant="primary")
|
210 |
|
211 |
+
with gr.Row():
|
212 |
+
|
213 |
+
with gr.Column(scale=4):
|
214 |
+
|
215 |
+
zoom_str = str(annotator_zoom_number) + '%'
|
216 |
+
|
217 |
+
annotator = image_annotator(
|
218 |
+
label="Modify redaction boxes",
|
219 |
+
label_list=["Redaction"],
|
220 |
+
label_colors=[(0, 0, 0)],
|
221 |
+
show_label=False,
|
222 |
+
height=zoom_str,
|
223 |
+
width=zoom_str,
|
224 |
+
box_min_size=1,
|
225 |
+
box_selected_thickness=2,
|
226 |
+
handle_size=4,
|
227 |
+
sources=None,#["upload"],
|
228 |
+
show_clear_button=False,
|
229 |
+
show_share_button=False,
|
230 |
+
show_remove_button=False,
|
231 |
+
handles_cursor=True,
|
232 |
+
interactive=False
|
233 |
+
)
|
234 |
+
|
235 |
+
with gr.Column(scale=1):
|
236 |
+
recogniser_entity_dropdown = gr.Dropdown(value="ALL", allow_custom_value=True)
|
237 |
+
recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas")
|
238 |
+
|
239 |
+
|
240 |
|
241 |
with gr.Row():
|
242 |
annotation_last_page_button_bottom = gr.Button("Previous page", scale = 3)
|
|
|
316 |
###
|
317 |
# PDF/IMAGE REDACTION
|
318 |
###
|
319 |
+
in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list])
|
320 |
|
321 |
document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator]).\
|
322 |
then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state], api_name="prepare_doc").\
|
323 |
then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
|
324 |
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number], api_name="redact_doc").\
|
325 |
+
then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
326 |
|
327 |
# If the app has completed a batch of pages, it will run this until the end of all pages in the document
|
328 |
current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
|
329 |
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number]).\
|
330 |
+
then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
331 |
|
332 |
# If a file has been completed, the function will continue onto the next document
|
333 |
+
latest_file_completed_text.change(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
334 |
then(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
|
335 |
|
336 |
###
|
337 |
# REVIEW PDF REDACTIONS
|
338 |
###
|
339 |
|
340 |
+
# Upload previous files for modifying redactions
|
341 |
+
upload_previous_review_file_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
|
342 |
+
then(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state]).\
|
343 |
+
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
344 |
+
|
345 |
# Page controls at top
|
346 |
annotate_current_page.submit(
|
347 |
modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
|
348 |
+
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
349 |
|
350 |
annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
351 |
+
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
352 |
annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
353 |
+
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
354 |
|
355 |
# Zoom in and out on annotator
|
356 |
annotate_zoom_in.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
|
|
|
359 |
annotate_zoom_out.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
|
360 |
then(update_zoom, inputs=[annotator_zoom_number, annotate_current_page, zoom_false_bool], outputs=[annotator_zoom_number, annotate_current_page])
|
361 |
|
362 |
+
annotator_zoom_number.change(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
363 |
|
364 |
clear_all_redactions_on_page_btn.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, clear_all_page_redactions], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
|
365 |
+
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
366 |
|
367 |
#annotation_button_get.click(get_boxes_json, annotator, json_boxes)
|
368 |
annotation_button_apply.click(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output], scroll_to_output=True)
|
|
|
370 |
# Page controls at bottom
|
371 |
annotate_current_page_bottom.submit(
|
372 |
modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page]).\
|
373 |
+
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
374 |
|
375 |
annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
376 |
+
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
377 |
annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
378 |
+
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
379 |
+
|
380 |
+
# Review side bar controls
|
381 |
+
recogniser_entity_dropdown.select(update_entities_df, inputs=[recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs=[recogniser_entity_dataframe])
|
382 |
+
|
383 |
+
recogniser_entity_dataframe.select(df_select_callback, inputs=[recogniser_entity_dataframe], outputs=annotate_current_page).\
|
384 |
+
then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
|
385 |
+
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
386 |
|
|
|
|
|
|
|
|
|
387 |
|
388 |
###
|
389 |
# TABULAR DATA REDACTION
|
390 |
###
|
391 |
in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets]).\
|
392 |
+
then(fn=get_input_file_names, inputs=[in_data_files], outputs=[data_full_file_name_textbox, data_file_name_no_extension_textbox, data_file_name_with_extension_textbox, data_full_file_name_textbox, data_file_name_textbox_list])
|
393 |
|
394 |
tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state], api_name="redact_data")
|
395 |
|
|
|
404 |
in_allow_list.change(fn=custom_regex_load, inputs=[in_allow_list], outputs=[in_allow_list_text, in_allow_list_state])
|
405 |
in_deny_list.change(fn=custom_regex_load, inputs=[in_deny_list, in_deny_list_text_in], outputs=[in_deny_list_text, in_deny_list_state])
|
406 |
in_fully_redacted_list.change(fn=custom_regex_load, inputs=[in_fully_redacted_list, in_fully_redacted_text_in], outputs=[in_fully_redacted_list_text, in_fully_redacted_list_state])
|
|
|
407 |
|
408 |
|
409 |
###
|
|
|
436 |
|
437 |
# User submitted feedback for data redactions
|
438 |
data_callback = CSVLogger_custom(dataset_file_name=log_file_name)
|
439 |
+
data_callback.setup([data_feedback_radio, data_further_details_text, data_full_file_name_textbox], feedback_logs_folder)
|
440 |
+
data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args)), [data_feedback_radio, data_further_details_text, data_full_file_name_textbox], None, preprocess=False).\
|
441 |
then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
|
442 |
|
443 |
# Log processing time/token usage when making a query
|
444 |
usage_callback = CSVLogger_custom(dataset_file_name=log_file_name)
|
445 |
+
usage_callback.setup([session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, estimated_time_taken_number, textract_metadata_textbox, pii_identification_method_drop, comprehend_query_number], usage_logs_folder)
|
446 |
+
latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, estimated_time_taken_number, textract_metadata_textbox, pii_identification_method_drop, comprehend_query_number], None, preprocess=False).\
|
447 |
then(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
448 |
|
449 |
# Get some environment variables and Launch the Gradio app
|
tools/file_conversion.py
CHANGED
@@ -8,6 +8,8 @@ import time
|
|
8 |
import json
|
9 |
import pymupdf
|
10 |
import pandas as pd
|
|
|
|
|
11 |
from tqdm import tqdm
|
12 |
from gradio import Progress
|
13 |
from typing import List, Optional
|
@@ -58,10 +60,10 @@ def process_single_page(pdf_path: str, page_num: int, image_dpi: float, output_d
|
|
58 |
os.makedirs(os.path.dirname(out_path), exist_ok=True)
|
59 |
|
60 |
if os.path.exists(out_path):
|
61 |
-
print(f"Loading existing image for page {page_num + 1}")
|
62 |
image = Image.open(out_path)
|
63 |
else:
|
64 |
-
print(f"Converting page {page_num + 1}")
|
65 |
image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1,
|
66 |
dpi=image_dpi, use_cropbox=True, use_pdftocairo=False)
|
67 |
image = image_l[0]
|
@@ -181,7 +183,7 @@ def process_file(file_path:str, prepare_for_review:bool=False):
|
|
181 |
|
182 |
return img_object
|
183 |
|
184 |
-
def get_input_file_names(file_input):
|
185 |
'''
|
186 |
Get list of input files to report to logs.
|
187 |
'''
|
@@ -210,14 +212,123 @@ def get_input_file_names(file_input):
|
|
210 |
file_extension = os.path.splitext(file_path)[1].lower()
|
211 |
|
212 |
# Check if the file is an image type
|
213 |
-
if file_extension in ['.jpg', '.jpeg', '.png', '.pdf', '.xlsx', '.csv', '.parquet']:
|
214 |
all_relevant_files.append(file_path_without_ext)
|
215 |
file_name_with_extension = file_path_without_ext + file_extension
|
216 |
full_file_name = file_path
|
217 |
|
218 |
all_relevant_files_str = ", ".join(all_relevant_files)
|
219 |
|
220 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
221 |
|
222 |
def prepare_image_or_pdf(
|
223 |
file_paths: List[str],
|
@@ -230,6 +341,7 @@ def prepare_image_or_pdf(
|
|
230 |
current_loop_page_number:int=0,
|
231 |
all_annotations_object:List = [],
|
232 |
prepare_for_review:bool = False,
|
|
|
233 |
progress: Progress = Progress(track_tqdm=True)
|
234 |
) -> tuple[List[str], List[str]]:
|
235 |
"""
|
@@ -241,15 +353,16 @@ def prepare_image_or_pdf(
|
|
241 |
Args:
|
242 |
file_paths (List[str]): List of file paths to process.
|
243 |
in_redact_method (str): The redaction method to use.
|
244 |
-
in_allow_list (Optional[List[List[str]]]): List of allowed terms for redaction.
|
245 |
-
latest_file_completed (int): Index of the last completed file.
|
246 |
-
out_message (List[str]): List to store output messages.
|
247 |
-
first_loop_state (bool): Flag indicating if this is the first iteration.
|
248 |
-
number_of_pages (int): integer indicating the number of pages in the document
|
249 |
-
current_loop_page_number (int): Current number of loop
|
250 |
-
all_annotations_object(List of annotation objects): All annotations for current document
|
251 |
-
prepare_for_review(bool): Is this preparation step preparing pdfs and json files to review current redactions?
|
252 |
-
|
|
|
253 |
|
254 |
|
255 |
Returns:
|
@@ -259,6 +372,9 @@ def prepare_image_or_pdf(
|
|
259 |
tic = time.perf_counter()
|
260 |
json_from_csv = False
|
261 |
|
|
|
|
|
|
|
262 |
# If this is the first time around, set variables to 0/blank
|
263 |
if first_loop_state==True:
|
264 |
print("first_loop_state is True")
|
@@ -329,6 +445,9 @@ def prepare_image_or_pdf(
|
|
329 |
|
330 |
# Loop through files to load in
|
331 |
for file in file_paths_loop:
|
|
|
|
|
|
|
332 |
if isinstance(file, str):
|
333 |
file_path = file
|
334 |
else:
|
@@ -342,15 +461,45 @@ def prepare_image_or_pdf(
|
|
342 |
|
343 |
file_extension = os.path.splitext(file_path)[1].lower()
|
344 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
345 |
|
346 |
# Check if the file is an image type and the user selected text ocr option
|
347 |
-
|
348 |
in_redact_method = tesseract_ocr_option
|
349 |
|
350 |
-
|
351 |
review_file_csv = read_file(file)
|
352 |
all_annotations_object = convert_pandas_df_to_review_json(review_file_csv)
|
353 |
json_from_csv = True
|
|
|
354 |
|
355 |
# If the file name ends with redactions.json, assume it is an annoations object, overwrite the current variable
|
356 |
if (file_extension in ['.json']) | (json_from_csv == True):
|
@@ -376,7 +525,7 @@ def prepare_image_or_pdf(
|
|
376 |
|
377 |
# If you have an annotations object from the above code
|
378 |
if all_annotations_object:
|
379 |
-
#print("out_annotations_object
|
380 |
|
381 |
# Get list of page numbers
|
382 |
image_file_paths_pages = [
|
@@ -388,9 +537,27 @@ def prepare_image_or_pdf(
|
|
388 |
|
389 |
# If PDF pages have been converted to image files, replace the current image paths in the json to this.
|
390 |
if image_file_paths:
|
|
|
|
|
|
|
|
|
|
|
|
|
391 |
|
392 |
-
|
393 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
394 |
#print("Annotation page number:", annotation_page_number)
|
395 |
|
396 |
# Check if the annotation page number exists in the image file paths pages
|
@@ -402,19 +569,30 @@ def prepare_image_or_pdf(
|
|
402 |
else:
|
403 |
print("Page", annotation_page_number, "image file not found.")
|
404 |
|
405 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
406 |
|
407 |
# Write the response to a JSON file in output folder
|
408 |
out_folder = output_folder + file_path_without_ext + ".json"
|
409 |
with open(out_folder, 'w') as json_file:
|
410 |
json.dump(all_annotations_object, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
|
411 |
continue
|
412 |
-
|
413 |
|
414 |
-
# Must be
|
415 |
else:
|
416 |
-
|
417 |
-
# Convert pdf/image file to correct format for redaction
|
418 |
if in_redact_method == tesseract_ocr_option or in_redact_method == textract_option:
|
419 |
if is_pdf_or_image(file_path) == False:
|
420 |
out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
|
@@ -425,25 +603,11 @@ def prepare_image_or_pdf(
|
|
425 |
if is_pdf(file_path) == False:
|
426 |
out_message = "Please upload a PDF file for text analysis."
|
427 |
print(out_message)
|
428 |
-
return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
|
429 |
-
|
430 |
-
converted_file_path = file_path # Pikepdf works with the basic unconverted pdf file
|
431 |
-
image_file_path = process_file(file_path, prepare_for_review)
|
432 |
|
433 |
-
converted_file_paths.append(converted_file_path)
|
434 |
-
image_file_paths.extend(image_file_path)
|
435 |
-
|
436 |
-
# If a pdf, load as a pymupdf document
|
437 |
-
if is_pdf(file_path):
|
438 |
-
pymupdf_doc = pymupdf.open(file_path)
|
439 |
|
440 |
-
|
441 |
-
|
442 |
-
pymupdf_doc = pymupdf.open() # Create a new empty document
|
443 |
-
img = Image.open(file_path) # Open the image file
|
444 |
-
rect = pymupdf.Rect(0, 0, img.width, img.height) # Create a rectangle for the image
|
445 |
-
page = pymupdf_doc.new_page(width=img.width, height=img.height) # Add a new page
|
446 |
-
page.insert_image(rect, filename=file_path) # Insert the image into the page
|
447 |
|
448 |
toc = time.perf_counter()
|
449 |
out_time = f"File '{file_path_without_ext}' prepared in {toc - tic:0.1f} seconds."
|
@@ -453,11 +617,12 @@ def prepare_image_or_pdf(
|
|
453 |
out_message.append(out_time)
|
454 |
out_message_out = '\n'.join(out_message)
|
455 |
|
456 |
-
if prepare_for_review == False:
|
457 |
-
|
458 |
-
else:
|
459 |
-
|
460 |
|
|
|
461 |
|
462 |
return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
|
463 |
|
@@ -498,13 +663,17 @@ def convert_review_json_to_pandas_df(data:List[dict]) -> pd.DataFrame:
|
|
498 |
match = re.search(r'_(\d+)\.png$', image_path)
|
499 |
if match:
|
500 |
number = match.group(1) # Extract the number
|
501 |
-
print(number) # Output: 0
|
502 |
reported_number = int(number) + 1
|
503 |
else:
|
504 |
print("No number found before .png")
|
505 |
|
|
|
|
|
|
|
|
|
506 |
for box in entry["boxes"]:
|
507 |
-
data_to_add = {"image": image_path, "page":reported_number, **box}
|
508 |
#print("data_to_add:", data_to_add)
|
509 |
flattened_data.append(data_to_add)
|
510 |
|
|
|
8 |
import json
|
9 |
import pymupdf
|
10 |
import pandas as pd
|
11 |
+
from pymupdf import Rect
|
12 |
+
from fitz import Page
|
13 |
from tqdm import tqdm
|
14 |
from gradio import Progress
|
15 |
from typing import List, Optional
|
|
|
60 |
os.makedirs(os.path.dirname(out_path), exist_ok=True)
|
61 |
|
62 |
if os.path.exists(out_path):
|
63 |
+
#print(f"Loading existing image for page {page_num + 1}")
|
64 |
image = Image.open(out_path)
|
65 |
else:
|
66 |
+
#print(f"Converting page {page_num + 1}")
|
67 |
image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1,
|
68 |
dpi=image_dpi, use_cropbox=True, use_pdftocairo=False)
|
69 |
image = image_l[0]
|
|
|
183 |
|
184 |
return img_object
|
185 |
|
186 |
+
def get_input_file_names(file_input:List[str]):
|
187 |
'''
|
188 |
Get list of input files to report to logs.
|
189 |
'''
|
|
|
212 |
file_extension = os.path.splitext(file_path)[1].lower()
|
213 |
|
214 |
# Check if the file is an image type
|
215 |
+
if (file_extension in ['.jpg', '.jpeg', '.png', '.pdf', '.xlsx', '.csv', '.parquet']) & ("review_file" not in file_path_without_ext):
|
216 |
all_relevant_files.append(file_path_without_ext)
|
217 |
file_name_with_extension = file_path_without_ext + file_extension
|
218 |
full_file_name = file_path
|
219 |
|
220 |
all_relevant_files_str = ", ".join(all_relevant_files)
|
221 |
|
222 |
+
#print("all_relevant_files_str in input_file_names", all_relevant_files_str)
|
223 |
+
#print("all_relevant_files in input_file_names", all_relevant_files)
|
224 |
+
|
225 |
+
return all_relevant_files_str, file_name_with_extension, full_file_name, all_relevant_files
|
226 |
+
|
227 |
+
def convert_color_to_range_0_1(color):
|
228 |
+
return tuple(component / 255 for component in color)
|
229 |
+
|
230 |
+
def redact_single_box(pymupdf_page:Page, pymupdf_rect:Rect, img_annotation_box:dict, custom_colours:bool=False):
|
231 |
+
pymupdf_x1 = pymupdf_rect[0]
|
232 |
+
pymupdf_y1 = pymupdf_rect[1]
|
233 |
+
pymupdf_x2 = pymupdf_rect[2]
|
234 |
+
pymupdf_y2 = pymupdf_rect[3]
|
235 |
+
|
236 |
+
# Calculate area to actually remove text from the pdf (different from black box size)
|
237 |
+
redact_bottom_y = pymupdf_y1 + 2
|
238 |
+
redact_top_y = pymupdf_y2 - 2
|
239 |
+
|
240 |
+
# Calculate the middle y value and set a small height if default values are too close together
|
241 |
+
if (redact_top_y - redact_bottom_y) < 1:
|
242 |
+
middle_y = (pymupdf_y1 + pymupdf_y2) / 2
|
243 |
+
redact_bottom_y = middle_y - 1
|
244 |
+
redact_top_y = middle_y + 1
|
245 |
+
|
246 |
+
#print("Rect:", rect)
|
247 |
+
|
248 |
+
rect_small_pixel_height = Rect(pymupdf_x1, redact_bottom_y, pymupdf_x2, redact_top_y) # Slightly smaller than outside box
|
249 |
+
|
250 |
+
# Add the annotation to the middle of the character line, so that it doesn't delete text from adjacent lines
|
251 |
+
#page.add_redact_annot(rect)#rect_small_pixel_height)
|
252 |
+
pymupdf_page.add_redact_annot(rect_small_pixel_height)
|
253 |
+
|
254 |
+
# Set up drawing a black box over the whole rect
|
255 |
+
shape = pymupdf_page.new_shape()
|
256 |
+
shape.draw_rect(pymupdf_rect)
|
257 |
+
|
258 |
+
if custom_colours == True:
|
259 |
+
if img_annotation_box["color"][0] > 1:
|
260 |
+
out_colour = convert_color_to_range_0_1(img_annotation_box["color"])
|
261 |
+
else:
|
262 |
+
out_colour = img_annotation_box["color"]
|
263 |
+
else:
|
264 |
+
out_colour = (0,0,0)
|
265 |
+
|
266 |
+
shape.finish(color=out_colour, fill=out_colour) # Black fill for the rectangle
|
267 |
+
#shape.finish(color=(0, 0, 0)) # Black fill for the rectangle
|
268 |
+
shape.commit()
|
269 |
+
|
270 |
+
def convert_pymupdf_to_image_coords(pymupdf_page, x1, y1, x2, y2, image: Image):
|
271 |
+
'''
|
272 |
+
Converts coordinates from pymupdf format to image coordinates,
|
273 |
+
accounting for mediabox dimensions.
|
274 |
+
'''
|
275 |
+
|
276 |
+
rect_height = pymupdf_page.rect.height
|
277 |
+
rect_width = pymupdf_page.rect.width
|
278 |
+
|
279 |
+
# Get mediabox dimensions
|
280 |
+
mediabox = pymupdf_page.mediabox
|
281 |
+
mediabox_width = mediabox.width
|
282 |
+
mediabox_height = mediabox.height
|
283 |
+
|
284 |
+
image_page_width, image_page_height = image.size
|
285 |
+
|
286 |
+
# Calculate scaling factors using mediabox dimensions
|
287 |
+
scale_width = image_page_width / mediabox_width
|
288 |
+
scale_height = image_page_height / mediabox_height
|
289 |
+
|
290 |
+
#print("scale_width:", scale_width)
|
291 |
+
#print("scale_height:", scale_height)
|
292 |
+
|
293 |
+
rect_to_mediabox_x_scale = mediabox_width / rect_width
|
294 |
+
rect_to_mediabox_y_scale = mediabox_height / rect_height
|
295 |
+
|
296 |
+
#print("rect_to_mediabox_x_scale:", rect_to_mediabox_x_scale)
|
297 |
+
#print("rect_to_mediabox_y_scale:", rect_to_mediabox_y_scale)
|
298 |
+
|
299 |
+
# Adjust coordinates based on scaling factors
|
300 |
+
x1_image = (x1 * scale_width) * rect_to_mediabox_x_scale
|
301 |
+
y1_image = (y1 * scale_height) * rect_to_mediabox_y_scale
|
302 |
+
x2_image = (x2 * scale_width) * rect_to_mediabox_x_scale
|
303 |
+
y2_image = (y2 * scale_height) * rect_to_mediabox_y_scale
|
304 |
+
|
305 |
+
return x1_image, y1_image, x2_image, y2_image
|
306 |
+
|
307 |
+
|
308 |
+
def redact_whole_pymupdf_page(rect_height, rect_width, image, page, custom_colours, border = 5):
|
309 |
+
# Small border to page that remains white
|
310 |
+
border = 5
|
311 |
+
# Define the coordinates for the Rect
|
312 |
+
whole_page_x1, whole_page_y1 = 0 + border, 0 + border # Bottom-left corner
|
313 |
+
whole_page_x2, whole_page_y2 = rect_width - border, rect_height - border # Top-right corner
|
314 |
+
|
315 |
+
whole_page_image_x1, whole_page_image_y1, whole_page_image_x2, whole_page_image_y2 = convert_pymupdf_to_image_coords(page, whole_page_x1, whole_page_y1, whole_page_x2, whole_page_y2, image)
|
316 |
+
|
317 |
+
# Create new image annotation element based on whole page coordinates
|
318 |
+
whole_page_rect = Rect(whole_page_x1, whole_page_y1, whole_page_x2, whole_page_y2)
|
319 |
+
|
320 |
+
# Write whole page annotation to annotation boxes
|
321 |
+
whole_page_img_annotation_box = {}
|
322 |
+
whole_page_img_annotation_box["xmin"] = whole_page_image_x1
|
323 |
+
whole_page_img_annotation_box["ymin"] = whole_page_image_y1
|
324 |
+
whole_page_img_annotation_box["xmax"] = whole_page_image_x2
|
325 |
+
whole_page_img_annotation_box["ymax"] = whole_page_image_y2
|
326 |
+
whole_page_img_annotation_box["color"] = (0,0,0)
|
327 |
+
whole_page_img_annotation_box["label"] = "Whole page"
|
328 |
+
|
329 |
+
redact_single_box(page, whole_page_rect, whole_page_img_annotation_box, custom_colours)
|
330 |
+
|
331 |
+
return whole_page_img_annotation_box
|
332 |
|
333 |
def prepare_image_or_pdf(
|
334 |
file_paths: List[str],
|
|
|
341 |
current_loop_page_number:int=0,
|
342 |
all_annotations_object:List = [],
|
343 |
prepare_for_review:bool = False,
|
344 |
+
in_fully_redacted_list:List[int]=[],
|
345 |
progress: Progress = Progress(track_tqdm=True)
|
346 |
) -> tuple[List[str], List[str]]:
|
347 |
"""
|
|
|
353 |
Args:
|
354 |
file_paths (List[str]): List of file paths to process.
|
355 |
in_redact_method (str): The redaction method to use.
|
356 |
+
in_allow_list (optional, Optional[List[List[str]]]): List of allowed terms for redaction.
|
357 |
+
latest_file_completed (optional, int): Index of the last completed file.
|
358 |
+
out_message (optional, List[str]): List to store output messages.
|
359 |
+
first_loop_state (optional, bool): Flag indicating if this is the first iteration.
|
360 |
+
number_of_pages (optional, int): integer indicating the number of pages in the document
|
361 |
+
current_loop_page_number (optional, int): Current number of loop
|
362 |
+
all_annotations_object(optional, List of annotation objects): All annotations for current document
|
363 |
+
prepare_for_review(optional, bool): Is this preparation step preparing pdfs and json files to review current redactions?
|
364 |
+
in_fully_redacted_list(optional, List of int): A list of pages to fully redact
|
365 |
+
progress (optional, Progress): Progress tracker for the operation.
|
366 |
|
367 |
|
368 |
Returns:
|
|
|
372 |
tic = time.perf_counter()
|
373 |
json_from_csv = False
|
374 |
|
375 |
+
if isinstance(in_fully_redacted_list, pd.DataFrame):
|
376 |
+
in_fully_redacted_list = in_fully_redacted_list.iloc[:,0].tolist()
|
377 |
+
|
378 |
# If this is the first time around, set variables to 0/blank
|
379 |
if first_loop_state==True:
|
380 |
print("first_loop_state is True")
|
|
|
445 |
|
446 |
# Loop through files to load in
|
447 |
for file in file_paths_loop:
|
448 |
+
converted_file_path = []
|
449 |
+
image_file_path = []
|
450 |
+
|
451 |
if isinstance(file, str):
|
452 |
file_path = file
|
453 |
else:
|
|
|
461 |
|
462 |
file_extension = os.path.splitext(file_path)[1].lower()
|
463 |
|
464 |
+
# If a pdf, load as a pymupdf document
|
465 |
+
if is_pdf(file_path):
|
466 |
+
pymupdf_doc = pymupdf.open(file_path)
|
467 |
+
|
468 |
+
converted_file_path = file_path
|
469 |
+
image_file_paths = process_file(file_path, prepare_for_review)
|
470 |
+
|
471 |
+
# Create base version of the annotation object that doesn't have any annotations in it
|
472 |
+
if not all_annotations_object:
|
473 |
+
all_annotations_object = []
|
474 |
+
|
475 |
+
for image_path in image_file_paths:
|
476 |
+
annotation = {}
|
477 |
+
annotation["image"] = image_path
|
478 |
+
|
479 |
+
all_annotations_object.append(annotation)
|
480 |
+
|
481 |
+
#print("all_annotations_object:", all_annotations_object)
|
482 |
+
|
483 |
+
|
484 |
+
elif is_pdf_or_image(file_path): # Alternatively, if it's an image
|
485 |
+
# Convert image to a pymupdf document
|
486 |
+
pymupdf_doc = pymupdf.open() # Create a new empty document
|
487 |
+
|
488 |
+
img = Image.open(file_path) # Open the image file
|
489 |
+
rect = pymupdf.Rect(0, 0, img.width, img.height) # Create a rectangle for the image
|
490 |
+
page = pymupdf_doc.new_page(width=img.width, height=img.height) # Add a new page
|
491 |
+
page.insert_image(rect, filename=file_path) # Insert the image into the page
|
492 |
+
|
493 |
|
494 |
# Check if the file is an image type and the user selected text ocr option
|
495 |
+
elif file_extension in ['.jpg', '.jpeg', '.png'] and in_redact_method == text_ocr_option:
|
496 |
in_redact_method = tesseract_ocr_option
|
497 |
|
498 |
+
elif file_extension in ['.csv']:
|
499 |
review_file_csv = read_file(file)
|
500 |
all_annotations_object = convert_pandas_df_to_review_json(review_file_csv)
|
501 |
json_from_csv = True
|
502 |
+
print("Converted CSV review file to json")
|
503 |
|
504 |
# If the file name ends with redactions.json, assume it is an annoations object, overwrite the current variable
|
505 |
if (file_extension in ['.json']) | (json_from_csv == True):
|
|
|
525 |
|
526 |
# If you have an annotations object from the above code
|
527 |
if all_annotations_object:
|
528 |
+
#print("out_annotations_object before reloading images:", all_annotations_object)
|
529 |
|
530 |
# Get list of page numbers
|
531 |
image_file_paths_pages = [
|
|
|
537 |
|
538 |
# If PDF pages have been converted to image files, replace the current image paths in the json to this.
|
539 |
if image_file_paths:
|
540 |
+
#print("Image file paths found")
|
541 |
+
|
542 |
+
#print("Image_file_paths:", image_file_paths)
|
543 |
+
|
544 |
+
#for i, annotation in enumerate(all_annotations_object):
|
545 |
+
for i, image_file_path in enumerate(image_file_paths):
|
546 |
|
547 |
+
if i < len(all_annotations_object):
|
548 |
+
annotation = all_annotations_object[i]
|
549 |
+
else:
|
550 |
+
annotation = {}
|
551 |
+
all_annotations_object.append(annotation)
|
552 |
+
|
553 |
+
#print("annotation:", annotation, "for page:", str(i))
|
554 |
+
|
555 |
+
if not annotation:
|
556 |
+
annotation = {"image":"", "boxes": []}
|
557 |
+
annotation_page_number = int(re.search(r'_(\d+)\.png$', image_file_path).group(1))
|
558 |
+
|
559 |
+
else:
|
560 |
+
annotation_page_number = int(re.search(r'_(\d+)\.png$', annotation["image"]).group(1))
|
561 |
#print("Annotation page number:", annotation_page_number)
|
562 |
|
563 |
# Check if the annotation page number exists in the image file paths pages
|
|
|
569 |
else:
|
570 |
print("Page", annotation_page_number, "image file not found.")
|
571 |
|
572 |
+
all_annotations_object[i] = annotation
|
573 |
+
|
574 |
+
#print("all_annotations_object at end of json/csv load part:", all_annotations_object)
|
575 |
+
|
576 |
+
# Get list of pages that are to be fully redacted and redact them
|
577 |
+
if in_fully_redacted_list:
|
578 |
+
print("Redacting whole pages")
|
579 |
+
|
580 |
+
for i, image in enumerate(image_file_paths):
|
581 |
+
page = pymupdf_doc.load_page(i)
|
582 |
+
rect_height = page.rect.height
|
583 |
+
rect_width = page.rect.width
|
584 |
+
whole_page_img_annotation_box = redact_whole_pymupdf_page(rect_height, rect_width, image, page, custom_colours = False, border = 5)
|
585 |
+
|
586 |
+
all_annotations_object.append(whole_page_img_annotation_box)
|
587 |
|
588 |
# Write the response to a JSON file in output folder
|
589 |
out_folder = output_folder + file_path_without_ext + ".json"
|
590 |
with open(out_folder, 'w') as json_file:
|
591 |
json.dump(all_annotations_object, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
|
592 |
continue
|
|
|
593 |
|
594 |
+
# Must be something else, return with error message
|
595 |
else:
|
|
|
|
|
596 |
if in_redact_method == tesseract_ocr_option or in_redact_method == textract_option:
|
597 |
if is_pdf_or_image(file_path) == False:
|
598 |
out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
|
|
|
603 |
if is_pdf(file_path) == False:
|
604 |
out_message = "Please upload a PDF file for text analysis."
|
605 |
print(out_message)
|
606 |
+
return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
|
|
|
|
|
|
|
607 |
|
|
|
|
|
|
|
|
|
|
|
|
|
608 |
|
609 |
+
converted_file_paths.append(converted_file_path)
|
610 |
+
image_file_paths.extend(image_file_path)
|
|
|
|
|
|
|
|
|
|
|
611 |
|
612 |
toc = time.perf_counter()
|
613 |
out_time = f"File '{file_path_without_ext}' prepared in {toc - tic:0.1f} seconds."
|
|
|
617 |
out_message.append(out_time)
|
618 |
out_message_out = '\n'.join(out_message)
|
619 |
|
620 |
+
#if prepare_for_review == False:
|
621 |
+
number_of_pages = len(image_file_paths)
|
622 |
+
#else:
|
623 |
+
# number_of_pages = len(all_annotations_object)
|
624 |
|
625 |
+
#print("all_annotations_object at end:", all_annotations_object)
|
626 |
|
627 |
return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
|
628 |
|
|
|
663 |
match = re.search(r'_(\d+)\.png$', image_path)
|
664 |
if match:
|
665 |
number = match.group(1) # Extract the number
|
666 |
+
#print(number) # Output: 0
|
667 |
reported_number = int(number) + 1
|
668 |
else:
|
669 |
print("No number found before .png")
|
670 |
|
671 |
+
# Check if 'boxes' is in the entry, if not, add an empty list
|
672 |
+
if 'boxes' not in entry:
|
673 |
+
entry['boxes'] = []
|
674 |
+
|
675 |
for box in entry["boxes"]:
|
676 |
+
data_to_add = {"image": image_path, "page": reported_number, **box}
|
677 |
#print("data_to_add:", data_to_add)
|
678 |
flattened_data.append(data_to_add)
|
679 |
|
tools/file_redaction.py
CHANGED
@@ -18,7 +18,7 @@ from pdfminer.layout import LTTextContainer, LTChar, LTTextLine, LTTextLineHoriz
|
|
18 |
from pikepdf import Pdf, Dictionary, Name
|
19 |
import pymupdf
|
20 |
from pymupdf import Rect
|
21 |
-
from fitz import
|
22 |
import gradio as gr
|
23 |
from gradio import Progress
|
24 |
from collections import defaultdict # For efficient grouping
|
@@ -26,7 +26,7 @@ from collections import defaultdict # For efficient grouping
|
|
26 |
from presidio_analyzer import RecognizerResult
|
27 |
from tools.aws_functions import RUN_AWS_FUNCTIONS
|
28 |
from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult
|
29 |
-
from tools.file_conversion import process_file, image_dpi, convert_review_json_to_pandas_df
|
30 |
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser
|
31 |
from tools.helper_functions import get_file_path_end, output_folder, clean_unicode_text, get_or_create_env_var, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
|
32 |
from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
|
@@ -69,8 +69,8 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
69 |
chosen_redact_comprehend_entities:List[str],
|
70 |
in_redact_method:str,
|
71 |
in_allow_list:List[List[str]]=None,
|
72 |
-
|
73 |
-
|
74 |
latest_file_completed:int=0,
|
75 |
out_message:list=[],
|
76 |
out_file_paths:list=[],
|
@@ -102,8 +102,8 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
102 |
- chosen_redact_comprehend_entities (List[str]): A list of entity types to redact from files, chosen from the official list from AWS Comprehend service
|
103 |
- in_redact_method (str): The method to use for redaction.
|
104 |
- in_allow_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to None.
|
105 |
-
-
|
106 |
-
-
|
107 |
- latest_file_completed (int, optional): The index of the last completed file. Defaults to 0.
|
108 |
- out_message (list, optional): A list to store output messages. Defaults to an empty list.
|
109 |
- out_file_paths (list, optional): A list to store paths to the output files. Defaults to an empty list.
|
@@ -131,6 +131,15 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
131 |
tic = time.perf_counter()
|
132 |
all_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
|
133 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
134 |
|
135 |
# If this is the first time around, set variables to 0/blank
|
136 |
if first_loop_state==True:
|
@@ -296,7 +305,9 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
296 |
pii_identification_method,
|
297 |
comprehend_query_number,
|
298 |
comprehend_client,
|
299 |
-
textract_client
|
|
|
|
|
300 |
|
301 |
# Save Textract request metadata (if exists)
|
302 |
if new_request_metadata:
|
@@ -330,7 +341,9 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
330 |
pymupdf_doc,
|
331 |
pii_identification_method,
|
332 |
comprehend_query_number,
|
333 |
-
comprehend_client
|
|
|
|
|
334 |
|
335 |
else:
|
336 |
out_message = "No redaction method selected"
|
@@ -378,14 +391,19 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
378 |
json.dump(annotations_all_pages, f)
|
379 |
log_files_output_paths.append(out_annotation_file_path)
|
380 |
|
381 |
-
print("Saving annotations to CSV")
|
382 |
|
383 |
# Convert json to csv and also save this
|
|
|
|
|
384 |
review_df = convert_review_json_to_pandas_df(annotations_all_pages)
|
|
|
385 |
out_review_file_file_path = out_image_file_path + '_review_file.csv'
|
386 |
review_df.to_csv(out_review_file_file_path, index=None)
|
387 |
out_file_paths.append(out_review_file_file_path)
|
388 |
|
|
|
|
|
389 |
except Exception as e:
|
390 |
print("Could not save annotations to json file:", e)
|
391 |
|
@@ -522,42 +540,7 @@ def convert_image_coords_to_pymupdf(pymupdf_page, annot:CustomImageRecognizerRes
|
|
522 |
|
523 |
return x1, new_y1, x2, new_y2
|
524 |
|
525 |
-
def convert_pymupdf_to_image_coords(pymupdf_page, x1, y1, x2, y2, image: Image):
|
526 |
-
'''
|
527 |
-
Converts coordinates from pymupdf format to image coordinates,
|
528 |
-
accounting for mediabox dimensions.
|
529 |
-
'''
|
530 |
-
|
531 |
-
rect_height = pymupdf_page.rect.height
|
532 |
-
rect_width = pymupdf_page.rect.width
|
533 |
-
|
534 |
-
# Get mediabox dimensions
|
535 |
-
mediabox = pymupdf_page.mediabox
|
536 |
-
mediabox_width = mediabox.width
|
537 |
-
mediabox_height = mediabox.height
|
538 |
-
|
539 |
-
image_page_width, image_page_height = image.size
|
540 |
-
|
541 |
-
# Calculate scaling factors using mediabox dimensions
|
542 |
-
scale_width = image_page_width / mediabox_width
|
543 |
-
scale_height = image_page_height / mediabox_height
|
544 |
|
545 |
-
#print("scale_width:", scale_width)
|
546 |
-
#print("scale_height:", scale_height)
|
547 |
-
|
548 |
-
rect_to_mediabox_x_scale = mediabox_width / rect_width
|
549 |
-
rect_to_mediabox_y_scale = mediabox_height / rect_height
|
550 |
-
|
551 |
-
#print("rect_to_mediabox_x_scale:", rect_to_mediabox_x_scale)
|
552 |
-
#print("rect_to_mediabox_y_scale:", rect_to_mediabox_y_scale)
|
553 |
-
|
554 |
-
# Adjust coordinates based on scaling factors
|
555 |
-
x1_image = (x1 * scale_width) * rect_to_mediabox_x_scale
|
556 |
-
y1_image = (y1 * scale_height) * rect_to_mediabox_y_scale
|
557 |
-
x2_image = (x2 * scale_width) * rect_to_mediabox_x_scale
|
558 |
-
y2_image = (y2 * scale_height) * rect_to_mediabox_y_scale
|
559 |
-
|
560 |
-
return x1_image, y1_image, x2_image, y2_image
|
561 |
|
562 |
def convert_gradio_annotation_coords_to_pymupdf(pymupdf_page:Page, annot:dict, image:Image):
|
563 |
'''
|
@@ -594,49 +577,6 @@ def move_page_info(file_path: str) -> str:
|
|
594 |
|
595 |
return new_file_path
|
596 |
|
597 |
-
def convert_color_to_range_0_1(color):
|
598 |
-
return tuple(component / 255 for component in color)
|
599 |
-
|
600 |
-
def redact_single_box(pymupdf_page:Page, pymupdf_rect:Rect, img_annotation_box:dict, custom_colours:bool=False):
|
601 |
-
pymupdf_x1 = pymupdf_rect[0]
|
602 |
-
pymupdf_y1 = pymupdf_rect[1]
|
603 |
-
pymupdf_x2 = pymupdf_rect[2]
|
604 |
-
pymupdf_y2 = pymupdf_rect[3]
|
605 |
-
|
606 |
-
# Calculate area to actually remove text from the pdf (different from black box size)
|
607 |
-
redact_bottom_y = pymupdf_y1 + 2
|
608 |
-
redact_top_y = pymupdf_y2 - 2
|
609 |
-
|
610 |
-
# Calculate the middle y value and set a small height if default values are too close together
|
611 |
-
if (redact_top_y - redact_bottom_y) < 1:
|
612 |
-
middle_y = (pymupdf_y1 + pymupdf_y2) / 2
|
613 |
-
redact_bottom_y = middle_y - 1
|
614 |
-
redact_top_y = middle_y + 1
|
615 |
-
|
616 |
-
#print("Rect:", rect)
|
617 |
-
|
618 |
-
rect_small_pixel_height = Rect(pymupdf_x1, redact_bottom_y, pymupdf_x2, redact_top_y) # Slightly smaller than outside box
|
619 |
-
|
620 |
-
# Add the annotation to the middle of the character line, so that it doesn't delete text from adjacent lines
|
621 |
-
#page.add_redact_annot(rect)#rect_small_pixel_height)
|
622 |
-
pymupdf_page.add_redact_annot(rect_small_pixel_height)
|
623 |
-
|
624 |
-
# Set up drawing a black box over the whole rect
|
625 |
-
shape = pymupdf_page.new_shape()
|
626 |
-
shape.draw_rect(pymupdf_rect)
|
627 |
-
|
628 |
-
if custom_colours == True:
|
629 |
-
if img_annotation_box["color"][0] > 1:
|
630 |
-
out_colour = convert_color_to_range_0_1(img_annotation_box["color"])
|
631 |
-
else:
|
632 |
-
out_colour = img_annotation_box["color"]
|
633 |
-
else:
|
634 |
-
out_colour = (0,0,0)
|
635 |
-
|
636 |
-
shape.finish(color=out_colour, fill=out_colour) # Black fill for the rectangle
|
637 |
-
#shape.finish(color=(0, 0, 0)) # Black fill for the rectangle
|
638 |
-
shape.commit()
|
639 |
-
|
640 |
def redact_page_with_pymupdf(page:Page, annotations_on_page:dict, image:Image.Image=None, custom_colours:bool=False, redact_whole_page:bool=False):
|
641 |
|
642 |
mediabox_height = page.mediabox[3] - page.mediabox[1]
|
@@ -732,28 +672,31 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page:dict, image:Image.Im
|
|
732 |
|
733 |
# If whole page is to be redacted, do that here
|
734 |
if redact_whole_page == True:
|
735 |
-
# Small border to page that remains white
|
736 |
-
border = 5
|
737 |
-
# Define the coordinates for the Rect
|
738 |
-
whole_page_x1, whole_page_y1 = 0 + border, 0 + border # Bottom-left corner
|
739 |
-
whole_page_x2, whole_page_y2 = rect_width - border, rect_height - border # Top-right corner
|
|
|
|
|
740 |
|
741 |
-
|
|
|
742 |
|
743 |
-
#
|
744 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
745 |
|
746 |
-
#
|
747 |
-
whole_page_img_annotation_box = {}
|
748 |
-
whole_page_img_annotation_box["xmin"] = whole_page_image_x1
|
749 |
-
whole_page_img_annotation_box["ymin"] = whole_page_image_y1
|
750 |
-
whole_page_img_annotation_box["xmax"] = whole_page_image_x2
|
751 |
-
whole_page_img_annotation_box["ymax"] = whole_page_image_y2
|
752 |
-
whole_page_img_annotation_box["color"] = (0,0,0)
|
753 |
-
whole_page_img_annotation_box["label"] = "Whole page"
|
754 |
|
755 |
-
|
756 |
|
|
|
757 |
all_image_annotation_boxes.append(whole_page_img_annotation_box)
|
758 |
|
759 |
out_annotation_boxes = {
|
@@ -1058,11 +1001,20 @@ def redact_image_pdf(file_path:str,
|
|
1058 |
comprehend_query_number_new = 0
|
1059 |
|
1060 |
# Update custom word list analyser object with any new words that have been added to the custom deny list
|
1061 |
-
|
|
|
1062 |
nlp_analyser.registry.remove_recognizer("CUSTOM")
|
1063 |
new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
|
|
|
1064 |
nlp_analyser.registry.add_recognizer(new_custom_recogniser)
|
1065 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1066 |
|
1067 |
image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
|
1068 |
|
@@ -1315,9 +1267,15 @@ def redact_image_pdf(file_path:str,
|
|
1315 |
|
1316 |
image_annotations = {"image": file_path, "boxes": all_image_annotations_boxes}
|
1317 |
|
1318 |
-
## Apply annotations with pymupdf
|
1319 |
else:
|
1320 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
1321 |
|
1322 |
# Convert decision process to table
|
1323 |
decision_process_table = pd.DataFrame([{
|
@@ -1811,11 +1769,20 @@ def redact_text_pdf(
|
|
1811 |
return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
|
1812 |
|
1813 |
# Update custom word list analyser object with any new words that have been added to the custom deny list
|
1814 |
-
|
|
|
1815 |
nlp_analyser.registry.remove_recognizer("CUSTOM")
|
1816 |
new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
|
|
|
1817 |
nlp_analyser.registry.add_recognizer(new_custom_recogniser)
|
1818 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1819 |
tic = time.perf_counter()
|
1820 |
|
1821 |
# Open with Pikepdf to get text lines
|
@@ -1903,6 +1870,7 @@ def redact_text_pdf(
|
|
1903 |
for i, text_line in enumerate(line_level_text_results_list):
|
1904 |
if chosen_redact_entities:
|
1905 |
if pii_identification_method == "Local":
|
|
|
1906 |
# Process immediately for local analysis
|
1907 |
text_line_analyser_result = nlp_analyser.analyze(
|
1908 |
text=text_line.text,
|
@@ -2024,7 +1992,13 @@ def redact_text_pdf(
|
|
2024 |
annotations_on_page = create_annotations_for_bounding_boxes(page_analysed_bounding_boxes)
|
2025 |
|
2026 |
# Make pymupdf page redactions
|
2027 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
2028 |
|
2029 |
#print("Did redact_page_with_pymupdf function")
|
2030 |
reported_page_no = page_no + 1
|
|
|
18 |
from pikepdf import Pdf, Dictionary, Name
|
19 |
import pymupdf
|
20 |
from pymupdf import Rect
|
21 |
+
from fitz import Page
|
22 |
import gradio as gr
|
23 |
from gradio import Progress
|
24 |
from collections import defaultdict # For efficient grouping
|
|
|
26 |
from presidio_analyzer import RecognizerResult
|
27 |
from tools.aws_functions import RUN_AWS_FUNCTIONS
|
28 |
from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult
|
29 |
+
from tools.file_conversion import process_file, image_dpi, convert_review_json_to_pandas_df, redact_whole_pymupdf_page, redact_single_box, convert_pymupdf_to_image_coords
|
30 |
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser
|
31 |
from tools.helper_functions import get_file_path_end, output_folder, clean_unicode_text, get_or_create_env_var, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
|
32 |
from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
|
|
|
69 |
chosen_redact_comprehend_entities:List[str],
|
70 |
in_redact_method:str,
|
71 |
in_allow_list:List[List[str]]=None,
|
72 |
+
custom_recogniser_word_list:List[str]=None,
|
73 |
+
redact_whole_page_list:List[str]=None,
|
74 |
latest_file_completed:int=0,
|
75 |
out_message:list=[],
|
76 |
out_file_paths:list=[],
|
|
|
102 |
- chosen_redact_comprehend_entities (List[str]): A list of entity types to redact from files, chosen from the official list from AWS Comprehend service
|
103 |
- in_redact_method (str): The method to use for redaction.
|
104 |
- in_allow_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to None.
|
105 |
+
- custom_recogniser_word_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to None.
|
106 |
+
- redact_whole_page_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to None.
|
107 |
- latest_file_completed (int, optional): The index of the last completed file. Defaults to 0.
|
108 |
- out_message (list, optional): A list to store output messages. Defaults to an empty list.
|
109 |
- out_file_paths (list, optional): A list to store paths to the output files. Defaults to an empty list.
|
|
|
131 |
tic = time.perf_counter()
|
132 |
all_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
|
133 |
|
134 |
+
if isinstance(custom_recogniser_word_list, pd.DataFrame):
|
135 |
+
custom_recogniser_word_list = custom_recogniser_word_list.iloc[:,0].tolist()
|
136 |
+
|
137 |
+
# Sort the strings in order from the longest string to the shortest
|
138 |
+
custom_recogniser_word_list = sorted(custom_recogniser_word_list, key=len, reverse=True)
|
139 |
+
|
140 |
+
if isinstance(redact_whole_page_list, pd.DataFrame):
|
141 |
+
redact_whole_page_list = redact_whole_page_list.iloc[:,0].tolist()
|
142 |
+
|
143 |
|
144 |
# If this is the first time around, set variables to 0/blank
|
145 |
if first_loop_state==True:
|
|
|
305 |
pii_identification_method,
|
306 |
comprehend_query_number,
|
307 |
comprehend_client,
|
308 |
+
textract_client,
|
309 |
+
custom_recogniser_word_list,
|
310 |
+
redact_whole_page_list)
|
311 |
|
312 |
# Save Textract request metadata (if exists)
|
313 |
if new_request_metadata:
|
|
|
341 |
pymupdf_doc,
|
342 |
pii_identification_method,
|
343 |
comprehend_query_number,
|
344 |
+
comprehend_client,
|
345 |
+
custom_recogniser_word_list,
|
346 |
+
redact_whole_page_list)
|
347 |
|
348 |
else:
|
349 |
out_message = "No redaction method selected"
|
|
|
391 |
json.dump(annotations_all_pages, f)
|
392 |
log_files_output_paths.append(out_annotation_file_path)
|
393 |
|
394 |
+
#print("Saving annotations to CSV")
|
395 |
|
396 |
# Convert json to csv and also save this
|
397 |
+
#print("annotations_all_pages:", annotations_all_pages)
|
398 |
+
|
399 |
review_df = convert_review_json_to_pandas_df(annotations_all_pages)
|
400 |
+
|
401 |
out_review_file_file_path = out_image_file_path + '_review_file.csv'
|
402 |
review_df.to_csv(out_review_file_file_path, index=None)
|
403 |
out_file_paths.append(out_review_file_file_path)
|
404 |
|
405 |
+
print("Saved review file to csv")
|
406 |
+
|
407 |
except Exception as e:
|
408 |
print("Could not save annotations to json file:", e)
|
409 |
|
|
|
540 |
|
541 |
return x1, new_y1, x2, new_y2
|
542 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
543 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
544 |
|
545 |
def convert_gradio_annotation_coords_to_pymupdf(pymupdf_page:Page, annot:dict, image:Image):
|
546 |
'''
|
|
|
577 |
|
578 |
return new_file_path
|
579 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
580 |
def redact_page_with_pymupdf(page:Page, annotations_on_page:dict, image:Image.Image=None, custom_colours:bool=False, redact_whole_page:bool=False):
|
581 |
|
582 |
mediabox_height = page.mediabox[3] - page.mediabox[1]
|
|
|
672 |
|
673 |
# If whole page is to be redacted, do that here
|
674 |
if redact_whole_page == True:
|
675 |
+
# # Small border to page that remains white
|
676 |
+
# border = 5
|
677 |
+
# # Define the coordinates for the Rect
|
678 |
+
# whole_page_x1, whole_page_y1 = 0 + border, 0 + border # Bottom-left corner
|
679 |
+
# whole_page_x2, whole_page_y2 = rect_width - border, rect_height - border # Top-right corner
|
680 |
+
|
681 |
+
# whole_page_image_x1, whole_page_image_y1, whole_page_image_x2, whole_page_image_y2 = convert_pymupdf_to_image_coords(page, whole_page_x1, whole_page_y1, whole_page_x2, whole_page_y2, image)
|
682 |
|
683 |
+
# # Create new image annotation element based on whole page coordinates
|
684 |
+
# whole_page_rect = Rect(whole_page_x1, whole_page_y1, whole_page_x2, whole_page_y2)
|
685 |
|
686 |
+
# # Write whole page annotation to annotation boxes
|
687 |
+
# whole_page_img_annotation_box = {}
|
688 |
+
# whole_page_img_annotation_box["xmin"] = whole_page_image_x1
|
689 |
+
# whole_page_img_annotation_box["ymin"] = whole_page_image_y1
|
690 |
+
# whole_page_img_annotation_box["xmax"] = whole_page_image_x2
|
691 |
+
# whole_page_img_annotation_box["ymax"] = whole_page_image_y2
|
692 |
+
# whole_page_img_annotation_box["color"] = (0,0,0)
|
693 |
+
# whole_page_img_annotation_box["label"] = "Whole page"
|
694 |
|
695 |
+
# redact_single_box(page, whole_page_rect, whole_page_img_annotation_box, custom_colours)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
696 |
|
697 |
+
# all_image_annotation_boxes.append(whole_page_img_annotation_box)
|
698 |
|
699 |
+
whole_page_img_annotation_box = redact_whole_pymupdf_page(rect_height, rect_width, image, page, custom_colours, border = 5)
|
700 |
all_image_annotation_boxes.append(whole_page_img_annotation_box)
|
701 |
|
702 |
out_annotation_boxes = {
|
|
|
1001 |
comprehend_query_number_new = 0
|
1002 |
|
1003 |
# Update custom word list analyser object with any new words that have been added to the custom deny list
|
1004 |
+
#print("custom_recogniser_word_list:", custom_recogniser_word_list)
|
1005 |
+
if custom_recogniser_word_list:
|
1006 |
nlp_analyser.registry.remove_recognizer("CUSTOM")
|
1007 |
new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
|
1008 |
+
#print("new_custom_recogniser:", new_custom_recogniser)
|
1009 |
nlp_analyser.registry.add_recognizer(new_custom_recogniser)
|
1010 |
|
1011 |
+
# List all elements currently in the nlp_analyser registry
|
1012 |
+
#print("Current recognizers in nlp_analyser registry:")
|
1013 |
+
for recognizer_name in nlp_analyser.registry.recognizers:
|
1014 |
+
print(recognizer_name)
|
1015 |
+
|
1016 |
+
|
1017 |
+
|
1018 |
|
1019 |
image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
|
1020 |
|
|
|
1267 |
|
1268 |
image_annotations = {"image": file_path, "boxes": all_image_annotations_boxes}
|
1269 |
|
1270 |
+
## Apply annotations with pymupdf
|
1271 |
else:
|
1272 |
+
#print("redact_whole_page_list:", redact_whole_page_list)
|
1273 |
+
if redact_whole_page_list:
|
1274 |
+
if current_loop_page in redact_whole_page_list: redact_whole_page = True
|
1275 |
+
else: redact_whole_page = False
|
1276 |
+
else: redact_whole_page = False
|
1277 |
+
|
1278 |
+
pymupdf_page, image_annotations = redact_page_with_pymupdf(pymupdf_page, merged_redaction_bboxes, image, redact_whole_page=redact_whole_page)
|
1279 |
|
1280 |
# Convert decision process to table
|
1281 |
decision_process_table = pd.DataFrame([{
|
|
|
1769 |
return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
|
1770 |
|
1771 |
# Update custom word list analyser object with any new words that have been added to the custom deny list
|
1772 |
+
#print("custom_recogniser_word_list:", custom_recogniser_word_list)
|
1773 |
+
if custom_recogniser_word_list:
|
1774 |
nlp_analyser.registry.remove_recognizer("CUSTOM")
|
1775 |
new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
|
1776 |
+
#print("new_custom_recogniser:", new_custom_recogniser)
|
1777 |
nlp_analyser.registry.add_recognizer(new_custom_recogniser)
|
1778 |
|
1779 |
+
# List all elements currently in the nlp_analyser registry
|
1780 |
+
#print("Current recognizers in nlp_analyser registry:")
|
1781 |
+
#for recognizer_name in nlp_analyser.registry.recognizers:
|
1782 |
+
# print(recognizer_name)
|
1783 |
+
|
1784 |
+
#print("Custom recogniser:", nlp_analyser.registry.)
|
1785 |
+
|
1786 |
tic = time.perf_counter()
|
1787 |
|
1788 |
# Open with Pikepdf to get text lines
|
|
|
1870 |
for i, text_line in enumerate(line_level_text_results_list):
|
1871 |
if chosen_redact_entities:
|
1872 |
if pii_identification_method == "Local":
|
1873 |
+
|
1874 |
# Process immediately for local analysis
|
1875 |
text_line_analyser_result = nlp_analyser.analyze(
|
1876 |
text=text_line.text,
|
|
|
1992 |
annotations_on_page = create_annotations_for_bounding_boxes(page_analysed_bounding_boxes)
|
1993 |
|
1994 |
# Make pymupdf page redactions
|
1995 |
+
#print("redact_whole_page_list:", redact_whole_page_list)
|
1996 |
+
if redact_whole_page_list:
|
1997 |
+
if current_loop_page in redact_whole_page_list: redact_whole_page = True
|
1998 |
+
else: redact_whole_page = False
|
1999 |
+
else: redact_whole_page = False
|
2000 |
+
|
2001 |
+
pymupdf_page, image_annotations = redact_page_with_pymupdf(pymupdf_page, annotations_on_page, image, redact_whole_page=redact_whole_page)
|
2002 |
|
2003 |
#print("Did redact_page_with_pymupdf function")
|
2004 |
reported_page_no = page_no + 1
|
tools/helper_functions.py
CHANGED
@@ -112,8 +112,6 @@ def custom_regex_load(in_file:List[str], file_type:str = "Allow list"):
|
|
112 |
custom_regex = pd.DataFrame()
|
113 |
|
114 |
if in_file:
|
115 |
-
print("File type:", file_type)
|
116 |
-
|
117 |
file_list = [string.name for string in in_file]
|
118 |
|
119 |
regex_file_names = [string for string in file_list if "csv" in string.lower()]
|
|
|
112 |
custom_regex = pd.DataFrame()
|
113 |
|
114 |
if in_file:
|
|
|
|
|
115 |
file_list = [string.name for string in in_file]
|
116 |
|
117 |
regex_file_names = [string for string in file_list if "csv" in string.lower()]
|
tools/load_spacy_model_custom_recognisers.py
CHANGED
@@ -28,8 +28,10 @@ except:
|
|
28 |
def custom_word_list_recogniser(custom_list:List[str]=[]):
|
29 |
custom_regex = '\\b' + '\\b|\\b'.join(rf"{re.escape(term)}" for term in custom_list) + '\\b'
|
30 |
custom_pattern = Pattern(name="custom_pattern",regex=custom_regex, score = 1)
|
|
|
|
|
31 |
custom_recogniser = PatternRecognizer(supported_entity="CUSTOM", name="CUSTOM", patterns = [custom_pattern],
|
32 |
-
global_regex_flags=re.DOTALL | re.MULTILINE)
|
33 |
|
34 |
return custom_recogniser
|
35 |
|
|
|
28 |
def custom_word_list_recogniser(custom_list:List[str]=[]):
|
29 |
custom_regex = '\\b' + '\\b|\\b'.join(rf"{re.escape(term)}" for term in custom_list) + '\\b'
|
30 |
custom_pattern = Pattern(name="custom_pattern",regex=custom_regex, score = 1)
|
31 |
+
|
32 |
+
#print("custom_pattern:", custom_pattern)
|
33 |
custom_recogniser = PatternRecognizer(supported_entity="CUSTOM", name="CUSTOM", patterns = [custom_pattern],
|
34 |
+
global_regex_flags=re.DOTALL | re.MULTILINE | re.IGNORECASE)
|
35 |
|
36 |
return custom_recogniser
|
37 |
|
tools/redaction_review.py
CHANGED
@@ -49,30 +49,66 @@ def update_zoom(current_zoom_level:int, annotate_current_page:int, decrease:bool
|
|
49 |
|
50 |
return current_zoom_level, annotate_current_page
|
51 |
|
52 |
-
def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, zoom:int=
|
53 |
'''
|
54 |
Update a gradio_image_annotation object with new annotation data
|
55 |
'''
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
|
57 |
zoom_str = str(zoom) + '%'
|
58 |
|
59 |
if not image_annotator_object:
|
|
|
|
|
60 |
out_image_annotator = image_annotator(
|
61 |
-
|
|
|
|
|
62 |
#label_list=["Redaction"],
|
63 |
#label_colors=[(0, 0, 0)],
|
|
|
64 |
height=zoom_str,
|
65 |
width=zoom_str,
|
66 |
-
|
67 |
-
|
|
|
|
|
68 |
show_clear_button=False,
|
69 |
show_share_button=False,
|
70 |
show_remove_button=False,
|
71 |
-
|
72 |
-
|
73 |
-
|
|
|
74 |
|
75 |
-
return out_image_annotator, number_reported, number_reported, page_num_reported
|
76 |
|
77 |
#print("page_num at start of update_annotator function:", page_num)
|
78 |
|
@@ -95,6 +131,28 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, zo
|
|
95 |
page_num_reported = page_max_reported
|
96 |
|
97 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
98 |
out_image_annotator = image_annotator(
|
99 |
value = image_annotator_object[page_num_reported - 1],
|
100 |
boxes_alpha=0.1,
|
@@ -117,7 +175,7 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, zo
|
|
117 |
|
118 |
number_reported = gr.Number(label = "Page (press enter to change)", value=page_num_reported, precision=0)
|
119 |
|
120 |
-
return out_image_annotator, number_reported, number_reported, page_num_reported
|
121 |
|
122 |
def modify_existing_page_redactions(image_annotated:AnnotatedImageData, current_page:int, previous_page:int, all_image_annotations:List[AnnotatedImageData], clear_all:bool=False):
|
123 |
'''
|
@@ -149,6 +207,8 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
|
|
149 |
output_files = []
|
150 |
output_log_files = []
|
151 |
|
|
|
|
|
152 |
image_annotated['image'] = all_image_annotations[current_page - 1]["image"]
|
153 |
|
154 |
all_image_annotations[current_page - 1] = image_annotated
|
@@ -252,32 +312,19 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
|
|
252 |
|
253 |
return doc, all_image_annotations, output_files, output_log_files
|
254 |
|
255 |
-
def crop(annotations:AnnotatedImageData):
|
256 |
-
if annotations["boxes"]:
|
257 |
-
box = annotations["boxes"][0]
|
258 |
-
return annotations["image"][
|
259 |
-
box["ymin"]:box["ymax"],
|
260 |
-
box["xmin"]:box["xmax"]
|
261 |
-
]
|
262 |
-
return None
|
263 |
-
|
264 |
def get_boxes_json(annotations:AnnotatedImageData):
|
265 |
return annotations["boxes"]
|
266 |
-
# Group the DataFrame by the 'image' column
|
267 |
-
grouped = df.groupby('image')
|
268 |
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
#
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
})
|
282 |
|
283 |
-
return json_data
|
|
|
49 |
|
50 |
return current_zoom_level, annotate_current_page
|
51 |
|
52 |
+
def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, recogniser_entities_drop=gr.Dropdown(value="ALL", allow_custom_value=True), recogniser_dataframe_gr=gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]})), zoom:int=80):
|
53 |
'''
|
54 |
Update a gradio_image_annotation object with new annotation data
|
55 |
'''
|
56 |
+
recogniser_entities = []
|
57 |
+
recogniser_dataframe = pd.DataFrame()
|
58 |
+
#recogniser_entities_drop = gr.Dropdown(value="ALL", allow_custom_value=True)
|
59 |
+
#recogniser_dataframe_gr = gr.Dataframe(pd.DataFrame(data={"page":[""], "label":[""]}))
|
60 |
+
|
61 |
+
#print("recogniser_dataframe_gr", recogniser_dataframe_gr)
|
62 |
+
#print("recogniser_dataframe_gr shape", recogniser_dataframe_gr.shape)
|
63 |
+
#print("recogniser_dataframe_gr.iloc[0,0]:", recogniser_dataframe_gr.iloc[0,0])
|
64 |
+
|
65 |
+
if recogniser_dataframe_gr.iloc[0,0] == "":
|
66 |
+
try:
|
67 |
+
review_dataframe = convert_review_json_to_pandas_df(image_annotator_object)[["page", "label"]]
|
68 |
+
#print("review_dataframe['label']", review_dataframe["label"])
|
69 |
+
recogniser_entities = review_dataframe["label"].unique().tolist()
|
70 |
+
recogniser_entities.append("ALL")
|
71 |
+
|
72 |
+
#print("recogniser_entities:", recogniser_entities)
|
73 |
+
|
74 |
+
recogniser_dataframe_out = gr.Dataframe(review_dataframe)
|
75 |
+
recogniser_dataframe_gr = gr.Dataframe(review_dataframe)
|
76 |
+
recogniser_entities_drop = gr.Dropdown(value=recogniser_entities[0], choices=recogniser_entities, allow_custom_value=True, interactive=True)
|
77 |
+
except Exception as e:
|
78 |
+
print("Could not extract recogniser information:", e)
|
79 |
+
|
80 |
+
else:
|
81 |
+
review_dataframe = update_entities_df(recogniser_entities_drop, recogniser_dataframe_gr)
|
82 |
+
recogniser_dataframe_out = gr.Dataframe(review_dataframe)
|
83 |
+
|
84 |
|
85 |
zoom_str = str(zoom) + '%'
|
86 |
|
87 |
if not image_annotator_object:
|
88 |
+
page_num_reported = 1
|
89 |
+
|
90 |
out_image_annotator = image_annotator(
|
91 |
+
image_annotator_object[page_num_reported - 1],
|
92 |
+
boxes_alpha=0.1,
|
93 |
+
box_thickness=1,
|
94 |
#label_list=["Redaction"],
|
95 |
#label_colors=[(0, 0, 0)],
|
96 |
+
show_label=False,
|
97 |
height=zoom_str,
|
98 |
width=zoom_str,
|
99 |
+
box_min_size=1,
|
100 |
+
box_selected_thickness=2,
|
101 |
+
handle_size=4,
|
102 |
+
sources=None,#["upload"],
|
103 |
show_clear_button=False,
|
104 |
show_share_button=False,
|
105 |
show_remove_button=False,
|
106 |
+
handles_cursor=True,
|
107 |
+
interactive=True
|
108 |
+
)
|
109 |
+
number_reported = gr.Number(label = "Page (press enter to change)", value=page_num_reported, precision=0)
|
110 |
|
111 |
+
return out_image_annotator, number_reported, number_reported, page_num_reported, recogniser_entities_drop, recogniser_dataframe_out, recogniser_dataframe_gr
|
112 |
|
113 |
#print("page_num at start of update_annotator function:", page_num)
|
114 |
|
|
|
131 |
page_num_reported = page_max_reported
|
132 |
|
133 |
|
134 |
+
|
135 |
+
# Remove duplicate elements that are blank
|
136 |
+
def remove_duplicate_images_with_blank_boxes(data: List[AnnotatedImageData]) -> List[AnnotatedImageData]:
|
137 |
+
seen_images = set()
|
138 |
+
filtered_data = []
|
139 |
+
|
140 |
+
for item in data:
|
141 |
+
# Check if 'image' is unique
|
142 |
+
if item['image'] not in seen_images:
|
143 |
+
filtered_data.append(item)
|
144 |
+
seen_images.add(item['image'])
|
145 |
+
# If 'boxes' is empty but 'image' is unique, keep the entry
|
146 |
+
elif item['boxes']:
|
147 |
+
filtered_data.append(item)
|
148 |
+
|
149 |
+
return filtered_data
|
150 |
+
|
151 |
+
image_annotator_object = remove_duplicate_images_with_blank_boxes(image_annotator_object)
|
152 |
+
|
153 |
+
#print("image_annotator_object in update_annotator:", image_annotator_object)
|
154 |
+
#print("image_annotator_object[page_num_reported - 1]:", image_annotator_object[page_num_reported - 1])
|
155 |
+
|
156 |
out_image_annotator = image_annotator(
|
157 |
value = image_annotator_object[page_num_reported - 1],
|
158 |
boxes_alpha=0.1,
|
|
|
175 |
|
176 |
number_reported = gr.Number(label = "Page (press enter to change)", value=page_num_reported, precision=0)
|
177 |
|
178 |
+
return out_image_annotator, number_reported, number_reported, page_num_reported, recogniser_entities_drop, recogniser_dataframe_out, recogniser_dataframe_gr
|
179 |
|
180 |
def modify_existing_page_redactions(image_annotated:AnnotatedImageData, current_page:int, previous_page:int, all_image_annotations:List[AnnotatedImageData], clear_all:bool=False):
|
181 |
'''
|
|
|
207 |
output_files = []
|
208 |
output_log_files = []
|
209 |
|
210 |
+
#print("File paths in apply_redactions:", file_paths)
|
211 |
+
|
212 |
image_annotated['image'] = all_image_annotations[current_page - 1]["image"]
|
213 |
|
214 |
all_image_annotations[current_page - 1] = image_annotated
|
|
|
312 |
|
313 |
return doc, all_image_annotations, output_files, output_log_files
|
314 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
315 |
def get_boxes_json(annotations:AnnotatedImageData):
|
316 |
return annotations["boxes"]
|
|
|
|
|
317 |
|
318 |
+
def update_entities_df(choice:str, df:pd.DataFrame):
|
319 |
+
if choice=="ALL":
|
320 |
+
return df
|
321 |
+
else:
|
322 |
+
return df.loc[df["label"]==choice,:]
|
323 |
+
|
324 |
+
def df_select_callback(df: pd.DataFrame, evt: gr.SelectData):
|
325 |
+
#print("index", evt.index)
|
326 |
+
#print("value", evt.value)
|
327 |
+
#print("row_value", evt.row_value)
|
328 |
+
row_value_page = evt.row_value[0] # This is the page number value
|
329 |
+
return row_value_page
|
|
|
330 |
|
|