Commit
·
a03496e
1
Parent(s):
59ff822
Side review bar is mostly there. A couple of bugs fixed. Can now return identified text in initial review files. Still working on retaining found text throughout review process
Browse files- app.py +28 -17
- tools/file_conversion.py +34 -9
- tools/file_redaction.py +134 -274
- tools/redaction_review.py +28 -6
app.py
CHANGED
@@ -8,6 +8,7 @@ import gradio as gr
|
|
8 |
import pandas as pd
|
9 |
from datetime import datetime
|
10 |
from gradio_image_annotation import image_annotator
|
|
|
11 |
|
12 |
from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
|
13 |
from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
|
@@ -80,6 +81,8 @@ with app:
|
|
80 |
output_file_list_state = gr.State([])
|
81 |
text_output_file_list_state = gr.State([])
|
82 |
log_files_output_list_state = gr.State([])
|
|
|
|
|
83 |
|
84 |
# Logging state
|
85 |
log_file_name = 'log.csv'
|
@@ -113,7 +116,7 @@ with app:
|
|
113 |
|
114 |
|
115 |
## Annotator zoom value
|
116 |
-
annotator_zoom_number = gr.Number(label = "Current annotator zoom level", value=
|
117 |
zoom_true_bool = gr.State(True)
|
118 |
zoom_false_bool = gr.State(False)
|
119 |
|
@@ -204,9 +207,9 @@ with app:
|
|
204 |
annotate_zoom_in = gr.Button("Zoom in")
|
205 |
annotate_zoom_out = gr.Button("Zoom out")
|
206 |
with gr.Row():
|
207 |
-
clear_all_redactions_on_page_btn = gr.Button("Clear all redactions on page")
|
|
|
208 |
|
209 |
-
annotation_button_apply = gr.Button("Apply revised redactions", variant="primary")
|
210 |
|
211 |
with gr.Row():
|
212 |
|
@@ -233,10 +236,8 @@ with app:
|
|
233 |
)
|
234 |
|
235 |
with gr.Column(scale=1):
|
236 |
-
recogniser_entity_dropdown = gr.Dropdown(value="ALL", allow_custom_value=True)
|
237 |
-
recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas")
|
238 |
-
|
239 |
-
|
240 |
|
241 |
with gr.Row():
|
242 |
annotation_last_page_button_bottom = gr.Button("Previous page", scale = 3)
|
@@ -245,6 +246,9 @@ with app:
|
|
245 |
annotation_next_page_button_bottom = gr.Button("Next page", scale = 3)
|
246 |
|
247 |
|
|
|
|
|
|
|
248 |
# TEXT / TABULAR DATA TAB
|
249 |
with gr.Tab(label="Open text or Excel/csv files"):
|
250 |
gr.Markdown(
|
@@ -304,7 +308,7 @@ with app:
|
|
304 |
|
305 |
in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Entities to redact - local PII identification model (click close to down arrow for full list)")
|
306 |
|
307 |
-
handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract settings", choices=["Redact all identified handwriting", "Redact all identified signatures"], value=["Redact all identified handwriting"])
|
308 |
#with gr.Row():
|
309 |
in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False, visible=False)
|
310 |
|
@@ -319,7 +323,7 @@ with app:
|
|
319 |
in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list])
|
320 |
|
321 |
document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator]).\
|
322 |
-
then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state], api_name="prepare_doc").\
|
323 |
then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
|
324 |
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number], api_name="redact_doc").\
|
325 |
then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
@@ -339,49 +343,56 @@ with app:
|
|
339 |
|
340 |
# Upload previous files for modifying redactions
|
341 |
upload_previous_review_file_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
|
342 |
-
then(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state]).\
|
343 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
344 |
|
345 |
# Page controls at top
|
346 |
annotate_current_page.submit(
|
347 |
-
modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
|
348 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
349 |
|
|
|
|
|
350 |
annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
|
|
351 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
352 |
annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
|
|
353 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
354 |
|
355 |
# Zoom in and out on annotator
|
356 |
-
annotate_zoom_in.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
|
357 |
then(update_zoom, inputs=[annotator_zoom_number, annotate_current_page, zoom_true_bool], outputs=[annotator_zoom_number, annotate_current_page])
|
358 |
|
359 |
-
annotate_zoom_out.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
|
360 |
then(update_zoom, inputs=[annotator_zoom_number, annotate_current_page, zoom_false_bool], outputs=[annotator_zoom_number, annotate_current_page])
|
361 |
|
362 |
annotator_zoom_number.change(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
363 |
|
364 |
-
clear_all_redactions_on_page_btn.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, clear_all_page_redactions], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
|
365 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
366 |
|
367 |
#annotation_button_get.click(get_boxes_json, annotator, json_boxes)
|
368 |
-
annotation_button_apply.click(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output], scroll_to_output=True)
|
369 |
|
370 |
# Page controls at bottom
|
371 |
annotate_current_page_bottom.submit(
|
372 |
-
modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page]).\
|
373 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
374 |
|
375 |
annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
|
|
376 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
|
|
377 |
annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
|
|
378 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
379 |
|
380 |
# Review side bar controls
|
381 |
recogniser_entity_dropdown.select(update_entities_df, inputs=[recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs=[recogniser_entity_dataframe])
|
382 |
|
383 |
recogniser_entity_dataframe.select(df_select_callback, inputs=[recogniser_entity_dataframe], outputs=annotate_current_page).\
|
384 |
-
then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
|
385 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
386 |
|
387 |
|
|
|
8 |
import pandas as pd
|
9 |
from datetime import datetime
|
10 |
from gradio_image_annotation import image_annotator
|
11 |
+
from gradio_image_annotation.image_annotator import AnnotatedImageData
|
12 |
|
13 |
from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
|
14 |
from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
|
|
|
81 |
output_file_list_state = gr.State([])
|
82 |
text_output_file_list_state = gr.State([])
|
83 |
log_files_output_list_state = gr.State([])
|
84 |
+
|
85 |
+
review_file_state = gr.State(pd.DataFrame())
|
86 |
|
87 |
# Logging state
|
88 |
log_file_name = 'log.csv'
|
|
|
116 |
|
117 |
|
118 |
## Annotator zoom value
|
119 |
+
annotator_zoom_number = gr.Number(label = "Current annotator zoom level", value=80, precision=0, visible=False)
|
120 |
zoom_true_bool = gr.State(True)
|
121 |
zoom_false_bool = gr.State(False)
|
122 |
|
|
|
207 |
annotate_zoom_in = gr.Button("Zoom in")
|
208 |
annotate_zoom_out = gr.Button("Zoom out")
|
209 |
with gr.Row():
|
210 |
+
clear_all_redactions_on_page_btn = gr.Button("Clear all redactions on page", visible=False)
|
211 |
+
annotation_button_apply = gr.Button("Apply revised redactions", variant="primary")
|
212 |
|
|
|
213 |
|
214 |
with gr.Row():
|
215 |
|
|
|
236 |
)
|
237 |
|
238 |
with gr.Column(scale=1):
|
239 |
+
recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
|
240 |
+
recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", label="Search results. Click to go to page")
|
|
|
|
|
241 |
|
242 |
with gr.Row():
|
243 |
annotation_last_page_button_bottom = gr.Button("Previous page", scale = 3)
|
|
|
246 |
annotation_next_page_button_bottom = gr.Button("Next page", scale = 3)
|
247 |
|
248 |
|
249 |
+
|
250 |
+
|
251 |
+
|
252 |
# TEXT / TABULAR DATA TAB
|
253 |
with gr.Tab(label="Open text or Excel/csv files"):
|
254 |
gr.Markdown(
|
|
|
308 |
|
309 |
in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Entities to redact - local PII identification model (click close to down arrow for full list)")
|
310 |
|
311 |
+
handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract settings", choices=["Redact all identified handwriting", "Redact all identified signatures"], value=["Redact all identified handwriting", "Redact all identified signatures"])
|
312 |
#with gr.Row():
|
313 |
in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False, visible=False)
|
314 |
|
|
|
323 |
in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list])
|
324 |
|
325 |
document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator]).\
|
326 |
+
then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state], api_name="prepare_doc").\
|
327 |
then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
|
328 |
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number], api_name="redact_doc").\
|
329 |
then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
|
|
343 |
|
344 |
# Upload previous files for modifying redactions
|
345 |
upload_previous_review_file_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
|
346 |
+
then(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state]).\
|
347 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
348 |
|
349 |
# Page controls at top
|
350 |
annotate_current_page.submit(
|
351 |
+
modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
352 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
353 |
|
354 |
+
|
355 |
+
|
356 |
annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
357 |
+
then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
358 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
359 |
annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
360 |
+
then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
361 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
362 |
|
363 |
# Zoom in and out on annotator
|
364 |
+
annotate_zoom_in.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
365 |
then(update_zoom, inputs=[annotator_zoom_number, annotate_current_page, zoom_true_bool], outputs=[annotator_zoom_number, annotate_current_page])
|
366 |
|
367 |
+
annotate_zoom_out.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
368 |
then(update_zoom, inputs=[annotator_zoom_number, annotate_current_page, zoom_false_bool], outputs=[annotator_zoom_number, annotate_current_page])
|
369 |
|
370 |
annotator_zoom_number.change(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
371 |
|
372 |
+
clear_all_redactions_on_page_btn.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base, clear_all_page_redactions], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
373 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
374 |
|
375 |
#annotation_button_get.click(get_boxes_json, annotator, json_boxes)
|
376 |
+
annotation_button_apply.click(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output], scroll_to_output=True)
|
377 |
|
378 |
# Page controls at bottom
|
379 |
annotate_current_page_bottom.submit(
|
380 |
+
modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
381 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
382 |
|
383 |
annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
384 |
+
then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
385 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
386 |
+
|
387 |
annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
388 |
+
then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
389 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
390 |
|
391 |
# Review side bar controls
|
392 |
recogniser_entity_dropdown.select(update_entities_df, inputs=[recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs=[recogniser_entity_dataframe])
|
393 |
|
394 |
recogniser_entity_dataframe.select(df_select_callback, inputs=[recogniser_entity_dataframe], outputs=annotate_current_page).\
|
395 |
+
then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
396 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
397 |
|
398 |
|
tools/file_conversion.py
CHANGED
@@ -399,6 +399,7 @@ def prepare_image_or_pdf(
|
|
399 |
converted_file_paths = []
|
400 |
image_file_paths = []
|
401 |
pymupdf_doc = []
|
|
|
402 |
|
403 |
if not file_paths:
|
404 |
file_paths = []
|
@@ -424,7 +425,7 @@ def prepare_image_or_pdf(
|
|
424 |
final_out_message = '\n'.join(out_message)
|
425 |
else:
|
426 |
final_out_message = out_message
|
427 |
-
return final_out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
|
428 |
|
429 |
#in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
|
430 |
|
@@ -457,7 +458,7 @@ def prepare_image_or_pdf(
|
|
457 |
if not file_path:
|
458 |
out_message = "Please select a file."
|
459 |
print(out_message)
|
460 |
-
return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
|
461 |
|
462 |
file_extension = os.path.splitext(file_path)[1].lower()
|
463 |
|
@@ -478,7 +479,7 @@ def prepare_image_or_pdf(
|
|
478 |
|
479 |
all_annotations_object.append(annotation)
|
480 |
|
481 |
-
print("all_annotations_object:", all_annotations_object)
|
482 |
|
483 |
|
484 |
elif is_pdf_or_image(file_path): # Alternatively, if it's an image
|
@@ -597,13 +598,13 @@ def prepare_image_or_pdf(
|
|
597 |
if is_pdf_or_image(file_path) == False:
|
598 |
out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
|
599 |
print(out_message)
|
600 |
-
return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
|
601 |
|
602 |
elif in_redact_method == text_ocr_option:
|
603 |
if is_pdf(file_path) == False:
|
604 |
out_message = "Please upload a PDF file for text analysis."
|
605 |
print(out_message)
|
606 |
-
return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
|
607 |
|
608 |
|
609 |
converted_file_paths.append(converted_file_path)
|
@@ -624,7 +625,7 @@ def prepare_image_or_pdf(
|
|
624 |
|
625 |
#print("all_annotations_object at end:", all_annotations_object)
|
626 |
|
627 |
-
return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
|
628 |
|
629 |
def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str], image_dpi:float=image_dpi):
|
630 |
file_path_without_ext = get_file_path_end(in_file_path)
|
@@ -650,7 +651,7 @@ def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str],
|
|
650 |
return out_message, out_file_paths
|
651 |
|
652 |
|
653 |
-
def convert_review_json_to_pandas_df(data:List[dict]) -> pd.DataFrame:
|
654 |
# Flatten the data
|
655 |
flattened_data = []
|
656 |
|
@@ -670,16 +671,40 @@ def convert_review_json_to_pandas_df(data:List[dict]) -> pd.DataFrame:
|
|
670 |
|
671 |
# Check if 'boxes' is in the entry, if not, add an empty list
|
672 |
if 'boxes' not in entry:
|
673 |
-
entry['boxes'] = []
|
674 |
|
675 |
for box in entry["boxes"]:
|
676 |
-
|
|
|
|
|
|
|
677 |
#print("data_to_add:", data_to_add)
|
678 |
flattened_data.append(data_to_add)
|
679 |
|
680 |
# Convert to a DataFrame
|
681 |
df = pd.DataFrame(flattened_data)
|
682 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
683 |
return df
|
684 |
|
685 |
def convert_pandas_df_to_review_json(df: pd.DataFrame) -> List[dict]:
|
|
|
399 |
converted_file_paths = []
|
400 |
image_file_paths = []
|
401 |
pymupdf_doc = []
|
402 |
+
review_file_csv = pd.DataFrame()
|
403 |
|
404 |
if not file_paths:
|
405 |
file_paths = []
|
|
|
425 |
final_out_message = '\n'.join(out_message)
|
426 |
else:
|
427 |
final_out_message = out_message
|
428 |
+
return final_out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv
|
429 |
|
430 |
#in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
|
431 |
|
|
|
458 |
if not file_path:
|
459 |
out_message = "Please select a file."
|
460 |
print(out_message)
|
461 |
+
return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv
|
462 |
|
463 |
file_extension = os.path.splitext(file_path)[1].lower()
|
464 |
|
|
|
479 |
|
480 |
all_annotations_object.append(annotation)
|
481 |
|
482 |
+
#print("all_annotations_object:", all_annotations_object)
|
483 |
|
484 |
|
485 |
elif is_pdf_or_image(file_path): # Alternatively, if it's an image
|
|
|
598 |
if is_pdf_or_image(file_path) == False:
|
599 |
out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
|
600 |
print(out_message)
|
601 |
+
return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv
|
602 |
|
603 |
elif in_redact_method == text_ocr_option:
|
604 |
if is_pdf(file_path) == False:
|
605 |
out_message = "Please upload a PDF file for text analysis."
|
606 |
print(out_message)
|
607 |
+
return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv
|
608 |
|
609 |
|
610 |
converted_file_paths.append(converted_file_path)
|
|
|
625 |
|
626 |
#print("all_annotations_object at end:", all_annotations_object)
|
627 |
|
628 |
+
return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv
|
629 |
|
630 |
def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str], image_dpi:float=image_dpi):
|
631 |
file_path_without_ext = get_file_path_end(in_file_path)
|
|
|
651 |
return out_message, out_file_paths
|
652 |
|
653 |
|
654 |
+
def convert_review_json_to_pandas_df(data:List[dict], text_join_data=pd.DataFrame) -> pd.DataFrame:
|
655 |
# Flatten the data
|
656 |
flattened_data = []
|
657 |
|
|
|
671 |
|
672 |
# Check if 'boxes' is in the entry, if not, add an empty list
|
673 |
if 'boxes' not in entry:
|
674 |
+
entry['boxes'] = []
|
675 |
|
676 |
for box in entry["boxes"]:
|
677 |
+
if 'text' not in box:
|
678 |
+
data_to_add = {"image": image_path, "page": reported_number, **box} # "text": entry['text'],
|
679 |
+
else:
|
680 |
+
data_to_add = {"image": image_path, "page": reported_number, "text": entry['text'], **box}
|
681 |
#print("data_to_add:", data_to_add)
|
682 |
flattened_data.append(data_to_add)
|
683 |
|
684 |
# Convert to a DataFrame
|
685 |
df = pd.DataFrame(flattened_data)
|
686 |
|
687 |
+
# Join on additional text data from decision output results if included
|
688 |
+
if not text_join_data.empty:
|
689 |
+
#print("text_join_data:", text_join_data)
|
690 |
+
#print("df:", df)
|
691 |
+
text_join_data['page'] = text_join_data['page'].astype(str)
|
692 |
+
df['page'] = df['page'].astype(str)
|
693 |
+
text_join_data = text_join_data[['xmin', 'ymin', 'xmax', 'ymax', 'label', 'page', 'text']]
|
694 |
+
text_join_data[['xmin', 'ymin', 'xmax', 'ymax']] = text_join_data[['xmin', 'ymin', 'xmax', 'ymax']].astype(float).round(0)
|
695 |
+
df[['xmin1', 'ymin1', 'xmax1', 'ymax1']] = df[['xmin', 'ymin', 'xmax', 'ymax']].astype(float).round(0)
|
696 |
+
|
697 |
+
df = df.merge(text_join_data, left_on = ['xmin1', 'ymin1', 'xmax1', 'ymax1', 'label', 'page'], right_on = ['xmin', 'ymin', 'xmax', 'ymax', 'label', 'page'], how = "left", suffixes=("", "_y"))
|
698 |
+
|
699 |
+
df = df.drop(['xmin1', 'ymin1', 'xmax1', 'ymax1', 'xmin_y', 'ymin_y', 'xmax_y', 'ymax_y'], axis=1, errors="ignore")
|
700 |
+
|
701 |
+
df = df[["image", "page", "label", "color", "xmin", "ymin", "xmax", "ymax", "text"]]
|
702 |
+
|
703 |
+
if 'text' not in df.columns:
|
704 |
+
df['text'] = ''
|
705 |
+
|
706 |
+
df = df.sort_values(['page', 'ymin', 'xmin', 'label'])
|
707 |
+
|
708 |
return df
|
709 |
|
710 |
def convert_pandas_df_to_review_json(df: pd.DataFrame) -> List[dict]:
|
tools/file_redaction.py
CHANGED
@@ -40,6 +40,11 @@ print(f'The value of page_break_value is {page_break_value}')
|
|
40 |
max_time_value = get_or_create_env_var('max_time_value', '999999')
|
41 |
print(f'The value of max_time_value is {max_time_value}')
|
42 |
|
|
|
|
|
|
|
|
|
|
|
43 |
def sum_numbers_before_seconds(string:str):
|
44 |
"""Extracts numbers that precede the word 'seconds' from a string and adds them up.
|
45 |
|
@@ -396,7 +401,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
396 |
# Convert json to csv and also save this
|
397 |
#print("annotations_all_pages:", annotations_all_pages)
|
398 |
|
399 |
-
review_df = convert_review_json_to_pandas_df(annotations_all_pages)
|
400 |
|
401 |
out_review_file_file_path = out_image_file_path + '_review_file.csv'
|
402 |
review_df.to_csv(out_review_file_file_path, index=None)
|
@@ -452,7 +457,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
452 |
|
453 |
return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
|
454 |
|
455 |
-
def convert_pikepdf_coords_to_pymupdf(pymupdf_page, pikepdf_bbox):
|
456 |
'''
|
457 |
Convert annotations from pikepdf to pymupdf format, handling the mediabox larger than rect.
|
458 |
'''
|
@@ -474,7 +479,10 @@ def convert_pikepdf_coords_to_pymupdf(pymupdf_page, pikepdf_bbox):
|
|
474 |
x_diff_ratio = media_reference_x_diff / reference_box_width
|
475 |
|
476 |
# Extract the annotation rectangle field
|
477 |
-
|
|
|
|
|
|
|
478 |
rect_coordinates = [float(coord) for coord in rect_field] # Convert to floats
|
479 |
|
480 |
# Unpack coordinates
|
@@ -487,7 +495,7 @@ def convert_pikepdf_coords_to_pymupdf(pymupdf_page, pikepdf_bbox):
|
|
487 |
|
488 |
return new_x1, new_y1, new_x2, new_y2
|
489 |
|
490 |
-
def convert_pikepdf_to_image_coords(pymupdf_page, annot, image:Image):
|
491 |
'''
|
492 |
Convert annotations from pikepdf coordinates to image coordinates.
|
493 |
'''
|
@@ -504,7 +512,10 @@ def convert_pikepdf_to_image_coords(pymupdf_page, annot, image:Image):
|
|
504 |
scale_height = image_page_height / rect_height
|
505 |
|
506 |
# Extract the /Rect field
|
507 |
-
|
|
|
|
|
|
|
508 |
|
509 |
# Convert the extracted /Rect field to a list of floats
|
510 |
rect_coordinates = [float(coord) for coord in rect_field]
|
@@ -518,9 +529,30 @@ def convert_pikepdf_to_image_coords(pymupdf_page, annot, image:Image):
|
|
518 |
|
519 |
return x1_image, new_y1_image, x2_image, new_y2_image
|
520 |
|
521 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
522 |
'''
|
523 |
-
Converts an image with redaction coordinates from a CustomImageRecognizerResult to pymupdf coordinates.
|
524 |
'''
|
525 |
|
526 |
rect_height = pymupdf_page.rect.height
|
@@ -533,14 +565,29 @@ def convert_image_coords_to_pymupdf(pymupdf_page, annot:CustomImageRecognizerRes
|
|
533 |
scale_height = rect_height / image_page_height
|
534 |
|
535 |
# Calculate scaled coordinates
|
536 |
-
|
537 |
-
|
538 |
-
|
539 |
-
|
|
|
|
|
|
|
|
|
|
|
540 |
|
541 |
-
|
|
|
|
|
|
|
|
|
542 |
|
|
|
|
|
|
|
|
|
|
|
543 |
|
|
|
544 |
|
545 |
def convert_gradio_annotation_coords_to_pymupdf(pymupdf_page:Page, annot:dict, image:Image):
|
546 |
'''
|
@@ -577,7 +624,7 @@ def move_page_info(file_path: str) -> str:
|
|
577 |
|
578 |
return new_file_path
|
579 |
|
580 |
-
def redact_page_with_pymupdf(page:Page,
|
581 |
|
582 |
mediabox_height = page.mediabox[3] - page.mediabox[1]
|
583 |
mediabox_width = page.mediabox[2] - page.mediabox[0]
|
@@ -599,10 +646,10 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page:dict, image:Image.Im
|
|
599 |
image = Image.open(image_path)
|
600 |
|
601 |
# Check if this is an object used in the Gradio Annotation component
|
602 |
-
if isinstance (
|
603 |
-
|
604 |
|
605 |
-
for annot in
|
606 |
# Check if an Image recogniser result, or a Gradio annotation object
|
607 |
if (isinstance(annot, CustomImageRecognizerResult)) | isinstance(annot, dict):
|
608 |
|
@@ -611,12 +658,16 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page:dict, image:Image.Im
|
|
611 |
# Should already be in correct format if img_annotator_box is an input
|
612 |
if isinstance(annot, dict):
|
613 |
img_annotation_box = annot
|
614 |
-
|
615 |
pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2 = convert_gradio_annotation_coords_to_pymupdf(page, annot, image)
|
616 |
|
617 |
x1 = pymupdf_x1
|
618 |
x2 = pymupdf_x2
|
619 |
|
|
|
|
|
|
|
|
|
|
|
620 |
# Else should be CustomImageRecognizerResult
|
621 |
else:
|
622 |
pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2 = convert_image_coords_to_pymupdf(page, annot, image)
|
@@ -633,12 +684,19 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page:dict, image:Image.Im
|
|
633 |
img_annotation_box["label"] = annot.entity_type
|
634 |
except:
|
635 |
img_annotation_box["label"] = "Redaction"
|
|
|
|
|
|
|
|
|
636 |
|
637 |
rect = Rect(x1, pymupdf_y1, x2, pymupdf_y2) # Create the PyMuPDF Rect
|
638 |
|
639 |
# Else it should be a pikepdf annotation object
|
640 |
-
else:
|
641 |
-
|
|
|
|
|
|
|
642 |
|
643 |
x1 = pymupdf_x1
|
644 |
x2 = pymupdf_x2
|
@@ -650,6 +708,8 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page:dict, image:Image.Im
|
|
650 |
if image:
|
651 |
img_width, img_height = image.size
|
652 |
|
|
|
|
|
653 |
x1, image_y1, x2, image_y2 = convert_pymupdf_to_image_coords(page, x1, pymupdf_y1, x2, pymupdf_y2, image)
|
654 |
|
655 |
img_annotation_box["xmin"] = x1 #* (img_width / rect_width) # Use adjusted x1
|
@@ -662,6 +722,10 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page:dict, image:Image.Im
|
|
662 |
img_annotation_box["label"] = str(annot["/T"])
|
663 |
else:
|
664 |
img_annotation_box["label"] = "REDACTION"
|
|
|
|
|
|
|
|
|
665 |
|
666 |
# Convert to a PyMuPDF Rect object
|
667 |
#rect = Rect(rect_coordinates)
|
@@ -672,29 +736,6 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page:dict, image:Image.Im
|
|
672 |
|
673 |
# If whole page is to be redacted, do that here
|
674 |
if redact_whole_page == True:
|
675 |
-
# # Small border to page that remains white
|
676 |
-
# border = 5
|
677 |
-
# # Define the coordinates for the Rect
|
678 |
-
# whole_page_x1, whole_page_y1 = 0 + border, 0 + border # Bottom-left corner
|
679 |
-
# whole_page_x2, whole_page_y2 = rect_width - border, rect_height - border # Top-right corner
|
680 |
-
|
681 |
-
# whole_page_image_x1, whole_page_image_y1, whole_page_image_x2, whole_page_image_y2 = convert_pymupdf_to_image_coords(page, whole_page_x1, whole_page_y1, whole_page_x2, whole_page_y2, image)
|
682 |
-
|
683 |
-
# # Create new image annotation element based on whole page coordinates
|
684 |
-
# whole_page_rect = Rect(whole_page_x1, whole_page_y1, whole_page_x2, whole_page_y2)
|
685 |
-
|
686 |
-
# # Write whole page annotation to annotation boxes
|
687 |
-
# whole_page_img_annotation_box = {}
|
688 |
-
# whole_page_img_annotation_box["xmin"] = whole_page_image_x1
|
689 |
-
# whole_page_img_annotation_box["ymin"] = whole_page_image_y1
|
690 |
-
# whole_page_img_annotation_box["xmax"] = whole_page_image_x2
|
691 |
-
# whole_page_img_annotation_box["ymax"] = whole_page_image_y2
|
692 |
-
# whole_page_img_annotation_box["color"] = (0,0,0)
|
693 |
-
# whole_page_img_annotation_box["label"] = "Whole page"
|
694 |
-
|
695 |
-
# redact_single_box(page, whole_page_rect, whole_page_img_annotation_box, custom_colours)
|
696 |
-
|
697 |
-
# all_image_annotation_boxes.append(whole_page_img_annotation_box)
|
698 |
|
699 |
whole_page_img_annotation_box = redact_whole_pymupdf_page(rect_height, rect_width, image, page, custom_colours, border = 5)
|
700 |
all_image_annotation_boxes.append(whole_page_img_annotation_box)
|
@@ -712,14 +753,7 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page:dict, image:Image.Im
|
|
712 |
|
713 |
return page, out_annotation_boxes
|
714 |
|
715 |
-
def bounding_boxes_overlap(box1, box2):
|
716 |
-
"""Check if two bounding boxes overlap."""
|
717 |
-
return (box1[0] < box2[2] and box2[0] < box1[2] and
|
718 |
-
box1[1] < box2[3] and box2[1] < box1[3])
|
719 |
|
720 |
-
from collections import defaultdict
|
721 |
-
from typing import List, Dict
|
722 |
-
import copy
|
723 |
|
724 |
def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_results=[], handwriting_recogniser_results=[], handwrite_signature_checkbox: List[str]=["Redact all identified handwriting", "Redact all identified signatures"], horizontal_threshold:int=50, vertical_threshold:int=12):
|
725 |
|
@@ -822,117 +856,6 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
|
|
822 |
unique_bboxes = list({(bbox.left, bbox.top, bbox.width, bbox.height): bbox for bbox in all_bboxes}.values())
|
823 |
return unique_bboxes
|
824 |
|
825 |
-
|
826 |
-
# def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_results=[], handwriting_recogniser_results=[], handwrite_signature_checkbox: List[str]=["Redact all identified handwriting", "Redact all identified signatures"], horizontal_threshold:int=50, vertical_threshold:int=12):
|
827 |
-
# merged_bboxes = []
|
828 |
-
# grouped_bboxes = defaultdict(list)
|
829 |
-
|
830 |
-
# # Process signature and handwriting results
|
831 |
-
# if signature_recogniser_results or handwriting_recogniser_results:
|
832 |
-
# if "Redact all identified handwriting" in handwrite_signature_checkbox:
|
833 |
-
# #print("Handwriting boxes exist at merge:", handwriting_recogniser_results)
|
834 |
-
# merged_bboxes.extend(handwriting_recogniser_results)
|
835 |
-
|
836 |
-
# if "Redact all identified signatures" in handwrite_signature_checkbox:
|
837 |
-
# #print("Signature boxes exist at merge:", signature_recogniser_results)
|
838 |
-
# merged_bboxes.extend(signature_recogniser_results)
|
839 |
-
|
840 |
-
|
841 |
-
# # Reconstruct bounding boxes for substrings of interest
|
842 |
-
# reconstructed_bboxes = []
|
843 |
-
# for bbox in bboxes:
|
844 |
-
# #print("bbox:", bbox)
|
845 |
-
# bbox_box = (bbox.left, bbox.top, bbox.left + bbox.width, bbox.top + bbox.height)
|
846 |
-
# for line_text, line_info in combined_results.items():
|
847 |
-
# line_box = line_info['bounding_box']
|
848 |
-
# if bounding_boxes_overlap(bbox_box, line_box):
|
849 |
-
# if bbox.text in line_text:
|
850 |
-
# start_char = line_text.index(bbox.text)
|
851 |
-
# end_char = start_char + len(bbox.text)
|
852 |
-
|
853 |
-
# relevant_words = []
|
854 |
-
# current_char = 0
|
855 |
-
# for word in line_info['words']:
|
856 |
-
# word_end = current_char + len(word['text'])
|
857 |
-
# if current_char <= start_char < word_end or current_char < end_char <= word_end or (start_char <= current_char and word_end <= end_char):
|
858 |
-
# relevant_words.append(word)
|
859 |
-
# if word_end >= end_char:
|
860 |
-
# break
|
861 |
-
# current_char = word_end
|
862 |
-
# if not word['text'].endswith(' '):
|
863 |
-
# current_char += 1 # +1 for space if the word doesn't already end with a space
|
864 |
-
|
865 |
-
# if relevant_words:
|
866 |
-
# #print("Relevant words:", relevant_words)
|
867 |
-
# left = min(word['bounding_box'][0] for word in relevant_words)
|
868 |
-
# top = min(word['bounding_box'][1] for word in relevant_words)
|
869 |
-
# right = max(word['bounding_box'][2] for word in relevant_words)
|
870 |
-
# bottom = max(word['bounding_box'][3] for word in relevant_words)
|
871 |
-
|
872 |
-
# # Combine the text of all relevant words
|
873 |
-
# combined_text = " ".join(word['text'] for word in relevant_words)
|
874 |
-
|
875 |
-
# # Calculate new dimensions for the merged box
|
876 |
-
# reconstructed_bbox = CustomImageRecognizerResult(
|
877 |
-
# bbox.entity_type,
|
878 |
-
# bbox.start,
|
879 |
-
# bbox.end,
|
880 |
-
# bbox.score,
|
881 |
-
# left,
|
882 |
-
# top,
|
883 |
-
# right - left, # width
|
884 |
-
# bottom - top, # height
|
885 |
-
# combined_text
|
886 |
-
# )
|
887 |
-
# # Add both the original and the merged bounding box
|
888 |
-
# reconstructed_bboxes.append(bbox) # Retain the original bbox
|
889 |
-
# reconstructed_bboxes.append(reconstructed_bbox) # Add the merged bbox
|
890 |
-
# break
|
891 |
-
# else:
|
892 |
-
# # If the bbox text is not found in any line in combined_results, keep the original bbox
|
893 |
-
# reconstructed_bboxes.append(bbox)
|
894 |
-
|
895 |
-
# # Group reconstructed bboxes by approximate vertical proximity
|
896 |
-
# for box in reconstructed_bboxes:
|
897 |
-
# grouped_bboxes[round(box.top / vertical_threshold)].append(box)
|
898 |
-
|
899 |
-
# # Merge within each group
|
900 |
-
# for _, group in grouped_bboxes.items():
|
901 |
-
# group.sort(key=lambda box: box.left)
|
902 |
-
|
903 |
-
# merged_box = group[0]
|
904 |
-
# for next_box in group[1:]:
|
905 |
-
# if next_box.left - (merged_box.left + merged_box.width) <= horizontal_threshold:
|
906 |
-
# # Calculate new dimensions for the merged box
|
907 |
-
# if merged_box.text == next_box.text:
|
908 |
-
# new_text = merged_box.text
|
909 |
-
# else:
|
910 |
-
# new_text = merged_box.text + " " + next_box.text
|
911 |
-
|
912 |
-
# if merged_box.text == next_box.text:
|
913 |
-
# new_text = merged_box.text
|
914 |
-
# new_entity_type = merged_box.entity_type # Keep the original entity type
|
915 |
-
# else:
|
916 |
-
# new_text = merged_box.text + " " + next_box.text
|
917 |
-
# new_entity_type = merged_box.entity_type + " - " + next_box.entity_type # Concatenate entity types
|
918 |
-
|
919 |
-
# new_left = min(merged_box.left, next_box.left)
|
920 |
-
# new_top = min(merged_box.top, next_box.top)
|
921 |
-
# new_width = max(merged_box.left + merged_box.width, next_box.left + next_box.width) - new_left
|
922 |
-
# new_height = max(merged_box.top + merged_box.height, next_box.top + next_box.height) - new_top
|
923 |
-
# merged_box = CustomImageRecognizerResult(
|
924 |
-
# new_entity_type, merged_box.start, merged_box.end, merged_box.score, new_left, new_top, new_width, new_height, new_text
|
925 |
-
# )
|
926 |
-
# else:
|
927 |
-
# merged_bboxes.append(merged_box)
|
928 |
-
# merged_box = next_box
|
929 |
-
|
930 |
-
# merged_bboxes.append(merged_box)
|
931 |
-
|
932 |
-
# #print("bboxes:", bboxes)
|
933 |
-
|
934 |
-
# return merged_bboxes
|
935 |
-
|
936 |
def redact_image_pdf(file_path:str,
|
937 |
prepared_pdf_file_paths:List[str],
|
938 |
language:str,
|
@@ -1279,17 +1202,21 @@ def redact_image_pdf(file_path:str,
|
|
1279 |
|
1280 |
# Convert decision process to table
|
1281 |
decision_process_table = pd.DataFrame([{
|
1282 |
-
'
|
1283 |
-
'
|
|
|
|
|
|
|
|
|
1284 |
'start': result.start,
|
1285 |
'end': result.end,
|
1286 |
'score': result.score,
|
1287 |
-
'
|
1288 |
-
|
1289 |
-
|
1290 |
-
'
|
1291 |
-
'
|
1292 |
-
|
1293 |
|
1294 |
all_decision_process_table = pd.concat([all_decision_process_table, decision_process_table])
|
1295 |
|
@@ -1323,7 +1250,7 @@ def redact_image_pdf(file_path:str,
|
|
1323 |
pymupdf_doc = images
|
1324 |
|
1325 |
# Check if the image already exists in annotations_all_pages
|
1326 |
-
print("annotations_all_pages:", annotations_all_pages)
|
1327 |
existing_index = next((index for index, ann in enumerate(annotations_all_pages) if ann["image"] == image_annotations["image"]), None)
|
1328 |
if existing_index is not None:
|
1329 |
# Replace the existing annotation
|
@@ -1346,7 +1273,7 @@ def redact_image_pdf(file_path:str,
|
|
1346 |
pymupdf_doc = images
|
1347 |
|
1348 |
# Check if the image already exists in annotations_all_pages
|
1349 |
-
print("annotations_all_pages:", annotations_all_pages)
|
1350 |
existing_index = next((index for index, ann in enumerate(annotations_all_pages) if ann["image"] == image_annotations["image"]), None)
|
1351 |
if existing_index is not None:
|
1352 |
# Replace the existing annotation
|
@@ -1595,105 +1522,25 @@ def merge_text_bounding_boxes(analyser_results, characters: List[LTChar], combin
|
|
1595 |
|
1596 |
return analysed_bounding_boxes
|
1597 |
|
1598 |
-
|
1599 |
-
# def merge_text_bounding_boxes(analyser_results, characters:List[LTChar], combine_pixel_dist:int=20, vertical_padding:int=0):
|
1600 |
-
# '''
|
1601 |
-
# Merge identified bounding boxes containing PII that are very close to one another
|
1602 |
-
# '''
|
1603 |
-
# analysed_bounding_boxes = []
|
1604 |
-
# if len(analyser_results) > 0 and len(characters) > 0:
|
1605 |
-
# # Extract bounding box coordinates for sorting
|
1606 |
-
# bounding_boxes = []
|
1607 |
-
# text_out = []
|
1608 |
-
# for result in analyser_results:
|
1609 |
-
# char_boxes = [char.bbox for char in characters[result.start:result.end] if isinstance(char, LTChar)]
|
1610 |
-
# char_text = [char._text for char in characters[result.start:result.end] if isinstance(char, LTChar)]
|
1611 |
-
# if char_boxes:
|
1612 |
-
# # Calculate the bounding box that encompasses all characters
|
1613 |
-
# left = min(box[0] for box in char_boxes)
|
1614 |
-
# bottom = min(box[1] for box in char_boxes)
|
1615 |
-
# right = max(box[2] for box in char_boxes)
|
1616 |
-
# top = max(box[3] for box in char_boxes) + vertical_padding
|
1617 |
-
# bounding_boxes.append((bottom, left, result, [left, bottom, right, top], char_text)) # (y, x, result, bbox, text)
|
1618 |
-
|
1619 |
-
# char_text = "".join(char_text)
|
1620 |
-
|
1621 |
-
# # Sort the results by y-coordinate and then by x-coordinate
|
1622 |
-
# bounding_boxes.sort()
|
1623 |
-
|
1624 |
-
# merged_bounding_boxes = []
|
1625 |
-
# current_box = None
|
1626 |
-
# current_y = None
|
1627 |
-
# current_result = None
|
1628 |
-
# current_text = []
|
1629 |
-
|
1630 |
-
# for y, x, result, char_box, text in bounding_boxes:
|
1631 |
-
# #print(f"Considering result: {result}")
|
1632 |
-
# #print(f"Character box: {char_box}")
|
1633 |
-
|
1634 |
-
# if current_y is None or current_box is None:
|
1635 |
-
# current_box = char_box
|
1636 |
-
# current_y = char_box[1]
|
1637 |
-
# current_result = result
|
1638 |
-
# current_text = list(text)
|
1639 |
-
# #print(f"Starting new box: {current_box}")
|
1640 |
-
# else:
|
1641 |
-
# vertical_diff_bboxes = abs(char_box[1] - current_y)
|
1642 |
-
# horizontal_diff_bboxes = abs(char_box[0] - current_box[2])
|
1643 |
-
|
1644 |
-
# if (
|
1645 |
-
# vertical_diff_bboxes <= 5 and horizontal_diff_bboxes <= combine_pixel_dist
|
1646 |
-
# ):
|
1647 |
-
# #print("box is being extended")
|
1648 |
-
# current_box[2] = char_box[2] # Extend the current box horizontally
|
1649 |
-
# current_box[3] = max(current_box[3], char_box[3]) # Ensure the top is the highest
|
1650 |
-
# current_result.end = max(current_result.end, result.end) # Extend the text range
|
1651 |
-
# try:
|
1652 |
-
# current_result.entity_type = current_result.entity_type + " - " + result.entity_type
|
1653 |
-
# except Exception as e:
|
1654 |
-
# print("Unable to combine result entity types:")
|
1655 |
-
# print(e)
|
1656 |
-
# # Add a space if current_text is not empty
|
1657 |
-
# if current_text:
|
1658 |
-
# current_text.append(" ") # Add space between texts
|
1659 |
-
# current_text.extend(text)
|
1660 |
-
|
1661 |
-
# #print(f"Latest merged box: {current_box[-1]}")
|
1662 |
-
# else:
|
1663 |
-
# merged_bounding_boxes.append(
|
1664 |
-
# {"text":"".join(current_text),"boundingBox": current_box, "result": current_result})
|
1665 |
-
|
1666 |
-
# # Reset current_box and current_y after appending
|
1667 |
-
# current_box = char_box
|
1668 |
-
# current_y = char_box[1]
|
1669 |
-
# current_result = result
|
1670 |
-
# current_text = list(text)
|
1671 |
-
|
1672 |
-
# # After finishing with the current result, add the last box for this result
|
1673 |
-
# if current_box:
|
1674 |
-
# merged_bounding_boxes.append({"text":"".join(current_text), "boundingBox": current_box, "result": current_result})
|
1675 |
-
|
1676 |
-
# if not merged_bounding_boxes:
|
1677 |
-
# analysed_bounding_boxes.extend(
|
1678 |
-
# {"text":text, "boundingBox": char.bbox, "result": result}
|
1679 |
-
# for result in analyser_results
|
1680 |
-
# for char in characters[result.start:result.end]
|
1681 |
-
# if isinstance(char, LTChar)
|
1682 |
-
# )
|
1683 |
-
# else:
|
1684 |
-
# analysed_bounding_boxes.extend(merged_bounding_boxes)
|
1685 |
-
|
1686 |
-
# return analysed_bounding_boxes
|
1687 |
-
|
1688 |
-
|
1689 |
def create_text_redaction_process_results(analyser_results, analysed_bounding_boxes, page_num):
|
1690 |
decision_process_table = pd.DataFrame()
|
1691 |
|
1692 |
if len(analyser_results) > 0:
|
1693 |
# Create summary df of annotations to be made
|
1694 |
analysed_bounding_boxes_df_new = pd.DataFrame(analysed_bounding_boxes)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1695 |
analysed_bounding_boxes_df_text = analysed_bounding_boxes_df_new['result'].astype(str).str.split(",",expand=True).replace(".*: ", "", regex=True)
|
1696 |
-
analysed_bounding_boxes_df_text.columns = ["
|
1697 |
analysed_bounding_boxes_df_new = pd.concat([analysed_bounding_boxes_df_new, analysed_bounding_boxes_df_text], axis = 1)
|
1698 |
analysed_bounding_boxes_df_new['page'] = page_num + 1
|
1699 |
decision_process_table = pd.concat([decision_process_table, analysed_bounding_boxes_df_new], axis = 0).drop('result', axis=1)
|
@@ -1702,8 +1549,8 @@ def create_text_redaction_process_results(analyser_results, analysed_bounding_bo
|
|
1702 |
|
1703 |
return decision_process_table
|
1704 |
|
1705 |
-
def
|
1706 |
-
|
1707 |
for analysed_bounding_box in analysed_bounding_boxes:
|
1708 |
bounding_box = analysed_bounding_box["boundingBox"]
|
1709 |
annotation = Dictionary(
|
@@ -1721,8 +1568,8 @@ def create_annotations_for_bounding_boxes(analysed_bounding_boxes):
|
|
1721 |
S=Name.S # Border style: solid
|
1722 |
)
|
1723 |
)
|
1724 |
-
|
1725 |
-
return
|
1726 |
|
1727 |
def redact_text_pdf(
|
1728 |
filename: str, # Path to the PDF file to be redacted
|
@@ -1840,13 +1687,17 @@ def redact_text_pdf(
|
|
1840 |
|
1841 |
if page_min <= page_no < page_max:
|
1842 |
|
|
|
|
|
|
|
|
|
1843 |
for page_layout in extract_pages(filename, page_numbers = [page_no], maxpages=1):
|
1844 |
|
1845 |
page_analyser_results = []
|
1846 |
page_analysed_bounding_boxes = []
|
1847 |
|
1848 |
characters = []
|
1849 |
-
|
1850 |
decision_process_table_on_page = pd.DataFrame()
|
1851 |
page_text_outputs = pd.DataFrame()
|
1852 |
|
@@ -1900,8 +1751,7 @@ def redact_text_pdf(
|
|
1900 |
)
|
1901 |
all_text_line_results.append((i, text_line_analyser_result))
|
1902 |
|
1903 |
-
|
1904 |
-
|
1905 |
elif pii_identification_method == "AWS Comprehend":
|
1906 |
|
1907 |
# First use the local Spacy model to pick up custom entities that AWS Comprehend can't search for.
|
@@ -2006,17 +1856,24 @@ def redact_text_pdf(
|
|
2006 |
text_container_analyser_results.extend(text_line_analyser_result)
|
2007 |
text_container_analysed_bounding_boxes.extend(text_line_bounding_boxes)
|
2008 |
|
2009 |
-
print("text_container_analyser_results:", text_container_analyser_results)
|
2010 |
|
|
|
2011 |
page_analysed_bounding_boxes.extend(text_line_bounding_boxes) # Add this line
|
2012 |
|
2013 |
|
2014 |
-
print("
|
|
|
|
|
|
|
|
|
|
|
|
|
2015 |
|
2016 |
# Annotate redactions on page
|
2017 |
-
|
2018 |
|
2019 |
-
print("
|
2020 |
|
2021 |
# Make pymupdf page redactions
|
2022 |
#print("redact_whole_page_list:", redact_whole_page_list)
|
@@ -2025,7 +1882,9 @@ def redact_text_pdf(
|
|
2025 |
else: redact_whole_page = False
|
2026 |
else: redact_whole_page = False
|
2027 |
|
2028 |
-
pymupdf_page, image_annotations = redact_page_with_pymupdf(pymupdf_page,
|
|
|
|
|
2029 |
|
2030 |
#print("Did redact_page_with_pymupdf function")
|
2031 |
reported_page_no = page_no + 1
|
@@ -2037,6 +1896,7 @@ def redact_text_pdf(
|
|
2037 |
|
2038 |
if not decision_process_table_on_page.empty:
|
2039 |
all_decision_process_table = pd.concat([all_decision_process_table, decision_process_table_on_page])
|
|
|
2040 |
|
2041 |
if not page_text_outputs.empty:
|
2042 |
page_text_outputs = page_text_outputs.sort_values(["top", "left"], ascending=[False, False]).reset_index(drop=True)
|
|
|
40 |
max_time_value = get_or_create_env_var('max_time_value', '999999')
|
41 |
print(f'The value of max_time_value is {max_time_value}')
|
42 |
|
43 |
+
def bounding_boxes_overlap(box1, box2):
|
44 |
+
"""Check if two bounding boxes overlap."""
|
45 |
+
return (box1[0] < box2[2] and box2[0] < box1[2] and
|
46 |
+
box1[1] < box2[3] and box2[1] < box1[3])
|
47 |
+
|
48 |
def sum_numbers_before_seconds(string:str):
|
49 |
"""Extracts numbers that precede the word 'seconds' from a string and adds them up.
|
50 |
|
|
|
401 |
# Convert json to csv and also save this
|
402 |
#print("annotations_all_pages:", annotations_all_pages)
|
403 |
|
404 |
+
review_df = convert_review_json_to_pandas_df(annotations_all_pages, all_decision_process_table)
|
405 |
|
406 |
out_review_file_file_path = out_image_file_path + '_review_file.csv'
|
407 |
review_df.to_csv(out_review_file_file_path, index=None)
|
|
|
457 |
|
458 |
return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
|
459 |
|
460 |
+
def convert_pikepdf_coords_to_pymupdf(pymupdf_page, pikepdf_bbox, type="pikepdf_annot"):
|
461 |
'''
|
462 |
Convert annotations from pikepdf to pymupdf format, handling the mediabox larger than rect.
|
463 |
'''
|
|
|
479 |
x_diff_ratio = media_reference_x_diff / reference_box_width
|
480 |
|
481 |
# Extract the annotation rectangle field
|
482 |
+
if type=="pikepdf_annot":
|
483 |
+
rect_field = pikepdf_bbox["/Rect"]
|
484 |
+
else:
|
485 |
+
rect_field = pikepdf_bbox
|
486 |
rect_coordinates = [float(coord) for coord in rect_field] # Convert to floats
|
487 |
|
488 |
# Unpack coordinates
|
|
|
495 |
|
496 |
return new_x1, new_y1, new_x2, new_y2
|
497 |
|
498 |
+
def convert_pikepdf_to_image_coords(pymupdf_page, annot, image:Image, type="pikepdf_annot"):
|
499 |
'''
|
500 |
Convert annotations from pikepdf coordinates to image coordinates.
|
501 |
'''
|
|
|
512 |
scale_height = image_page_height / rect_height
|
513 |
|
514 |
# Extract the /Rect field
|
515 |
+
if type=="pikepdf_annot":
|
516 |
+
rect_field = annot["/Rect"]
|
517 |
+
else:
|
518 |
+
rect_field = annot
|
519 |
|
520 |
# Convert the extracted /Rect field to a list of floats
|
521 |
rect_coordinates = [float(coord) for coord in rect_field]
|
|
|
529 |
|
530 |
return x1_image, new_y1_image, x2_image, new_y2_image
|
531 |
|
532 |
+
def convert_pikepdf_decision_output_to_image_coords(pymupdf_page, pikepdf_decision_ouput_data:List, image):
|
533 |
+
if isinstance(image, str):
|
534 |
+
image_path = image
|
535 |
+
image = Image.open(image_path)
|
536 |
+
|
537 |
+
# Loop through each item in the data
|
538 |
+
for item in pikepdf_decision_ouput_data:
|
539 |
+
# Extract the bounding box
|
540 |
+
bounding_box = item['boundingBox']
|
541 |
+
|
542 |
+
# Create a pikepdf_bbox dictionary to match the expected input
|
543 |
+
pikepdf_bbox = {"/Rect": bounding_box}
|
544 |
+
|
545 |
+
# Call the conversion function
|
546 |
+
new_x1, new_y1, new_x2, new_y2 = convert_pikepdf_to_image_coords(pymupdf_page, pikepdf_bbox, image, type="pikepdf_annot")
|
547 |
+
|
548 |
+
# Update the original object with the new bounding box values
|
549 |
+
item['boundingBox'] = [new_x1, new_y1, new_x2, new_y2]
|
550 |
+
|
551 |
+
return pikepdf_decision_ouput_data
|
552 |
+
|
553 |
+
def convert_image_coords_to_pymupdf(pymupdf_page, annot, image:Image, type="image_recognizer"):
|
554 |
'''
|
555 |
+
Converts an image with redaction coordinates from a CustomImageRecognizerResult or pikepdf object with image coordinates to pymupdf coordinates.
|
556 |
'''
|
557 |
|
558 |
rect_height = pymupdf_page.rect.height
|
|
|
565 |
scale_height = rect_height / image_page_height
|
566 |
|
567 |
# Calculate scaled coordinates
|
568 |
+
if type == "image_recognizer":
|
569 |
+
x1 = (annot.left * scale_width)# + page_x_adjust
|
570 |
+
new_y1 = (annot.top * scale_height)# - page_y_adjust # Flip Y0 (since it starts from bottom)
|
571 |
+
x2 = ((annot.left + annot.width) * scale_width)# + page_x_adjust # Calculate x1
|
572 |
+
new_y2 = ((annot.top + annot.height) * scale_height)# - page_y_adjust # Calculate y1 correctly
|
573 |
+
# Else assume it is a pikepdf derived object
|
574 |
+
else:
|
575 |
+
rect_field = annot["/Rect"]
|
576 |
+
rect_coordinates = [float(coord) for coord in rect_field] # Convert to floats
|
577 |
|
578 |
+
# Unpack coordinates
|
579 |
+
x1, y1, x2, y2 = rect_coordinates
|
580 |
+
|
581 |
+
#print("scale_width:", scale_width)
|
582 |
+
#print("scale_height:", scale_height)
|
583 |
|
584 |
+
x1 = (x1* scale_width)# + page_x_adjust
|
585 |
+
new_y1 = ((y2 + (y1 - y2))* scale_height)# - page_y_adjust # Calculate y1 correctly
|
586 |
+
x2 = ((x1 + (x2 - x1)) * scale_width)# + page_x_adjust # Calculate x1
|
587 |
+
new_y2 = (y2 * scale_height)# - page_y_adjust # Flip Y0 (since it starts from bottom)
|
588 |
+
|
589 |
|
590 |
+
return x1, new_y1, x2, new_y2
|
591 |
|
592 |
def convert_gradio_annotation_coords_to_pymupdf(pymupdf_page:Page, annot:dict, image:Image):
|
593 |
'''
|
|
|
624 |
|
625 |
return new_file_path
|
626 |
|
627 |
+
def redact_page_with_pymupdf(page:Page, page_annotations:dict, image=None, custom_colours:bool=False, redact_whole_page:bool=False, convert_coords:bool=True):
|
628 |
|
629 |
mediabox_height = page.mediabox[3] - page.mediabox[1]
|
630 |
mediabox_width = page.mediabox[2] - page.mediabox[0]
|
|
|
646 |
image = Image.open(image_path)
|
647 |
|
648 |
# Check if this is an object used in the Gradio Annotation component
|
649 |
+
if isinstance (page_annotations, dict):
|
650 |
+
page_annotations = page_annotations["boxes"]
|
651 |
|
652 |
+
for annot in page_annotations:
|
653 |
# Check if an Image recogniser result, or a Gradio annotation object
|
654 |
if (isinstance(annot, CustomImageRecognizerResult)) | isinstance(annot, dict):
|
655 |
|
|
|
658 |
# Should already be in correct format if img_annotator_box is an input
|
659 |
if isinstance(annot, dict):
|
660 |
img_annotation_box = annot
|
|
|
661 |
pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2 = convert_gradio_annotation_coords_to_pymupdf(page, annot, image)
|
662 |
|
663 |
x1 = pymupdf_x1
|
664 |
x2 = pymupdf_x2
|
665 |
|
666 |
+
# if hasattr(annot, 'text') and annot.text:
|
667 |
+
# img_annotation_box["text"] = annot.text
|
668 |
+
# else:
|
669 |
+
# img_annotation_box["text"] = ""
|
670 |
+
|
671 |
# Else should be CustomImageRecognizerResult
|
672 |
else:
|
673 |
pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2 = convert_image_coords_to_pymupdf(page, annot, image)
|
|
|
684 |
img_annotation_box["label"] = annot.entity_type
|
685 |
except:
|
686 |
img_annotation_box["label"] = "Redaction"
|
687 |
+
# if hasattr(annot, 'text') and annot.text:
|
688 |
+
# img_annotation_box["text"] = annot.text
|
689 |
+
# else:
|
690 |
+
# img_annotation_box["text"] = ""
|
691 |
|
692 |
rect = Rect(x1, pymupdf_y1, x2, pymupdf_y2) # Create the PyMuPDF Rect
|
693 |
|
694 |
# Else it should be a pikepdf annotation object
|
695 |
+
else:
|
696 |
+
if convert_coords == True:
|
697 |
+
pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2 = convert_pikepdf_coords_to_pymupdf(page, annot)
|
698 |
+
else:
|
699 |
+
pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2 = convert_image_coords_to_pymupdf(page, annot, image, type="pikepdf_image_coords")
|
700 |
|
701 |
x1 = pymupdf_x1
|
702 |
x2 = pymupdf_x2
|
|
|
708 |
if image:
|
709 |
img_width, img_height = image.size
|
710 |
|
711 |
+
print("annot:", annot)
|
712 |
+
|
713 |
x1, image_y1, x2, image_y2 = convert_pymupdf_to_image_coords(page, x1, pymupdf_y1, x2, pymupdf_y2, image)
|
714 |
|
715 |
img_annotation_box["xmin"] = x1 #* (img_width / rect_width) # Use adjusted x1
|
|
|
722 |
img_annotation_box["label"] = str(annot["/T"])
|
723 |
else:
|
724 |
img_annotation_box["label"] = "REDACTION"
|
725 |
+
# if hasattr(annot, 'text') and annot.text:
|
726 |
+
# img_annotation_box["text"] = annot.text
|
727 |
+
# else:
|
728 |
+
# img_annotation_box["text"] = ""
|
729 |
|
730 |
# Convert to a PyMuPDF Rect object
|
731 |
#rect = Rect(rect_coordinates)
|
|
|
736 |
|
737 |
# If whole page is to be redacted, do that here
|
738 |
if redact_whole_page == True:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
739 |
|
740 |
whole_page_img_annotation_box = redact_whole_pymupdf_page(rect_height, rect_width, image, page, custom_colours, border = 5)
|
741 |
all_image_annotation_boxes.append(whole_page_img_annotation_box)
|
|
|
753 |
|
754 |
return page, out_annotation_boxes
|
755 |
|
|
|
|
|
|
|
|
|
756 |
|
|
|
|
|
|
|
757 |
|
758 |
def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_results=[], handwriting_recogniser_results=[], handwrite_signature_checkbox: List[str]=["Redact all identified handwriting", "Redact all identified signatures"], horizontal_threshold:int=50, vertical_threshold:int=12):
|
759 |
|
|
|
856 |
unique_bboxes = list({(bbox.left, bbox.top, bbox.width, bbox.height): bbox for bbox in all_bboxes}.values())
|
857 |
return unique_bboxes
|
858 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
859 |
def redact_image_pdf(file_path:str,
|
860 |
prepared_pdf_file_paths:List[str],
|
861 |
language:str,
|
|
|
1202 |
|
1203 |
# Convert decision process to table
|
1204 |
decision_process_table = pd.DataFrame([{
|
1205 |
+
'text': result.text,
|
1206 |
+
'xmin': result.left,
|
1207 |
+
'ymin': result.top,
|
1208 |
+
'xmax': result.left + result.width,
|
1209 |
+
'ymax': result.top + result.height,
|
1210 |
+
'label': result.entity_type,
|
1211 |
'start': result.start,
|
1212 |
'end': result.end,
|
1213 |
'score': result.score,
|
1214 |
+
'page': reported_page_number
|
1215 |
+
|
1216 |
+
} for result in merged_redaction_bboxes]) #'left': result.left,
|
1217 |
+
#'top': result.top,
|
1218 |
+
#'width': result.width,
|
1219 |
+
#'height': result.height,
|
1220 |
|
1221 |
all_decision_process_table = pd.concat([all_decision_process_table, decision_process_table])
|
1222 |
|
|
|
1250 |
pymupdf_doc = images
|
1251 |
|
1252 |
# Check if the image already exists in annotations_all_pages
|
1253 |
+
#print("annotations_all_pages:", annotations_all_pages)
|
1254 |
existing_index = next((index for index, ann in enumerate(annotations_all_pages) if ann["image"] == image_annotations["image"]), None)
|
1255 |
if existing_index is not None:
|
1256 |
# Replace the existing annotation
|
|
|
1273 |
pymupdf_doc = images
|
1274 |
|
1275 |
# Check if the image already exists in annotations_all_pages
|
1276 |
+
#print("annotations_all_pages:", annotations_all_pages)
|
1277 |
existing_index = next((index for index, ann in enumerate(annotations_all_pages) if ann["image"] == image_annotations["image"]), None)
|
1278 |
if existing_index is not None:
|
1279 |
# Replace the existing annotation
|
|
|
1522 |
|
1523 |
return analysed_bounding_boxes
|
1524 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1525 |
def create_text_redaction_process_results(analyser_results, analysed_bounding_boxes, page_num):
|
1526 |
decision_process_table = pd.DataFrame()
|
1527 |
|
1528 |
if len(analyser_results) > 0:
|
1529 |
# Create summary df of annotations to be made
|
1530 |
analysed_bounding_boxes_df_new = pd.DataFrame(analysed_bounding_boxes)
|
1531 |
+
|
1532 |
+
# Remove brackets and split the string into four separate columns
|
1533 |
+
#print("analysed_bounding_boxes_df_new:", analysed_bounding_boxes_df_new['boundingBox'])
|
1534 |
+
# analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']] = analysed_bounding_boxes_df_new['boundingBox'].str.strip('[]').str.split(',', expand=True)
|
1535 |
+
|
1536 |
+
# Split the boundingBox list into four separate columns
|
1537 |
+
analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']] = analysed_bounding_boxes_df_new['boundingBox'].apply(pd.Series)
|
1538 |
+
|
1539 |
+
# Convert the new columns to integers (if needed)
|
1540 |
+
analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']] = analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']].astype(float)
|
1541 |
+
|
1542 |
analysed_bounding_boxes_df_text = analysed_bounding_boxes_df_new['result'].astype(str).str.split(",",expand=True).replace(".*: ", "", regex=True)
|
1543 |
+
analysed_bounding_boxes_df_text.columns = ["label", "start", "end", "score"]
|
1544 |
analysed_bounding_boxes_df_new = pd.concat([analysed_bounding_boxes_df_new, analysed_bounding_boxes_df_text], axis = 1)
|
1545 |
analysed_bounding_boxes_df_new['page'] = page_num + 1
|
1546 |
decision_process_table = pd.concat([decision_process_table, analysed_bounding_boxes_df_new], axis = 0).drop('result', axis=1)
|
|
|
1549 |
|
1550 |
return decision_process_table
|
1551 |
|
1552 |
+
def create_pikepdf_annotations_for_bounding_boxes(analysed_bounding_boxes):
|
1553 |
+
pikepdf_annotations_on_page = []
|
1554 |
for analysed_bounding_box in analysed_bounding_boxes:
|
1555 |
bounding_box = analysed_bounding_box["boundingBox"]
|
1556 |
annotation = Dictionary(
|
|
|
1568 |
S=Name.S # Border style: solid
|
1569 |
)
|
1570 |
)
|
1571 |
+
pikepdf_annotations_on_page.append(annotation)
|
1572 |
+
return pikepdf_annotations_on_page
|
1573 |
|
1574 |
def redact_text_pdf(
|
1575 |
filename: str, # Path to the PDF file to be redacted
|
|
|
1687 |
|
1688 |
if page_min <= page_no < page_max:
|
1689 |
|
1690 |
+
if isinstance(image, str):
|
1691 |
+
image_path = image
|
1692 |
+
image = Image.open(image_path)
|
1693 |
+
|
1694 |
for page_layout in extract_pages(filename, page_numbers = [page_no], maxpages=1):
|
1695 |
|
1696 |
page_analyser_results = []
|
1697 |
page_analysed_bounding_boxes = []
|
1698 |
|
1699 |
characters = []
|
1700 |
+
pikepdf_annotations_on_page = []
|
1701 |
decision_process_table_on_page = pd.DataFrame()
|
1702 |
page_text_outputs = pd.DataFrame()
|
1703 |
|
|
|
1751 |
)
|
1752 |
all_text_line_results.append((i, text_line_analyser_result))
|
1753 |
|
1754 |
+
|
|
|
1755 |
elif pii_identification_method == "AWS Comprehend":
|
1756 |
|
1757 |
# First use the local Spacy model to pick up custom entities that AWS Comprehend can't search for.
|
|
|
1856 |
text_container_analyser_results.extend(text_line_analyser_result)
|
1857 |
text_container_analysed_bounding_boxes.extend(text_line_bounding_boxes)
|
1858 |
|
1859 |
+
#print("text_container_analyser_results:", text_container_analyser_results)
|
1860 |
|
1861 |
+
page_analyser_results.extend(text_container_analyser_results) # Add this line
|
1862 |
page_analysed_bounding_boxes.extend(text_line_bounding_boxes) # Add this line
|
1863 |
|
1864 |
|
1865 |
+
#print("page_analyser_results:", page_analyser_results)
|
1866 |
+
#print("page_analysed_bounding_boxes:", page_analysed_bounding_boxes)
|
1867 |
+
#print("image:", image)
|
1868 |
+
|
1869 |
+
page_analysed_bounding_boxes = convert_pikepdf_decision_output_to_image_coords(pymupdf_page, page_analysed_bounding_boxes, image)
|
1870 |
+
|
1871 |
+
#print("page_analysed_bounding_boxes_out_converted:", page_analysed_bounding_boxes)
|
1872 |
|
1873 |
# Annotate redactions on page
|
1874 |
+
pikepdf_annotations_on_page = create_pikepdf_annotations_for_bounding_boxes(page_analysed_bounding_boxes)
|
1875 |
|
1876 |
+
#print("pikepdf_annotations_on_page:", pikepdf_annotations_on_page)
|
1877 |
|
1878 |
# Make pymupdf page redactions
|
1879 |
#print("redact_whole_page_list:", redact_whole_page_list)
|
|
|
1882 |
else: redact_whole_page = False
|
1883 |
else: redact_whole_page = False
|
1884 |
|
1885 |
+
pymupdf_page, image_annotations = redact_page_with_pymupdf(pymupdf_page, pikepdf_annotations_on_page, image, redact_whole_page=redact_whole_page, convert_coords=False)
|
1886 |
+
|
1887 |
+
#print("image_annotations:", image_annotations)
|
1888 |
|
1889 |
#print("Did redact_page_with_pymupdf function")
|
1890 |
reported_page_no = page_no + 1
|
|
|
1896 |
|
1897 |
if not decision_process_table_on_page.empty:
|
1898 |
all_decision_process_table = pd.concat([all_decision_process_table, decision_process_table_on_page])
|
1899 |
+
#print("all_decision_process_table:", all_decision_process_table)
|
1900 |
|
1901 |
if not page_text_outputs.empty:
|
1902 |
page_text_outputs = page_text_outputs.sort_values(["top", "left"], ascending=[False, False]).reset_index(drop=True)
|
tools/redaction_review.py
CHANGED
@@ -68,6 +68,7 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, re
|
|
68 |
#print("review_dataframe['label']", review_dataframe["label"])
|
69 |
recogniser_entities = review_dataframe["label"].unique().tolist()
|
70 |
recogniser_entities.append("ALL")
|
|
|
71 |
|
72 |
#print("recogniser_entities:", recogniser_entities)
|
73 |
|
@@ -187,7 +188,7 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, re
|
|
187 |
|
188 |
return out_image_annotator, number_reported, number_reported, page_num_reported, recogniser_entities_drop, recogniser_dataframe_out, recogniser_dataframe_gr
|
189 |
|
190 |
-
def modify_existing_page_redactions(image_annotated:AnnotatedImageData, current_page:int, previous_page:int, all_image_annotations:List[AnnotatedImageData], clear_all:bool=False):
|
191 |
'''
|
192 |
Overwrite current image annotations with modifications
|
193 |
'''
|
@@ -198,6 +199,8 @@ def modify_existing_page_redactions(image_annotated:AnnotatedImageData, current_
|
|
198 |
#If no previous page or is 0, i.e. first time run, then rewrite current page
|
199 |
#if not previous_page:
|
200 |
# previous_page = current_page
|
|
|
|
|
201 |
|
202 |
image_annotated['image'] = all_image_annotations[previous_page - 1]["image"]
|
203 |
|
@@ -206,9 +209,26 @@ def modify_existing_page_redactions(image_annotated:AnnotatedImageData, current_
|
|
206 |
else:
|
207 |
all_image_annotations[previous_page - 1]["boxes"] = []
|
208 |
|
209 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
210 |
|
211 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
212 |
'''
|
213 |
Apply modified redactions to a pymupdf and export review files
|
214 |
'''
|
@@ -302,7 +322,7 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
|
|
302 |
output_files.append(out_pdf_file_path)
|
303 |
|
304 |
try:
|
305 |
-
|
306 |
|
307 |
out_annotation_file_path = output_folder + file_base + '_review_file.json'
|
308 |
with open(out_annotation_file_path, 'w') as f:
|
@@ -311,14 +331,16 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
|
|
311 |
|
312 |
print("Saving annotations to CSV review file")
|
313 |
|
|
|
|
|
314 |
# Convert json to csv and also save this
|
315 |
-
review_df = convert_review_json_to_pandas_df(all_image_annotations)
|
316 |
out_review_file_file_path = output_folder + file_base + '_review_file.csv'
|
317 |
review_df.to_csv(out_review_file_file_path, index=None)
|
318 |
output_files.append(out_review_file_file_path)
|
319 |
|
320 |
except Exception as e:
|
321 |
-
print("Could not save annotations to json file:", e)
|
322 |
|
323 |
return doc, all_image_annotations, output_files, output_log_files
|
324 |
|
|
|
68 |
#print("review_dataframe['label']", review_dataframe["label"])
|
69 |
recogniser_entities = review_dataframe["label"].unique().tolist()
|
70 |
recogniser_entities.append("ALL")
|
71 |
+
recogniser_entities = sorted(recogniser_entities)
|
72 |
|
73 |
#print("recogniser_entities:", recogniser_entities)
|
74 |
|
|
|
188 |
|
189 |
return out_image_annotator, number_reported, number_reported, page_num_reported, recogniser_entities_drop, recogniser_dataframe_out, recogniser_dataframe_gr
|
190 |
|
191 |
+
def modify_existing_page_redactions(image_annotated:AnnotatedImageData, current_page:int, previous_page:int, all_image_annotations:List[AnnotatedImageData], recogniser_entities_drop=gr.Dropdown(value="ALL", allow_custom_value=True),recogniser_dataframe=gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]})), clear_all:bool=False):
|
192 |
'''
|
193 |
Overwrite current image annotations with modifications
|
194 |
'''
|
|
|
199 |
#If no previous page or is 0, i.e. first time run, then rewrite current page
|
200 |
#if not previous_page:
|
201 |
# previous_page = current_page
|
202 |
+
|
203 |
+
#print("image_annotated:", image_annotated)
|
204 |
|
205 |
image_annotated['image'] = all_image_annotations[previous_page - 1]["image"]
|
206 |
|
|
|
209 |
else:
|
210 |
all_image_annotations[previous_page - 1]["boxes"] = []
|
211 |
|
212 |
+
#print("all_image_annotations:", all_image_annotations)
|
213 |
+
|
214 |
+
# Rewrite all_image_annotations search dataframe with latest updates
|
215 |
+
try:
|
216 |
+
review_dataframe = convert_review_json_to_pandas_df(all_image_annotations)[["page", "label"]]
|
217 |
+
#print("review_dataframe['label']", review_dataframe["label"])
|
218 |
+
recogniser_entities = review_dataframe["label"].unique().tolist()
|
219 |
+
recogniser_entities.append("ALL")
|
220 |
+
recogniser_entities = sorted(recogniser_entities)
|
221 |
|
222 |
+
recogniser_dataframe_out = gr.Dataframe(review_dataframe)
|
223 |
+
#recogniser_dataframe_gr = gr.Dataframe(review_dataframe)
|
224 |
+
recogniser_entities_drop = gr.Dropdown(value=recogniser_entities_drop, choices=recogniser_entities, allow_custom_value=True, interactive=True)
|
225 |
+
except Exception as e:
|
226 |
+
print("Could not extract recogniser information:", e)
|
227 |
+
recogniser_dataframe_out = recogniser_dataframe
|
228 |
+
|
229 |
+
return all_image_annotations, current_page, current_page, recogniser_entities_drop, recogniser_dataframe_out
|
230 |
+
|
231 |
+
def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], doc:Document, all_image_annotations:List[AnnotatedImageData], current_page:int, review_file_state, progress=gr.Progress(track_tqdm=True)):
|
232 |
'''
|
233 |
Apply modified redactions to a pymupdf and export review files
|
234 |
'''
|
|
|
322 |
output_files.append(out_pdf_file_path)
|
323 |
|
324 |
try:
|
325 |
+
print("Saving annotations to JSON")
|
326 |
|
327 |
out_annotation_file_path = output_folder + file_base + '_review_file.json'
|
328 |
with open(out_annotation_file_path, 'w') as f:
|
|
|
331 |
|
332 |
print("Saving annotations to CSV review file")
|
333 |
|
334 |
+
print("review_file_state:", review_file_state)
|
335 |
+
|
336 |
# Convert json to csv and also save this
|
337 |
+
review_df = convert_review_json_to_pandas_df(all_image_annotations, review_file_state)
|
338 |
out_review_file_file_path = output_folder + file_base + '_review_file.csv'
|
339 |
review_df.to_csv(out_review_file_file_path, index=None)
|
340 |
output_files.append(out_review_file_file_path)
|
341 |
|
342 |
except Exception as e:
|
343 |
+
print("Could not save annotations to json or csv file:", e)
|
344 |
|
345 |
return doc, all_image_annotations, output_files, output_log_files
|
346 |
|