seanpedrickcase commited on
Commit
eea5c07
1 Parent(s): 21d060c

Allowed for time limits on redact to avoid timeouts. Improved review interface. Now accepts only one file at a time. Upgraded Gradio version

Browse files
app.py CHANGED
@@ -43,29 +43,38 @@ with app:
43
  ###
44
  # STATE VARIABLES
45
  ###
46
- prepared_pdf_state = gr.State([])
47
- output_image_files_state = gr.State([])
48
- output_file_list_state = gr.State([])
49
- text_output_file_list_state = gr.State([])
50
- log_files_output_list_state = gr.State([])
51
- first_loop_state = gr.State(True)
52
- second_loop_state = gr.State(False)
 
 
53
 
54
  in_allow_list_state = gr.State(pd.DataFrame())
55
 
56
  session_hash_state = gr.State()
57
  s3_output_folder_state = gr.State()
58
 
59
- pdf_doc_state = gr.State([])
 
 
 
60
  images_pdf_state = gr.State([]) # List of pdf pages converted to PIL images
61
- all_image_annotations_state = gr.State([])
 
 
 
 
62
 
63
  # Logging state
64
- feedback_logs_state = gr.State(feedback_logs_folder + 'log.csv')
65
  feedback_s3_logs_loc_state = gr.State(feedback_logs_folder)
66
- access_logs_state = gr.State(access_logs_folder + 'log.csv')
67
  access_s3_logs_loc_state = gr.State(access_logs_folder)
68
- usage_logs_state = gr.State(usage_logs_folder + 'log.csv')
69
  usage_s3_logs_loc_state = gr.State(usage_logs_folder)
70
 
71
  # Invisible elements effectively used as state variables
@@ -93,21 +102,23 @@ with app:
93
 
94
  NOTE: In testing the app seems to find about 60% of personal information on a given (typed) page of text. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.
95
 
96
- This app accepts a maximum file size of 50mb. Please consider giving feedback for the quality of the answers underneath the redact buttons when the option appears, this will help to improve the app.
97
  """)
98
 
99
  # PDF / IMAGES TAB
100
  with gr.Tab("PDFs/images"):
101
  with gr.Accordion("Redact document", open = True):
102
- in_doc_files = gr.File(label="Choose document/image files (PDF, JPG, PNG)", file_count= "multiple", file_types=['.pdf', '.jpg', '.png', '.json'])
103
  in_redaction_method = gr.Radio(label="Choose document redaction method. AWS Textract has a cost per page so please only use when needed.", value = "Simple text analysis - PDFs with selectable text", choices=["Simple text analysis - PDFs with selectable text", "Quick image analysis - typed text", "Complex image analysis - docs with handwriting/signatures (AWS Textract)"])
104
  gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses), please go to the redaction settings tab.""")
105
  document_redact_btn = gr.Button("Redact document(s)", variant="primary")
 
 
106
 
107
  with gr.Row():
108
  output_summary = gr.Textbox(label="Output summary")
109
  output_file = gr.File(label="Output files")
110
- text_documents_done = gr.Number(value=0, label="Number of documents redacted", interactive=False, visible=False)
111
 
112
  with gr.Row():
113
  convert_text_pdf_to_img_btn = gr.Button(value="Convert pdf to image-based pdf to apply redactions", variant="secondary", visible=False)
@@ -122,10 +133,10 @@ with app:
122
  with gr.Tab("Review redactions", id="tab_object_annotation"):
123
 
124
  with gr.Row():
125
- annotation_last_page_button = gr.Button("Previous page")
126
- annotate_current_page = gr.Number(value=1, label="Current page (select page number then press enter)", precision=0)
127
-
128
- annotation_next_page_button = gr.Button("Next page")
129
 
130
  annotation_button_apply = gr.Button("Apply revised redactions", variant="primary")
131
 
@@ -141,6 +152,12 @@ with app:
141
  interactive=False
142
  )
143
 
 
 
 
 
 
 
144
  output_review_files = gr.File(label="Review output files")
145
 
146
  # TEXT / TABULAR DATA TAB
@@ -169,7 +186,7 @@ with app:
169
  # Feedback elements are invisible until revealed by redaction action
170
  data_feedback_title = gr.Markdown(value="## Please give feedback", visible=False)
171
  data_feedback_radio = gr.Radio(label="Please give some feedback about the results of the redaction. A reminder that the app is only expected to identify about 60% of personally identifiable information in a given (typed) document.",
172
- choices=["The results were good", "The results were not good"], visible=False)
173
  data_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
174
  data_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
175
 
@@ -202,36 +219,56 @@ with app:
202
 
203
  # If a custom allow list is uploaded
204
  in_allow_list.upload(fn=custom_regex_load, inputs=[in_allow_list], outputs=[in_allow_list_text, in_allow_list_state])
205
-
206
  ###
207
  # PDF/IMAGE REDACTION
208
  ###
209
  in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_textbox, doc_file_name_with_extension_textbox])
210
 
211
- document_redact_btn.click(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, text_documents_done, output_summary, first_loop_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state], api_name="prepare_doc").\
212
- then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list_state, text_documents_done, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, pdf_doc_state],
213
- outputs=[output_summary, output_file, output_file_list_state, text_documents_done, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state], api_name="redact_doc").\
214
- then(fn=update_annotator, inputs=[all_image_annotations_state, page_min], outputs=[annotator, annotate_current_page])
 
215
 
216
- # If the output file count text box changes, keep going with redacting each document until done
217
- text_documents_done.change(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, text_documents_done, output_summary, second_loop_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state]).\
218
- then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list_state, text_documents_done, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, pdf_doc_state],
219
- outputs=[output_summary, output_file, output_file_list_state, text_documents_done, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state]).\
220
- then(fn=update_annotator, inputs=[all_image_annotations_state, page_min], outputs=[annotator, annotate_current_page]).\
221
- then(fn = reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
222
 
223
- annotate_current_page.submit(
224
- modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page]).\
225
- then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page], outputs = [annotator, annotate_current_page])
 
 
 
 
 
 
 
226
 
227
- annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page]).\
228
- then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page], outputs = [annotator, annotate_current_page])
229
- annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page]).\
230
- then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page], outputs = [annotator, annotate_current_page])
 
 
 
 
 
231
 
232
  #annotation_button_get.click(get_boxes_json, annotator, json_boxes)
233
  annotation_button_apply.click(apply_redactions, inputs=[annotator, in_doc_files, pdf_doc_state, all_image_annotations_state, annotate_current_page], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files], scroll_to_output=True)
234
 
 
 
 
 
 
 
 
 
 
 
235
  ###
236
  # TABULAR DATA REDACTION
237
  ###
@@ -281,9 +318,9 @@ print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
281
 
282
  if __name__ == "__main__":
283
  if os.environ['COGNITO_AUTH'] == "1":
284
- app.queue().launch(show_error=True, auth=authenticate_user, max_file_size='50mb')
285
  else:
286
- app.queue().launch(show_error=True, inbrowser=True, max_file_size='50mb')
287
 
288
 
289
  # AWS options - placeholder for possibility of storing data on s3 and retrieving it in app
 
43
  ###
44
  # STATE VARIABLES
45
  ###
46
+
47
+ pdf_doc_state = gr.State([])
48
+ all_image_annotations_state = gr.State([])
49
+ all_line_level_ocr_results_df_state = gr.State(pd.DataFrame())
50
+ all_decision_process_table_state = gr.State(pd.DataFrame())
51
+
52
+ def reset_state_vars():
53
+ return [], [], pd.DataFrame(), pd.DataFrame()
54
+
55
 
56
  in_allow_list_state = gr.State(pd.DataFrame())
57
 
58
  session_hash_state = gr.State()
59
  s3_output_folder_state = gr.State()
60
 
61
+ first_loop_state = gr.State(True)
62
+ second_loop_state = gr.State(False)
63
+
64
+ prepared_pdf_state = gr.State([])
65
  images_pdf_state = gr.State([]) # List of pdf pages converted to PIL images
66
+
67
+ output_image_files_state = gr.State([])
68
+ output_file_list_state = gr.State([])
69
+ text_output_file_list_state = gr.State([])
70
+ log_files_output_list_state = gr.State([])
71
 
72
  # Logging state
73
+ feedback_logs_state = gr.State(feedback_logs_folder + 'dataset1.csv') #'log.csv')
74
  feedback_s3_logs_loc_state = gr.State(feedback_logs_folder)
75
+ access_logs_state = gr.State(access_logs_folder + 'dataset1.csv') #'log.csv')
76
  access_s3_logs_loc_state = gr.State(access_logs_folder)
77
+ usage_logs_state = gr.State(usage_logs_folder + 'dataset1.csv') #'log.csv')
78
  usage_s3_logs_loc_state = gr.State(usage_logs_folder)
79
 
80
  # Invisible elements effectively used as state variables
 
102
 
103
  NOTE: In testing the app seems to find about 60% of personal information on a given (typed) page of text. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.
104
 
105
+ This app accepts a maximum file size of 100mb. Please consider giving feedback for the quality of the answers underneath the redact buttons when the option appears, this will help to improve the app.
106
  """)
107
 
108
  # PDF / IMAGES TAB
109
  with gr.Tab("PDFs/images"):
110
  with gr.Accordion("Redact document", open = True):
111
+ in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "single", file_types=['.pdf', '.jpg', '.png', '.json'])
112
  in_redaction_method = gr.Radio(label="Choose document redaction method. AWS Textract has a cost per page so please only use when needed.", value = "Simple text analysis - PDFs with selectable text", choices=["Simple text analysis - PDFs with selectable text", "Quick image analysis - typed text", "Complex image analysis - docs with handwriting/signatures (AWS Textract)"])
113
  gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses), please go to the redaction settings tab.""")
114
  document_redact_btn = gr.Button("Redact document(s)", variant="primary")
115
+ current_loop_page_number = gr.Number(value=0,precision=0, interactive=False, label = "Last redacted page in document", visible=False)
116
+ page_break_return = gr.Checkbox(value = False, label="Page break reached", visible=False)
117
 
118
  with gr.Row():
119
  output_summary = gr.Textbox(label="Output summary")
120
  output_file = gr.File(label="Output files")
121
+ latest_file_completed_text = gr.Number(value=0, label="Number of documents redacted", interactive=False, visible=False)
122
 
123
  with gr.Row():
124
  convert_text_pdf_to_img_btn = gr.Button(value="Convert pdf to image-based pdf to apply redactions", variant="secondary", visible=False)
 
133
  with gr.Tab("Review redactions", id="tab_object_annotation"):
134
 
135
  with gr.Row():
136
+ annotation_last_page_button = gr.Button("Previous page", scale = 3)
137
+ annotate_current_page = gr.Number(value=1, label="Page (press enter to change)", precision=0, scale = 2)
138
+ annotate_max_pages = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 1)
139
+ annotation_next_page_button = gr.Button("Next page", scale = 3)
140
 
141
  annotation_button_apply = gr.Button("Apply revised redactions", variant="primary")
142
 
 
152
  interactive=False
153
  )
154
 
155
+ with gr.Row():
156
+ annotation_last_page_button_bottom = gr.Button("Previous page", scale = 3)
157
+ annotate_current_page_bottom = gr.Number(value=1, label="Page (press enter to change)", precision=0, interactive=True, scale = 2)
158
+ annotate_max_pages_bottom = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 1)
159
+ annotation_next_page_button_bottom = gr.Button("Next page", scale = 3)
160
+
161
  output_review_files = gr.File(label="Review output files")
162
 
163
  # TEXT / TABULAR DATA TAB
 
186
  # Feedback elements are invisible until revealed by redaction action
187
  data_feedback_title = gr.Markdown(value="## Please give feedback", visible=False)
188
  data_feedback_radio = gr.Radio(label="Please give some feedback about the results of the redaction. A reminder that the app is only expected to identify about 60% of personally identifiable information in a given (typed) document.",
189
+ choices=["The results were good", "The results were not good"], visible=False, show_label=True)
190
  data_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
191
  data_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
192
 
 
219
 
220
  # If a custom allow list is uploaded
221
  in_allow_list.upload(fn=custom_regex_load, inputs=[in_allow_list], outputs=[in_allow_list_text, in_allow_list_state])
222
+
223
  ###
224
  # PDF/IMAGE REDACTION
225
  ###
226
  in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_textbox, doc_file_name_with_extension_textbox])
227
 
228
+ document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state]).\
229
+ then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state], api_name="prepare_doc").\
230
+ then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return],
231
+ outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state], api_name="redact_doc")#.\
232
+ #then(fn=update_annotator, inputs=[all_image_annotations_state, page_min], outputs=[annotator, annotate_current_page])
233
 
234
+ # If the app has completed a batch of pages, it will run this until the end of all pages in the document
235
+ current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return],
236
+ outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state])
 
 
 
237
 
238
+ # If a file has been completed, the function will continue onto the next document
239
+ latest_file_completed_text.change(fn=update_annotator, inputs=[all_image_annotations_state, page_min], outputs=[annotator, annotate_current_page, annotate_current_page_bottom]).\
240
+ then(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
241
+ # latest_file_completed_text.change(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state]).\
242
+ # then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return],
243
+ # outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state]).\
244
+ #then(fn=update_annotator, inputs=[all_image_annotations_state, page_min], outputs=[annotator, annotate_current_page]).\
245
+ #then(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
246
+
247
+ ### REVIEW REDACTIONS
248
 
249
+ # Page controls at top
250
+ annotate_current_page.submit(
251
+ modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
252
+ then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
253
+
254
+ annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
255
+ then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
256
+ annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
257
+ then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
258
 
259
  #annotation_button_get.click(get_boxes_json, annotator, json_boxes)
260
  annotation_button_apply.click(apply_redactions, inputs=[annotator, in_doc_files, pdf_doc_state, all_image_annotations_state, annotate_current_page], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files], scroll_to_output=True)
261
 
262
+ # Page controls at bottom
263
+ annotate_current_page_bottom.submit(
264
+ modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page]).\
265
+ then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
266
+
267
+ annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
268
+ then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
269
+ annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
270
+ then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
271
+
272
  ###
273
  # TABULAR DATA REDACTION
274
  ###
 
318
 
319
  if __name__ == "__main__":
320
  if os.environ['COGNITO_AUTH'] == "1":
321
+ app.queue().launch(show_error=True, auth=authenticate_user, max_file_size='100mb')
322
  else:
323
+ app.queue().launch(show_error=True, inbrowser=True, max_file_size='100mb')
324
 
325
 
326
  # AWS options - placeholder for possibility of storing data on s3 and retrieving it in app
requirements.txt CHANGED
@@ -7,11 +7,16 @@ presidio_anonymizer==2.2.355
7
  presidio-image-redactor==0.0.53
8
  pikepdf==8.15.1
9
  pandas==2.2.3
10
- spacy==3.8.2
11
  en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
12
- gradio==4.44.1
13
- boto3==1.35.40
 
14
  pyarrow==17.0.0
15
  openpyxl==3.1.2
16
  Faker==22.2.0
17
  gradio_image_annotation==0.2.3
 
 
 
 
 
7
  presidio-image-redactor==0.0.53
8
  pikepdf==8.15.1
9
  pandas==2.2.3
10
+ spacy==3.7.5
11
  en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
12
+ #en_core_web_sm @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-#3.8.0.tar.gz
13
+ gradio==5.4.0
14
+ boto3==1.35.54
15
  pyarrow==17.0.0
16
  openpyxl==3.1.2
17
  Faker==22.2.0
18
  gradio_image_annotation==0.2.3
19
+ numpy==1.26.4
20
+
21
+
22
+
tools/aws_functions.py CHANGED
@@ -181,23 +181,27 @@ def upload_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=buck
181
  local_file_paths = [local_file_paths]
182
 
183
  for file in local_file_paths:
184
- try:
185
- # Get file name off file path
186
- file_name = os.path.basename(file)
187
-
188
- s3_key_full = s3_key + file_name
189
- print("S3 key: ", s3_key_full)
190
-
191
- s3_client.upload_file(file, s3_bucket, s3_key_full)
192
- out_message = "File " + file_name + " uploaded successfully!"
193
- print(out_message)
194
-
195
- except Exception as e:
196
- out_message = f"Error uploading file(s): {e}"
197
- print(out_message)
198
-
199
- final_out_message.append(out_message)
200
- final_out_message_str = '\n'.join(final_out_message)
 
 
 
 
201
 
202
  return final_out_message_str
203
 
 
181
  local_file_paths = [local_file_paths]
182
 
183
  for file in local_file_paths:
184
+ if s3_client:
185
+ #print(s3_client)
186
+ try:
187
+ # Get file name off file path
188
+ file_name = os.path.basename(file)
189
+
190
+ s3_key_full = s3_key + file_name
191
+ print("S3 key: ", s3_key_full)
192
+
193
+ s3_client.upload_file(file, s3_bucket, s3_key_full)
194
+ out_message = "File " + file_name + " uploaded successfully!"
195
+ print(out_message)
196
+
197
+ except Exception as e:
198
+ out_message = f"Error uploading file(s): {e}"
199
+ print(out_message)
200
+
201
+ final_out_message.append(out_message)
202
+ final_out_message_str = '\n'.join(final_out_message)
203
+
204
+ else: final_out_message_str = "Could not connect to AWS."
205
 
206
  return final_out_message_str
207
 
tools/aws_textract.py CHANGED
@@ -23,7 +23,7 @@ def extract_textract_metadata(response):
23
  #'NumberOfPages': number_of_pages
24
  })
25
 
26
- def analyse_page_with_textract(pdf_page_bytes, json_file_path):
27
  '''
28
  Analyse page with AWS Textract
29
  '''
@@ -31,28 +31,22 @@ def analyse_page_with_textract(pdf_page_bytes, json_file_path):
31
  client = boto3.client('textract')
32
  except:
33
  print("Cannot connect to AWS Textract")
34
- return "", "", ""
35
 
36
  print("Analysing page with AWS Textract")
37
 
38
- # Convert the image to bytes using an in-memory buffer
39
- #image_buffer = io.BytesIO()
40
- #image.save(image_buffer, format='PNG') # Save as PNG, or adjust format if needed
41
- #image_bytes = image_buffer.getvalue()
42
-
43
- #response = client.detect_document_text(Document={'Bytes': image_bytes})
44
  response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=["SIGNATURES"])
45
 
46
- text_blocks = response['Blocks']
47
- request_metadata = extract_textract_metadata(response) # Metadata comes out as a string
 
 
 
48
 
49
- # Write the response to a JSON file
50
- with open(json_file_path, 'w') as json_file:
51
- json.dump(response, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
52
 
53
- print("Response has been written to output:", json_file_path)
54
-
55
- return text_blocks, request_metadata
56
 
57
 
58
  def convert_pike_pdf_page_to_bytes(pdf, page_num):
@@ -81,7 +75,7 @@ def convert_pike_pdf_page_to_bytes(pdf, page_num):
81
  return pdf_bytes
82
 
83
 
84
- def json_to_ocrresult(json_data, page_width, page_height):
85
  '''
86
  Convert the json response from textract to the OCRResult format used elsewhere in the code. Looks for lines, words, and signatures. Handwriting and signatures are set aside especially for later in case the user wants to override the default behaviour and redact all handwriting/signatures.
87
  '''
@@ -92,16 +86,27 @@ def json_to_ocrresult(json_data, page_width, page_height):
92
  signatures = []
93
  handwriting = []
94
  ocr_results_with_children = {}
 
95
 
96
  i = 1
97
 
98
- for text_block in json_data:
 
 
 
 
 
 
 
 
 
 
99
 
100
- is_signature = False
101
- is_handwriting = False
102
 
103
-
104
-
105
  if (text_block['BlockType'] == 'LINE') | (text_block['BlockType'] == 'SIGNATURE'): # (text_block['BlockType'] == 'WORD') |
106
 
107
  # Extract text and bounding box for the line
@@ -124,7 +129,7 @@ def json_to_ocrresult(json_data, page_width, page_height):
124
  for relationship in text_block['Relationships']:
125
  if relationship['Type'] == 'CHILD':
126
  for child_id in relationship['Ids']:
127
- child_block = next((block for block in json_data if block['Id'] == child_id), None)
128
  if child_block and child_block['BlockType'] == 'WORD':
129
  word_text = child_block.get('Text', '')
130
  word_bbox = child_block["Geometry"]["BoundingBox"]
@@ -156,9 +161,9 @@ def json_to_ocrresult(json_data, page_width, page_height):
156
 
157
  recogniser_result = CustomImageRecognizerResult(entity_type=entity_name, text= word_text, score= confidence, start=0, end=word_end, left=word_left, top=word_top, width=word_width_abs, height=word_height_abs)
158
 
159
- handwriting.append(recogniser_result)
160
-
161
- #print("Handwriting found:", handwriting[-1])
162
 
163
  # If handwriting or signature, add to bounding box
164
 
@@ -172,13 +177,14 @@ def json_to_ocrresult(json_data, page_width, page_height):
172
 
173
  recogniser_result = CustomImageRecognizerResult(entity_type=entity_name, text= line_text, score= confidence, start=0, end=word_end, left=line_left, top=line_top, width=width_abs, height=height_abs)
174
 
175
- signatures.append(recogniser_result)
176
- #print("Signature found:", signatures[-1])
 
177
 
178
  words = []
179
  words.append({
180
- 'text': line_text,
181
- 'bounding_box': (line_left, line_top, line_right, line_bottom)
182
  })
183
 
184
  ocr_results_with_children["text_line_" + str(i)] = {
@@ -196,11 +202,17 @@ def json_to_ocrresult(json_data, page_width, page_height):
196
 
197
  # If it is signature or handwriting, will overwrite the default behaviour of the PII analyser
198
  if is_signature_or_handwriting:
199
- signature_or_handwriting_recogniser_results.append(recogniser_result)
 
200
 
201
- if is_signature: signature_recogniser_results.append(recogniser_result)
202
- if is_handwriting: handwriting_recogniser_results.append(recogniser_result)
 
 
 
 
 
203
 
204
  i += 1
205
-
206
  return all_ocr_results, signature_or_handwriting_recogniser_results, signature_recogniser_results, handwriting_recogniser_results, ocr_results_with_children
 
23
  #'NumberOfPages': number_of_pages
24
  })
25
 
26
+ def analyse_page_with_textract(pdf_page_bytes, page_no):
27
  '''
28
  Analyse page with AWS Textract
29
  '''
 
31
  client = boto3.client('textract')
32
  except:
33
  print("Cannot connect to AWS Textract")
34
+ return [], "" # Return an empty list and an empty string
35
 
36
  print("Analysing page with AWS Textract")
37
 
 
 
 
 
 
 
38
  response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=["SIGNATURES"])
39
 
40
+ # Wrap the response with the page number in the desired format
41
+ wrapped_response = {
42
+ 'page_no': page_no,
43
+ 'data': response
44
+ }
45
 
46
+ request_metadata = extract_textract_metadata(response) # Metadata comes out as a string
 
 
47
 
48
+ # Return a list containing the wrapped response and the metadata
49
+ return wrapped_response, request_metadata # Return as a list to match the desired structure
 
50
 
51
 
52
  def convert_pike_pdf_page_to_bytes(pdf, page_num):
 
75
  return pdf_bytes
76
 
77
 
78
+ def json_to_ocrresult(json_data, page_width, page_height, page_no):
79
  '''
80
  Convert the json response from textract to the OCRResult format used elsewhere in the code. Looks for lines, words, and signatures. Handwriting and signatures are set aside especially for later in case the user wants to override the default behaviour and redact all handwriting/signatures.
81
  '''
 
86
  signatures = []
87
  handwriting = []
88
  ocr_results_with_children = {}
89
+ text_block={}
90
 
91
  i = 1
92
 
93
+ # Assuming json_data is structured as a dictionary with a "pages" key
94
+ #if "pages" in json_data:
95
+ # Find the specific page data
96
+ page_json_data = json_data #next((page for page in json_data["pages"] if page["page_no"] == page_no), None)
97
+
98
+ if "Blocks" in page_json_data:
99
+ # Access the data for the specific page
100
+ text_blocks = page_json_data["Blocks"] # Access the Blocks within the page data
101
+ # This is a new page
102
+ elif "page_no" in page_json_data:
103
+ text_blocks = page_json_data["data"]["Blocks"]
104
 
105
+ is_signature = False
106
+ is_handwriting = False
107
 
108
+ for text_block in text_blocks:
109
+
110
  if (text_block['BlockType'] == 'LINE') | (text_block['BlockType'] == 'SIGNATURE'): # (text_block['BlockType'] == 'WORD') |
111
 
112
  # Extract text and bounding box for the line
 
129
  for relationship in text_block['Relationships']:
130
  if relationship['Type'] == 'CHILD':
131
  for child_id in relationship['Ids']:
132
+ child_block = next((block for block in text_blocks if block['Id'] == child_id), None)
133
  if child_block and child_block['BlockType'] == 'WORD':
134
  word_text = child_block.get('Text', '')
135
  word_bbox = child_block["Geometry"]["BoundingBox"]
 
161
 
162
  recogniser_result = CustomImageRecognizerResult(entity_type=entity_name, text= word_text, score= confidence, start=0, end=word_end, left=word_left, top=word_top, width=word_width_abs, height=word_height_abs)
163
 
164
+ if recogniser_result not in handwriting:
165
+ handwriting.append(recogniser_result)
166
+ print("Handwriting found:", handwriting[-1])
167
 
168
  # If handwriting or signature, add to bounding box
169
 
 
177
 
178
  recogniser_result = CustomImageRecognizerResult(entity_type=entity_name, text= line_text, score= confidence, start=0, end=word_end, left=line_left, top=line_top, width=width_abs, height=height_abs)
179
 
180
+ if recogniser_result not in signatures:
181
+ signatures.append(recogniser_result)
182
+ #print("Signature found:", signatures[-1])
183
 
184
  words = []
185
  words.append({
186
+ 'text': line_text,
187
+ 'bounding_box': (line_left, line_top, line_right, line_bottom)
188
  })
189
 
190
  ocr_results_with_children["text_line_" + str(i)] = {
 
202
 
203
  # If it is signature or handwriting, will overwrite the default behaviour of the PII analyser
204
  if is_signature_or_handwriting:
205
+ if recogniser_result not in signature_or_handwriting_recogniser_results:
206
+ signature_or_handwriting_recogniser_results.append(recogniser_result)
207
 
208
+ if is_signature:
209
+ if recogniser_result not in signature_recogniser_results:
210
+ signature_recogniser_results.append(recogniser_result)
211
+
212
+ if is_handwriting:
213
+ if recogniser_result not in handwriting_recogniser_results:
214
+ handwriting_recogniser_results.append(recogniser_result)
215
 
216
  i += 1
217
+
218
  return all_ocr_results, signature_or_handwriting_recogniser_results, signature_recogniser_results, handwriting_recogniser_results, ocr_results_with_children
tools/custom_image_analyser_engine.py CHANGED
@@ -9,7 +9,8 @@ import PIL
9
  from PIL import ImageDraw, ImageFont, Image
10
  from typing import Optional, Tuple, Union
11
  from copy import deepcopy
12
- import string # Import string to get a list of common punctuation characters
 
13
 
14
  @dataclass
15
  class OCRResult:
@@ -445,7 +446,7 @@ class CustomImageAnalyzerEngine:
445
 
446
  return [
447
  OCRResult(
448
- text=ocr_result['text'][i],
449
  left=ocr_result['left'][i],
450
  top=ocr_result['top'][i],
451
  width=ocr_result['width'][i],
 
9
  from PIL import ImageDraw, ImageFont, Image
10
  from typing import Optional, Tuple, Union
11
  from copy import deepcopy
12
+ from tools.helper_functions import clean_unicode_text
13
+ #import string # Import string to get a list of common punctuation characters
14
 
15
  @dataclass
16
  class OCRResult:
 
446
 
447
  return [
448
  OCRResult(
449
+ text=clean_unicode_text(ocr_result['text'][i]),
450
  left=ocr_result['left'][i],
451
  top=ocr_result['top'][i],
452
  width=ocr_result['width'][i],
tools/file_conversion.py CHANGED
@@ -4,8 +4,10 @@ from PIL import Image, ImageFile
4
  ImageFile.LOAD_TRUNCATED_IMAGES = True
5
 
6
  import os
 
7
  import time
8
  import json
 
9
  from gradio import Progress
10
  from typing import List, Optional
11
 
@@ -62,11 +64,20 @@ def convert_pdf_to_images(pdf_path:str, page_min:int = 0, progress=Progress(trac
62
 
63
  # Check if the image already exists
64
  if os.path.exists(out_path):
65
- print(f"Loading existing image from {out_path}.")
66
- image = [Image.open(out_path)] # Load the existing image
 
 
 
67
  else:
68
- image = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1, dpi=300, use_cropbox=True, use_pdftocairo=False)
69
- image[0].save(out_path, format="PNG") # Save the new image
 
 
 
 
 
 
70
 
71
  # If no images are returned, break the loop
72
  if not image:
@@ -76,9 +87,7 @@ def convert_pdf_to_images(pdf_path:str, page_min:int = 0, progress=Progress(trac
76
  # print("Conversion of page", str(page_num), "to file succeeded.")
77
  # print("image:", image)
78
 
79
-
80
-
81
- images.extend(image)
82
 
83
  print("PDF has been converted to images.")
84
  # print("Images:", images)
@@ -104,6 +113,8 @@ def process_file(file_path):
104
  # Run your function for processing PDF files here
105
  img_object = convert_pdf_to_images(file_path)
106
 
 
 
107
  else:
108
  print(f"{file_path} is not an image or PDF file.")
109
  img_object = ['']
@@ -119,9 +130,15 @@ def get_input_file_names(file_input):
119
 
120
  #print("file_input:", file_input)
121
 
122
- for file in file_input:
123
- file_path = file.name
124
- print(file_path)
 
 
 
 
 
 
125
  file_path_without_ext = get_file_path_end(file_path)
126
 
127
  #print("file:", file_path)
@@ -147,6 +164,8 @@ def prepare_image_or_pdf(
147
  latest_file_completed: int = 0,
148
  out_message: List[str] = [],
149
  first_loop_state: bool = False,
 
 
150
  progress: Progress = Progress(track_tqdm=True)
151
  ) -> tuple[List[str], List[str]]:
152
  """
@@ -162,6 +181,7 @@ def prepare_image_or_pdf(
162
  latest_file_completed (int): Index of the last completed file.
163
  out_message (List[str]): List to store output messages.
164
  first_loop_state (bool): Flag indicating if this is the first iteration.
 
165
  progress (Progress): Progress tracker for the operation.
166
 
167
  Returns:
@@ -170,47 +190,73 @@ def prepare_image_or_pdf(
170
 
171
  tic = time.perf_counter()
172
 
173
- # If out message or converted_file_paths are blank, change to a list so it can be appended to
174
- if isinstance(out_message, str):
175
- out_message = [out_message]
176
-
177
  # If this is the first time around, set variables to 0/blank
178
  if first_loop_state==True:
 
179
  latest_file_completed = 0
180
- out_message = []
181
- converted_file_paths = []
182
- image_file_paths = []
183
  else:
184
  print("Now attempting file:", str(latest_file_completed))
185
- converted_file_paths = []
186
- image_file_paths = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
 
188
  if not file_paths:
189
  file_paths = []
190
 
191
- #converted_file_paths = file_paths
 
 
 
 
 
 
 
192
 
193
  latest_file_completed = int(latest_file_completed)
194
 
195
  # If we have already redacted the last file, return the input out_message and file list to the relevant components
196
- if latest_file_completed >= len(file_paths):
197
  print("Last file reached, returning files:", str(latest_file_completed))
198
  if isinstance(out_message, list):
199
  final_out_message = '\n'.join(out_message)
200
  else:
201
  final_out_message = out_message
202
- return final_out_message, converted_file_paths, image_file_paths
203
 
204
  #in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
205
 
206
  progress(0.1, desc='Preparing file')
207
 
208
- file_paths_loop = [file_paths[int(latest_file_completed)]]
 
 
 
 
 
 
 
209
  #print("file_paths_loop:", str(file_paths_loop))
210
 
211
  #for file in progress.tqdm(file_paths, desc="Preparing files"):
212
  for file in file_paths_loop:
213
- file_path = file.name
 
 
 
214
  file_path_without_ext = get_file_path_end(file_path)
215
 
216
  #print("file:", file_path)
@@ -235,14 +281,14 @@ def prepare_image_or_pdf(
235
  if not file_path:
236
  out_message = "No file selected"
237
  print(out_message)
238
- return out_message, converted_file_paths, image_file_paths
239
 
240
  if in_redact_method == "Quick image analysis - typed text" or in_redact_method == "Complex image analysis - docs with handwriting/signatures (AWS Textract)":
241
  # Analyse and redact image-based pdf or image
242
  if is_pdf_or_image(file_path) == False:
243
  out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
244
  print(out_message)
245
- return out_message, converted_file_paths, image_file_paths
246
 
247
  converted_file_path = process_file(file_path)
248
  image_file_path = converted_file_path
@@ -252,7 +298,7 @@ def prepare_image_or_pdf(
252
  if is_pdf(file_path) == False:
253
  out_message = "Please upload a PDF file for text analysis."
254
  print(out_message)
255
- return out_message, converted_file_paths, image_file_paths
256
 
257
  converted_file_path = file_path # Pikepdf works with the basic unconverted pdf file
258
  image_file_path = process_file(file_path)
@@ -261,7 +307,20 @@ def prepare_image_or_pdf(
261
  converted_file_paths.append(converted_file_path)
262
  image_file_paths.extend(image_file_path)
263
 
264
- #print("file conversion image_file_paths:", image_file_paths)
 
 
 
 
 
 
 
 
 
 
 
 
 
265
 
266
  toc = time.perf_counter()
267
  out_time = f"File '{file_path_without_ext}' prepared in {toc - tic:0.1f} seconds."
@@ -270,8 +329,12 @@ def prepare_image_or_pdf(
270
 
271
  out_message.append(out_time)
272
  out_message_out = '\n'.join(out_message)
 
 
 
 
273
 
274
- return out_message_out, converted_file_paths, image_file_paths
275
 
276
  def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
277
  file_path_without_ext = get_file_path_end(in_file_path)
 
4
  ImageFile.LOAD_TRUNCATED_IMAGES = True
5
 
6
  import os
7
+ import gradio as gr
8
  import time
9
  import json
10
+ import pymupdf
11
  from gradio import Progress
12
  from typing import List, Optional
13
 
 
64
 
65
  # Check if the image already exists
66
  if os.path.exists(out_path):
67
+ #print(f"Loading existing image from {out_path}.")
68
+ image = Image.open(out_path) # Load the existing image
69
+
70
+
71
+
72
  else:
73
+ image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1, dpi=300, use_cropbox=True, use_pdftocairo=False)
74
+
75
+ image = image_l[0]
76
+
77
+ # Convert to greyscale
78
+ image = image.convert("L")
79
+
80
+ image.save(out_path, format="PNG") # Save the new image
81
 
82
  # If no images are returned, break the loop
83
  if not image:
 
87
  # print("Conversion of page", str(page_num), "to file succeeded.")
88
  # print("image:", image)
89
 
90
+ images.append(out_path)
 
 
91
 
92
  print("PDF has been converted to images.")
93
  # print("Images:", images)
 
113
  # Run your function for processing PDF files here
114
  img_object = convert_pdf_to_images(file_path)
115
 
116
+ print("img_object has length", len(img_object), "and contains", img_object)
117
+
118
  else:
119
  print(f"{file_path} is not an image or PDF file.")
120
  img_object = ['']
 
130
 
131
  #print("file_input:", file_input)
132
 
133
+ if isinstance(file_input, str):
134
+ file_input_list = [file_input]
135
+
136
+ for file in file_input_list:
137
+ if isinstance(file, str):
138
+ file_path = file
139
+ else:
140
+ file_path = file.name
141
+
142
  file_path_without_ext = get_file_path_end(file_path)
143
 
144
  #print("file:", file_path)
 
164
  latest_file_completed: int = 0,
165
  out_message: List[str] = [],
166
  first_loop_state: bool = False,
167
+ number_of_pages:int = 1,
168
+ current_loop_page_number:int=0,
169
  progress: Progress = Progress(track_tqdm=True)
170
  ) -> tuple[List[str], List[str]]:
171
  """
 
181
  latest_file_completed (int): Index of the last completed file.
182
  out_message (List[str]): List to store output messages.
183
  first_loop_state (bool): Flag indicating if this is the first iteration.
184
+ number_of_pages (int): integer indicating the number of pages in the document
185
  progress (Progress): Progress tracker for the operation.
186
 
187
  Returns:
 
190
 
191
  tic = time.perf_counter()
192
 
 
 
 
 
193
  # If this is the first time around, set variables to 0/blank
194
  if first_loop_state==True:
195
+ print("first_loop_state is True")
196
  latest_file_completed = 0
197
+ out_message = []
 
 
198
  else:
199
  print("Now attempting file:", str(latest_file_completed))
200
+
201
+ # This is only run when a new page is loaded, so can reset page loop values. If end of last file (99), current loop number set to 999
202
+ # if latest_file_completed == 99:
203
+ # current_loop_page_number = 999
204
+ # page_break_return = False
205
+ # else:
206
+ # current_loop_page_number = 0
207
+ # page_break_return = False
208
+
209
+ # If out message or converted_file_paths are blank, change to a list so it can be appended to
210
+ if isinstance(out_message, str):
211
+ out_message = [out_message]
212
+
213
+ converted_file_paths = []
214
+ image_file_paths = []
215
+ pymupdf_doc = []
216
 
217
  if not file_paths:
218
  file_paths = []
219
 
220
+ if isinstance(file_paths, str):
221
+ file_path_number = 1
222
+ else:
223
+ file_path_number = len(file_paths)
224
+
225
+ print("Current_loop_page_number at start of prepare_image_or_pdf function is:", current_loop_page_number)
226
+ print("Number of file paths:", file_path_number)
227
+ print("Latest_file_completed:", latest_file_completed)
228
 
229
  latest_file_completed = int(latest_file_completed)
230
 
231
  # If we have already redacted the last file, return the input out_message and file list to the relevant components
232
+ if latest_file_completed >= file_path_number:
233
  print("Last file reached, returning files:", str(latest_file_completed))
234
  if isinstance(out_message, list):
235
  final_out_message = '\n'.join(out_message)
236
  else:
237
  final_out_message = out_message
238
+ return final_out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc
239
 
240
  #in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
241
 
242
  progress(0.1, desc='Preparing file')
243
 
244
+ if isinstance(file_paths, str):
245
+ file_paths_list = [file_paths]
246
+ file_paths_loop = file_paths_list
247
+ else:
248
+ file_paths_list = file_paths
249
+ file_paths_loop = [file_paths_list[int(latest_file_completed)]]
250
+
251
+
252
  #print("file_paths_loop:", str(file_paths_loop))
253
 
254
  #for file in progress.tqdm(file_paths, desc="Preparing files"):
255
  for file in file_paths_loop:
256
+ if isinstance(file, str):
257
+ file_path = file
258
+ else:
259
+ file_path = file.name
260
  file_path_without_ext = get_file_path_end(file_path)
261
 
262
  #print("file:", file_path)
 
281
  if not file_path:
282
  out_message = "No file selected"
283
  print(out_message)
284
+ return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc
285
 
286
  if in_redact_method == "Quick image analysis - typed text" or in_redact_method == "Complex image analysis - docs with handwriting/signatures (AWS Textract)":
287
  # Analyse and redact image-based pdf or image
288
  if is_pdf_or_image(file_path) == False:
289
  out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
290
  print(out_message)
291
+ return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc
292
 
293
  converted_file_path = process_file(file_path)
294
  image_file_path = converted_file_path
 
298
  if is_pdf(file_path) == False:
299
  out_message = "Please upload a PDF file for text analysis."
300
  print(out_message)
301
+ return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc
302
 
303
  converted_file_path = file_path # Pikepdf works with the basic unconverted pdf file
304
  image_file_path = process_file(file_path)
 
307
  converted_file_paths.append(converted_file_path)
308
  image_file_paths.extend(image_file_path)
309
 
310
+ # If a pdf, load as a pymupdf document
311
+ if is_pdf(file_path):
312
+ pymupdf_doc = pymupdf.open(file_path)
313
+ #print("pymupdf_doc:", pymupdf_doc)
314
+ elif is_pdf_or_image(file_path): # Alternatively, if it's an image
315
+ # Convert image to a pymupdf document
316
+ pymupdf_doc = pymupdf.open() # Create a new empty document
317
+ img = Image.open(file_path) # Open the image file
318
+ rect = pymupdf.Rect(0, 0, img.width, img.height) # Create a rectangle for the image
319
+ page = pymupdf_doc.new_page(width=img.width, height=img.height) # Add a new page
320
+ page.insert_image(rect, filename=file_path) # Insert the image into the page
321
+ # Ensure to save the document after processing
322
+ #pymupdf_doc.save(output_path) # Uncomment and specify output_path if needed
323
+ #pymupdf_doc.close() # Close the PDF document
324
 
325
  toc = time.perf_counter()
326
  out_time = f"File '{file_path_without_ext}' prepared in {toc - tic:0.1f} seconds."
 
329
 
330
  out_message.append(out_time)
331
  out_message_out = '\n'.join(out_message)
332
+
333
+ number_of_pages = len(image_file_paths)
334
+
335
+ print("At end of prepare_image_or_pdf function - current_loop_page_number:", current_loop_page_number)
336
 
337
+ return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc
338
 
339
  def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
340
  file_path_without_ext = get_file_path_end(in_file_path)
tools/file_redaction.py CHANGED
@@ -4,6 +4,8 @@ import json
4
  import io
5
  import os
6
  import boto3
 
 
7
  from PIL import Image, ImageChops, ImageFile, ImageDraw
8
  ImageFile.LOAD_TRUNCATED_IMAGES = True
9
 
@@ -25,11 +27,20 @@ from collections import defaultdict # For efficient grouping
25
  from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult
26
  from tools.file_conversion import process_file
27
  from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
28
- from tools.helper_functions import get_file_path_end, output_folder
29
  from tools.file_conversion import process_file, is_pdf, convert_text_pdf_to_img_pdf, is_pdf_or_image
30
  from tools.data_anonymise import generate_decision_process_output
31
  from tools.aws_textract import analyse_page_with_textract, convert_pike_pdf_page_to_bytes, json_to_ocrresult
32
 
 
 
 
 
 
 
 
 
 
33
  def sum_numbers_before_seconds(string:str):
34
  """Extracts numbers that precede the word 'seconds' from a string and adds them up.
35
 
@@ -51,49 +62,130 @@ def sum_numbers_before_seconds(string:str):
51
 
52
  return sum_of_numbers
53
 
54
- def choose_and_run_redactor(file_paths:List[str], prepared_pdf_file_paths:List[str], prepared_pdf_image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list=[], log_files_output_paths:list=[], first_loop_state:bool=False, page_min:int=0, page_max:int=999, estimated_time_taken_state:float=0.0, handwrite_signature_checkbox:List[str]=["Redact all identified handwriting", "Redact all identified signatures"], all_request_metadata_str:str = "", all_image_annotations:dict={}, pdf_text=[], progress=gr.Progress(track_tqdm=True)):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  '''
56
- Based on the type of redaction selected, pass the document file content onto the relevant function and return a redacted document plus processing logs.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  '''
58
-
59
  tic = time.perf_counter()
60
  all_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
61
 
62
  # If this is the first time around, set variables to 0/blank
63
  if first_loop_state==True:
 
64
  latest_file_completed = 0
 
65
  #out_message = []
66
  out_file_paths = []
67
- pdf_text = []
 
 
 
 
 
 
68
 
69
  # If out message is string or out_file_paths are blank, change to a list so it can be appended to
70
- if isinstance(out_message, str):
71
- out_message = [out_message]
72
 
73
  if not out_file_paths:
74
  out_file_paths = []
75
 
76
  latest_file_completed = int(latest_file_completed)
77
 
78
- #pdf_text = []
 
 
 
 
 
 
 
 
 
 
79
 
80
  # If we have already redacted the last file, return the input out_message and file list to the relevant components
81
- if latest_file_completed >= len(file_paths):
82
- #print("Last file reached")
 
83
  # Set to a very high number so as not to mix up with subsequent file processing by the user
84
  latest_file_completed = 99
85
- final_out_message = '\n'.join(out_message)
86
- #final_out_message = final_out_message + "\n\nGo to to the Redaction settings tab to see redaction logs. Please give feedback on the results below to help improve this app."
 
 
 
 
87
 
88
- estimate_total_processing_time = sum_numbers_before_seconds(final_out_message)
89
  print("Estimated total processing time:", str(estimate_total_processing_time))
90
 
91
- #print("Final all_image_annotations:", all_image_annotations)
92
-
93
- return final_out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimate_total_processing_time, all_request_metadata_str, pdf_text, all_image_annotations
94
 
95
- file_paths_loop = [file_paths[int(latest_file_completed)]]
 
 
 
 
 
 
96
 
 
 
 
97
  if not in_allow_list.empty:
98
  in_allow_list_flat = in_allow_list[0].tolist()
99
  print("In allow list:", in_allow_list_flat)
@@ -101,13 +193,26 @@ def choose_and_run_redactor(file_paths:List[str], prepared_pdf_file_paths:List[s
101
  in_allow_list_flat = []
102
 
103
  progress(0.5, desc="Redacting file")
 
104
 
 
 
 
 
 
 
 
 
105
  for file in file_paths_loop:
106
- #for file in progress.tqdm(file_paths_loop, desc="Redacting files", unit = "files"):
107
- file_path = file.name
 
 
108
 
109
  if file_path:
110
  file_path_without_ext = get_file_path_end(file_path)
 
 
111
  is_a_pdf = is_pdf(file_path) == True
112
  if is_a_pdf == False:
113
  # If user has not submitted a pdf, assume it's an image
@@ -116,7 +221,8 @@ def choose_and_run_redactor(file_paths:List[str], prepared_pdf_file_paths:List[s
116
  else:
117
  out_message = "No file selected"
118
  print(out_message)
119
- return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pdf_text, all_image_annotations
 
120
 
121
  if in_redact_method == "Quick image analysis - typed text" or in_redact_method == "Complex image analysis - docs with handwriting/signatures (AWS Textract)":
122
 
@@ -127,98 +233,130 @@ def choose_and_run_redactor(file_paths:List[str], prepared_pdf_file_paths:List[s
127
  except:
128
  out_message = "Cannot connect to AWS Textract. Please choose another redaction method."
129
  print(out_message)
130
- return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pdf_text, all_image_annotations
131
 
132
  #Analyse and redact image-based pdf or image
133
  if is_pdf_or_image(file_path) == False:
134
  out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
135
- return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pdf_text, all_image_annotations
136
 
137
  print("Redacting file " + file_path_without_ext + " as an image-based file")
138
 
139
- pdf_text, redaction_logs, logging_file_paths, new_request_metadata, all_image_annotations = redact_image_pdf(file_path, prepared_pdf_image_paths, language, chosen_redact_entities, in_allow_list_flat, is_a_pdf, page_min, page_max, in_redact_method, handwrite_signature_checkbox)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
 
141
  # Save file
142
  if is_pdf(file_path) == False:
143
  out_image_file_path = output_folder + file_path_without_ext + "_redacted_as_img.pdf"
144
- pdf_text[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_text[1:])
145
 
146
  else:
147
  out_image_file_path = output_folder + file_path_without_ext + "_redacted.pdf"
148
- pdf_text.save(out_image_file_path)
149
 
150
  out_file_paths.append(out_image_file_path)
151
  if logging_file_paths:
152
  log_files_output_paths.extend(logging_file_paths)
153
 
154
- out_message.append("File '" + file_path_without_ext + "' successfully redacted")
155
-
156
 
157
  logs_output_file_name = out_image_file_path + "_decision_process_output.csv"
158
- redaction_logs.to_csv(logs_output_file_name)
159
  log_files_output_paths.append(logs_output_file_name)
160
 
161
- # Save Textract request metadata (if exists)
162
- if new_request_metadata:
163
- print("Request metadata:", new_request_metadata)
164
- all_request_metadata.append(new_request_metadata)
165
 
 
 
 
 
 
 
 
 
166
  # Increase latest file completed count unless we are at the last file
167
- if latest_file_completed != len(file_paths):
168
- print("Completed file number:", str(latest_file_completed))
169
- latest_file_completed += 1
170
 
171
- elif in_redact_method == "Simple text analysis - PDFs with selectable text":
172
 
173
- print("file_path for selectable text analysis:", file_path)
174
-
175
- if is_pdf(file_path) == False:
176
- out_message = "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'."
177
- return out_message, None, None
178
-
179
- # Analyse text-based pdf
180
- print('Redacting file as text-based PDF')
181
- pdf_text, decision_process_logs, page_text_outputs, all_image_annotations = redact_text_pdf(file_path, prepared_pdf_image_paths, language, chosen_redact_entities, in_allow_list_flat, page_min, page_max, "Simple text analysis - PDFs with selectable text")
182
-
183
- out_text_file_path = output_folder + file_path_without_ext + "_text_redacted.pdf"
184
- pdf_text.save(out_text_file_path)
185
- out_file_paths.append(out_text_file_path)
186
 
187
- # Convert message
188
- #convert_message="Converting PDF to image-based PDF to embed redactions."
189
- #print(convert_message)
190
 
191
- # Convert document to image-based document to 'embed' redactions
192
- #img_output_summary, img_output_file_path = convert_text_pdf_to_img_pdf(file_path, [out_text_file_path])
193
- #out_file_paths.extend(img_output_file_path)
194
 
195
- # Write logs to file
196
- decision_logs_output_file_name = out_text_file_path + "_decision_process_output.csv"
197
- decision_process_logs.to_csv(decision_logs_output_file_name)
198
- log_files_output_paths.append(decision_logs_output_file_name)
199
 
200
- all_text_output_file_name = out_text_file_path + "_all_text_output.csv"
201
- page_text_outputs.to_csv(all_text_output_file_name)
202
- log_files_output_paths.append(all_text_output_file_name)
 
 
203
 
204
- out_message_new = "File '" + file_path_without_ext + "' successfully redacted"
205
- out_message.append(out_message_new)
206
 
207
  if latest_file_completed != len(file_paths):
208
- print("Completed file number:", str(latest_file_completed), "more files to do")
209
- latest_file_completed += 1
210
-
211
- else:
212
- out_message = "No redaction method selected"
213
- print(out_message)
214
- return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pdf_text, all_image_annotations
 
 
 
 
 
 
215
 
216
  toc = time.perf_counter()
217
- out_time = f"in {toc - tic:0.1f} seconds."
218
- print(out_time)
219
-
220
- out_message_out = '\n'.join(out_message)
221
- out_message_out = out_message_out + " " + out_time
222
 
223
  # If textract requests made, write to logging file
224
  if all_request_metadata:
@@ -233,8 +371,16 @@ def choose_and_run_redactor(file_paths:List[str], prepared_pdf_file_paths:List[s
233
  if all_request_metadata_file_path not in log_files_output_paths:
234
  log_files_output_paths.append(all_request_metadata_file_path)
235
 
 
 
 
 
 
 
 
 
236
 
237
- return out_message_out, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pdf_text, all_image_annotations
238
 
239
  def convert_pikepdf_coords_to_pymudf(pymupdf_page, annot):
240
  '''
@@ -430,7 +576,6 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page, image = None):#, sc
430
 
431
  if image:
432
  image_x1, image_y1, image_x2, image_y2 = convert_pikepdf_to_image_coords(page, annot, image)
433
-
434
 
435
  img_annotation_box["xmin"] = image_x1
436
  img_annotation_box["ymin"] = image_y1
@@ -455,6 +600,7 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page, image = None):#, sc
455
  rect_single_pixel_height = Rect(x1, middle_y - 2, x2, middle_y + 2) # Small height in middle of word to remove text
456
 
457
  # Add the annotation to the middle of the character line, so that it doesn't delete text from adjacent lines
 
458
  page.add_redact_annot(rect_single_pixel_height)
459
 
460
  # Set up drawing a black box over the whole rect
@@ -468,6 +614,8 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page, image = None):#, sc
468
  "boxes": all_image_annotation_boxes
469
  }
470
 
 
 
471
  page.apply_redactions(images=0, graphics=0)
472
  page.clean_contents()
473
 
@@ -485,16 +633,6 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
485
  merged_bboxes = []
486
  grouped_bboxes = defaultdict(list)
487
 
488
- # Process signature and handwriting results
489
- if signature_recogniser_results or handwriting_recogniser_results:
490
- if "Redact all identified handwriting" in handwrite_signature_checkbox:
491
- #print("Handwriting boxes exist at merge:", handwriting_recogniser_results)
492
- bboxes.extend(handwriting_recogniser_results)
493
-
494
- if "Redact all identified signatures" in handwrite_signature_checkbox:
495
- #print("Signature boxes exist at merge:", signature_recogniser_results)
496
- bboxes.extend(signature_recogniser_results)
497
-
498
  # Reconstruct bounding boxes for substrings of interest
499
  reconstructed_bboxes = []
500
  for bbox in bboxes:
@@ -586,26 +724,53 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
586
  merged_bboxes.append(merged_box)
587
  merged_box = next_box
588
 
589
- merged_bboxes.append(merged_box)
 
 
 
 
 
 
 
 
 
 
 
 
590
 
591
  return merged_bboxes
592
 
593
- def redact_image_pdf(file_path:str, prepared_pdf_file_paths:List[str], language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, is_a_pdf:bool=True, page_min:int=0, page_max:int=999, analysis_type:str="Quick image analysis - typed text", handwrite_signature_checkbox:List[str]=["Redact all identified handwriting", "Redact all identified signatures"], request_metadata:str="", progress=Progress(track_tqdm=True)):
 
594
  '''
595
- Take an path for an image of a document, then run this image through the Presidio ImageAnalyzer and PIL to get a redacted page back. Adapted from Presidio ImageRedactorEngine.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
596
  '''
597
- # json_file_path is for AWS Textract outputs
598
- logging_file_paths = []
599
  file_name = get_file_path_end(file_path)
600
  fill = (0, 0, 0) # Fill colour
601
- decision_process_output_str = ""
602
- images = []
603
- all_image_annotations = []
604
- #request_metadata = {}
605
  image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
606
 
607
- # Also open as pymupdf pdf to apply annotations later on
608
- pymupdf_doc = pymupdf.open(file_path)
 
609
 
610
  if not prepared_pdf_file_paths:
611
  out_message = "PDF does not exist as images. Converting pages to image"
@@ -613,71 +778,56 @@ def redact_image_pdf(file_path:str, prepared_pdf_file_paths:List[str], language:
613
 
614
  prepared_pdf_file_paths = process_file(file_path)
615
 
616
- if not isinstance(prepared_pdf_file_paths, list):
617
- print("Converting prepared_pdf_file_paths to list")
618
- prepared_pdf_file_paths = [prepared_pdf_file_paths]
619
-
620
- #print("Image paths:", prepared_pdf_file_paths)
621
  number_of_pages = len(prepared_pdf_file_paths)
622
-
623
  print("Number of pages:", str(number_of_pages))
624
 
625
- out_message = "Redacting pages"
626
- print(out_message)
627
- #progress(0.1, desc=out_message)
628
-
629
  # Check that page_min and page_max are within expected ranges
630
  if page_max > number_of_pages or page_max == 0:
631
  page_max = number_of_pages
632
 
633
- if page_min <= 0:
634
- page_min = 0
635
- else:
636
- page_min = page_min - 1
637
 
638
  print("Page range:", str(page_min + 1), "to", str(page_max))
639
-
640
- #for i in progress.tqdm(range(0,number_of_pages), total=number_of_pages, unit="pages", desc="Redacting pages"):
641
-
642
- all_ocr_results = []
643
- all_decision_process = []
644
- all_line_level_ocr_results_df = pd.DataFrame()
645
- all_decision_process_table = pd.DataFrame()
646
-
647
  if analysis_type == "Quick image analysis - typed text": ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".csv"
648
  elif analysis_type == "Complex image analysis - docs with handwriting/signatures (AWS Textract)": ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.csv"
649
 
650
- for page_no in progress.tqdm(range(0, number_of_pages), unit="pages", desc="Redacting pages"):
651
- #for page_no in range(0, number_of_pages):
 
 
 
 
 
 
652
  handwriting_or_signature_boxes = []
653
  signature_recogniser_results = []
654
  handwriting_recogniser_results = []
655
-
656
-
657
 
 
 
 
658
  # Assuming prepared_pdf_file_paths[page_no] is a PIL image object
659
  try:
660
  image = prepared_pdf_file_paths[page_no]#.copy()
661
  #print("image:", image)
662
  except Exception as e:
663
  print("Could not redact page:", reported_page_number, "due to:")
664
- print(e)
665
-
666
  continue
667
 
668
- image_annotations = {"image": image, "boxes": []}
669
-
670
-
671
  pymupdf_page = pymupdf_doc.load_page(page_no)
672
-
673
- #try:
674
- #print("prepared_pdf_file_paths:", prepared_pdf_file_paths)
675
 
676
- if page_no >= page_min and page_no < page_max:
677
-
678
- reported_page_number = str(page_no + 1)
679
 
680
- print("Redacting page", reported_page_number)
 
 
 
681
 
682
  # Need image size to convert textract OCR outputs to the correct sizes
683
  page_width, page_height = image.size
@@ -695,13 +845,6 @@ def redact_image_pdf(file_path:str, prepared_pdf_file_paths:List[str], language:
695
  # Combine OCR results
696
  line_level_ocr_results, line_level_ocr_results_with_children = combine_ocr_results(word_level_ocr_results)
697
 
698
- #print("ocr_results after:", ocr_results)
699
-
700
- # Save ocr_with_children_outputs
701
- ocr_results_with_children_str = str(line_level_ocr_results_with_children)
702
- logs_output_file_name = output_folder + "ocr_with_children.txt"
703
- with open(logs_output_file_name, "w") as f:
704
- f.write(ocr_results_with_children_str)
705
 
706
  # Import results from json and convert
707
  if analysis_type == "Complex image analysis - docs with handwriting/signatures (AWS Textract)":
@@ -711,24 +854,53 @@ def redact_image_pdf(file_path:str, prepared_pdf_file_paths:List[str], language:
711
  image.save(image_buffer, format='PNG') # Save as PNG, or adjust format if needed
712
  pdf_page_as_bytes = image_buffer.getvalue()
713
 
714
- json_file_path = output_folder + file_name + "_page_" + reported_page_number + "_textract.json"
 
715
 
716
  if not os.path.exists(json_file_path):
717
- text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, json_file_path) # Analyse page with Textract
718
  logging_file_paths.append(json_file_path)
719
  request_metadata = request_metadata + "\n" + new_request_metadata
 
 
 
 
 
 
720
  else:
721
  # Open the file and load the JSON data
722
- print("Found existing Textract json results file for this page.")
723
  with open(json_file_path, 'r') as json_file:
724
- text_blocks = json.load(json_file)
725
- text_blocks = text_blocks['Blocks']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
726
 
727
- line_level_ocr_results, handwriting_or_signature_boxes, signature_recogniser_results, handwriting_recogniser_results, line_level_ocr_results_with_children = json_to_ocrresult(text_blocks, page_width, page_height)
728
 
729
  # Step 2: Analyze text and identify PII
730
  if chosen_redact_entities:
731
-
732
  redaction_bboxes = image_analyser.analyze_text(
733
  line_level_ocr_results,
734
  line_level_ocr_results_with_children,
@@ -740,6 +912,10 @@ def redact_image_pdf(file_path:str, prepared_pdf_file_paths:List[str], language:
740
  else:
741
  redaction_bboxes = []
742
 
 
 
 
 
743
  if analysis_type == "Quick image analysis - typed text": interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".txt"
744
  elif analysis_type == "Complex image analysis - docs with handwriting/signatures (AWS Textract)": interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.txt"
745
 
@@ -750,12 +926,8 @@ def redact_image_pdf(file_path:str, prepared_pdf_file_paths:List[str], language:
750
 
751
  # Merge close bounding boxes
752
  merged_redaction_bboxes = merge_img_bboxes(redaction_bboxes, line_level_ocr_results_with_children, signature_recogniser_results, handwriting_recogniser_results, handwrite_signature_checkbox)
753
-
754
- # Save image first so that the redactions can be checked after
755
- #image.save(output_folder + "page_as_img_" + file_name + "_pages_" + str(reported_page_number) + ".png")
756
-
757
  # 3. Draw the merged boxes
758
- #if merged_redaction_bboxes:
759
  if is_pdf(file_path) == False:
760
  draw = ImageDraw.Draw(image)
761
 
@@ -790,7 +962,7 @@ def redact_image_pdf(file_path:str, prepared_pdf_file_paths:List[str], language:
790
 
791
  ## Apply annotations with pymupdf
792
  else:
793
- pymupdf_page, image_annotations = redact_page_with_pymupdf(pymupdf_page, merged_redaction_bboxes, image)#, scale)
794
 
795
  # Convert decision process to table
796
  decision_process_table = pd.DataFrame([{
@@ -820,18 +992,46 @@ def redact_image_pdf(file_path:str, prepared_pdf_file_paths:List[str], language:
820
 
821
  all_line_level_ocr_results_df = pd.concat([all_line_level_ocr_results_df, line_level_ocr_results_df])
822
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
823
  if is_pdf(file_path) == False:
824
  images.append(image)
825
  pymupdf_doc = images
826
 
827
- all_image_annotations.append(image_annotations)
828
 
829
- #print("\nall_image_annotations for page", str(page_no), "are:", all_image_annotations)
830
 
831
- all_line_level_ocr_results_df.to_csv(ocr_results_file_path)
832
- logging_file_paths.append(ocr_results_file_path)
 
 
 
833
 
834
- return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, all_image_annotations
 
 
835
 
836
 
837
  ###
@@ -848,7 +1048,30 @@ def get_text_container_characters(text_container:LTTextContainer):
848
 
849
  return characters
850
  return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
851
 
 
 
 
852
  def analyse_text_container(text_container:OCRResult, language:str, chosen_redact_entities:List[str], score_threshold:float, allow_list:List[str]):
853
  '''
854
  Take text and bounding boxes in OCRResult format and analyze it for PII using spacy and the Microsoft Presidio package.
@@ -856,19 +1079,23 @@ def analyse_text_container(text_container:OCRResult, language:str, chosen_redact
856
 
857
  analyser_results = []
858
 
859
- text_to_analyze = text_container.text
860
- #print("text_to_analyze:", text_to_analyze)
 
861
 
862
  if chosen_redact_entities:
863
- analyser_results = nlp_analyser.analyze(text=text_to_analyze,
 
 
864
  language=language,
865
  entities=chosen_redact_entities,
866
  score_threshold=score_threshold,
867
  return_decision_process=True,
868
- allow_list=allow_list)
869
 
870
  return analyser_results
871
 
 
872
  def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tuple[List[OCRResult], List[LTChar]]:
873
  '''
874
  Create an OCRResult object based on a list of pdfminer LTChar objects.
@@ -881,6 +1108,7 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
881
 
882
  # Initialize variables
883
  full_text = ""
 
884
  overall_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')] # [x0, y0, x1, y1]
885
  word_bboxes = []
886
 
@@ -894,6 +1122,7 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
894
  if isinstance(char, LTAnno):
895
  # Handle space separately by finalizing the word
896
  full_text += char.get_text() # Adds space or newline
 
897
  if current_word: # Only finalize if there is a current word
898
  word_bboxes.append((current_word, current_word_bbox))
899
  current_word = ""
@@ -918,7 +1147,17 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
918
  continue
919
 
920
  # Concatenate text for LTChar
921
- full_text += char.get_text()
 
 
 
 
 
 
 
 
 
 
922
 
923
  # Update overall bounding box
924
  x0, y0, x1, y1 = char.bbox
@@ -928,7 +1167,8 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
928
  overall_bbox[3] = max(overall_bbox[3], y1) # y1
929
 
930
  # Update current word
931
- current_word += char.get_text()
 
932
 
933
  # Update current word bounding box
934
  current_word_bbox[0] = min(current_word_bbox[0], x0) # x0
@@ -936,18 +1176,25 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
936
  current_word_bbox[2] = max(current_word_bbox[2], x1) # x1
937
  current_word_bbox[3] = max(current_word_bbox[3], y1) # y1
938
 
939
-
940
  # Finalize the last word if any
941
  if current_word:
942
  word_bboxes.append((current_word, current_word_bbox))
943
 
944
  if full_text:
 
 
 
 
 
 
 
945
  line_level_results_out.append(OCRResult(full_text, round(overall_bbox[0],2), round(overall_bbox[1], 2), round(overall_bbox[2]-overall_bbox[0],2), round(overall_bbox[3]-overall_bbox[1],2)))
946
-
 
947
 
948
  return line_level_results_out, line_level_characters_out # Return both results and character objects
949
 
950
- def merge_text_bounding_boxes(analyser_results:CustomImageRecognizerResult, characters:List[LTChar], combine_pixel_dist:int, vertical_padding:int=0):
951
  '''
952
  Merge identified bounding boxes containing PII that are very close to one another
953
  '''
@@ -1003,9 +1250,10 @@ def merge_text_bounding_boxes(analyser_results:CustomImageRecognizerResult, char
1003
  current_box[3] = max(current_box[3], char_box[3]) # Ensure the top is the highest
1004
  current_result.end = max(current_result.end, result.end) # Extend the text range
1005
  try:
1006
- current_result.type = current_result.type + " - " + result.type
1007
- except:
1008
- print("Unable to append new result type.")
 
1009
  # Add a space if current_text is not empty
1010
  if current_text:
1011
  current_text.append(" ") # Add space between texts
@@ -1082,52 +1330,98 @@ def create_annotations_for_bounding_boxes(analysed_bounding_boxes):
1082
  annotations_on_page.append(annotation)
1083
  return annotations_on_page
1084
 
1085
- def redact_text_pdf(filename:str, prepared_pdf_image_path:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, page_min:int=0, page_max:int=999, analysis_type:str = "Simple text analysis - PDFs with selectable text", progress=Progress(track_tqdm=True)):
1086
- '''
1087
- Redact chosen entities from a pdf that is made up of multiple pages that are not images.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1088
  '''
1089
- annotations_all_pages = []
1090
- all_image_annotations = []
1091
- page_text_outputs_all_pages = pd.DataFrame()
1092
- decision_process_table_all_pages = pd.DataFrame()
1093
 
1094
- combine_pixel_dist = 20 # Horizontal distance between PII bounding boxes under/equal they are combined into one
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1095
 
1096
  # Open with Pikepdf to get text lines
1097
  pikepdf_pdf = Pdf.open(filename)
1098
  number_of_pages = len(pikepdf_pdf.pages)
1099
-
1100
- # Also open pdf with pymupdf to be able to annotate later while retaining text
1101
- pymupdf_doc = pymupdf.open(filename)
1102
 
1103
- page_num = 0
1104
-
1105
  # Check that page_min and page_max are within expected ranges
1106
  if page_max > number_of_pages or page_max == 0:
1107
  page_max = number_of_pages
1108
- #else:
1109
- # page_max = page_max - 1
1110
 
1111
  if page_min <= 0: page_min = 0
1112
  else: page_min = page_min - 1
1113
 
1114
  print("Page range is",str(page_min + 1), "to", str(page_max))
 
 
 
 
 
 
 
1115
 
1116
  #for page_no in range(0, number_of_pages):
1117
- for page_no in progress.tqdm(range(0, number_of_pages), unit="pages", desc="Redacting pages"):
1118
-
1119
- #print("prepared_pdf_image_path:", prepared_pdf_image_path)
1120
- #print("prepared_pdf_image_path[page_no]:", prepared_pdf_image_path[page_no])
1121
- image = prepared_pdf_image_path[page_no]
1122
 
1123
- image_annotations = {"image": image, "boxes": []}
 
1124
 
 
 
 
 
 
 
 
 
 
 
1125
  pymupdf_page = pymupdf_doc.load_page(page_no)
1126
 
1127
- print("Page number is:", str(page_no + 1))
 
 
1128
 
1129
  if page_min <= page_no < page_max:
1130
 
 
 
1131
  for page_layout in extract_pages(filename, page_numbers = [page_no], maxpages=1):
1132
 
1133
  page_analyser_results = []
@@ -1139,18 +1433,18 @@ def redact_text_pdf(filename:str, prepared_pdf_image_path:str, language:str, cho
1139
  page_text_outputs = pd.DataFrame()
1140
 
1141
  if analysis_type == "Simple text analysis - PDFs with selectable text":
1142
- for text_container in page_layout:
1143
 
1144
  text_container_analyser_results = []
1145
  text_container_analysed_bounding_boxes = []
 
1146
 
1147
- characters = get_text_container_characters(text_container)
 
1148
 
1149
  # Create dataframe for all the text on the page
1150
  line_level_text_results_list, line_characters = create_text_bounding_boxes_from_characters(characters)
1151
 
1152
- #print("line_characters:", line_characters)
1153
-
1154
  # Create page_text_outputs (OCR format outputs)
1155
  if line_level_text_results_list:
1156
  # Convert to DataFrame and add to ongoing logging table
@@ -1167,60 +1461,79 @@ def redact_text_pdf(filename:str, prepared_pdf_image_path:str, language:str, cho
1167
 
1168
  # Analyse each line of text in turn for PII and add to list
1169
  for i, text_line in enumerate(line_level_text_results_list):
 
1170
  text_line_analyser_result = []
1171
  text_line_bounding_boxes = []
1172
 
1173
- #print("text_line:", text_line.text)
1174
-
1175
  text_line_analyser_result = analyse_text_container(text_line, language, chosen_redact_entities, score_threshold, allow_list)
1176
 
1177
  # Merge bounding boxes for the line if multiple found close together
1178
  if text_line_analyser_result:
 
 
 
1179
  # Merge bounding boxes if very close together
1180
- #print("text_line_bounding_boxes:", text_line_bounding_boxes)
1181
- #print("line_characters:")
1182
- #print(line_characters[i])
1183
- #print("".join(char._text for char in line_characters[i]))
1184
- text_line_bounding_boxes = merge_text_bounding_boxes(text_line_analyser_result, line_characters[i], combine_pixel_dist, vertical_padding = 0)
1185
 
1186
  text_container_analyser_results.extend(text_line_analyser_result)
1187
  text_container_analysed_bounding_boxes.extend(text_line_bounding_boxes)
1188
-
1189
- #print("\n FINAL text_container_analyser_results:", text_container_analyser_results)
1190
-
1191
-
1192
  page_analyser_results.extend(text_container_analyser_results)
1193
  page_analysed_bounding_boxes.extend(text_container_analysed_bounding_boxes)
1194
 
1195
  # Annotate redactions on page
1196
  annotations_on_page = create_annotations_for_bounding_boxes(page_analysed_bounding_boxes)
1197
-
1198
-
1199
- # Make page annotations
1200
- #page.Annots = pdf.make_indirect(annotations_on_page)
1201
- #if annotations_on_page:
1202
 
1203
- # Make pymupdf redactions
1204
  pymupdf_page, image_annotations = redact_page_with_pymupdf(pymupdf_page, annotations_on_page, image)
1205
 
1206
- annotations_all_pages.extend([annotations_on_page])
1207
 
1208
  print("For page number:", page_no, "there are", len(image_annotations["boxes"]), "annotations")
1209
 
1210
  # Write logs
1211
  # Create decision process table
1212
- decision_process_table_on_page = create_text_redaction_process_results(page_analyser_results, page_analysed_bounding_boxes, page_num)
1213
 
1214
  if not decision_process_table_on_page.empty:
1215
- decision_process_table_all_pages = pd.concat([decision_process_table_all_pages, decision_process_table_on_page])
1216
 
1217
  if not page_text_outputs.empty:
1218
  page_text_outputs = page_text_outputs.sort_values(["top", "left"], ascending=[False, False]).reset_index(drop=True)
1219
- #page_text_outputs.to_csv("text_page_text_outputs.csv")
1220
- page_text_outputs_all_pages = pd.concat([page_text_outputs_all_pages, page_text_outputs])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1221
 
1222
- all_image_annotations.append(image_annotations)
1223
 
1224
- #print("all_image_annotations:", all_image_annotations)
 
 
 
 
 
 
 
 
 
 
 
 
 
1225
 
1226
- return pymupdf_doc, decision_process_table_all_pages, page_text_outputs_all_pages, all_image_annotations
 
4
  import io
5
  import os
6
  import boto3
7
+
8
+ from tqdm import tqdm
9
  from PIL import Image, ImageChops, ImageFile, ImageDraw
10
  ImageFile.LOAD_TRUNCATED_IMAGES = True
11
 
 
27
  from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult
28
  from tools.file_conversion import process_file
29
  from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
30
+ from tools.helper_functions import get_file_path_end, output_folder, clean_unicode_text, get_or_create_env_var
31
  from tools.file_conversion import process_file, is_pdf, convert_text_pdf_to_img_pdf, is_pdf_or_image
32
  from tools.data_anonymise import generate_decision_process_output
33
  from tools.aws_textract import analyse_page_with_textract, convert_pike_pdf_page_to_bytes, json_to_ocrresult
34
 
35
+ # Number of pages to loop through before breaking. Currently set very high, as functions are breaking on time metrics (e.g. every 105 seconds), rather than on number of pages redacted.
36
+
37
+ page_break_value = get_or_create_env_var('page_break_value', '500')
38
+ print(f'The value of page_break_value is {page_break_value}')
39
+
40
+ max_time_value = get_or_create_env_var('max_time_value', '105')
41
+ print(f'The value of max_time_value is {max_time_value}')
42
+
43
+
44
  def sum_numbers_before_seconds(string:str):
45
  """Extracts numbers that precede the word 'seconds' from a string and adds them up.
46
 
 
62
 
63
  return sum_of_numbers
64
 
65
+
66
+ def choose_and_run_redactor(file_paths:List[str],
67
+ prepared_pdf_file_paths:List[str],
68
+ prepared_pdf_image_paths:List[str],
69
+ language:str,
70
+ chosen_redact_entities:List[str],
71
+ in_redact_method:str,
72
+ in_allow_list:List[List[str]]=None,
73
+ latest_file_completed:int=0,
74
+ out_message:list=[],
75
+ out_file_paths:list=[],
76
+ log_files_output_paths:list=[],
77
+ first_loop_state:bool=False,
78
+ page_min:int=0,
79
+ page_max:int=999,
80
+ estimated_time_taken_state:float=0.0,
81
+ handwrite_signature_checkbox:List[str]=["Redact all identified handwriting", "Redact all identified signatures"],
82
+ all_request_metadata_str:str = "",
83
+ annotations_all_pages:dict={},
84
+ all_line_level_ocr_results_df=[],
85
+ all_decision_process_table=[],
86
+ pymupdf_doc=[],
87
+ current_loop_page:int=0,
88
+ page_break_return:bool=False,
89
+ progress=gr.Progress(track_tqdm=True)):
90
  '''
91
+ This function orchestrates the redaction process based on the specified method and parameters. It takes the following inputs:
92
+
93
+ - file_paths (List[str]): A list of paths to the files to be redacted.
94
+ - prepared_pdf_file_paths (List[str]): A list of paths to the PDF files prepared for redaction.
95
+ - prepared_pdf_image_paths (List[str]): A list of paths to the PDF files converted to images for redaction.
96
+ - language (str): The language of the text in the files.
97
+ - chosen_redact_entities (List[str]): A list of entity types to redact from the files.
98
+ - in_redact_method (str): The method to use for redaction.
99
+ - in_allow_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to None.
100
+ - latest_file_completed (int, optional): The index of the last completed file. Defaults to 0.
101
+ - out_message (list, optional): A list to store output messages. Defaults to an empty list.
102
+ - out_file_paths (list, optional): A list to store paths to the output files. Defaults to an empty list.
103
+ - log_files_output_paths (list, optional): A list to store paths to the log files. Defaults to an empty list.
104
+ - first_loop_state (bool, optional): A flag indicating if this is the first iteration. Defaults to False.
105
+ - page_min (int, optional): The minimum page number to start redaction from. Defaults to 0.
106
+ - page_max (int, optional): The maximum page number to end redaction at. Defaults to 999.
107
+ - estimated_time_taken_state (float, optional): The estimated time taken for the redaction process. Defaults to 0.0.
108
+ - handwrite_signature_checkbox (List[str], optional): A list of options for redacting handwriting and signatures. Defaults to ["Redact all identified handwriting", "Redact all identified signatures"].
109
+ - all_request_metadata_str (str, optional): A string containing all request metadata. Defaults to an empty string.
110
+ - annotations_all_pages (dict, optional): A dictionary containing all image annotations. Defaults to an empty dictionary.
111
+ - all_line_level_ocr_results_df (optional): A DataFrame containing all line-level OCR results. Defaults to an empty DataFrame.
112
+ - all_decision_process_table (optional): A DataFrame containing all decision process tables. Defaults to an empty DataFrame.
113
+ - pymupdf_doc (optional): A list containing the PDF document object. Defaults to an empty list.
114
+ - current_loop_page (int, optional): The current page being processed in the loop. Defaults to 0.
115
+ - page_break_return (bool, optional): A flag indicating if the function should return after a page break. Defaults to False.
116
+ - progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
117
+
118
+ The function returns a redacted document along with processing logs.
119
  '''
120
+ combined_out_message = ""
121
  tic = time.perf_counter()
122
  all_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
123
 
124
  # If this is the first time around, set variables to 0/blank
125
  if first_loop_state==True:
126
+ print("First_loop_state is True")
127
  latest_file_completed = 0
128
+ current_loop_page = 0
129
  #out_message = []
130
  out_file_paths = []
131
+ estimate_total_processing_time = 0
132
+ estimated_time_taken_state = 0
133
+
134
+ # If not the first time around, and the current page loop has been set to a huge number (been through all pages), reset current page to 0
135
+ elif (first_loop_state == False) & (current_loop_page == 999):
136
+ current_loop_page = 0
137
+
138
 
139
  # If out message is string or out_file_paths are blank, change to a list so it can be appended to
140
+ #if isinstance(out_message, str):
141
+ # out_message = [out_message]
142
 
143
  if not out_file_paths:
144
  out_file_paths = []
145
 
146
  latest_file_completed = int(latest_file_completed)
147
 
148
+ number_of_pages = len(prepared_pdf_image_paths)
149
+
150
+ if isinstance(file_paths,str):
151
+ number_of_files = 1
152
+ else:
153
+ number_of_files = len(file_paths)
154
+
155
+
156
+ print("\nIn choose_and_run_redactor function, latest_file_completed is:", latest_file_completed)
157
+ print("current_loop_page is:", current_loop_page)
158
+
159
 
160
  # If we have already redacted the last file, return the input out_message and file list to the relevant components
161
+ if latest_file_completed >= number_of_files:
162
+
163
+ print("latest_file_completed is equal to or greater than the number of files")
164
  # Set to a very high number so as not to mix up with subsequent file processing by the user
165
  latest_file_completed = 99
166
+ current_loop_page = 0
167
+
168
+ if isinstance(out_message, list):
169
+ combined_out_message = '\n'.join(out_message)
170
+ else:
171
+ combined_out_message = out_message
172
 
173
+ estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
174
  print("Estimated total processing time:", str(estimate_total_processing_time))
175
 
176
+ return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table
 
 
177
 
178
+ # If we have reached the last page, return message
179
+ if current_loop_page >= number_of_pages:
180
+ print("current_loop_page:", current_loop_page, "is equal to or greater than number of pages in document:", number_of_pages)
181
+
182
+ # Set to a very high number so as not to mix up with subsequent file processing by the user
183
+ current_loop_page = 999
184
+ combined_out_message = out_message
185
 
186
+ return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = False, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table
187
+
188
+ # Create allow list
189
  if not in_allow_list.empty:
190
  in_allow_list_flat = in_allow_list[0].tolist()
191
  print("In allow list:", in_allow_list_flat)
 
193
  in_allow_list_flat = []
194
 
195
  progress(0.5, desc="Redacting file")
196
+
197
 
198
+ if isinstance(file_paths, str):
199
+ file_paths_list = [file_paths]
200
+ file_paths_loop = file_paths_list
201
+ else:
202
+ file_paths_list = file_paths
203
+ file_paths_loop = [file_paths_list[int(latest_file_completed)]]
204
+
205
+
206
  for file in file_paths_loop:
207
+ if isinstance(file, str):
208
+ file_path = file
209
+ else:
210
+ file_path = file.name
211
 
212
  if file_path:
213
  file_path_without_ext = get_file_path_end(file_path)
214
+ print("Redacting file:", file_path_without_ext)
215
+
216
  is_a_pdf = is_pdf(file_path) == True
217
  if is_a_pdf == False:
218
  # If user has not submitted a pdf, assume it's an image
 
221
  else:
222
  out_message = "No file selected"
223
  print(out_message)
224
+
225
+ return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table
226
 
227
  if in_redact_method == "Quick image analysis - typed text" or in_redact_method == "Complex image analysis - docs with handwriting/signatures (AWS Textract)":
228
 
 
233
  except:
234
  out_message = "Cannot connect to AWS Textract. Please choose another redaction method."
235
  print(out_message)
236
+ return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages
237
 
238
  #Analyse and redact image-based pdf or image
239
  if is_pdf_or_image(file_path) == False:
240
  out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
241
+ return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages
242
 
243
  print("Redacting file " + file_path_without_ext + " as an image-based file")
244
 
245
+ pymupdf_doc, all_decision_process_table, logging_file_paths, new_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df = redact_image_pdf(file_path, prepared_pdf_image_paths, language, chosen_redact_entities, in_allow_list_flat, is_a_pdf, page_min, page_max, in_redact_method, handwrite_signature_checkbox, "", current_loop_page, page_break_return, prepared_pdf_image_paths, annotations_all_pages, all_line_level_ocr_results_df, all_decision_process_table, pymupdf_doc)
246
+
247
+ # Save Textract request metadata (if exists)
248
+ if new_request_metadata:
249
+ print("Request metadata:", new_request_metadata)
250
+ all_request_metadata.append(new_request_metadata)
251
+
252
+ elif in_redact_method == "Simple text analysis - PDFs with selectable text":
253
+
254
+ logging_file_paths = ""
255
+
256
+ if is_pdf(file_path) == False:
257
+ out_message = "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'."
258
+ return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table
259
+
260
+ # Analyse text-based pdf
261
+ print('Redacting file as text-based PDF')
262
+
263
+ pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return = redact_text_pdf(file_path, prepared_pdf_image_paths, language, chosen_redact_entities, in_allow_list_flat, page_min, page_max, "Simple text analysis - PDFs with selectable text", current_loop_page, page_break_return, annotations_all_pages, all_line_level_ocr_results_df, all_decision_process_table, pymupdf_doc)
264
+
265
+ else:
266
+ out_message = "No redaction method selected"
267
+ print(out_message)
268
+ return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table
269
+
270
+ # If at last page, save to file
271
+ if current_loop_page >= number_of_pages:
272
+
273
+ print("Current page loop:", current_loop_page, "is greater or equal to number of pages:", number_of_pages)
274
+ latest_file_completed += 1
275
+ current_loop_page = 999
276
+
277
+ if latest_file_completed != len(file_paths):
278
+ print("Completed file number:", str(latest_file_completed), "there are more files to do")
279
 
280
  # Save file
281
  if is_pdf(file_path) == False:
282
  out_image_file_path = output_folder + file_path_without_ext + "_redacted_as_img.pdf"
283
+ pymupdf_doc[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pymupdf_doc[1:])
284
 
285
  else:
286
  out_image_file_path = output_folder + file_path_without_ext + "_redacted.pdf"
287
+ pymupdf_doc.save(out_image_file_path)
288
 
289
  out_file_paths.append(out_image_file_path)
290
  if logging_file_paths:
291
  log_files_output_paths.extend(logging_file_paths)
292
 
293
+ #if isinstance(out_message, list):
294
+ # out_message.append("File '" + file_path_without_ext + "' successfully redacted")
295
 
296
  logs_output_file_name = out_image_file_path + "_decision_process_output.csv"
297
+ all_decision_process_table.to_csv(logs_output_file_name, index = None, encoding="utf-8")
298
  log_files_output_paths.append(logs_output_file_name)
299
 
300
+ all_text_output_file_name = out_image_file_path + "_ocr_output.csv"
301
+ all_line_level_ocr_results_df.to_csv(all_text_output_file_name, index = None, encoding="utf-8")
302
+ log_files_output_paths.append(all_text_output_file_name)
 
303
 
304
+ # Make a combined message for the file
305
+ if isinstance(out_message, list):
306
+ combined_out_message = '\n'.join(out_message) # Ensure out_message is a list of strings
307
+ else: combined_out_message = out_message
308
+
309
+ out_time_message = f" Redacted in {estimated_time_taken_state:0.1f} seconds."
310
+ combined_out_message = combined_out_message + " " + out_time_message # Ensure this is a single string
311
+
312
  # Increase latest file completed count unless we are at the last file
313
+ # if latest_file_completed != len(file_paths):
314
+ # print("Completed file number:", str(latest_file_completed), "more files to do")
 
315
 
316
+ # if current_loop_page >= number_of_pages:
317
 
318
+ # print("Current page loop", current_loop_page, "is greater than or equal to number of pages:", number_of_pages)
319
+ # latest_file_completed += 1
 
 
 
 
 
 
 
 
 
 
 
320
 
321
+ # # Set to 999 to be a big number not to interrupt processing of large files by user
322
+ # current_loop_page = 999
 
323
 
324
+ # out_text_file_path = output_folder + file_path_without_ext + "_text_redacted.pdf"
325
+ # pymupdf_doc.save(out_text_file_path)
326
+ # out_file_paths.append(out_text_file_path)
327
 
328
+ # # Write logs to file
329
+ # decision_logs_output_file_name = out_text_file_path + "_decision_process_output.csv"
330
+ # all_decision_process_table.to_csv(decision_logs_output_file_name)
331
+ # log_files_output_paths.append(decision_logs_output_file_name)
332
 
333
+ # all_text_output_file_name = out_text_file_path + "_all_text_output.csv"
334
+ # all_line_level_ocr_results_df.to_csv(all_text_output_file_name)
335
+ # log_files_output_paths.append(all_text_output_file_name)
336
+
337
+ # out_message_new = "File '" + file_path_without_ext + "' successfully redacted"
338
 
339
+ # if isinstance(out_message, list):
340
+ # out_message.append(out_message_new) # Ensure out_message is a list of strings
341
 
342
  if latest_file_completed != len(file_paths):
343
+ print("Completed file number:", str(latest_file_completed), " there are more files to do")
344
+
345
+
346
+ # Make a combined message for the file
347
+ if isinstance(out_message, list):
348
+ combined_out_message = '\n'.join(out_message) # Ensure out_message is a list of strings
349
+ else: combined_out_message = out_message
350
+
351
+ out_time_message = f" Redacted in {estimated_time_taken_state:0.1f} seconds."
352
+ combined_out_message = combined_out_message + " " + out_time_message # Ensure this is a single string
353
+
354
+ estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
355
+ print("Estimated total processing time:", str(estimate_total_processing_time))
356
 
357
  toc = time.perf_counter()
358
+ time_taken = toc - tic
359
+ estimated_time_taken_state = estimated_time_taken_state + time_taken
 
 
 
360
 
361
  # If textract requests made, write to logging file
362
  if all_request_metadata:
 
371
  if all_request_metadata_file_path not in log_files_output_paths:
372
  log_files_output_paths.append(all_request_metadata_file_path)
373
 
374
+ if combined_out_message: out_message = combined_out_message
375
+
376
+ print("\nout_message at choose_and_run_redactor end is:", out_message)
377
+
378
+ # Ensure no duplicated output files
379
+ log_files_output_paths = list(set(log_files_output_paths))
380
+ out_file_paths = list(set(out_file_paths))
381
+
382
 
383
+ return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table
384
 
385
  def convert_pikepdf_coords_to_pymudf(pymupdf_page, annot):
386
  '''
 
576
 
577
  if image:
578
  image_x1, image_y1, image_x2, image_y2 = convert_pikepdf_to_image_coords(page, annot, image)
 
579
 
580
  img_annotation_box["xmin"] = image_x1
581
  img_annotation_box["ymin"] = image_y1
 
600
  rect_single_pixel_height = Rect(x1, middle_y - 2, x2, middle_y + 2) # Small height in middle of word to remove text
601
 
602
  # Add the annotation to the middle of the character line, so that it doesn't delete text from adjacent lines
603
+ #print("rect_single_pixel_height:", rect_single_pixel_height)
604
  page.add_redact_annot(rect_single_pixel_height)
605
 
606
  # Set up drawing a black box over the whole rect
 
614
  "boxes": all_image_annotation_boxes
615
  }
616
 
617
+ #print("out_annotation_boxes:", out_annotation_boxes)
618
+
619
  page.apply_redactions(images=0, graphics=0)
620
  page.clean_contents()
621
 
 
633
  merged_bboxes = []
634
  grouped_bboxes = defaultdict(list)
635
 
 
 
 
 
 
 
 
 
 
 
636
  # Reconstruct bounding boxes for substrings of interest
637
  reconstructed_bboxes = []
638
  for bbox in bboxes:
 
724
  merged_bboxes.append(merged_box)
725
  merged_box = next_box
726
 
727
+ merged_bboxes.append(merged_box)
728
+
729
+ # Process signature and handwriting results
730
+ if signature_recogniser_results or handwriting_recogniser_results:
731
+ if "Redact all identified handwriting" in handwrite_signature_checkbox:
732
+ #print("Handwriting boxes exist at merge:", handwriting_recogniser_results)
733
+ merged_bboxes.extend(handwriting_recogniser_results)
734
+
735
+ if "Redact all identified signatures" in handwrite_signature_checkbox:
736
+ #print("Signature boxes exist at merge:", signature_recogniser_results)
737
+ merged_bboxes.extend(signature_recogniser_results)
738
+
739
+ #print("bboxes:", bboxes)
740
 
741
  return merged_bboxes
742
 
743
+ def redact_image_pdf(file_path:str, prepared_pdf_file_paths:List[str], language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, is_a_pdf:bool=True, page_min:int=0, page_max:int=999, analysis_type:str="Quick image analysis - typed text", handwrite_signature_checkbox:List[str]=["Redact all identified handwriting", "Redact all identified signatures"], request_metadata:str="", current_loop_page:int=0, page_break_return:bool=False, images=[], annotations_all_pages:List=[], all_line_level_ocr_results_df = pd.DataFrame(), all_decision_process_table = pd.DataFrame(), pymupdf_doc = [], page_break_val:int=int(page_break_value), logging_file_paths:List=[], max_time:int=int(max_time_value), progress=Progress(track_tqdm=True)):
744
+
745
  '''
746
+ This function redacts sensitive information from a PDF document. It takes the following parameters:
747
+
748
+ - file_path (str): The path to the PDF file to be redacted.
749
+ - prepared_pdf_file_paths (List[str]): A list of paths to the PDF file pages converted to images.
750
+ - language (str): The language of the text in the PDF.
751
+ - chosen_redact_entities (List[str]): A list of entity types to redact from the PDF.
752
+ - allow_list (List[str], optional): A list of entity types to allow in the PDF. Defaults to None.
753
+ - is_a_pdf (bool, optional): Indicates if the input file is a PDF. Defaults to True.
754
+ - page_min (int, optional): The minimum page number to start redaction from. Defaults to 0.
755
+ - page_max (int, optional): The maximum page number to end redaction at. Defaults to 999.
756
+ - analysis_type (str, optional): The type of analysis to perform on the PDF. Defaults to "Quick image analysis - typed text".
757
+ - handwrite_signature_checkbox (List[str], optional): A list of options for redacting handwriting and signatures. Defaults to ["Redact all identified handwriting", "Redact all identified signatures"].
758
+ - request_metadata (str, optional): Metadata related to the redaction request. Defaults to an empty string.
759
+ - current_loop_page (int, optional): The current page being processed in the loop. Defaults to 0.
760
+ - page_break_return (bool, optional): Indicates if the function should return after a page break. Defaults to False.
761
+ - page_break_val (int, optional): The value at which to trigger a page break. Defaults to 3.
762
+ - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
763
+ - progress (Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
764
+
765
+ The function returns a redacted PDF document.
766
  '''
 
 
767
  file_name = get_file_path_end(file_path)
768
  fill = (0, 0, 0) # Fill colour
 
 
 
 
769
  image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
770
 
771
+ #print("pymupdf_doc at start of redact_image_pdf function:", pymupdf_doc)
772
+
773
+ tic = time.perf_counter()
774
 
775
  if not prepared_pdf_file_paths:
776
  out_message = "PDF does not exist as images. Converting pages to image"
 
778
 
779
  prepared_pdf_file_paths = process_file(file_path)
780
 
 
 
 
 
 
781
  number_of_pages = len(prepared_pdf_file_paths)
 
782
  print("Number of pages:", str(number_of_pages))
783
 
 
 
 
 
784
  # Check that page_min and page_max are within expected ranges
785
  if page_max > number_of_pages or page_max == 0:
786
  page_max = number_of_pages
787
 
788
+ if page_min <= 0: page_min = 0
789
+ else: page_min = page_min - 1
 
 
790
 
791
  print("Page range:", str(page_min + 1), "to", str(page_max))
792
+ print("Current_loop_page:", current_loop_page)
793
+
 
 
 
 
 
 
794
  if analysis_type == "Quick image analysis - typed text": ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".csv"
795
  elif analysis_type == "Complex image analysis - docs with handwriting/signatures (AWS Textract)": ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.csv"
796
 
797
+ if current_loop_page == 0: page_loop_start = 0
798
+ else: page_loop_start = current_loop_page
799
+
800
+ #progress_bar = progress.tqdm(range(page_loop_start, number_of_pages), unit="pages", desc="Redacting pages")
801
+ progress_bar = tqdm(range(page_loop_start, number_of_pages), unit="pages remaining", desc="Redacting pages")
802
+
803
+ for page_no in progress_bar:
804
+
805
  handwriting_or_signature_boxes = []
806
  signature_recogniser_results = []
807
  handwriting_recogniser_results = []
808
+ page_break_return = False
 
809
 
810
+ reported_page_number = str(page_no + 1)
811
+ print("Redacting page:", reported_page_number)
812
+
813
  # Assuming prepared_pdf_file_paths[page_no] is a PIL image object
814
  try:
815
  image = prepared_pdf_file_paths[page_no]#.copy()
816
  #print("image:", image)
817
  except Exception as e:
818
  print("Could not redact page:", reported_page_number, "due to:")
819
+ print(e)
 
820
  continue
821
 
822
+ image_annotations = {"image": image, "boxes": []}
 
 
823
  pymupdf_page = pymupdf_doc.load_page(page_no)
 
 
 
824
 
825
+ if page_no >= page_min and page_no < page_max:
 
 
826
 
827
+ #print("Image is in range of pages to redact")
828
+ if isinstance(image, str):
829
+ #print("image is a file path")
830
+ image = Image.open(image)
831
 
832
  # Need image size to convert textract OCR outputs to the correct sizes
833
  page_width, page_height = image.size
 
845
  # Combine OCR results
846
  line_level_ocr_results, line_level_ocr_results_with_children = combine_ocr_results(word_level_ocr_results)
847
 
 
 
 
 
 
 
 
848
 
849
  # Import results from json and convert
850
  if analysis_type == "Complex image analysis - docs with handwriting/signatures (AWS Textract)":
 
854
  image.save(image_buffer, format='PNG') # Save as PNG, or adjust format if needed
855
  pdf_page_as_bytes = image_buffer.getvalue()
856
 
857
+ #json_file_path = output_folder + file_name + "_page_" + reported_page_number + "_textract.json"
858
+ json_file_path = output_folder + file_name + "_textract.json"
859
 
860
  if not os.path.exists(json_file_path):
861
+ text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number) # Analyse page with Textract
862
  logging_file_paths.append(json_file_path)
863
  request_metadata = request_metadata + "\n" + new_request_metadata
864
+
865
+ wrapped_text_blocks = {"pages":[text_blocks]}
866
+
867
+ # Write the updated existing_data back to the JSON file
868
+ with open(json_file_path, 'w') as json_file:
869
+ json.dump(wrapped_text_blocks, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
870
  else:
871
  # Open the file and load the JSON data
872
+ print("Found existing Textract json results file.")
873
  with open(json_file_path, 'r') as json_file:
874
+ existing_data = json.load(json_file)
875
+
876
+ # Check if the current reported_page_number exists in the loaded JSON
877
+ page_exists = any(page['page_no'] == reported_page_number for page in existing_data.get("pages", []))
878
+
879
+ if not page_exists: # If the page does not exist, analyze again
880
+ print(f"Page number {reported_page_number} not found in existing data. Analyzing again.")
881
+ text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number) # Analyse page with Textract
882
+
883
+ # Check if "pages" key exists, if not, initialize it as an empty list
884
+ if "pages" not in existing_data:
885
+ existing_data["pages"] = []
886
+
887
+ # Append the new page data
888
+ existing_data["pages"].append(text_blocks)
889
+
890
+ # Write the updated existing_data back to the JSON file
891
+ with open(json_file_path, 'w') as json_file:
892
+ json.dump(existing_data, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
893
+
894
+ logging_file_paths.append(json_file_path)
895
+ request_metadata = request_metadata + "\n" + new_request_metadata
896
+ else:
897
+ # If the page exists, retrieve the data
898
+ text_blocks = next(page['data'] for page in existing_data["pages"] if page['page_no'] == reported_page_number)
899
 
900
+ line_level_ocr_results, handwriting_or_signature_boxes, signature_recogniser_results, handwriting_recogniser_results, line_level_ocr_results_with_children = json_to_ocrresult(text_blocks, page_width, page_height, reported_page_number)
901
 
902
  # Step 2: Analyze text and identify PII
903
  if chosen_redact_entities:
 
904
  redaction_bboxes = image_analyser.analyze_text(
905
  line_level_ocr_results,
906
  line_level_ocr_results_with_children,
 
912
  else:
913
  redaction_bboxes = []
914
 
915
+ #print("\nsignature_recogniser_boxes:", signature_recogniser_results)
916
+ #print("\nhandwriting_recogniser_boxes:", handwriting_recogniser_results)
917
+ #print("\nredaction_bboxes:", redaction_bboxes)
918
+
919
  if analysis_type == "Quick image analysis - typed text": interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".txt"
920
  elif analysis_type == "Complex image analysis - docs with handwriting/signatures (AWS Textract)": interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.txt"
921
 
 
926
 
927
  # Merge close bounding boxes
928
  merged_redaction_bboxes = merge_img_bboxes(redaction_bboxes, line_level_ocr_results_with_children, signature_recogniser_results, handwriting_recogniser_results, handwrite_signature_checkbox)
929
+
 
 
 
930
  # 3. Draw the merged boxes
 
931
  if is_pdf(file_path) == False:
932
  draw = ImageDraw.Draw(image)
933
 
 
962
 
963
  ## Apply annotations with pymupdf
964
  else:
965
+ pymupdf_page, image_annotations = redact_page_with_pymupdf(pymupdf_page, merged_redaction_bboxes, image)
966
 
967
  # Convert decision process to table
968
  decision_process_table = pd.DataFrame([{
 
992
 
993
  all_line_level_ocr_results_df = pd.concat([all_line_level_ocr_results_df, line_level_ocr_results_df])
994
 
995
+ toc = time.perf_counter()
996
+
997
+ time_taken = toc - tic
998
+
999
+ #print("toc - tic:", time_taken)
1000
+
1001
+ # Break if time taken is greater than max_time seconds
1002
+ if time_taken > max_time:
1003
+ print("Processing for", max_time, "seconds, breaking loop.")
1004
+ page_break_return = True
1005
+ progress.close(_tqdm=progress_bar)
1006
+ tqdm._instances.clear()
1007
+
1008
+ if is_pdf(file_path) == False:
1009
+ images.append(image)
1010
+ pymupdf_doc = images
1011
+
1012
+ annotations_all_pages.append(image_annotations)
1013
+
1014
+ current_loop_page += 1
1015
+
1016
+ return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df
1017
+
1018
  if is_pdf(file_path) == False:
1019
  images.append(image)
1020
  pymupdf_doc = images
1021
 
1022
+ annotations_all_pages.append(image_annotations)
1023
 
1024
+ current_loop_page += 1
1025
 
1026
+ # Break if new page is a multiple of chosen page_break_val
1027
+ if current_loop_page % page_break_val == 0:
1028
+ page_break_return = True
1029
+ progress.close(_tqdm=progress_bar)
1030
+ tqdm._instances.clear()
1031
 
1032
+ return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df
1033
+
1034
+ return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df
1035
 
1036
 
1037
  ###
 
1048
 
1049
  return characters
1050
  return []
1051
+
1052
+
1053
+ def initial_clean(text):
1054
+ #### Some of my cleaning functions
1055
+ html_pattern_regex = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0|&nbsp;'
1056
+ html_start_pattern_end_dots_regex = r'<(.*?)\.\.'
1057
+ non_ascii_pattern = r'[^\x00-\x7F]+'
1058
+ multiple_spaces_regex = r'\s{2,}'
1059
+
1060
+ # Define a list of patterns and their replacements
1061
+ patterns = [
1062
+ (html_pattern_regex, ' '),
1063
+ (html_start_pattern_end_dots_regex, ' '),
1064
+ (non_ascii_pattern, ' '),
1065
+ (multiple_spaces_regex, ' ')
1066
+ ]
1067
+
1068
+ # Apply each regex replacement
1069
+ for pattern, replacement in patterns:
1070
+ text = re.sub(pattern, replacement, text)
1071
 
1072
+ return text
1073
+
1074
+
1075
  def analyse_text_container(text_container:OCRResult, language:str, chosen_redact_entities:List[str], score_threshold:float, allow_list:List[str]):
1076
  '''
1077
  Take text and bounding boxes in OCRResult format and analyze it for PII using spacy and the Microsoft Presidio package.
 
1079
 
1080
  analyser_results = []
1081
 
1082
+ #text_to_analyse = initial_clean(text_container.text).strip()
1083
+
1084
+ text_to_analyse = initial_clean(text_container.text)
1085
 
1086
  if chosen_redact_entities:
1087
+ #print("Running Presidio analyze method. text_to_analyse:", text_to_analyse)
1088
+
1089
+ analyser_results = nlp_analyser.analyze(text=text_to_analyse,
1090
  language=language,
1091
  entities=chosen_redact_entities,
1092
  score_threshold=score_threshold,
1093
  return_decision_process=True,
1094
+ allow_list=allow_list)
1095
 
1096
  return analyser_results
1097
 
1098
+
1099
  def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tuple[List[OCRResult], List[LTChar]]:
1100
  '''
1101
  Create an OCRResult object based on a list of pdfminer LTChar objects.
 
1108
 
1109
  # Initialize variables
1110
  full_text = ""
1111
+ added_text = ""
1112
  overall_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')] # [x0, y0, x1, y1]
1113
  word_bboxes = []
1114
 
 
1122
  if isinstance(char, LTAnno):
1123
  # Handle space separately by finalizing the word
1124
  full_text += char.get_text() # Adds space or newline
1125
+
1126
  if current_word: # Only finalize if there is a current word
1127
  word_bboxes.append((current_word, current_word_bbox))
1128
  current_word = ""
 
1147
  continue
1148
 
1149
  # Concatenate text for LTChar
1150
+
1151
+
1152
+ #full_text += char.get_text()
1153
+ #added_text = re.sub(r'[^\x00-\x7F]+', ' ', char.get_text())
1154
+ added_text = char.get_text()
1155
+ if re.search(r'[^\x00-\x7F]', added_text): # Matches any non-ASCII character
1156
+ #added_text.encode('latin1', errors='replace').decode('utf-8')
1157
+ added_text = clean_unicode_text(added_text)
1158
+ full_text += added_text # Adds space or newline, removing
1159
+
1160
+
1161
 
1162
  # Update overall bounding box
1163
  x0, y0, x1, y1 = char.bbox
 
1167
  overall_bbox[3] = max(overall_bbox[3], y1) # y1
1168
 
1169
  # Update current word
1170
+ #current_word += char.get_text()
1171
+ current_word += added_text
1172
 
1173
  # Update current word bounding box
1174
  current_word_bbox[0] = min(current_word_bbox[0], x0) # x0
 
1176
  current_word_bbox[2] = max(current_word_bbox[2], x1) # x1
1177
  current_word_bbox[3] = max(current_word_bbox[3], y1) # y1
1178
 
 
1179
  # Finalize the last word if any
1180
  if current_word:
1181
  word_bboxes.append((current_word, current_word_bbox))
1182
 
1183
  if full_text:
1184
+ #print("full_text before:", full_text)
1185
+ if re.search(r'[^\x00-\x7F]', full_text): # Matches any non-ASCII character
1186
+ # Convert special characters to a human-readable format
1187
+ #full_text = full_text.encode('latin1', errors='replace').decode('utf-8')
1188
+ full_text = clean_unicode_text(full_text)
1189
+ #print("full_text:", full_text)
1190
+
1191
  line_level_results_out.append(OCRResult(full_text, round(overall_bbox[0],2), round(overall_bbox[1], 2), round(overall_bbox[2]-overall_bbox[0],2), round(overall_bbox[3]-overall_bbox[1],2)))
1192
+
1193
+ #line_level_characters_out = character_objects_out
1194
 
1195
  return line_level_results_out, line_level_characters_out # Return both results and character objects
1196
 
1197
+ def merge_text_bounding_boxes(analyser_results:CustomImageRecognizerResult, characters:List[LTChar], combine_pixel_dist:int=20, vertical_padding:int=0):
1198
  '''
1199
  Merge identified bounding boxes containing PII that are very close to one another
1200
  '''
 
1250
  current_box[3] = max(current_box[3], char_box[3]) # Ensure the top is the highest
1251
  current_result.end = max(current_result.end, result.end) # Extend the text range
1252
  try:
1253
+ current_result.entity_type = current_result.entity_type + " - " + result.entity_type
1254
+ except Exception as e:
1255
+ print("Unable to combine result entity types:")
1256
+ print(e)
1257
  # Add a space if current_text is not empty
1258
  if current_text:
1259
  current_text.append(" ") # Add space between texts
 
1330
  annotations_on_page.append(annotation)
1331
  return annotations_on_page
1332
 
1333
+ def redact_text_pdf(
1334
+ filename: str, # Path to the PDF file to be redacted
1335
+ prepared_pdf_image_path: str, # Path to the prepared PDF image for redaction
1336
+ language: str, # Language of the PDF content
1337
+ chosen_redact_entities: List[str], # List of entities to be redacted
1338
+ allow_list: List[str] = None, # Optional list of allowed entities
1339
+ page_min: int = 0, # Minimum page number to start redaction
1340
+ page_max: int = 999, # Maximum page number to end redaction
1341
+ analysis_type: str = "Simple text analysis - PDFs with selectable text", # Type of analysis to perform
1342
+ current_loop_page: int = 0, # Current page being processed in the loop
1343
+ page_break_return: bool = False, # Flag to indicate if a page break should be returned
1344
+ annotations_all_pages: List = [], # List of annotations across all pages
1345
+ all_line_level_ocr_results_df: pd.DataFrame = pd.DataFrame(), # DataFrame for OCR results
1346
+ all_decision_process_table: pd.DataFrame = pd.DataFrame(), # DataFrame for decision process table
1347
+ pymupdf_doc: List = [], # List of PyMuPDF documents
1348
+ page_break_val: int = int(page_break_value), # Value for page break
1349
+ max_time: int = int(max_time_value),
1350
+ progress: Progress = Progress(track_tqdm=True) # Progress tracking object
1351
+ ):
1352
+
1353
  '''
1354
+ Redact chosen entities from a PDF that is made up of multiple pages that are not images.
 
 
 
1355
 
1356
+ Input Variables:
1357
+ - filename: Path to the PDF file to be redacted
1358
+ - prepared_pdf_image_path: Path to the prepared PDF image for redaction
1359
+ - language: Language of the PDF content
1360
+ - chosen_redact_entities: List of entities to be redacted
1361
+ - allow_list: Optional list of allowed entities
1362
+ - page_min: Minimum page number to start redaction
1363
+ - page_max: Maximum page number to end redaction
1364
+ - analysis_type: Type of analysis to perform
1365
+ - current_loop_page: Current page being processed in the loop
1366
+ - page_break_return: Flag to indicate if a page break should be returned
1367
+ - images: List of images (not used in this function)
1368
+ - annotations_all_pages: List of annotations across all pages
1369
+ - all_line_level_ocr_results_df: DataFrame for OCR results
1370
+ - all_decision_process_table: DataFrame for decision process table
1371
+ - pymupdf_doc: List of PyMuPDF documents
1372
+ - page_break_val: Value for page break
1373
+ - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
1374
+ - progress: Progress tracking object
1375
+ '''
1376
+
1377
+ tic = time.perf_counter()
1378
 
1379
  # Open with Pikepdf to get text lines
1380
  pikepdf_pdf = Pdf.open(filename)
1381
  number_of_pages = len(pikepdf_pdf.pages)
 
 
 
1382
 
 
 
1383
  # Check that page_min and page_max are within expected ranges
1384
  if page_max > number_of_pages or page_max == 0:
1385
  page_max = number_of_pages
 
 
1386
 
1387
  if page_min <= 0: page_min = 0
1388
  else: page_min = page_min - 1
1389
 
1390
  print("Page range is",str(page_min + 1), "to", str(page_max))
1391
+ print("Current_loop_page:", current_loop_page)
1392
+
1393
+ if current_loop_page == 0: page_loop_start = 0
1394
+ else: page_loop_start = current_loop_page
1395
+
1396
+ #progress_bar = progress.tqdm(range(current_loop_page, number_of_pages), unit="pages", desc="Redacting pages")
1397
+ progress_bar = tqdm(range(current_loop_page, number_of_pages), unit="pages remaining", desc="Redacting pages")
1398
 
1399
  #for page_no in range(0, number_of_pages):
1400
+ for page_no in progress_bar:
 
 
 
 
1401
 
1402
+ reported_page_number = str(page_no + 1)
1403
+ print("Redacting page:", reported_page_number)
1404
 
1405
+ # Assuming prepared_pdf_file_paths[page_no] is a PIL image object
1406
+ try:
1407
+ image = prepared_pdf_image_path[page_no]#.copy()
1408
+ #print("image:", image)
1409
+ except Exception as e:
1410
+ print("Could not redact page:", reported_page_number, "due to:")
1411
+ print(e)
1412
+ continue
1413
+
1414
+ image_annotations = {"image": image, "boxes": []}
1415
  pymupdf_page = pymupdf_doc.load_page(page_no)
1416
 
1417
+ #print("pymupdf page loaded")
1418
+
1419
+ #print("Page number is:", str(page_no + 1))
1420
 
1421
  if page_min <= page_no < page_max:
1422
 
1423
+ #print("Page is in range of pages to redact")
1424
+
1425
  for page_layout in extract_pages(filename, page_numbers = [page_no], maxpages=1):
1426
 
1427
  page_analyser_results = []
 
1433
  page_text_outputs = pd.DataFrame()
1434
 
1435
  if analysis_type == "Simple text analysis - PDFs with selectable text":
1436
+ for n, text_container in enumerate(page_layout):
1437
 
1438
  text_container_analyser_results = []
1439
  text_container_analysed_bounding_boxes = []
1440
+ characters = []
1441
 
1442
+ if isinstance(text_container, LTTextContainer) or isinstance(text_container, LTAnno):
1443
+ characters = get_text_container_characters(text_container)
1444
 
1445
  # Create dataframe for all the text on the page
1446
  line_level_text_results_list, line_characters = create_text_bounding_boxes_from_characters(characters)
1447
 
 
 
1448
  # Create page_text_outputs (OCR format outputs)
1449
  if line_level_text_results_list:
1450
  # Convert to DataFrame and add to ongoing logging table
 
1461
 
1462
  # Analyse each line of text in turn for PII and add to list
1463
  for i, text_line in enumerate(line_level_text_results_list):
1464
+
1465
  text_line_analyser_result = []
1466
  text_line_bounding_boxes = []
1467
 
 
 
1468
  text_line_analyser_result = analyse_text_container(text_line, language, chosen_redact_entities, score_threshold, allow_list)
1469
 
1470
  # Merge bounding boxes for the line if multiple found close together
1471
  if text_line_analyser_result:
1472
+
1473
+ #print("Analysed text container, now merging bounding boxes")
1474
+
1475
  # Merge bounding boxes if very close together
1476
+ text_line_bounding_boxes = merge_text_bounding_boxes(text_line_analyser_result, line_characters[i])
1477
+
1478
+ #print("merged bounding boxes")
 
 
1479
 
1480
  text_container_analyser_results.extend(text_line_analyser_result)
1481
  text_container_analysed_bounding_boxes.extend(text_line_bounding_boxes)
1482
+
 
 
 
1483
  page_analyser_results.extend(text_container_analyser_results)
1484
  page_analysed_bounding_boxes.extend(text_container_analysed_bounding_boxes)
1485
 
1486
  # Annotate redactions on page
1487
  annotations_on_page = create_annotations_for_bounding_boxes(page_analysed_bounding_boxes)
 
 
 
 
 
1488
 
1489
+ # Make pymupdf page redactions
1490
  pymupdf_page, image_annotations = redact_page_with_pymupdf(pymupdf_page, annotations_on_page, image)
1491
 
1492
+ #print("Did redact_page_with_pymupdf function")
1493
 
1494
  print("For page number:", page_no, "there are", len(image_annotations["boxes"]), "annotations")
1495
 
1496
  # Write logs
1497
  # Create decision process table
1498
+ decision_process_table_on_page = create_text_redaction_process_results(page_analyser_results, page_analysed_bounding_boxes, current_loop_page)
1499
 
1500
  if not decision_process_table_on_page.empty:
1501
+ all_decision_process_table = pd.concat([all_decision_process_table, decision_process_table_on_page])
1502
 
1503
  if not page_text_outputs.empty:
1504
  page_text_outputs = page_text_outputs.sort_values(["top", "left"], ascending=[False, False]).reset_index(drop=True)
1505
+ all_line_level_ocr_results_df = pd.concat([all_line_level_ocr_results_df, page_text_outputs])
1506
+
1507
+ toc = time.perf_counter()
1508
+
1509
+ time_taken = toc - tic
1510
+
1511
+ #print("toc - tic:", time_taken)
1512
+
1513
+ # Break if time taken is greater than max_time seconds
1514
+ if time_taken > max_time:
1515
+ print("Processing for", max_time, "seconds, breaking.")
1516
+ page_break_return = True
1517
+ progress.close(_tqdm=progress_bar)
1518
+ tqdm._instances.clear()
1519
+
1520
+ annotations_all_pages.append(image_annotations)
1521
 
1522
+ current_loop_page += 1
1523
 
1524
+ return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return
1525
+
1526
+
1527
+ annotations_all_pages.append(image_annotations)
1528
+
1529
+ current_loop_page += 1
1530
+
1531
+ # Break if new page is a multiple of 10
1532
+ if current_loop_page % page_break_val == 0:
1533
+ page_break_return = True
1534
+ progress.close(_tqdm=progress_bar)
1535
+
1536
+ return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return
1537
+
1538
 
1539
+ return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return
tools/helper_functions.py CHANGED
@@ -1,6 +1,8 @@
1
  import os
 
2
  import gradio as gr
3
  import pandas as pd
 
4
 
5
  def get_or_create_env_var(var_name, default_value):
6
  # Get the environment variable if it exists
@@ -166,7 +168,7 @@ def add_folder_to_path(folder_path: str):
166
 
167
  # Upon running a process, the feedback buttons are revealed
168
  def reveal_feedback_buttons():
169
- return gr.Radio(visible=True), gr.Textbox(visible=True), gr.Button(visible=True), gr.Markdown(visible=True)
170
 
171
  def wipe_logs(feedback_logs_loc, usage_logs_loc):
172
  try:
@@ -238,4 +240,26 @@ async def get_connection_params(request: gr.Request):
238
  return out_session_hash, output_folder, out_session_hash
239
  else:
240
  print("No session parameters found.")
241
- return "",""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
+ import re
3
  import gradio as gr
4
  import pandas as pd
5
+ import unicodedata
6
 
7
  def get_or_create_env_var(var_name, default_value):
8
  # Get the environment variable if it exists
 
168
 
169
  # Upon running a process, the feedback buttons are revealed
170
  def reveal_feedback_buttons():
171
+ return gr.Radio(visible=True, label="Please give some feedback about the results of the redaction. A reminder that the app is only expected to identify about 60% of personally identifiable information in a given (typed) document."), gr.Textbox(visible=True), gr.Button(visible=True), gr.Markdown(visible=True)
172
 
173
  def wipe_logs(feedback_logs_loc, usage_logs_loc):
174
  try:
 
240
  return out_session_hash, output_folder, out_session_hash
241
  else:
242
  print("No session parameters found.")
243
+ return "",""
244
+
245
+
246
+ def clean_unicode_text(text):
247
+ # Step 1: Normalize unicode characters to decompose any special forms
248
+ normalized_text = unicodedata.normalize('NFKC', text)
249
+
250
+ # Step 2: Replace smart quotes and special punctuation with standard ASCII equivalents
251
+ replacements = {
252
+ '‘': "'", '’': "'", '“': '"', '”': '"',
253
+ '–': '-', '—': '-', '…': '...', '•': '*',
254
+ }
255
+
256
+ # Perform replacements
257
+ for old_char, new_char in replacements.items():
258
+ normalized_text = normalized_text.replace(old_char, new_char)
259
+
260
+ # Step 3: Optionally remove non-ASCII characters if needed
261
+ # This regex removes any remaining non-ASCII characters, if desired.
262
+ # Comment this line if you want to keep all Unicode characters.
263
+ cleaned_text = re.sub(r'[^\x00-\x7F]+', '', normalized_text)
264
+
265
+ return cleaned_text
tools/load_spacy_model_custom_recognisers.py CHANGED
@@ -141,7 +141,7 @@ class LoadedSpacyNlpEngine(SpacyNlpEngine):
141
  self.nlp = {"en": loaded_spacy_model}
142
 
143
  # %%
144
- # Load spacy model
145
  try:
146
  import en_core_web_lg
147
  nlp = en_core_web_lg.load()
@@ -151,6 +151,16 @@ except:
151
  download("en_core_web_lg")
152
  nlp = spacy.load("en_core_web_lg")
153
  print("Successfully downloaded and imported spaCy model")
 
 
 
 
 
 
 
 
 
 
154
 
155
  # Pass the loaded model to the new LoadedSpacyNlpEngine
156
  loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp)
 
141
  self.nlp = {"en": loaded_spacy_model}
142
 
143
  # %%
144
+ #Load spacy model
145
  try:
146
  import en_core_web_lg
147
  nlp = en_core_web_lg.load()
 
151
  download("en_core_web_lg")
152
  nlp = spacy.load("en_core_web_lg")
153
  print("Successfully downloaded and imported spaCy model")
154
+
155
+ # try:
156
+ # import en_core_web_sm
157
+ # nlp = en_core_web_sm.load()
158
+ # print("Successfully imported spaCy model")
159
+
160
+ # except:
161
+ # download("en_core_web_sm")
162
+ # nlp = spacy.load("en_core_web_sm")
163
+ # print("Successfully downloaded and imported spaCy model")
164
 
165
  # Pass the loaded model to the new LoadedSpacyNlpEngine
166
  loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp)
tools/redaction_review.py CHANGED
@@ -18,9 +18,9 @@ def decrease_page(number:int):
18
  '''
19
  #print("number:", str(number))
20
  if number > 1:
21
- return number - 1
22
  else:
23
- return 1
24
 
25
  def increase_page(number:int, image_annotator_object:AnnotatedImageData):
26
  '''
@@ -28,14 +28,14 @@ def increase_page(number:int, image_annotator_object:AnnotatedImageData):
28
  '''
29
 
30
  if not image_annotator_object:
31
- return 1
32
 
33
  max_pages = len(image_annotator_object)
34
 
35
  if number < max_pages:
36
- return number + 1
37
  else:
38
- return max_pages
39
 
40
  def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int):
41
  # print("\nImage annotator object:", image_annotator_object)
@@ -51,7 +51,7 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int):
51
  show_share_button=False,
52
  show_remove_button=False,
53
  interactive=False
54
- ), gr.Number(label = "Current page (select page number then press enter)", value=1, precision=0)
55
 
56
  if page_num is None:
57
  page_num = 0
@@ -89,9 +89,9 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int):
89
  interactive=True
90
  )
91
 
92
- number_reported = gr.Number(label = "Current page (select page number then press enter)", value=page_num_reported, precision=0)
93
 
94
- return out_image_annotator, number_reported
95
 
96
  def modify_existing_page_redactions(image_annotated:AnnotatedImageData, current_page:int, previous_page:int, all_image_annotations:List[AnnotatedImageData]):
97
  '''
@@ -99,7 +99,7 @@ def modify_existing_page_redactions(image_annotated:AnnotatedImageData, current_
99
  '''
100
  #If no previous page or is 0, i.e. first time run, then make no changes
101
  if not previous_page:
102
- return all_image_annotations, current_page
103
 
104
  if not current_page:
105
  current_page = 1
@@ -114,7 +114,7 @@ def modify_existing_page_redactions(image_annotated:AnnotatedImageData, current_
114
 
115
  #print("all_image_annotations after:",all_image_annotations)
116
 
117
- return all_image_annotations, current_page
118
 
119
  def apply_redactions(image_annotated:AnnotatedImageData, file_paths:str, doc:Document, all_image_annotations:List[AnnotatedImageData], current_page:int, progress=gr.Progress(track_tqdm=True)):
120
  '''
@@ -132,7 +132,11 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:str, doc:Doc
132
  print("No image annotations found")
133
  return doc, all_image_annotations
134
 
135
- file_path = file_paths[-1].name
 
 
 
 
136
  print("file_path:", file_path)
137
  file_base = get_file_path_end(file_path)
138
 
 
18
  '''
19
  #print("number:", str(number))
20
  if number > 1:
21
+ return number - 1, number - 1
22
  else:
23
+ return 1, 1
24
 
25
  def increase_page(number:int, image_annotator_object:AnnotatedImageData):
26
  '''
 
28
  '''
29
 
30
  if not image_annotator_object:
31
+ return 1, 1
32
 
33
  max_pages = len(image_annotator_object)
34
 
35
  if number < max_pages:
36
+ return number + 1, number + 1
37
  else:
38
+ return max_pages, max_pages
39
 
40
  def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int):
41
  # print("\nImage annotator object:", image_annotator_object)
 
51
  show_share_button=False,
52
  show_remove_button=False,
53
  interactive=False
54
+ ), gr.Number(label = "Page (press enter to change)", value=1, precision=0)
55
 
56
  if page_num is None:
57
  page_num = 0
 
89
  interactive=True
90
  )
91
 
92
+ number_reported = gr.Number(label = "Page (press enter to change)", value=page_num_reported, precision=0)
93
 
94
+ return out_image_annotator, number_reported, number_reported
95
 
96
  def modify_existing_page_redactions(image_annotated:AnnotatedImageData, current_page:int, previous_page:int, all_image_annotations:List[AnnotatedImageData]):
97
  '''
 
99
  '''
100
  #If no previous page or is 0, i.e. first time run, then make no changes
101
  if not previous_page:
102
+ return all_image_annotations, current_page, current_page
103
 
104
  if not current_page:
105
  current_page = 1
 
114
 
115
  #print("all_image_annotations after:",all_image_annotations)
116
 
117
+ return all_image_annotations, current_page, current_page
118
 
119
  def apply_redactions(image_annotated:AnnotatedImageData, file_paths:str, doc:Document, all_image_annotations:List[AnnotatedImageData], current_page:int, progress=gr.Progress(track_tqdm=True)):
120
  '''
 
132
  print("No image annotations found")
133
  return doc, all_image_annotations
134
 
135
+ if isinstance(file_paths, list):
136
+ file_path = file_paths[-1].name
137
+ else:
138
+ file_path = file_paths
139
+
140
  print("file_path:", file_path)
141
  file_base = get_file_path_end(file_path)
142