seanpedrickcase
commited on
Commit
•
eea5c07
1
Parent(s):
21d060c
Allowed for time limits on redact to avoid timeouts. Improved review interface. Now accepts only one file at a time. Upgraded Gradio version
Browse files- app.py +77 -40
- requirements.txt +8 -3
- tools/aws_functions.py +21 -17
- tools/aws_textract.py +46 -34
- tools/custom_image_analyser_engine.py +3 -2
- tools/file_conversion.py +92 -29
- tools/file_redaction.py +538 -225
- tools/helper_functions.py +26 -2
- tools/load_spacy_model_custom_recognisers.py +11 -1
- tools/redaction_review.py +15 -11
app.py
CHANGED
@@ -43,29 +43,38 @@ with app:
|
|
43 |
###
|
44 |
# STATE VARIABLES
|
45 |
###
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
|
|
|
|
53 |
|
54 |
in_allow_list_state = gr.State(pd.DataFrame())
|
55 |
|
56 |
session_hash_state = gr.State()
|
57 |
s3_output_folder_state = gr.State()
|
58 |
|
59 |
-
|
|
|
|
|
|
|
60 |
images_pdf_state = gr.State([]) # List of pdf pages converted to PIL images
|
61 |
-
|
|
|
|
|
|
|
|
|
62 |
|
63 |
# Logging state
|
64 |
-
feedback_logs_state = gr.State(feedback_logs_folder + 'log.csv')
|
65 |
feedback_s3_logs_loc_state = gr.State(feedback_logs_folder)
|
66 |
-
access_logs_state = gr.State(access_logs_folder + 'log.csv')
|
67 |
access_s3_logs_loc_state = gr.State(access_logs_folder)
|
68 |
-
usage_logs_state = gr.State(usage_logs_folder + 'log.csv')
|
69 |
usage_s3_logs_loc_state = gr.State(usage_logs_folder)
|
70 |
|
71 |
# Invisible elements effectively used as state variables
|
@@ -93,21 +102,23 @@ with app:
|
|
93 |
|
94 |
NOTE: In testing the app seems to find about 60% of personal information on a given (typed) page of text. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.
|
95 |
|
96 |
-
This app accepts a maximum file size of
|
97 |
""")
|
98 |
|
99 |
# PDF / IMAGES TAB
|
100 |
with gr.Tab("PDFs/images"):
|
101 |
with gr.Accordion("Redact document", open = True):
|
102 |
-
in_doc_files = gr.File(label="Choose document
|
103 |
in_redaction_method = gr.Radio(label="Choose document redaction method. AWS Textract has a cost per page so please only use when needed.", value = "Simple text analysis - PDFs with selectable text", choices=["Simple text analysis - PDFs with selectable text", "Quick image analysis - typed text", "Complex image analysis - docs with handwriting/signatures (AWS Textract)"])
|
104 |
gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses), please go to the redaction settings tab.""")
|
105 |
document_redact_btn = gr.Button("Redact document(s)", variant="primary")
|
|
|
|
|
106 |
|
107 |
with gr.Row():
|
108 |
output_summary = gr.Textbox(label="Output summary")
|
109 |
output_file = gr.File(label="Output files")
|
110 |
-
|
111 |
|
112 |
with gr.Row():
|
113 |
convert_text_pdf_to_img_btn = gr.Button(value="Convert pdf to image-based pdf to apply redactions", variant="secondary", visible=False)
|
@@ -122,10 +133,10 @@ with app:
|
|
122 |
with gr.Tab("Review redactions", id="tab_object_annotation"):
|
123 |
|
124 |
with gr.Row():
|
125 |
-
annotation_last_page_button = gr.Button("Previous page")
|
126 |
-
annotate_current_page = gr.Number(value=1, label="
|
127 |
-
|
128 |
-
annotation_next_page_button = gr.Button("Next page")
|
129 |
|
130 |
annotation_button_apply = gr.Button("Apply revised redactions", variant="primary")
|
131 |
|
@@ -141,6 +152,12 @@ with app:
|
|
141 |
interactive=False
|
142 |
)
|
143 |
|
|
|
|
|
|
|
|
|
|
|
|
|
144 |
output_review_files = gr.File(label="Review output files")
|
145 |
|
146 |
# TEXT / TABULAR DATA TAB
|
@@ -169,7 +186,7 @@ with app:
|
|
169 |
# Feedback elements are invisible until revealed by redaction action
|
170 |
data_feedback_title = gr.Markdown(value="## Please give feedback", visible=False)
|
171 |
data_feedback_radio = gr.Radio(label="Please give some feedback about the results of the redaction. A reminder that the app is only expected to identify about 60% of personally identifiable information in a given (typed) document.",
|
172 |
-
choices=["The results were good", "The results were not good"], visible=False)
|
173 |
data_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
|
174 |
data_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
|
175 |
|
@@ -202,36 +219,56 @@ with app:
|
|
202 |
|
203 |
# If a custom allow list is uploaded
|
204 |
in_allow_list.upload(fn=custom_regex_load, inputs=[in_allow_list], outputs=[in_allow_list_text, in_allow_list_state])
|
205 |
-
|
206 |
###
|
207 |
# PDF/IMAGE REDACTION
|
208 |
###
|
209 |
in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_textbox, doc_file_name_with_extension_textbox])
|
210 |
|
211 |
-
document_redact_btn.click(fn =
|
212 |
-
then(fn =
|
213 |
-
|
214 |
-
|
|
|
215 |
|
216 |
-
# If the
|
217 |
-
|
218 |
-
|
219 |
-
outputs=[output_summary, output_file, output_file_list_state, text_documents_done, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state]).\
|
220 |
-
then(fn=update_annotator, inputs=[all_image_annotations_state, page_min], outputs=[annotator, annotate_current_page]).\
|
221 |
-
then(fn = reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
|
222 |
|
223 |
-
|
224 |
-
|
225 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
226 |
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page], outputs = [annotator, annotate_current_page])
|
|
|
|
|
|
|
|
|
|
|
231 |
|
232 |
#annotation_button_get.click(get_boxes_json, annotator, json_boxes)
|
233 |
annotation_button_apply.click(apply_redactions, inputs=[annotator, in_doc_files, pdf_doc_state, all_image_annotations_state, annotate_current_page], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files], scroll_to_output=True)
|
234 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
235 |
###
|
236 |
# TABULAR DATA REDACTION
|
237 |
###
|
@@ -281,9 +318,9 @@ print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
|
|
281 |
|
282 |
if __name__ == "__main__":
|
283 |
if os.environ['COGNITO_AUTH'] == "1":
|
284 |
-
app.queue().launch(show_error=True, auth=authenticate_user, max_file_size='
|
285 |
else:
|
286 |
-
app.queue().launch(show_error=True, inbrowser=True, max_file_size='
|
287 |
|
288 |
|
289 |
# AWS options - placeholder for possibility of storing data on s3 and retrieving it in app
|
|
|
43 |
###
|
44 |
# STATE VARIABLES
|
45 |
###
|
46 |
+
|
47 |
+
pdf_doc_state = gr.State([])
|
48 |
+
all_image_annotations_state = gr.State([])
|
49 |
+
all_line_level_ocr_results_df_state = gr.State(pd.DataFrame())
|
50 |
+
all_decision_process_table_state = gr.State(pd.DataFrame())
|
51 |
+
|
52 |
+
def reset_state_vars():
|
53 |
+
return [], [], pd.DataFrame(), pd.DataFrame()
|
54 |
+
|
55 |
|
56 |
in_allow_list_state = gr.State(pd.DataFrame())
|
57 |
|
58 |
session_hash_state = gr.State()
|
59 |
s3_output_folder_state = gr.State()
|
60 |
|
61 |
+
first_loop_state = gr.State(True)
|
62 |
+
second_loop_state = gr.State(False)
|
63 |
+
|
64 |
+
prepared_pdf_state = gr.State([])
|
65 |
images_pdf_state = gr.State([]) # List of pdf pages converted to PIL images
|
66 |
+
|
67 |
+
output_image_files_state = gr.State([])
|
68 |
+
output_file_list_state = gr.State([])
|
69 |
+
text_output_file_list_state = gr.State([])
|
70 |
+
log_files_output_list_state = gr.State([])
|
71 |
|
72 |
# Logging state
|
73 |
+
feedback_logs_state = gr.State(feedback_logs_folder + 'dataset1.csv') #'log.csv')
|
74 |
feedback_s3_logs_loc_state = gr.State(feedback_logs_folder)
|
75 |
+
access_logs_state = gr.State(access_logs_folder + 'dataset1.csv') #'log.csv')
|
76 |
access_s3_logs_loc_state = gr.State(access_logs_folder)
|
77 |
+
usage_logs_state = gr.State(usage_logs_folder + 'dataset1.csv') #'log.csv')
|
78 |
usage_s3_logs_loc_state = gr.State(usage_logs_folder)
|
79 |
|
80 |
# Invisible elements effectively used as state variables
|
|
|
102 |
|
103 |
NOTE: In testing the app seems to find about 60% of personal information on a given (typed) page of text. It is essential that all outputs are checked **by a human** to ensure that all personal information has been removed.
|
104 |
|
105 |
+
This app accepts a maximum file size of 100mb. Please consider giving feedback for the quality of the answers underneath the redact buttons when the option appears, this will help to improve the app.
|
106 |
""")
|
107 |
|
108 |
# PDF / IMAGES TAB
|
109 |
with gr.Tab("PDFs/images"):
|
110 |
with gr.Accordion("Redact document", open = True):
|
111 |
+
in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "single", file_types=['.pdf', '.jpg', '.png', '.json'])
|
112 |
in_redaction_method = gr.Radio(label="Choose document redaction method. AWS Textract has a cost per page so please only use when needed.", value = "Simple text analysis - PDFs with selectable text", choices=["Simple text analysis - PDFs with selectable text", "Quick image analysis - typed text", "Complex image analysis - docs with handwriting/signatures (AWS Textract)"])
|
113 |
gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses), please go to the redaction settings tab.""")
|
114 |
document_redact_btn = gr.Button("Redact document(s)", variant="primary")
|
115 |
+
current_loop_page_number = gr.Number(value=0,precision=0, interactive=False, label = "Last redacted page in document", visible=False)
|
116 |
+
page_break_return = gr.Checkbox(value = False, label="Page break reached", visible=False)
|
117 |
|
118 |
with gr.Row():
|
119 |
output_summary = gr.Textbox(label="Output summary")
|
120 |
output_file = gr.File(label="Output files")
|
121 |
+
latest_file_completed_text = gr.Number(value=0, label="Number of documents redacted", interactive=False, visible=False)
|
122 |
|
123 |
with gr.Row():
|
124 |
convert_text_pdf_to_img_btn = gr.Button(value="Convert pdf to image-based pdf to apply redactions", variant="secondary", visible=False)
|
|
|
133 |
with gr.Tab("Review redactions", id="tab_object_annotation"):
|
134 |
|
135 |
with gr.Row():
|
136 |
+
annotation_last_page_button = gr.Button("Previous page", scale = 3)
|
137 |
+
annotate_current_page = gr.Number(value=1, label="Page (press enter to change)", precision=0, scale = 2)
|
138 |
+
annotate_max_pages = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 1)
|
139 |
+
annotation_next_page_button = gr.Button("Next page", scale = 3)
|
140 |
|
141 |
annotation_button_apply = gr.Button("Apply revised redactions", variant="primary")
|
142 |
|
|
|
152 |
interactive=False
|
153 |
)
|
154 |
|
155 |
+
with gr.Row():
|
156 |
+
annotation_last_page_button_bottom = gr.Button("Previous page", scale = 3)
|
157 |
+
annotate_current_page_bottom = gr.Number(value=1, label="Page (press enter to change)", precision=0, interactive=True, scale = 2)
|
158 |
+
annotate_max_pages_bottom = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 1)
|
159 |
+
annotation_next_page_button_bottom = gr.Button("Next page", scale = 3)
|
160 |
+
|
161 |
output_review_files = gr.File(label="Review output files")
|
162 |
|
163 |
# TEXT / TABULAR DATA TAB
|
|
|
186 |
# Feedback elements are invisible until revealed by redaction action
|
187 |
data_feedback_title = gr.Markdown(value="## Please give feedback", visible=False)
|
188 |
data_feedback_radio = gr.Radio(label="Please give some feedback about the results of the redaction. A reminder that the app is only expected to identify about 60% of personally identifiable information in a given (typed) document.",
|
189 |
+
choices=["The results were good", "The results were not good"], visible=False, show_label=True)
|
190 |
data_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
|
191 |
data_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
|
192 |
|
|
|
219 |
|
220 |
# If a custom allow list is uploaded
|
221 |
in_allow_list.upload(fn=custom_regex_load, inputs=[in_allow_list], outputs=[in_allow_list_text, in_allow_list_state])
|
222 |
+
|
223 |
###
|
224 |
# PDF/IMAGE REDACTION
|
225 |
###
|
226 |
in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_textbox, doc_file_name_with_extension_textbox])
|
227 |
|
228 |
+
document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state]).\
|
229 |
+
then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state], api_name="prepare_doc").\
|
230 |
+
then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return],
|
231 |
+
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state], api_name="redact_doc")#.\
|
232 |
+
#then(fn=update_annotator, inputs=[all_image_annotations_state, page_min], outputs=[annotator, annotate_current_page])
|
233 |
|
234 |
+
# If the app has completed a batch of pages, it will run this until the end of all pages in the document
|
235 |
+
current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return],
|
236 |
+
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state])
|
|
|
|
|
|
|
237 |
|
238 |
+
# If a file has been completed, the function will continue onto the next document
|
239 |
+
latest_file_completed_text.change(fn=update_annotator, inputs=[all_image_annotations_state, page_min], outputs=[annotator, annotate_current_page, annotate_current_page_bottom]).\
|
240 |
+
then(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
|
241 |
+
# latest_file_completed_text.change(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state]).\
|
242 |
+
# then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return],
|
243 |
+
# outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state]).\
|
244 |
+
#then(fn=update_annotator, inputs=[all_image_annotations_state, page_min], outputs=[annotator, annotate_current_page]).\
|
245 |
+
#then(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
|
246 |
+
|
247 |
+
### REVIEW REDACTIONS
|
248 |
|
249 |
+
# Page controls at top
|
250 |
+
annotate_current_page.submit(
|
251 |
+
modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
|
252 |
+
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
|
253 |
+
|
254 |
+
annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
255 |
+
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
|
256 |
+
annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
257 |
+
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
|
258 |
|
259 |
#annotation_button_get.click(get_boxes_json, annotator, json_boxes)
|
260 |
annotation_button_apply.click(apply_redactions, inputs=[annotator, in_doc_files, pdf_doc_state, all_image_annotations_state, annotate_current_page], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files], scroll_to_output=True)
|
261 |
|
262 |
+
# Page controls at bottom
|
263 |
+
annotate_current_page_bottom.submit(
|
264 |
+
modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page]).\
|
265 |
+
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
|
266 |
+
|
267 |
+
annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
268 |
+
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
|
269 |
+
annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
270 |
+
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
|
271 |
+
|
272 |
###
|
273 |
# TABULAR DATA REDACTION
|
274 |
###
|
|
|
318 |
|
319 |
if __name__ == "__main__":
|
320 |
if os.environ['COGNITO_AUTH'] == "1":
|
321 |
+
app.queue().launch(show_error=True, auth=authenticate_user, max_file_size='100mb')
|
322 |
else:
|
323 |
+
app.queue().launch(show_error=True, inbrowser=True, max_file_size='100mb')
|
324 |
|
325 |
|
326 |
# AWS options - placeholder for possibility of storing data on s3 and retrieving it in app
|
requirements.txt
CHANGED
@@ -7,11 +7,16 @@ presidio_anonymizer==2.2.355
|
|
7 |
presidio-image-redactor==0.0.53
|
8 |
pikepdf==8.15.1
|
9 |
pandas==2.2.3
|
10 |
-
spacy==3.
|
11 |
en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
|
12 |
-
|
13 |
-
|
|
|
14 |
pyarrow==17.0.0
|
15 |
openpyxl==3.1.2
|
16 |
Faker==22.2.0
|
17 |
gradio_image_annotation==0.2.3
|
|
|
|
|
|
|
|
|
|
7 |
presidio-image-redactor==0.0.53
|
8 |
pikepdf==8.15.1
|
9 |
pandas==2.2.3
|
10 |
+
spacy==3.7.5
|
11 |
en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
|
12 |
+
#en_core_web_sm @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-#3.8.0.tar.gz
|
13 |
+
gradio==5.4.0
|
14 |
+
boto3==1.35.54
|
15 |
pyarrow==17.0.0
|
16 |
openpyxl==3.1.2
|
17 |
Faker==22.2.0
|
18 |
gradio_image_annotation==0.2.3
|
19 |
+
numpy==1.26.4
|
20 |
+
|
21 |
+
|
22 |
+
|
tools/aws_functions.py
CHANGED
@@ -181,23 +181,27 @@ def upload_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=buck
|
|
181 |
local_file_paths = [local_file_paths]
|
182 |
|
183 |
for file in local_file_paths:
|
184 |
-
|
185 |
-
#
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
|
|
|
|
|
|
|
|
201 |
|
202 |
return final_out_message_str
|
203 |
|
|
|
181 |
local_file_paths = [local_file_paths]
|
182 |
|
183 |
for file in local_file_paths:
|
184 |
+
if s3_client:
|
185 |
+
#print(s3_client)
|
186 |
+
try:
|
187 |
+
# Get file name off file path
|
188 |
+
file_name = os.path.basename(file)
|
189 |
+
|
190 |
+
s3_key_full = s3_key + file_name
|
191 |
+
print("S3 key: ", s3_key_full)
|
192 |
+
|
193 |
+
s3_client.upload_file(file, s3_bucket, s3_key_full)
|
194 |
+
out_message = "File " + file_name + " uploaded successfully!"
|
195 |
+
print(out_message)
|
196 |
+
|
197 |
+
except Exception as e:
|
198 |
+
out_message = f"Error uploading file(s): {e}"
|
199 |
+
print(out_message)
|
200 |
+
|
201 |
+
final_out_message.append(out_message)
|
202 |
+
final_out_message_str = '\n'.join(final_out_message)
|
203 |
+
|
204 |
+
else: final_out_message_str = "Could not connect to AWS."
|
205 |
|
206 |
return final_out_message_str
|
207 |
|
tools/aws_textract.py
CHANGED
@@ -23,7 +23,7 @@ def extract_textract_metadata(response):
|
|
23 |
#'NumberOfPages': number_of_pages
|
24 |
})
|
25 |
|
26 |
-
def analyse_page_with_textract(pdf_page_bytes,
|
27 |
'''
|
28 |
Analyse page with AWS Textract
|
29 |
'''
|
@@ -31,28 +31,22 @@ def analyse_page_with_textract(pdf_page_bytes, json_file_path):
|
|
31 |
client = boto3.client('textract')
|
32 |
except:
|
33 |
print("Cannot connect to AWS Textract")
|
34 |
-
return
|
35 |
|
36 |
print("Analysing page with AWS Textract")
|
37 |
|
38 |
-
# Convert the image to bytes using an in-memory buffer
|
39 |
-
#image_buffer = io.BytesIO()
|
40 |
-
#image.save(image_buffer, format='PNG') # Save as PNG, or adjust format if needed
|
41 |
-
#image_bytes = image_buffer.getvalue()
|
42 |
-
|
43 |
-
#response = client.detect_document_text(Document={'Bytes': image_bytes})
|
44 |
response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=["SIGNATURES"])
|
45 |
|
46 |
-
|
47 |
-
|
|
|
|
|
|
|
48 |
|
49 |
-
#
|
50 |
-
with open(json_file_path, 'w') as json_file:
|
51 |
-
json.dump(response, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
|
52 |
|
53 |
-
|
54 |
-
|
55 |
-
return text_blocks, request_metadata
|
56 |
|
57 |
|
58 |
def convert_pike_pdf_page_to_bytes(pdf, page_num):
|
@@ -81,7 +75,7 @@ def convert_pike_pdf_page_to_bytes(pdf, page_num):
|
|
81 |
return pdf_bytes
|
82 |
|
83 |
|
84 |
-
def json_to_ocrresult(json_data, page_width, page_height):
|
85 |
'''
|
86 |
Convert the json response from textract to the OCRResult format used elsewhere in the code. Looks for lines, words, and signatures. Handwriting and signatures are set aside especially for later in case the user wants to override the default behaviour and redact all handwriting/signatures.
|
87 |
'''
|
@@ -92,16 +86,27 @@ def json_to_ocrresult(json_data, page_width, page_height):
|
|
92 |
signatures = []
|
93 |
handwriting = []
|
94 |
ocr_results_with_children = {}
|
|
|
95 |
|
96 |
i = 1
|
97 |
|
98 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
|
100 |
-
|
101 |
-
|
102 |
|
103 |
-
|
104 |
-
|
105 |
if (text_block['BlockType'] == 'LINE') | (text_block['BlockType'] == 'SIGNATURE'): # (text_block['BlockType'] == 'WORD') |
|
106 |
|
107 |
# Extract text and bounding box for the line
|
@@ -124,7 +129,7 @@ def json_to_ocrresult(json_data, page_width, page_height):
|
|
124 |
for relationship in text_block['Relationships']:
|
125 |
if relationship['Type'] == 'CHILD':
|
126 |
for child_id in relationship['Ids']:
|
127 |
-
child_block = next((block for block in
|
128 |
if child_block and child_block['BlockType'] == 'WORD':
|
129 |
word_text = child_block.get('Text', '')
|
130 |
word_bbox = child_block["Geometry"]["BoundingBox"]
|
@@ -156,9 +161,9 @@ def json_to_ocrresult(json_data, page_width, page_height):
|
|
156 |
|
157 |
recogniser_result = CustomImageRecognizerResult(entity_type=entity_name, text= word_text, score= confidence, start=0, end=word_end, left=word_left, top=word_top, width=word_width_abs, height=word_height_abs)
|
158 |
|
159 |
-
handwriting
|
160 |
-
|
161 |
-
|
162 |
|
163 |
# If handwriting or signature, add to bounding box
|
164 |
|
@@ -172,13 +177,14 @@ def json_to_ocrresult(json_data, page_width, page_height):
|
|
172 |
|
173 |
recogniser_result = CustomImageRecognizerResult(entity_type=entity_name, text= line_text, score= confidence, start=0, end=word_end, left=line_left, top=line_top, width=width_abs, height=height_abs)
|
174 |
|
175 |
-
signatures
|
176 |
-
|
|
|
177 |
|
178 |
words = []
|
179 |
words.append({
|
180 |
-
|
181 |
-
|
182 |
})
|
183 |
|
184 |
ocr_results_with_children["text_line_" + str(i)] = {
|
@@ -196,11 +202,17 @@ def json_to_ocrresult(json_data, page_width, page_height):
|
|
196 |
|
197 |
# If it is signature or handwriting, will overwrite the default behaviour of the PII analyser
|
198 |
if is_signature_or_handwriting:
|
199 |
-
signature_or_handwriting_recogniser_results
|
|
|
200 |
|
201 |
-
if is_signature:
|
202 |
-
|
|
|
|
|
|
|
|
|
|
|
203 |
|
204 |
i += 1
|
205 |
-
|
206 |
return all_ocr_results, signature_or_handwriting_recogniser_results, signature_recogniser_results, handwriting_recogniser_results, ocr_results_with_children
|
|
|
23 |
#'NumberOfPages': number_of_pages
|
24 |
})
|
25 |
|
26 |
+
def analyse_page_with_textract(pdf_page_bytes, page_no):
|
27 |
'''
|
28 |
Analyse page with AWS Textract
|
29 |
'''
|
|
|
31 |
client = boto3.client('textract')
|
32 |
except:
|
33 |
print("Cannot connect to AWS Textract")
|
34 |
+
return [], "" # Return an empty list and an empty string
|
35 |
|
36 |
print("Analysing page with AWS Textract")
|
37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=["SIGNATURES"])
|
39 |
|
40 |
+
# Wrap the response with the page number in the desired format
|
41 |
+
wrapped_response = {
|
42 |
+
'page_no': page_no,
|
43 |
+
'data': response
|
44 |
+
}
|
45 |
|
46 |
+
request_metadata = extract_textract_metadata(response) # Metadata comes out as a string
|
|
|
|
|
47 |
|
48 |
+
# Return a list containing the wrapped response and the metadata
|
49 |
+
return wrapped_response, request_metadata # Return as a list to match the desired structure
|
|
|
50 |
|
51 |
|
52 |
def convert_pike_pdf_page_to_bytes(pdf, page_num):
|
|
|
75 |
return pdf_bytes
|
76 |
|
77 |
|
78 |
+
def json_to_ocrresult(json_data, page_width, page_height, page_no):
|
79 |
'''
|
80 |
Convert the json response from textract to the OCRResult format used elsewhere in the code. Looks for lines, words, and signatures. Handwriting and signatures are set aside especially for later in case the user wants to override the default behaviour and redact all handwriting/signatures.
|
81 |
'''
|
|
|
86 |
signatures = []
|
87 |
handwriting = []
|
88 |
ocr_results_with_children = {}
|
89 |
+
text_block={}
|
90 |
|
91 |
i = 1
|
92 |
|
93 |
+
# Assuming json_data is structured as a dictionary with a "pages" key
|
94 |
+
#if "pages" in json_data:
|
95 |
+
# Find the specific page data
|
96 |
+
page_json_data = json_data #next((page for page in json_data["pages"] if page["page_no"] == page_no), None)
|
97 |
+
|
98 |
+
if "Blocks" in page_json_data:
|
99 |
+
# Access the data for the specific page
|
100 |
+
text_blocks = page_json_data["Blocks"] # Access the Blocks within the page data
|
101 |
+
# This is a new page
|
102 |
+
elif "page_no" in page_json_data:
|
103 |
+
text_blocks = page_json_data["data"]["Blocks"]
|
104 |
|
105 |
+
is_signature = False
|
106 |
+
is_handwriting = False
|
107 |
|
108 |
+
for text_block in text_blocks:
|
109 |
+
|
110 |
if (text_block['BlockType'] == 'LINE') | (text_block['BlockType'] == 'SIGNATURE'): # (text_block['BlockType'] == 'WORD') |
|
111 |
|
112 |
# Extract text and bounding box for the line
|
|
|
129 |
for relationship in text_block['Relationships']:
|
130 |
if relationship['Type'] == 'CHILD':
|
131 |
for child_id in relationship['Ids']:
|
132 |
+
child_block = next((block for block in text_blocks if block['Id'] == child_id), None)
|
133 |
if child_block and child_block['BlockType'] == 'WORD':
|
134 |
word_text = child_block.get('Text', '')
|
135 |
word_bbox = child_block["Geometry"]["BoundingBox"]
|
|
|
161 |
|
162 |
recogniser_result = CustomImageRecognizerResult(entity_type=entity_name, text= word_text, score= confidence, start=0, end=word_end, left=word_left, top=word_top, width=word_width_abs, height=word_height_abs)
|
163 |
|
164 |
+
if recogniser_result not in handwriting:
|
165 |
+
handwriting.append(recogniser_result)
|
166 |
+
print("Handwriting found:", handwriting[-1])
|
167 |
|
168 |
# If handwriting or signature, add to bounding box
|
169 |
|
|
|
177 |
|
178 |
recogniser_result = CustomImageRecognizerResult(entity_type=entity_name, text= line_text, score= confidence, start=0, end=word_end, left=line_left, top=line_top, width=width_abs, height=height_abs)
|
179 |
|
180 |
+
if recogniser_result not in signatures:
|
181 |
+
signatures.append(recogniser_result)
|
182 |
+
#print("Signature found:", signatures[-1])
|
183 |
|
184 |
words = []
|
185 |
words.append({
|
186 |
+
'text': line_text,
|
187 |
+
'bounding_box': (line_left, line_top, line_right, line_bottom)
|
188 |
})
|
189 |
|
190 |
ocr_results_with_children["text_line_" + str(i)] = {
|
|
|
202 |
|
203 |
# If it is signature or handwriting, will overwrite the default behaviour of the PII analyser
|
204 |
if is_signature_or_handwriting:
|
205 |
+
if recogniser_result not in signature_or_handwriting_recogniser_results:
|
206 |
+
signature_or_handwriting_recogniser_results.append(recogniser_result)
|
207 |
|
208 |
+
if is_signature:
|
209 |
+
if recogniser_result not in signature_recogniser_results:
|
210 |
+
signature_recogniser_results.append(recogniser_result)
|
211 |
+
|
212 |
+
if is_handwriting:
|
213 |
+
if recogniser_result not in handwriting_recogniser_results:
|
214 |
+
handwriting_recogniser_results.append(recogniser_result)
|
215 |
|
216 |
i += 1
|
217 |
+
|
218 |
return all_ocr_results, signature_or_handwriting_recogniser_results, signature_recogniser_results, handwriting_recogniser_results, ocr_results_with_children
|
tools/custom_image_analyser_engine.py
CHANGED
@@ -9,7 +9,8 @@ import PIL
|
|
9 |
from PIL import ImageDraw, ImageFont, Image
|
10 |
from typing import Optional, Tuple, Union
|
11 |
from copy import deepcopy
|
12 |
-
|
|
|
13 |
|
14 |
@dataclass
|
15 |
class OCRResult:
|
@@ -445,7 +446,7 @@ class CustomImageAnalyzerEngine:
|
|
445 |
|
446 |
return [
|
447 |
OCRResult(
|
448 |
-
text=ocr_result['text'][i],
|
449 |
left=ocr_result['left'][i],
|
450 |
top=ocr_result['top'][i],
|
451 |
width=ocr_result['width'][i],
|
|
|
9 |
from PIL import ImageDraw, ImageFont, Image
|
10 |
from typing import Optional, Tuple, Union
|
11 |
from copy import deepcopy
|
12 |
+
from tools.helper_functions import clean_unicode_text
|
13 |
+
#import string # Import string to get a list of common punctuation characters
|
14 |
|
15 |
@dataclass
|
16 |
class OCRResult:
|
|
|
446 |
|
447 |
return [
|
448 |
OCRResult(
|
449 |
+
text=clean_unicode_text(ocr_result['text'][i]),
|
450 |
left=ocr_result['left'][i],
|
451 |
top=ocr_result['top'][i],
|
452 |
width=ocr_result['width'][i],
|
tools/file_conversion.py
CHANGED
@@ -4,8 +4,10 @@ from PIL import Image, ImageFile
|
|
4 |
ImageFile.LOAD_TRUNCATED_IMAGES = True
|
5 |
|
6 |
import os
|
|
|
7 |
import time
|
8 |
import json
|
|
|
9 |
from gradio import Progress
|
10 |
from typing import List, Optional
|
11 |
|
@@ -62,11 +64,20 @@ def convert_pdf_to_images(pdf_path:str, page_min:int = 0, progress=Progress(trac
|
|
62 |
|
63 |
# Check if the image already exists
|
64 |
if os.path.exists(out_path):
|
65 |
-
print(f"Loading existing image from {out_path}.")
|
66 |
-
image =
|
|
|
|
|
|
|
67 |
else:
|
68 |
-
|
69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
|
71 |
# If no images are returned, break the loop
|
72 |
if not image:
|
@@ -76,9 +87,7 @@ def convert_pdf_to_images(pdf_path:str, page_min:int = 0, progress=Progress(trac
|
|
76 |
# print("Conversion of page", str(page_num), "to file succeeded.")
|
77 |
# print("image:", image)
|
78 |
|
79 |
-
|
80 |
-
|
81 |
-
images.extend(image)
|
82 |
|
83 |
print("PDF has been converted to images.")
|
84 |
# print("Images:", images)
|
@@ -104,6 +113,8 @@ def process_file(file_path):
|
|
104 |
# Run your function for processing PDF files here
|
105 |
img_object = convert_pdf_to_images(file_path)
|
106 |
|
|
|
|
|
107 |
else:
|
108 |
print(f"{file_path} is not an image or PDF file.")
|
109 |
img_object = ['']
|
@@ -119,9 +130,15 @@ def get_input_file_names(file_input):
|
|
119 |
|
120 |
#print("file_input:", file_input)
|
121 |
|
122 |
-
|
123 |
-
|
124 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
file_path_without_ext = get_file_path_end(file_path)
|
126 |
|
127 |
#print("file:", file_path)
|
@@ -147,6 +164,8 @@ def prepare_image_or_pdf(
|
|
147 |
latest_file_completed: int = 0,
|
148 |
out_message: List[str] = [],
|
149 |
first_loop_state: bool = False,
|
|
|
|
|
150 |
progress: Progress = Progress(track_tqdm=True)
|
151 |
) -> tuple[List[str], List[str]]:
|
152 |
"""
|
@@ -162,6 +181,7 @@ def prepare_image_or_pdf(
|
|
162 |
latest_file_completed (int): Index of the last completed file.
|
163 |
out_message (List[str]): List to store output messages.
|
164 |
first_loop_state (bool): Flag indicating if this is the first iteration.
|
|
|
165 |
progress (Progress): Progress tracker for the operation.
|
166 |
|
167 |
Returns:
|
@@ -170,47 +190,73 @@ def prepare_image_or_pdf(
|
|
170 |
|
171 |
tic = time.perf_counter()
|
172 |
|
173 |
-
# If out message or converted_file_paths are blank, change to a list so it can be appended to
|
174 |
-
if isinstance(out_message, str):
|
175 |
-
out_message = [out_message]
|
176 |
-
|
177 |
# If this is the first time around, set variables to 0/blank
|
178 |
if first_loop_state==True:
|
|
|
179 |
latest_file_completed = 0
|
180 |
-
out_message = []
|
181 |
-
converted_file_paths = []
|
182 |
-
image_file_paths = []
|
183 |
else:
|
184 |
print("Now attempting file:", str(latest_file_completed))
|
185 |
-
|
186 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
187 |
|
188 |
if not file_paths:
|
189 |
file_paths = []
|
190 |
|
191 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
192 |
|
193 |
latest_file_completed = int(latest_file_completed)
|
194 |
|
195 |
# If we have already redacted the last file, return the input out_message and file list to the relevant components
|
196 |
-
if latest_file_completed >=
|
197 |
print("Last file reached, returning files:", str(latest_file_completed))
|
198 |
if isinstance(out_message, list):
|
199 |
final_out_message = '\n'.join(out_message)
|
200 |
else:
|
201 |
final_out_message = out_message
|
202 |
-
return final_out_message, converted_file_paths, image_file_paths
|
203 |
|
204 |
#in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
|
205 |
|
206 |
progress(0.1, desc='Preparing file')
|
207 |
|
208 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
209 |
#print("file_paths_loop:", str(file_paths_loop))
|
210 |
|
211 |
#for file in progress.tqdm(file_paths, desc="Preparing files"):
|
212 |
for file in file_paths_loop:
|
213 |
-
|
|
|
|
|
|
|
214 |
file_path_without_ext = get_file_path_end(file_path)
|
215 |
|
216 |
#print("file:", file_path)
|
@@ -235,14 +281,14 @@ def prepare_image_or_pdf(
|
|
235 |
if not file_path:
|
236 |
out_message = "No file selected"
|
237 |
print(out_message)
|
238 |
-
return out_message, converted_file_paths, image_file_paths
|
239 |
|
240 |
if in_redact_method == "Quick image analysis - typed text" or in_redact_method == "Complex image analysis - docs with handwriting/signatures (AWS Textract)":
|
241 |
# Analyse and redact image-based pdf or image
|
242 |
if is_pdf_or_image(file_path) == False:
|
243 |
out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
|
244 |
print(out_message)
|
245 |
-
return out_message, converted_file_paths, image_file_paths
|
246 |
|
247 |
converted_file_path = process_file(file_path)
|
248 |
image_file_path = converted_file_path
|
@@ -252,7 +298,7 @@ def prepare_image_or_pdf(
|
|
252 |
if is_pdf(file_path) == False:
|
253 |
out_message = "Please upload a PDF file for text analysis."
|
254 |
print(out_message)
|
255 |
-
return out_message, converted_file_paths, image_file_paths
|
256 |
|
257 |
converted_file_path = file_path # Pikepdf works with the basic unconverted pdf file
|
258 |
image_file_path = process_file(file_path)
|
@@ -261,7 +307,20 @@ def prepare_image_or_pdf(
|
|
261 |
converted_file_paths.append(converted_file_path)
|
262 |
image_file_paths.extend(image_file_path)
|
263 |
|
264 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
265 |
|
266 |
toc = time.perf_counter()
|
267 |
out_time = f"File '{file_path_without_ext}' prepared in {toc - tic:0.1f} seconds."
|
@@ -270,8 +329,12 @@ def prepare_image_or_pdf(
|
|
270 |
|
271 |
out_message.append(out_time)
|
272 |
out_message_out = '\n'.join(out_message)
|
|
|
|
|
|
|
|
|
273 |
|
274 |
-
return out_message_out, converted_file_paths, image_file_paths
|
275 |
|
276 |
def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
|
277 |
file_path_without_ext = get_file_path_end(in_file_path)
|
|
|
4 |
ImageFile.LOAD_TRUNCATED_IMAGES = True
|
5 |
|
6 |
import os
|
7 |
+
import gradio as gr
|
8 |
import time
|
9 |
import json
|
10 |
+
import pymupdf
|
11 |
from gradio import Progress
|
12 |
from typing import List, Optional
|
13 |
|
|
|
64 |
|
65 |
# Check if the image already exists
|
66 |
if os.path.exists(out_path):
|
67 |
+
#print(f"Loading existing image from {out_path}.")
|
68 |
+
image = Image.open(out_path) # Load the existing image
|
69 |
+
|
70 |
+
|
71 |
+
|
72 |
else:
|
73 |
+
image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1, dpi=300, use_cropbox=True, use_pdftocairo=False)
|
74 |
+
|
75 |
+
image = image_l[0]
|
76 |
+
|
77 |
+
# Convert to greyscale
|
78 |
+
image = image.convert("L")
|
79 |
+
|
80 |
+
image.save(out_path, format="PNG") # Save the new image
|
81 |
|
82 |
# If no images are returned, break the loop
|
83 |
if not image:
|
|
|
87 |
# print("Conversion of page", str(page_num), "to file succeeded.")
|
88 |
# print("image:", image)
|
89 |
|
90 |
+
images.append(out_path)
|
|
|
|
|
91 |
|
92 |
print("PDF has been converted to images.")
|
93 |
# print("Images:", images)
|
|
|
113 |
# Run your function for processing PDF files here
|
114 |
img_object = convert_pdf_to_images(file_path)
|
115 |
|
116 |
+
print("img_object has length", len(img_object), "and contains", img_object)
|
117 |
+
|
118 |
else:
|
119 |
print(f"{file_path} is not an image or PDF file.")
|
120 |
img_object = ['']
|
|
|
130 |
|
131 |
#print("file_input:", file_input)
|
132 |
|
133 |
+
if isinstance(file_input, str):
|
134 |
+
file_input_list = [file_input]
|
135 |
+
|
136 |
+
for file in file_input_list:
|
137 |
+
if isinstance(file, str):
|
138 |
+
file_path = file
|
139 |
+
else:
|
140 |
+
file_path = file.name
|
141 |
+
|
142 |
file_path_without_ext = get_file_path_end(file_path)
|
143 |
|
144 |
#print("file:", file_path)
|
|
|
164 |
latest_file_completed: int = 0,
|
165 |
out_message: List[str] = [],
|
166 |
first_loop_state: bool = False,
|
167 |
+
number_of_pages:int = 1,
|
168 |
+
current_loop_page_number:int=0,
|
169 |
progress: Progress = Progress(track_tqdm=True)
|
170 |
) -> tuple[List[str], List[str]]:
|
171 |
"""
|
|
|
181 |
latest_file_completed (int): Index of the last completed file.
|
182 |
out_message (List[str]): List to store output messages.
|
183 |
first_loop_state (bool): Flag indicating if this is the first iteration.
|
184 |
+
number_of_pages (int): integer indicating the number of pages in the document
|
185 |
progress (Progress): Progress tracker for the operation.
|
186 |
|
187 |
Returns:
|
|
|
190 |
|
191 |
tic = time.perf_counter()
|
192 |
|
|
|
|
|
|
|
|
|
193 |
# If this is the first time around, set variables to 0/blank
|
194 |
if first_loop_state==True:
|
195 |
+
print("first_loop_state is True")
|
196 |
latest_file_completed = 0
|
197 |
+
out_message = []
|
|
|
|
|
198 |
else:
|
199 |
print("Now attempting file:", str(latest_file_completed))
|
200 |
+
|
201 |
+
# This is only run when a new page is loaded, so can reset page loop values. If end of last file (99), current loop number set to 999
|
202 |
+
# if latest_file_completed == 99:
|
203 |
+
# current_loop_page_number = 999
|
204 |
+
# page_break_return = False
|
205 |
+
# else:
|
206 |
+
# current_loop_page_number = 0
|
207 |
+
# page_break_return = False
|
208 |
+
|
209 |
+
# If out message or converted_file_paths are blank, change to a list so it can be appended to
|
210 |
+
if isinstance(out_message, str):
|
211 |
+
out_message = [out_message]
|
212 |
+
|
213 |
+
converted_file_paths = []
|
214 |
+
image_file_paths = []
|
215 |
+
pymupdf_doc = []
|
216 |
|
217 |
if not file_paths:
|
218 |
file_paths = []
|
219 |
|
220 |
+
if isinstance(file_paths, str):
|
221 |
+
file_path_number = 1
|
222 |
+
else:
|
223 |
+
file_path_number = len(file_paths)
|
224 |
+
|
225 |
+
print("Current_loop_page_number at start of prepare_image_or_pdf function is:", current_loop_page_number)
|
226 |
+
print("Number of file paths:", file_path_number)
|
227 |
+
print("Latest_file_completed:", latest_file_completed)
|
228 |
|
229 |
latest_file_completed = int(latest_file_completed)
|
230 |
|
231 |
# If we have already redacted the last file, return the input out_message and file list to the relevant components
|
232 |
+
if latest_file_completed >= file_path_number:
|
233 |
print("Last file reached, returning files:", str(latest_file_completed))
|
234 |
if isinstance(out_message, list):
|
235 |
final_out_message = '\n'.join(out_message)
|
236 |
else:
|
237 |
final_out_message = out_message
|
238 |
+
return final_out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc
|
239 |
|
240 |
#in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
|
241 |
|
242 |
progress(0.1, desc='Preparing file')
|
243 |
|
244 |
+
if isinstance(file_paths, str):
|
245 |
+
file_paths_list = [file_paths]
|
246 |
+
file_paths_loop = file_paths_list
|
247 |
+
else:
|
248 |
+
file_paths_list = file_paths
|
249 |
+
file_paths_loop = [file_paths_list[int(latest_file_completed)]]
|
250 |
+
|
251 |
+
|
252 |
#print("file_paths_loop:", str(file_paths_loop))
|
253 |
|
254 |
#for file in progress.tqdm(file_paths, desc="Preparing files"):
|
255 |
for file in file_paths_loop:
|
256 |
+
if isinstance(file, str):
|
257 |
+
file_path = file
|
258 |
+
else:
|
259 |
+
file_path = file.name
|
260 |
file_path_without_ext = get_file_path_end(file_path)
|
261 |
|
262 |
#print("file:", file_path)
|
|
|
281 |
if not file_path:
|
282 |
out_message = "No file selected"
|
283 |
print(out_message)
|
284 |
+
return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc
|
285 |
|
286 |
if in_redact_method == "Quick image analysis - typed text" or in_redact_method == "Complex image analysis - docs with handwriting/signatures (AWS Textract)":
|
287 |
# Analyse and redact image-based pdf or image
|
288 |
if is_pdf_or_image(file_path) == False:
|
289 |
out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
|
290 |
print(out_message)
|
291 |
+
return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc
|
292 |
|
293 |
converted_file_path = process_file(file_path)
|
294 |
image_file_path = converted_file_path
|
|
|
298 |
if is_pdf(file_path) == False:
|
299 |
out_message = "Please upload a PDF file for text analysis."
|
300 |
print(out_message)
|
301 |
+
return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc
|
302 |
|
303 |
converted_file_path = file_path # Pikepdf works with the basic unconverted pdf file
|
304 |
image_file_path = process_file(file_path)
|
|
|
307 |
converted_file_paths.append(converted_file_path)
|
308 |
image_file_paths.extend(image_file_path)
|
309 |
|
310 |
+
# If a pdf, load as a pymupdf document
|
311 |
+
if is_pdf(file_path):
|
312 |
+
pymupdf_doc = pymupdf.open(file_path)
|
313 |
+
#print("pymupdf_doc:", pymupdf_doc)
|
314 |
+
elif is_pdf_or_image(file_path): # Alternatively, if it's an image
|
315 |
+
# Convert image to a pymupdf document
|
316 |
+
pymupdf_doc = pymupdf.open() # Create a new empty document
|
317 |
+
img = Image.open(file_path) # Open the image file
|
318 |
+
rect = pymupdf.Rect(0, 0, img.width, img.height) # Create a rectangle for the image
|
319 |
+
page = pymupdf_doc.new_page(width=img.width, height=img.height) # Add a new page
|
320 |
+
page.insert_image(rect, filename=file_path) # Insert the image into the page
|
321 |
+
# Ensure to save the document after processing
|
322 |
+
#pymupdf_doc.save(output_path) # Uncomment and specify output_path if needed
|
323 |
+
#pymupdf_doc.close() # Close the PDF document
|
324 |
|
325 |
toc = time.perf_counter()
|
326 |
out_time = f"File '{file_path_without_ext}' prepared in {toc - tic:0.1f} seconds."
|
|
|
329 |
|
330 |
out_message.append(out_time)
|
331 |
out_message_out = '\n'.join(out_message)
|
332 |
+
|
333 |
+
number_of_pages = len(image_file_paths)
|
334 |
+
|
335 |
+
print("At end of prepare_image_or_pdf function - current_loop_page_number:", current_loop_page_number)
|
336 |
|
337 |
+
return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc
|
338 |
|
339 |
def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
|
340 |
file_path_without_ext = get_file_path_end(in_file_path)
|
tools/file_redaction.py
CHANGED
@@ -4,6 +4,8 @@ import json
|
|
4 |
import io
|
5 |
import os
|
6 |
import boto3
|
|
|
|
|
7 |
from PIL import Image, ImageChops, ImageFile, ImageDraw
|
8 |
ImageFile.LOAD_TRUNCATED_IMAGES = True
|
9 |
|
@@ -25,11 +27,20 @@ from collections import defaultdict # For efficient grouping
|
|
25 |
from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult
|
26 |
from tools.file_conversion import process_file
|
27 |
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
|
28 |
-
from tools.helper_functions import get_file_path_end, output_folder
|
29 |
from tools.file_conversion import process_file, is_pdf, convert_text_pdf_to_img_pdf, is_pdf_or_image
|
30 |
from tools.data_anonymise import generate_decision_process_output
|
31 |
from tools.aws_textract import analyse_page_with_textract, convert_pike_pdf_page_to_bytes, json_to_ocrresult
|
32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
def sum_numbers_before_seconds(string:str):
|
34 |
"""Extracts numbers that precede the word 'seconds' from a string and adds them up.
|
35 |
|
@@ -51,49 +62,130 @@ def sum_numbers_before_seconds(string:str):
|
|
51 |
|
52 |
return sum_of_numbers
|
53 |
|
54 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
'''
|
56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
'''
|
58 |
-
|
59 |
tic = time.perf_counter()
|
60 |
all_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
|
61 |
|
62 |
# If this is the first time around, set variables to 0/blank
|
63 |
if first_loop_state==True:
|
|
|
64 |
latest_file_completed = 0
|
|
|
65 |
#out_message = []
|
66 |
out_file_paths = []
|
67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
|
69 |
# If out message is string or out_file_paths are blank, change to a list so it can be appended to
|
70 |
-
if isinstance(out_message, str):
|
71 |
-
|
72 |
|
73 |
if not out_file_paths:
|
74 |
out_file_paths = []
|
75 |
|
76 |
latest_file_completed = int(latest_file_completed)
|
77 |
|
78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
|
80 |
# If we have already redacted the last file, return the input out_message and file list to the relevant components
|
81 |
-
if latest_file_completed >=
|
82 |
-
|
|
|
83 |
# Set to a very high number so as not to mix up with subsequent file processing by the user
|
84 |
latest_file_completed = 99
|
85 |
-
|
86 |
-
|
|
|
|
|
|
|
|
|
87 |
|
88 |
-
estimate_total_processing_time = sum_numbers_before_seconds(
|
89 |
print("Estimated total processing time:", str(estimate_total_processing_time))
|
90 |
|
91 |
-
|
92 |
-
|
93 |
-
return final_out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimate_total_processing_time, all_request_metadata_str, pdf_text, all_image_annotations
|
94 |
|
95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
|
|
|
|
|
|
|
97 |
if not in_allow_list.empty:
|
98 |
in_allow_list_flat = in_allow_list[0].tolist()
|
99 |
print("In allow list:", in_allow_list_flat)
|
@@ -101,13 +193,26 @@ def choose_and_run_redactor(file_paths:List[str], prepared_pdf_file_paths:List[s
|
|
101 |
in_allow_list_flat = []
|
102 |
|
103 |
progress(0.5, desc="Redacting file")
|
|
|
104 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
for file in file_paths_loop:
|
106 |
-
|
107 |
-
|
|
|
|
|
108 |
|
109 |
if file_path:
|
110 |
file_path_without_ext = get_file_path_end(file_path)
|
|
|
|
|
111 |
is_a_pdf = is_pdf(file_path) == True
|
112 |
if is_a_pdf == False:
|
113 |
# If user has not submitted a pdf, assume it's an image
|
@@ -116,7 +221,8 @@ def choose_and_run_redactor(file_paths:List[str], prepared_pdf_file_paths:List[s
|
|
116 |
else:
|
117 |
out_message = "No file selected"
|
118 |
print(out_message)
|
119 |
-
|
|
|
120 |
|
121 |
if in_redact_method == "Quick image analysis - typed text" or in_redact_method == "Complex image analysis - docs with handwriting/signatures (AWS Textract)":
|
122 |
|
@@ -127,98 +233,130 @@ def choose_and_run_redactor(file_paths:List[str], prepared_pdf_file_paths:List[s
|
|
127 |
except:
|
128 |
out_message = "Cannot connect to AWS Textract. Please choose another redaction method."
|
129 |
print(out_message)
|
130 |
-
return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str,
|
131 |
|
132 |
#Analyse and redact image-based pdf or image
|
133 |
if is_pdf_or_image(file_path) == False:
|
134 |
out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
|
135 |
-
return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str,
|
136 |
|
137 |
print("Redacting file " + file_path_without_ext + " as an image-based file")
|
138 |
|
139 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
140 |
|
141 |
# Save file
|
142 |
if is_pdf(file_path) == False:
|
143 |
out_image_file_path = output_folder + file_path_without_ext + "_redacted_as_img.pdf"
|
144 |
-
|
145 |
|
146 |
else:
|
147 |
out_image_file_path = output_folder + file_path_without_ext + "_redacted.pdf"
|
148 |
-
|
149 |
|
150 |
out_file_paths.append(out_image_file_path)
|
151 |
if logging_file_paths:
|
152 |
log_files_output_paths.extend(logging_file_paths)
|
153 |
|
154 |
-
out_message
|
155 |
-
|
156 |
|
157 |
logs_output_file_name = out_image_file_path + "_decision_process_output.csv"
|
158 |
-
|
159 |
log_files_output_paths.append(logs_output_file_name)
|
160 |
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
all_request_metadata.append(new_request_metadata)
|
165 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
166 |
# Increase latest file completed count unless we are at the last file
|
167 |
-
if latest_file_completed != len(file_paths):
|
168 |
-
|
169 |
-
latest_file_completed += 1
|
170 |
|
171 |
-
|
172 |
|
173 |
-
print("
|
174 |
-
|
175 |
-
if is_pdf(file_path) == False:
|
176 |
-
out_message = "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'."
|
177 |
-
return out_message, None, None
|
178 |
-
|
179 |
-
# Analyse text-based pdf
|
180 |
-
print('Redacting file as text-based PDF')
|
181 |
-
pdf_text, decision_process_logs, page_text_outputs, all_image_annotations = redact_text_pdf(file_path, prepared_pdf_image_paths, language, chosen_redact_entities, in_allow_list_flat, page_min, page_max, "Simple text analysis - PDFs with selectable text")
|
182 |
-
|
183 |
-
out_text_file_path = output_folder + file_path_without_ext + "_text_redacted.pdf"
|
184 |
-
pdf_text.save(out_text_file_path)
|
185 |
-
out_file_paths.append(out_text_file_path)
|
186 |
|
187 |
-
#
|
188 |
-
#
|
189 |
-
#print(convert_message)
|
190 |
|
191 |
-
#
|
192 |
-
#
|
193 |
-
#out_file_paths.
|
194 |
|
195 |
-
# Write logs to file
|
196 |
-
decision_logs_output_file_name = out_text_file_path + "_decision_process_output.csv"
|
197 |
-
|
198 |
-
log_files_output_paths.append(decision_logs_output_file_name)
|
199 |
|
200 |
-
all_text_output_file_name = out_text_file_path + "_all_text_output.csv"
|
201 |
-
|
202 |
-
log_files_output_paths.append(all_text_output_file_name)
|
|
|
|
|
203 |
|
204 |
-
|
205 |
-
out_message.append(out_message_new)
|
206 |
|
207 |
if latest_file_completed != len(file_paths):
|
208 |
-
print("Completed file number:", str(latest_file_completed), "more files to do")
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
out_message
|
213 |
-
|
214 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
215 |
|
216 |
toc = time.perf_counter()
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
out_message_out = '\n'.join(out_message)
|
221 |
-
out_message_out = out_message_out + " " + out_time
|
222 |
|
223 |
# If textract requests made, write to logging file
|
224 |
if all_request_metadata:
|
@@ -233,8 +371,16 @@ def choose_and_run_redactor(file_paths:List[str], prepared_pdf_file_paths:List[s
|
|
233 |
if all_request_metadata_file_path not in log_files_output_paths:
|
234 |
log_files_output_paths.append(all_request_metadata_file_path)
|
235 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
236 |
|
237 |
-
return
|
238 |
|
239 |
def convert_pikepdf_coords_to_pymudf(pymupdf_page, annot):
|
240 |
'''
|
@@ -430,7 +576,6 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page, image = None):#, sc
|
|
430 |
|
431 |
if image:
|
432 |
image_x1, image_y1, image_x2, image_y2 = convert_pikepdf_to_image_coords(page, annot, image)
|
433 |
-
|
434 |
|
435 |
img_annotation_box["xmin"] = image_x1
|
436 |
img_annotation_box["ymin"] = image_y1
|
@@ -455,6 +600,7 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page, image = None):#, sc
|
|
455 |
rect_single_pixel_height = Rect(x1, middle_y - 2, x2, middle_y + 2) # Small height in middle of word to remove text
|
456 |
|
457 |
# Add the annotation to the middle of the character line, so that it doesn't delete text from adjacent lines
|
|
|
458 |
page.add_redact_annot(rect_single_pixel_height)
|
459 |
|
460 |
# Set up drawing a black box over the whole rect
|
@@ -468,6 +614,8 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page, image = None):#, sc
|
|
468 |
"boxes": all_image_annotation_boxes
|
469 |
}
|
470 |
|
|
|
|
|
471 |
page.apply_redactions(images=0, graphics=0)
|
472 |
page.clean_contents()
|
473 |
|
@@ -485,16 +633,6 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
|
|
485 |
merged_bboxes = []
|
486 |
grouped_bboxes = defaultdict(list)
|
487 |
|
488 |
-
# Process signature and handwriting results
|
489 |
-
if signature_recogniser_results or handwriting_recogniser_results:
|
490 |
-
if "Redact all identified handwriting" in handwrite_signature_checkbox:
|
491 |
-
#print("Handwriting boxes exist at merge:", handwriting_recogniser_results)
|
492 |
-
bboxes.extend(handwriting_recogniser_results)
|
493 |
-
|
494 |
-
if "Redact all identified signatures" in handwrite_signature_checkbox:
|
495 |
-
#print("Signature boxes exist at merge:", signature_recogniser_results)
|
496 |
-
bboxes.extend(signature_recogniser_results)
|
497 |
-
|
498 |
# Reconstruct bounding boxes for substrings of interest
|
499 |
reconstructed_bboxes = []
|
500 |
for bbox in bboxes:
|
@@ -586,26 +724,53 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
|
|
586 |
merged_bboxes.append(merged_box)
|
587 |
merged_box = next_box
|
588 |
|
589 |
-
merged_bboxes.append(merged_box)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
590 |
|
591 |
return merged_bboxes
|
592 |
|
593 |
-
def redact_image_pdf(file_path:str, prepared_pdf_file_paths:List[str], language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, is_a_pdf:bool=True, page_min:int=0, page_max:int=999, analysis_type:str="Quick image analysis - typed text", handwrite_signature_checkbox:List[str]=["Redact all identified handwriting", "Redact all identified signatures"], request_metadata:str="", progress=Progress(track_tqdm=True)):
|
|
|
594 |
'''
|
595 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
596 |
'''
|
597 |
-
# json_file_path is for AWS Textract outputs
|
598 |
-
logging_file_paths = []
|
599 |
file_name = get_file_path_end(file_path)
|
600 |
fill = (0, 0, 0) # Fill colour
|
601 |
-
decision_process_output_str = ""
|
602 |
-
images = []
|
603 |
-
all_image_annotations = []
|
604 |
-
#request_metadata = {}
|
605 |
image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
|
606 |
|
607 |
-
#
|
608 |
-
|
|
|
609 |
|
610 |
if not prepared_pdf_file_paths:
|
611 |
out_message = "PDF does not exist as images. Converting pages to image"
|
@@ -613,71 +778,56 @@ def redact_image_pdf(file_path:str, prepared_pdf_file_paths:List[str], language:
|
|
613 |
|
614 |
prepared_pdf_file_paths = process_file(file_path)
|
615 |
|
616 |
-
if not isinstance(prepared_pdf_file_paths, list):
|
617 |
-
print("Converting prepared_pdf_file_paths to list")
|
618 |
-
prepared_pdf_file_paths = [prepared_pdf_file_paths]
|
619 |
-
|
620 |
-
#print("Image paths:", prepared_pdf_file_paths)
|
621 |
number_of_pages = len(prepared_pdf_file_paths)
|
622 |
-
|
623 |
print("Number of pages:", str(number_of_pages))
|
624 |
|
625 |
-
out_message = "Redacting pages"
|
626 |
-
print(out_message)
|
627 |
-
#progress(0.1, desc=out_message)
|
628 |
-
|
629 |
# Check that page_min and page_max are within expected ranges
|
630 |
if page_max > number_of_pages or page_max == 0:
|
631 |
page_max = number_of_pages
|
632 |
|
633 |
-
if page_min <= 0:
|
634 |
-
|
635 |
-
else:
|
636 |
-
page_min = page_min - 1
|
637 |
|
638 |
print("Page range:", str(page_min + 1), "to", str(page_max))
|
639 |
-
|
640 |
-
|
641 |
-
|
642 |
-
all_ocr_results = []
|
643 |
-
all_decision_process = []
|
644 |
-
all_line_level_ocr_results_df = pd.DataFrame()
|
645 |
-
all_decision_process_table = pd.DataFrame()
|
646 |
-
|
647 |
if analysis_type == "Quick image analysis - typed text": ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".csv"
|
648 |
elif analysis_type == "Complex image analysis - docs with handwriting/signatures (AWS Textract)": ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.csv"
|
649 |
|
650 |
-
|
651 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
652 |
handwriting_or_signature_boxes = []
|
653 |
signature_recogniser_results = []
|
654 |
handwriting_recogniser_results = []
|
655 |
-
|
656 |
-
|
657 |
|
|
|
|
|
|
|
658 |
# Assuming prepared_pdf_file_paths[page_no] is a PIL image object
|
659 |
try:
|
660 |
image = prepared_pdf_file_paths[page_no]#.copy()
|
661 |
#print("image:", image)
|
662 |
except Exception as e:
|
663 |
print("Could not redact page:", reported_page_number, "due to:")
|
664 |
-
print(e)
|
665 |
-
|
666 |
continue
|
667 |
|
668 |
-
image_annotations = {"image": image, "boxes": []}
|
669 |
-
|
670 |
-
|
671 |
pymupdf_page = pymupdf_doc.load_page(page_no)
|
672 |
-
|
673 |
-
#try:
|
674 |
-
#print("prepared_pdf_file_paths:", prepared_pdf_file_paths)
|
675 |
|
676 |
-
if page_no >= page_min and page_no < page_max:
|
677 |
-
|
678 |
-
reported_page_number = str(page_no + 1)
|
679 |
|
680 |
-
print("
|
|
|
|
|
|
|
681 |
|
682 |
# Need image size to convert textract OCR outputs to the correct sizes
|
683 |
page_width, page_height = image.size
|
@@ -695,13 +845,6 @@ def redact_image_pdf(file_path:str, prepared_pdf_file_paths:List[str], language:
|
|
695 |
# Combine OCR results
|
696 |
line_level_ocr_results, line_level_ocr_results_with_children = combine_ocr_results(word_level_ocr_results)
|
697 |
|
698 |
-
#print("ocr_results after:", ocr_results)
|
699 |
-
|
700 |
-
# Save ocr_with_children_outputs
|
701 |
-
ocr_results_with_children_str = str(line_level_ocr_results_with_children)
|
702 |
-
logs_output_file_name = output_folder + "ocr_with_children.txt"
|
703 |
-
with open(logs_output_file_name, "w") as f:
|
704 |
-
f.write(ocr_results_with_children_str)
|
705 |
|
706 |
# Import results from json and convert
|
707 |
if analysis_type == "Complex image analysis - docs with handwriting/signatures (AWS Textract)":
|
@@ -711,24 +854,53 @@ def redact_image_pdf(file_path:str, prepared_pdf_file_paths:List[str], language:
|
|
711 |
image.save(image_buffer, format='PNG') # Save as PNG, or adjust format if needed
|
712 |
pdf_page_as_bytes = image_buffer.getvalue()
|
713 |
|
714 |
-
json_file_path = output_folder + file_name + "_page_" + reported_page_number + "_textract.json"
|
|
|
715 |
|
716 |
if not os.path.exists(json_file_path):
|
717 |
-
text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes,
|
718 |
logging_file_paths.append(json_file_path)
|
719 |
request_metadata = request_metadata + "\n" + new_request_metadata
|
|
|
|
|
|
|
|
|
|
|
|
|
720 |
else:
|
721 |
# Open the file and load the JSON data
|
722 |
-
print("Found existing Textract json results file
|
723 |
with open(json_file_path, 'r') as json_file:
|
724 |
-
|
725 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
726 |
|
727 |
-
line_level_ocr_results, handwriting_or_signature_boxes, signature_recogniser_results, handwriting_recogniser_results, line_level_ocr_results_with_children = json_to_ocrresult(text_blocks, page_width, page_height)
|
728 |
|
729 |
# Step 2: Analyze text and identify PII
|
730 |
if chosen_redact_entities:
|
731 |
-
|
732 |
redaction_bboxes = image_analyser.analyze_text(
|
733 |
line_level_ocr_results,
|
734 |
line_level_ocr_results_with_children,
|
@@ -740,6 +912,10 @@ def redact_image_pdf(file_path:str, prepared_pdf_file_paths:List[str], language:
|
|
740 |
else:
|
741 |
redaction_bboxes = []
|
742 |
|
|
|
|
|
|
|
|
|
743 |
if analysis_type == "Quick image analysis - typed text": interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".txt"
|
744 |
elif analysis_type == "Complex image analysis - docs with handwriting/signatures (AWS Textract)": interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.txt"
|
745 |
|
@@ -750,12 +926,8 @@ def redact_image_pdf(file_path:str, prepared_pdf_file_paths:List[str], language:
|
|
750 |
|
751 |
# Merge close bounding boxes
|
752 |
merged_redaction_bboxes = merge_img_bboxes(redaction_bboxes, line_level_ocr_results_with_children, signature_recogniser_results, handwriting_recogniser_results, handwrite_signature_checkbox)
|
753 |
-
|
754 |
-
# Save image first so that the redactions can be checked after
|
755 |
-
#image.save(output_folder + "page_as_img_" + file_name + "_pages_" + str(reported_page_number) + ".png")
|
756 |
-
|
757 |
# 3. Draw the merged boxes
|
758 |
-
#if merged_redaction_bboxes:
|
759 |
if is_pdf(file_path) == False:
|
760 |
draw = ImageDraw.Draw(image)
|
761 |
|
@@ -790,7 +962,7 @@ def redact_image_pdf(file_path:str, prepared_pdf_file_paths:List[str], language:
|
|
790 |
|
791 |
## Apply annotations with pymupdf
|
792 |
else:
|
793 |
-
pymupdf_page, image_annotations = redact_page_with_pymupdf(pymupdf_page, merged_redaction_bboxes, image)
|
794 |
|
795 |
# Convert decision process to table
|
796 |
decision_process_table = pd.DataFrame([{
|
@@ -820,18 +992,46 @@ def redact_image_pdf(file_path:str, prepared_pdf_file_paths:List[str], language:
|
|
820 |
|
821 |
all_line_level_ocr_results_df = pd.concat([all_line_level_ocr_results_df, line_level_ocr_results_df])
|
822 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
823 |
if is_pdf(file_path) == False:
|
824 |
images.append(image)
|
825 |
pymupdf_doc = images
|
826 |
|
827 |
-
|
828 |
|
829 |
-
|
830 |
|
831 |
-
|
832 |
-
|
|
|
|
|
|
|
833 |
|
834 |
-
|
|
|
|
|
835 |
|
836 |
|
837 |
###
|
@@ -848,7 +1048,30 @@ def get_text_container_characters(text_container:LTTextContainer):
|
|
848 |
|
849 |
return characters
|
850 |
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
851 |
|
|
|
|
|
|
|
852 |
def analyse_text_container(text_container:OCRResult, language:str, chosen_redact_entities:List[str], score_threshold:float, allow_list:List[str]):
|
853 |
'''
|
854 |
Take text and bounding boxes in OCRResult format and analyze it for PII using spacy and the Microsoft Presidio package.
|
@@ -856,19 +1079,23 @@ def analyse_text_container(text_container:OCRResult, language:str, chosen_redact
|
|
856 |
|
857 |
analyser_results = []
|
858 |
|
859 |
-
|
860 |
-
|
|
|
861 |
|
862 |
if chosen_redact_entities:
|
863 |
-
|
|
|
|
|
864 |
language=language,
|
865 |
entities=chosen_redact_entities,
|
866 |
score_threshold=score_threshold,
|
867 |
return_decision_process=True,
|
868 |
-
allow_list=allow_list)
|
869 |
|
870 |
return analyser_results
|
871 |
|
|
|
872 |
def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tuple[List[OCRResult], List[LTChar]]:
|
873 |
'''
|
874 |
Create an OCRResult object based on a list of pdfminer LTChar objects.
|
@@ -881,6 +1108,7 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
|
|
881 |
|
882 |
# Initialize variables
|
883 |
full_text = ""
|
|
|
884 |
overall_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')] # [x0, y0, x1, y1]
|
885 |
word_bboxes = []
|
886 |
|
@@ -894,6 +1122,7 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
|
|
894 |
if isinstance(char, LTAnno):
|
895 |
# Handle space separately by finalizing the word
|
896 |
full_text += char.get_text() # Adds space or newline
|
|
|
897 |
if current_word: # Only finalize if there is a current word
|
898 |
word_bboxes.append((current_word, current_word_bbox))
|
899 |
current_word = ""
|
@@ -918,7 +1147,17 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
|
|
918 |
continue
|
919 |
|
920 |
# Concatenate text for LTChar
|
921 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
922 |
|
923 |
# Update overall bounding box
|
924 |
x0, y0, x1, y1 = char.bbox
|
@@ -928,7 +1167,8 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
|
|
928 |
overall_bbox[3] = max(overall_bbox[3], y1) # y1
|
929 |
|
930 |
# Update current word
|
931 |
-
current_word += char.get_text()
|
|
|
932 |
|
933 |
# Update current word bounding box
|
934 |
current_word_bbox[0] = min(current_word_bbox[0], x0) # x0
|
@@ -936,18 +1176,25 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
|
|
936 |
current_word_bbox[2] = max(current_word_bbox[2], x1) # x1
|
937 |
current_word_bbox[3] = max(current_word_bbox[3], y1) # y1
|
938 |
|
939 |
-
|
940 |
# Finalize the last word if any
|
941 |
if current_word:
|
942 |
word_bboxes.append((current_word, current_word_bbox))
|
943 |
|
944 |
if full_text:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
945 |
line_level_results_out.append(OCRResult(full_text, round(overall_bbox[0],2), round(overall_bbox[1], 2), round(overall_bbox[2]-overall_bbox[0],2), round(overall_bbox[3]-overall_bbox[1],2)))
|
946 |
-
|
|
|
947 |
|
948 |
return line_level_results_out, line_level_characters_out # Return both results and character objects
|
949 |
|
950 |
-
def merge_text_bounding_boxes(analyser_results:CustomImageRecognizerResult, characters:List[LTChar], combine_pixel_dist:int, vertical_padding:int=0):
|
951 |
'''
|
952 |
Merge identified bounding boxes containing PII that are very close to one another
|
953 |
'''
|
@@ -1003,9 +1250,10 @@ def merge_text_bounding_boxes(analyser_results:CustomImageRecognizerResult, char
|
|
1003 |
current_box[3] = max(current_box[3], char_box[3]) # Ensure the top is the highest
|
1004 |
current_result.end = max(current_result.end, result.end) # Extend the text range
|
1005 |
try:
|
1006 |
-
current_result.
|
1007 |
-
except:
|
1008 |
-
print("Unable to
|
|
|
1009 |
# Add a space if current_text is not empty
|
1010 |
if current_text:
|
1011 |
current_text.append(" ") # Add space between texts
|
@@ -1082,52 +1330,98 @@ def create_annotations_for_bounding_boxes(analysed_bounding_boxes):
|
|
1082 |
annotations_on_page.append(annotation)
|
1083 |
return annotations_on_page
|
1084 |
|
1085 |
-
def redact_text_pdf(
|
1086 |
-
|
1087 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1088 |
'''
|
1089 |
-
|
1090 |
-
all_image_annotations = []
|
1091 |
-
page_text_outputs_all_pages = pd.DataFrame()
|
1092 |
-
decision_process_table_all_pages = pd.DataFrame()
|
1093 |
|
1094 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1095 |
|
1096 |
# Open with Pikepdf to get text lines
|
1097 |
pikepdf_pdf = Pdf.open(filename)
|
1098 |
number_of_pages = len(pikepdf_pdf.pages)
|
1099 |
-
|
1100 |
-
# Also open pdf with pymupdf to be able to annotate later while retaining text
|
1101 |
-
pymupdf_doc = pymupdf.open(filename)
|
1102 |
|
1103 |
-
page_num = 0
|
1104 |
-
|
1105 |
# Check that page_min and page_max are within expected ranges
|
1106 |
if page_max > number_of_pages or page_max == 0:
|
1107 |
page_max = number_of_pages
|
1108 |
-
#else:
|
1109 |
-
# page_max = page_max - 1
|
1110 |
|
1111 |
if page_min <= 0: page_min = 0
|
1112 |
else: page_min = page_min - 1
|
1113 |
|
1114 |
print("Page range is",str(page_min + 1), "to", str(page_max))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1115 |
|
1116 |
#for page_no in range(0, number_of_pages):
|
1117 |
-
for page_no in
|
1118 |
-
|
1119 |
-
#print("prepared_pdf_image_path:", prepared_pdf_image_path)
|
1120 |
-
#print("prepared_pdf_image_path[page_no]:", prepared_pdf_image_path[page_no])
|
1121 |
-
image = prepared_pdf_image_path[page_no]
|
1122 |
|
1123 |
-
|
|
|
1124 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1125 |
pymupdf_page = pymupdf_doc.load_page(page_no)
|
1126 |
|
1127 |
-
print("
|
|
|
|
|
1128 |
|
1129 |
if page_min <= page_no < page_max:
|
1130 |
|
|
|
|
|
1131 |
for page_layout in extract_pages(filename, page_numbers = [page_no], maxpages=1):
|
1132 |
|
1133 |
page_analyser_results = []
|
@@ -1139,18 +1433,18 @@ def redact_text_pdf(filename:str, prepared_pdf_image_path:str, language:str, cho
|
|
1139 |
page_text_outputs = pd.DataFrame()
|
1140 |
|
1141 |
if analysis_type == "Simple text analysis - PDFs with selectable text":
|
1142 |
-
for text_container in page_layout:
|
1143 |
|
1144 |
text_container_analyser_results = []
|
1145 |
text_container_analysed_bounding_boxes = []
|
|
|
1146 |
|
1147 |
-
|
|
|
1148 |
|
1149 |
# Create dataframe for all the text on the page
|
1150 |
line_level_text_results_list, line_characters = create_text_bounding_boxes_from_characters(characters)
|
1151 |
|
1152 |
-
#print("line_characters:", line_characters)
|
1153 |
-
|
1154 |
# Create page_text_outputs (OCR format outputs)
|
1155 |
if line_level_text_results_list:
|
1156 |
# Convert to DataFrame and add to ongoing logging table
|
@@ -1167,60 +1461,79 @@ def redact_text_pdf(filename:str, prepared_pdf_image_path:str, language:str, cho
|
|
1167 |
|
1168 |
# Analyse each line of text in turn for PII and add to list
|
1169 |
for i, text_line in enumerate(line_level_text_results_list):
|
|
|
1170 |
text_line_analyser_result = []
|
1171 |
text_line_bounding_boxes = []
|
1172 |
|
1173 |
-
#print("text_line:", text_line.text)
|
1174 |
-
|
1175 |
text_line_analyser_result = analyse_text_container(text_line, language, chosen_redact_entities, score_threshold, allow_list)
|
1176 |
|
1177 |
# Merge bounding boxes for the line if multiple found close together
|
1178 |
if text_line_analyser_result:
|
|
|
|
|
|
|
1179 |
# Merge bounding boxes if very close together
|
1180 |
-
|
1181 |
-
|
1182 |
-
#print(
|
1183 |
-
#print("".join(char._text for char in line_characters[i]))
|
1184 |
-
text_line_bounding_boxes = merge_text_bounding_boxes(text_line_analyser_result, line_characters[i], combine_pixel_dist, vertical_padding = 0)
|
1185 |
|
1186 |
text_container_analyser_results.extend(text_line_analyser_result)
|
1187 |
text_container_analysed_bounding_boxes.extend(text_line_bounding_boxes)
|
1188 |
-
|
1189 |
-
#print("\n FINAL text_container_analyser_results:", text_container_analyser_results)
|
1190 |
-
|
1191 |
-
|
1192 |
page_analyser_results.extend(text_container_analyser_results)
|
1193 |
page_analysed_bounding_boxes.extend(text_container_analysed_bounding_boxes)
|
1194 |
|
1195 |
# Annotate redactions on page
|
1196 |
annotations_on_page = create_annotations_for_bounding_boxes(page_analysed_bounding_boxes)
|
1197 |
-
|
1198 |
-
|
1199 |
-
# Make page annotations
|
1200 |
-
#page.Annots = pdf.make_indirect(annotations_on_page)
|
1201 |
-
#if annotations_on_page:
|
1202 |
|
1203 |
-
# Make pymupdf redactions
|
1204 |
pymupdf_page, image_annotations = redact_page_with_pymupdf(pymupdf_page, annotations_on_page, image)
|
1205 |
|
1206 |
-
|
1207 |
|
1208 |
print("For page number:", page_no, "there are", len(image_annotations["boxes"]), "annotations")
|
1209 |
|
1210 |
# Write logs
|
1211 |
# Create decision process table
|
1212 |
-
decision_process_table_on_page = create_text_redaction_process_results(page_analyser_results, page_analysed_bounding_boxes,
|
1213 |
|
1214 |
if not decision_process_table_on_page.empty:
|
1215 |
-
|
1216 |
|
1217 |
if not page_text_outputs.empty:
|
1218 |
page_text_outputs = page_text_outputs.sort_values(["top", "left"], ascending=[False, False]).reset_index(drop=True)
|
1219 |
-
|
1220 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1221 |
|
1222 |
-
|
1223 |
|
1224 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1225 |
|
1226 |
-
return pymupdf_doc,
|
|
|
4 |
import io
|
5 |
import os
|
6 |
import boto3
|
7 |
+
|
8 |
+
from tqdm import tqdm
|
9 |
from PIL import Image, ImageChops, ImageFile, ImageDraw
|
10 |
ImageFile.LOAD_TRUNCATED_IMAGES = True
|
11 |
|
|
|
27 |
from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult
|
28 |
from tools.file_conversion import process_file
|
29 |
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
|
30 |
+
from tools.helper_functions import get_file_path_end, output_folder, clean_unicode_text, get_or_create_env_var
|
31 |
from tools.file_conversion import process_file, is_pdf, convert_text_pdf_to_img_pdf, is_pdf_or_image
|
32 |
from tools.data_anonymise import generate_decision_process_output
|
33 |
from tools.aws_textract import analyse_page_with_textract, convert_pike_pdf_page_to_bytes, json_to_ocrresult
|
34 |
|
35 |
+
# Number of pages to loop through before breaking. Currently set very high, as functions are breaking on time metrics (e.g. every 105 seconds), rather than on number of pages redacted.
|
36 |
+
|
37 |
+
page_break_value = get_or_create_env_var('page_break_value', '500')
|
38 |
+
print(f'The value of page_break_value is {page_break_value}')
|
39 |
+
|
40 |
+
max_time_value = get_or_create_env_var('max_time_value', '105')
|
41 |
+
print(f'The value of max_time_value is {max_time_value}')
|
42 |
+
|
43 |
+
|
44 |
def sum_numbers_before_seconds(string:str):
|
45 |
"""Extracts numbers that precede the word 'seconds' from a string and adds them up.
|
46 |
|
|
|
62 |
|
63 |
return sum_of_numbers
|
64 |
|
65 |
+
|
66 |
+
def choose_and_run_redactor(file_paths:List[str],
|
67 |
+
prepared_pdf_file_paths:List[str],
|
68 |
+
prepared_pdf_image_paths:List[str],
|
69 |
+
language:str,
|
70 |
+
chosen_redact_entities:List[str],
|
71 |
+
in_redact_method:str,
|
72 |
+
in_allow_list:List[List[str]]=None,
|
73 |
+
latest_file_completed:int=0,
|
74 |
+
out_message:list=[],
|
75 |
+
out_file_paths:list=[],
|
76 |
+
log_files_output_paths:list=[],
|
77 |
+
first_loop_state:bool=False,
|
78 |
+
page_min:int=0,
|
79 |
+
page_max:int=999,
|
80 |
+
estimated_time_taken_state:float=0.0,
|
81 |
+
handwrite_signature_checkbox:List[str]=["Redact all identified handwriting", "Redact all identified signatures"],
|
82 |
+
all_request_metadata_str:str = "",
|
83 |
+
annotations_all_pages:dict={},
|
84 |
+
all_line_level_ocr_results_df=[],
|
85 |
+
all_decision_process_table=[],
|
86 |
+
pymupdf_doc=[],
|
87 |
+
current_loop_page:int=0,
|
88 |
+
page_break_return:bool=False,
|
89 |
+
progress=gr.Progress(track_tqdm=True)):
|
90 |
'''
|
91 |
+
This function orchestrates the redaction process based on the specified method and parameters. It takes the following inputs:
|
92 |
+
|
93 |
+
- file_paths (List[str]): A list of paths to the files to be redacted.
|
94 |
+
- prepared_pdf_file_paths (List[str]): A list of paths to the PDF files prepared for redaction.
|
95 |
+
- prepared_pdf_image_paths (List[str]): A list of paths to the PDF files converted to images for redaction.
|
96 |
+
- language (str): The language of the text in the files.
|
97 |
+
- chosen_redact_entities (List[str]): A list of entity types to redact from the files.
|
98 |
+
- in_redact_method (str): The method to use for redaction.
|
99 |
+
- in_allow_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to None.
|
100 |
+
- latest_file_completed (int, optional): The index of the last completed file. Defaults to 0.
|
101 |
+
- out_message (list, optional): A list to store output messages. Defaults to an empty list.
|
102 |
+
- out_file_paths (list, optional): A list to store paths to the output files. Defaults to an empty list.
|
103 |
+
- log_files_output_paths (list, optional): A list to store paths to the log files. Defaults to an empty list.
|
104 |
+
- first_loop_state (bool, optional): A flag indicating if this is the first iteration. Defaults to False.
|
105 |
+
- page_min (int, optional): The minimum page number to start redaction from. Defaults to 0.
|
106 |
+
- page_max (int, optional): The maximum page number to end redaction at. Defaults to 999.
|
107 |
+
- estimated_time_taken_state (float, optional): The estimated time taken for the redaction process. Defaults to 0.0.
|
108 |
+
- handwrite_signature_checkbox (List[str], optional): A list of options for redacting handwriting and signatures. Defaults to ["Redact all identified handwriting", "Redact all identified signatures"].
|
109 |
+
- all_request_metadata_str (str, optional): A string containing all request metadata. Defaults to an empty string.
|
110 |
+
- annotations_all_pages (dict, optional): A dictionary containing all image annotations. Defaults to an empty dictionary.
|
111 |
+
- all_line_level_ocr_results_df (optional): A DataFrame containing all line-level OCR results. Defaults to an empty DataFrame.
|
112 |
+
- all_decision_process_table (optional): A DataFrame containing all decision process tables. Defaults to an empty DataFrame.
|
113 |
+
- pymupdf_doc (optional): A list containing the PDF document object. Defaults to an empty list.
|
114 |
+
- current_loop_page (int, optional): The current page being processed in the loop. Defaults to 0.
|
115 |
+
- page_break_return (bool, optional): A flag indicating if the function should return after a page break. Defaults to False.
|
116 |
+
- progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
|
117 |
+
|
118 |
+
The function returns a redacted document along with processing logs.
|
119 |
'''
|
120 |
+
combined_out_message = ""
|
121 |
tic = time.perf_counter()
|
122 |
all_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
|
123 |
|
124 |
# If this is the first time around, set variables to 0/blank
|
125 |
if first_loop_state==True:
|
126 |
+
print("First_loop_state is True")
|
127 |
latest_file_completed = 0
|
128 |
+
current_loop_page = 0
|
129 |
#out_message = []
|
130 |
out_file_paths = []
|
131 |
+
estimate_total_processing_time = 0
|
132 |
+
estimated_time_taken_state = 0
|
133 |
+
|
134 |
+
# If not the first time around, and the current page loop has been set to a huge number (been through all pages), reset current page to 0
|
135 |
+
elif (first_loop_state == False) & (current_loop_page == 999):
|
136 |
+
current_loop_page = 0
|
137 |
+
|
138 |
|
139 |
# If out message is string or out_file_paths are blank, change to a list so it can be appended to
|
140 |
+
#if isinstance(out_message, str):
|
141 |
+
# out_message = [out_message]
|
142 |
|
143 |
if not out_file_paths:
|
144 |
out_file_paths = []
|
145 |
|
146 |
latest_file_completed = int(latest_file_completed)
|
147 |
|
148 |
+
number_of_pages = len(prepared_pdf_image_paths)
|
149 |
+
|
150 |
+
if isinstance(file_paths,str):
|
151 |
+
number_of_files = 1
|
152 |
+
else:
|
153 |
+
number_of_files = len(file_paths)
|
154 |
+
|
155 |
+
|
156 |
+
print("\nIn choose_and_run_redactor function, latest_file_completed is:", latest_file_completed)
|
157 |
+
print("current_loop_page is:", current_loop_page)
|
158 |
+
|
159 |
|
160 |
# If we have already redacted the last file, return the input out_message and file list to the relevant components
|
161 |
+
if latest_file_completed >= number_of_files:
|
162 |
+
|
163 |
+
print("latest_file_completed is equal to or greater than the number of files")
|
164 |
# Set to a very high number so as not to mix up with subsequent file processing by the user
|
165 |
latest_file_completed = 99
|
166 |
+
current_loop_page = 0
|
167 |
+
|
168 |
+
if isinstance(out_message, list):
|
169 |
+
combined_out_message = '\n'.join(out_message)
|
170 |
+
else:
|
171 |
+
combined_out_message = out_message
|
172 |
|
173 |
+
estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
|
174 |
print("Estimated total processing time:", str(estimate_total_processing_time))
|
175 |
|
176 |
+
return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table
|
|
|
|
|
177 |
|
178 |
+
# If we have reached the last page, return message
|
179 |
+
if current_loop_page >= number_of_pages:
|
180 |
+
print("current_loop_page:", current_loop_page, "is equal to or greater than number of pages in document:", number_of_pages)
|
181 |
+
|
182 |
+
# Set to a very high number so as not to mix up with subsequent file processing by the user
|
183 |
+
current_loop_page = 999
|
184 |
+
combined_out_message = out_message
|
185 |
|
186 |
+
return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = False, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table
|
187 |
+
|
188 |
+
# Create allow list
|
189 |
if not in_allow_list.empty:
|
190 |
in_allow_list_flat = in_allow_list[0].tolist()
|
191 |
print("In allow list:", in_allow_list_flat)
|
|
|
193 |
in_allow_list_flat = []
|
194 |
|
195 |
progress(0.5, desc="Redacting file")
|
196 |
+
|
197 |
|
198 |
+
if isinstance(file_paths, str):
|
199 |
+
file_paths_list = [file_paths]
|
200 |
+
file_paths_loop = file_paths_list
|
201 |
+
else:
|
202 |
+
file_paths_list = file_paths
|
203 |
+
file_paths_loop = [file_paths_list[int(latest_file_completed)]]
|
204 |
+
|
205 |
+
|
206 |
for file in file_paths_loop:
|
207 |
+
if isinstance(file, str):
|
208 |
+
file_path = file
|
209 |
+
else:
|
210 |
+
file_path = file.name
|
211 |
|
212 |
if file_path:
|
213 |
file_path_without_ext = get_file_path_end(file_path)
|
214 |
+
print("Redacting file:", file_path_without_ext)
|
215 |
+
|
216 |
is_a_pdf = is_pdf(file_path) == True
|
217 |
if is_a_pdf == False:
|
218 |
# If user has not submitted a pdf, assume it's an image
|
|
|
221 |
else:
|
222 |
out_message = "No file selected"
|
223 |
print(out_message)
|
224 |
+
|
225 |
+
return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table
|
226 |
|
227 |
if in_redact_method == "Quick image analysis - typed text" or in_redact_method == "Complex image analysis - docs with handwriting/signatures (AWS Textract)":
|
228 |
|
|
|
233 |
except:
|
234 |
out_message = "Cannot connect to AWS Textract. Please choose another redaction method."
|
235 |
print(out_message)
|
236 |
+
return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages
|
237 |
|
238 |
#Analyse and redact image-based pdf or image
|
239 |
if is_pdf_or_image(file_path) == False:
|
240 |
out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
|
241 |
+
return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages
|
242 |
|
243 |
print("Redacting file " + file_path_without_ext + " as an image-based file")
|
244 |
|
245 |
+
pymupdf_doc, all_decision_process_table, logging_file_paths, new_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df = redact_image_pdf(file_path, prepared_pdf_image_paths, language, chosen_redact_entities, in_allow_list_flat, is_a_pdf, page_min, page_max, in_redact_method, handwrite_signature_checkbox, "", current_loop_page, page_break_return, prepared_pdf_image_paths, annotations_all_pages, all_line_level_ocr_results_df, all_decision_process_table, pymupdf_doc)
|
246 |
+
|
247 |
+
# Save Textract request metadata (if exists)
|
248 |
+
if new_request_metadata:
|
249 |
+
print("Request metadata:", new_request_metadata)
|
250 |
+
all_request_metadata.append(new_request_metadata)
|
251 |
+
|
252 |
+
elif in_redact_method == "Simple text analysis - PDFs with selectable text":
|
253 |
+
|
254 |
+
logging_file_paths = ""
|
255 |
+
|
256 |
+
if is_pdf(file_path) == False:
|
257 |
+
out_message = "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'."
|
258 |
+
return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table
|
259 |
+
|
260 |
+
# Analyse text-based pdf
|
261 |
+
print('Redacting file as text-based PDF')
|
262 |
+
|
263 |
+
pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return = redact_text_pdf(file_path, prepared_pdf_image_paths, language, chosen_redact_entities, in_allow_list_flat, page_min, page_max, "Simple text analysis - PDFs with selectable text", current_loop_page, page_break_return, annotations_all_pages, all_line_level_ocr_results_df, all_decision_process_table, pymupdf_doc)
|
264 |
+
|
265 |
+
else:
|
266 |
+
out_message = "No redaction method selected"
|
267 |
+
print(out_message)
|
268 |
+
return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table
|
269 |
+
|
270 |
+
# If at last page, save to file
|
271 |
+
if current_loop_page >= number_of_pages:
|
272 |
+
|
273 |
+
print("Current page loop:", current_loop_page, "is greater or equal to number of pages:", number_of_pages)
|
274 |
+
latest_file_completed += 1
|
275 |
+
current_loop_page = 999
|
276 |
+
|
277 |
+
if latest_file_completed != len(file_paths):
|
278 |
+
print("Completed file number:", str(latest_file_completed), "there are more files to do")
|
279 |
|
280 |
# Save file
|
281 |
if is_pdf(file_path) == False:
|
282 |
out_image_file_path = output_folder + file_path_without_ext + "_redacted_as_img.pdf"
|
283 |
+
pymupdf_doc[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pymupdf_doc[1:])
|
284 |
|
285 |
else:
|
286 |
out_image_file_path = output_folder + file_path_without_ext + "_redacted.pdf"
|
287 |
+
pymupdf_doc.save(out_image_file_path)
|
288 |
|
289 |
out_file_paths.append(out_image_file_path)
|
290 |
if logging_file_paths:
|
291 |
log_files_output_paths.extend(logging_file_paths)
|
292 |
|
293 |
+
#if isinstance(out_message, list):
|
294 |
+
# out_message.append("File '" + file_path_without_ext + "' successfully redacted")
|
295 |
|
296 |
logs_output_file_name = out_image_file_path + "_decision_process_output.csv"
|
297 |
+
all_decision_process_table.to_csv(logs_output_file_name, index = None, encoding="utf-8")
|
298 |
log_files_output_paths.append(logs_output_file_name)
|
299 |
|
300 |
+
all_text_output_file_name = out_image_file_path + "_ocr_output.csv"
|
301 |
+
all_line_level_ocr_results_df.to_csv(all_text_output_file_name, index = None, encoding="utf-8")
|
302 |
+
log_files_output_paths.append(all_text_output_file_name)
|
|
|
303 |
|
304 |
+
# Make a combined message for the file
|
305 |
+
if isinstance(out_message, list):
|
306 |
+
combined_out_message = '\n'.join(out_message) # Ensure out_message is a list of strings
|
307 |
+
else: combined_out_message = out_message
|
308 |
+
|
309 |
+
out_time_message = f" Redacted in {estimated_time_taken_state:0.1f} seconds."
|
310 |
+
combined_out_message = combined_out_message + " " + out_time_message # Ensure this is a single string
|
311 |
+
|
312 |
# Increase latest file completed count unless we are at the last file
|
313 |
+
# if latest_file_completed != len(file_paths):
|
314 |
+
# print("Completed file number:", str(latest_file_completed), "more files to do")
|
|
|
315 |
|
316 |
+
# if current_loop_page >= number_of_pages:
|
317 |
|
318 |
+
# print("Current page loop", current_loop_page, "is greater than or equal to number of pages:", number_of_pages)
|
319 |
+
# latest_file_completed += 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
320 |
|
321 |
+
# # Set to 999 to be a big number not to interrupt processing of large files by user
|
322 |
+
# current_loop_page = 999
|
|
|
323 |
|
324 |
+
# out_text_file_path = output_folder + file_path_without_ext + "_text_redacted.pdf"
|
325 |
+
# pymupdf_doc.save(out_text_file_path)
|
326 |
+
# out_file_paths.append(out_text_file_path)
|
327 |
|
328 |
+
# # Write logs to file
|
329 |
+
# decision_logs_output_file_name = out_text_file_path + "_decision_process_output.csv"
|
330 |
+
# all_decision_process_table.to_csv(decision_logs_output_file_name)
|
331 |
+
# log_files_output_paths.append(decision_logs_output_file_name)
|
332 |
|
333 |
+
# all_text_output_file_name = out_text_file_path + "_all_text_output.csv"
|
334 |
+
# all_line_level_ocr_results_df.to_csv(all_text_output_file_name)
|
335 |
+
# log_files_output_paths.append(all_text_output_file_name)
|
336 |
+
|
337 |
+
# out_message_new = "File '" + file_path_without_ext + "' successfully redacted"
|
338 |
|
339 |
+
# if isinstance(out_message, list):
|
340 |
+
# out_message.append(out_message_new) # Ensure out_message is a list of strings
|
341 |
|
342 |
if latest_file_completed != len(file_paths):
|
343 |
+
print("Completed file number:", str(latest_file_completed), " there are more files to do")
|
344 |
+
|
345 |
+
|
346 |
+
# Make a combined message for the file
|
347 |
+
if isinstance(out_message, list):
|
348 |
+
combined_out_message = '\n'.join(out_message) # Ensure out_message is a list of strings
|
349 |
+
else: combined_out_message = out_message
|
350 |
+
|
351 |
+
out_time_message = f" Redacted in {estimated_time_taken_state:0.1f} seconds."
|
352 |
+
combined_out_message = combined_out_message + " " + out_time_message # Ensure this is a single string
|
353 |
+
|
354 |
+
estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
|
355 |
+
print("Estimated total processing time:", str(estimate_total_processing_time))
|
356 |
|
357 |
toc = time.perf_counter()
|
358 |
+
time_taken = toc - tic
|
359 |
+
estimated_time_taken_state = estimated_time_taken_state + time_taken
|
|
|
|
|
|
|
360 |
|
361 |
# If textract requests made, write to logging file
|
362 |
if all_request_metadata:
|
|
|
371 |
if all_request_metadata_file_path not in log_files_output_paths:
|
372 |
log_files_output_paths.append(all_request_metadata_file_path)
|
373 |
|
374 |
+
if combined_out_message: out_message = combined_out_message
|
375 |
+
|
376 |
+
print("\nout_message at choose_and_run_redactor end is:", out_message)
|
377 |
+
|
378 |
+
# Ensure no duplicated output files
|
379 |
+
log_files_output_paths = list(set(log_files_output_paths))
|
380 |
+
out_file_paths = list(set(out_file_paths))
|
381 |
+
|
382 |
|
383 |
+
return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table
|
384 |
|
385 |
def convert_pikepdf_coords_to_pymudf(pymupdf_page, annot):
|
386 |
'''
|
|
|
576 |
|
577 |
if image:
|
578 |
image_x1, image_y1, image_x2, image_y2 = convert_pikepdf_to_image_coords(page, annot, image)
|
|
|
579 |
|
580 |
img_annotation_box["xmin"] = image_x1
|
581 |
img_annotation_box["ymin"] = image_y1
|
|
|
600 |
rect_single_pixel_height = Rect(x1, middle_y - 2, x2, middle_y + 2) # Small height in middle of word to remove text
|
601 |
|
602 |
# Add the annotation to the middle of the character line, so that it doesn't delete text from adjacent lines
|
603 |
+
#print("rect_single_pixel_height:", rect_single_pixel_height)
|
604 |
page.add_redact_annot(rect_single_pixel_height)
|
605 |
|
606 |
# Set up drawing a black box over the whole rect
|
|
|
614 |
"boxes": all_image_annotation_boxes
|
615 |
}
|
616 |
|
617 |
+
#print("out_annotation_boxes:", out_annotation_boxes)
|
618 |
+
|
619 |
page.apply_redactions(images=0, graphics=0)
|
620 |
page.clean_contents()
|
621 |
|
|
|
633 |
merged_bboxes = []
|
634 |
grouped_bboxes = defaultdict(list)
|
635 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
636 |
# Reconstruct bounding boxes for substrings of interest
|
637 |
reconstructed_bboxes = []
|
638 |
for bbox in bboxes:
|
|
|
724 |
merged_bboxes.append(merged_box)
|
725 |
merged_box = next_box
|
726 |
|
727 |
+
merged_bboxes.append(merged_box)
|
728 |
+
|
729 |
+
# Process signature and handwriting results
|
730 |
+
if signature_recogniser_results or handwriting_recogniser_results:
|
731 |
+
if "Redact all identified handwriting" in handwrite_signature_checkbox:
|
732 |
+
#print("Handwriting boxes exist at merge:", handwriting_recogniser_results)
|
733 |
+
merged_bboxes.extend(handwriting_recogniser_results)
|
734 |
+
|
735 |
+
if "Redact all identified signatures" in handwrite_signature_checkbox:
|
736 |
+
#print("Signature boxes exist at merge:", signature_recogniser_results)
|
737 |
+
merged_bboxes.extend(signature_recogniser_results)
|
738 |
+
|
739 |
+
#print("bboxes:", bboxes)
|
740 |
|
741 |
return merged_bboxes
|
742 |
|
743 |
+
def redact_image_pdf(file_path:str, prepared_pdf_file_paths:List[str], language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, is_a_pdf:bool=True, page_min:int=0, page_max:int=999, analysis_type:str="Quick image analysis - typed text", handwrite_signature_checkbox:List[str]=["Redact all identified handwriting", "Redact all identified signatures"], request_metadata:str="", current_loop_page:int=0, page_break_return:bool=False, images=[], annotations_all_pages:List=[], all_line_level_ocr_results_df = pd.DataFrame(), all_decision_process_table = pd.DataFrame(), pymupdf_doc = [], page_break_val:int=int(page_break_value), logging_file_paths:List=[], max_time:int=int(max_time_value), progress=Progress(track_tqdm=True)):
|
744 |
+
|
745 |
'''
|
746 |
+
This function redacts sensitive information from a PDF document. It takes the following parameters:
|
747 |
+
|
748 |
+
- file_path (str): The path to the PDF file to be redacted.
|
749 |
+
- prepared_pdf_file_paths (List[str]): A list of paths to the PDF file pages converted to images.
|
750 |
+
- language (str): The language of the text in the PDF.
|
751 |
+
- chosen_redact_entities (List[str]): A list of entity types to redact from the PDF.
|
752 |
+
- allow_list (List[str], optional): A list of entity types to allow in the PDF. Defaults to None.
|
753 |
+
- is_a_pdf (bool, optional): Indicates if the input file is a PDF. Defaults to True.
|
754 |
+
- page_min (int, optional): The minimum page number to start redaction from. Defaults to 0.
|
755 |
+
- page_max (int, optional): The maximum page number to end redaction at. Defaults to 999.
|
756 |
+
- analysis_type (str, optional): The type of analysis to perform on the PDF. Defaults to "Quick image analysis - typed text".
|
757 |
+
- handwrite_signature_checkbox (List[str], optional): A list of options for redacting handwriting and signatures. Defaults to ["Redact all identified handwriting", "Redact all identified signatures"].
|
758 |
+
- request_metadata (str, optional): Metadata related to the redaction request. Defaults to an empty string.
|
759 |
+
- current_loop_page (int, optional): The current page being processed in the loop. Defaults to 0.
|
760 |
+
- page_break_return (bool, optional): Indicates if the function should return after a page break. Defaults to False.
|
761 |
+
- page_break_val (int, optional): The value at which to trigger a page break. Defaults to 3.
|
762 |
+
- max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
|
763 |
+
- progress (Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
|
764 |
+
|
765 |
+
The function returns a redacted PDF document.
|
766 |
'''
|
|
|
|
|
767 |
file_name = get_file_path_end(file_path)
|
768 |
fill = (0, 0, 0) # Fill colour
|
|
|
|
|
|
|
|
|
769 |
image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
|
770 |
|
771 |
+
#print("pymupdf_doc at start of redact_image_pdf function:", pymupdf_doc)
|
772 |
+
|
773 |
+
tic = time.perf_counter()
|
774 |
|
775 |
if not prepared_pdf_file_paths:
|
776 |
out_message = "PDF does not exist as images. Converting pages to image"
|
|
|
778 |
|
779 |
prepared_pdf_file_paths = process_file(file_path)
|
780 |
|
|
|
|
|
|
|
|
|
|
|
781 |
number_of_pages = len(prepared_pdf_file_paths)
|
|
|
782 |
print("Number of pages:", str(number_of_pages))
|
783 |
|
|
|
|
|
|
|
|
|
784 |
# Check that page_min and page_max are within expected ranges
|
785 |
if page_max > number_of_pages or page_max == 0:
|
786 |
page_max = number_of_pages
|
787 |
|
788 |
+
if page_min <= 0: page_min = 0
|
789 |
+
else: page_min = page_min - 1
|
|
|
|
|
790 |
|
791 |
print("Page range:", str(page_min + 1), "to", str(page_max))
|
792 |
+
print("Current_loop_page:", current_loop_page)
|
793 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
794 |
if analysis_type == "Quick image analysis - typed text": ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".csv"
|
795 |
elif analysis_type == "Complex image analysis - docs with handwriting/signatures (AWS Textract)": ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.csv"
|
796 |
|
797 |
+
if current_loop_page == 0: page_loop_start = 0
|
798 |
+
else: page_loop_start = current_loop_page
|
799 |
+
|
800 |
+
#progress_bar = progress.tqdm(range(page_loop_start, number_of_pages), unit="pages", desc="Redacting pages")
|
801 |
+
progress_bar = tqdm(range(page_loop_start, number_of_pages), unit="pages remaining", desc="Redacting pages")
|
802 |
+
|
803 |
+
for page_no in progress_bar:
|
804 |
+
|
805 |
handwriting_or_signature_boxes = []
|
806 |
signature_recogniser_results = []
|
807 |
handwriting_recogniser_results = []
|
808 |
+
page_break_return = False
|
|
|
809 |
|
810 |
+
reported_page_number = str(page_no + 1)
|
811 |
+
print("Redacting page:", reported_page_number)
|
812 |
+
|
813 |
# Assuming prepared_pdf_file_paths[page_no] is a PIL image object
|
814 |
try:
|
815 |
image = prepared_pdf_file_paths[page_no]#.copy()
|
816 |
#print("image:", image)
|
817 |
except Exception as e:
|
818 |
print("Could not redact page:", reported_page_number, "due to:")
|
819 |
+
print(e)
|
|
|
820 |
continue
|
821 |
|
822 |
+
image_annotations = {"image": image, "boxes": []}
|
|
|
|
|
823 |
pymupdf_page = pymupdf_doc.load_page(page_no)
|
|
|
|
|
|
|
824 |
|
825 |
+
if page_no >= page_min and page_no < page_max:
|
|
|
|
|
826 |
|
827 |
+
#print("Image is in range of pages to redact")
|
828 |
+
if isinstance(image, str):
|
829 |
+
#print("image is a file path")
|
830 |
+
image = Image.open(image)
|
831 |
|
832 |
# Need image size to convert textract OCR outputs to the correct sizes
|
833 |
page_width, page_height = image.size
|
|
|
845 |
# Combine OCR results
|
846 |
line_level_ocr_results, line_level_ocr_results_with_children = combine_ocr_results(word_level_ocr_results)
|
847 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
848 |
|
849 |
# Import results from json and convert
|
850 |
if analysis_type == "Complex image analysis - docs with handwriting/signatures (AWS Textract)":
|
|
|
854 |
image.save(image_buffer, format='PNG') # Save as PNG, or adjust format if needed
|
855 |
pdf_page_as_bytes = image_buffer.getvalue()
|
856 |
|
857 |
+
#json_file_path = output_folder + file_name + "_page_" + reported_page_number + "_textract.json"
|
858 |
+
json_file_path = output_folder + file_name + "_textract.json"
|
859 |
|
860 |
if not os.path.exists(json_file_path):
|
861 |
+
text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number) # Analyse page with Textract
|
862 |
logging_file_paths.append(json_file_path)
|
863 |
request_metadata = request_metadata + "\n" + new_request_metadata
|
864 |
+
|
865 |
+
wrapped_text_blocks = {"pages":[text_blocks]}
|
866 |
+
|
867 |
+
# Write the updated existing_data back to the JSON file
|
868 |
+
with open(json_file_path, 'w') as json_file:
|
869 |
+
json.dump(wrapped_text_blocks, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
|
870 |
else:
|
871 |
# Open the file and load the JSON data
|
872 |
+
print("Found existing Textract json results file.")
|
873 |
with open(json_file_path, 'r') as json_file:
|
874 |
+
existing_data = json.load(json_file)
|
875 |
+
|
876 |
+
# Check if the current reported_page_number exists in the loaded JSON
|
877 |
+
page_exists = any(page['page_no'] == reported_page_number for page in existing_data.get("pages", []))
|
878 |
+
|
879 |
+
if not page_exists: # If the page does not exist, analyze again
|
880 |
+
print(f"Page number {reported_page_number} not found in existing data. Analyzing again.")
|
881 |
+
text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number) # Analyse page with Textract
|
882 |
+
|
883 |
+
# Check if "pages" key exists, if not, initialize it as an empty list
|
884 |
+
if "pages" not in existing_data:
|
885 |
+
existing_data["pages"] = []
|
886 |
+
|
887 |
+
# Append the new page data
|
888 |
+
existing_data["pages"].append(text_blocks)
|
889 |
+
|
890 |
+
# Write the updated existing_data back to the JSON file
|
891 |
+
with open(json_file_path, 'w') as json_file:
|
892 |
+
json.dump(existing_data, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
|
893 |
+
|
894 |
+
logging_file_paths.append(json_file_path)
|
895 |
+
request_metadata = request_metadata + "\n" + new_request_metadata
|
896 |
+
else:
|
897 |
+
# If the page exists, retrieve the data
|
898 |
+
text_blocks = next(page['data'] for page in existing_data["pages"] if page['page_no'] == reported_page_number)
|
899 |
|
900 |
+
line_level_ocr_results, handwriting_or_signature_boxes, signature_recogniser_results, handwriting_recogniser_results, line_level_ocr_results_with_children = json_to_ocrresult(text_blocks, page_width, page_height, reported_page_number)
|
901 |
|
902 |
# Step 2: Analyze text and identify PII
|
903 |
if chosen_redact_entities:
|
|
|
904 |
redaction_bboxes = image_analyser.analyze_text(
|
905 |
line_level_ocr_results,
|
906 |
line_level_ocr_results_with_children,
|
|
|
912 |
else:
|
913 |
redaction_bboxes = []
|
914 |
|
915 |
+
#print("\nsignature_recogniser_boxes:", signature_recogniser_results)
|
916 |
+
#print("\nhandwriting_recogniser_boxes:", handwriting_recogniser_results)
|
917 |
+
#print("\nredaction_bboxes:", redaction_bboxes)
|
918 |
+
|
919 |
if analysis_type == "Quick image analysis - typed text": interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".txt"
|
920 |
elif analysis_type == "Complex image analysis - docs with handwriting/signatures (AWS Textract)": interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.txt"
|
921 |
|
|
|
926 |
|
927 |
# Merge close bounding boxes
|
928 |
merged_redaction_bboxes = merge_img_bboxes(redaction_bboxes, line_level_ocr_results_with_children, signature_recogniser_results, handwriting_recogniser_results, handwrite_signature_checkbox)
|
929 |
+
|
|
|
|
|
|
|
930 |
# 3. Draw the merged boxes
|
|
|
931 |
if is_pdf(file_path) == False:
|
932 |
draw = ImageDraw.Draw(image)
|
933 |
|
|
|
962 |
|
963 |
## Apply annotations with pymupdf
|
964 |
else:
|
965 |
+
pymupdf_page, image_annotations = redact_page_with_pymupdf(pymupdf_page, merged_redaction_bboxes, image)
|
966 |
|
967 |
# Convert decision process to table
|
968 |
decision_process_table = pd.DataFrame([{
|
|
|
992 |
|
993 |
all_line_level_ocr_results_df = pd.concat([all_line_level_ocr_results_df, line_level_ocr_results_df])
|
994 |
|
995 |
+
toc = time.perf_counter()
|
996 |
+
|
997 |
+
time_taken = toc - tic
|
998 |
+
|
999 |
+
#print("toc - tic:", time_taken)
|
1000 |
+
|
1001 |
+
# Break if time taken is greater than max_time seconds
|
1002 |
+
if time_taken > max_time:
|
1003 |
+
print("Processing for", max_time, "seconds, breaking loop.")
|
1004 |
+
page_break_return = True
|
1005 |
+
progress.close(_tqdm=progress_bar)
|
1006 |
+
tqdm._instances.clear()
|
1007 |
+
|
1008 |
+
if is_pdf(file_path) == False:
|
1009 |
+
images.append(image)
|
1010 |
+
pymupdf_doc = images
|
1011 |
+
|
1012 |
+
annotations_all_pages.append(image_annotations)
|
1013 |
+
|
1014 |
+
current_loop_page += 1
|
1015 |
+
|
1016 |
+
return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df
|
1017 |
+
|
1018 |
if is_pdf(file_path) == False:
|
1019 |
images.append(image)
|
1020 |
pymupdf_doc = images
|
1021 |
|
1022 |
+
annotations_all_pages.append(image_annotations)
|
1023 |
|
1024 |
+
current_loop_page += 1
|
1025 |
|
1026 |
+
# Break if new page is a multiple of chosen page_break_val
|
1027 |
+
if current_loop_page % page_break_val == 0:
|
1028 |
+
page_break_return = True
|
1029 |
+
progress.close(_tqdm=progress_bar)
|
1030 |
+
tqdm._instances.clear()
|
1031 |
|
1032 |
+
return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df
|
1033 |
+
|
1034 |
+
return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df
|
1035 |
|
1036 |
|
1037 |
###
|
|
|
1048 |
|
1049 |
return characters
|
1050 |
return []
|
1051 |
+
|
1052 |
+
|
1053 |
+
def initial_clean(text):
|
1054 |
+
#### Some of my cleaning functions
|
1055 |
+
html_pattern_regex = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0| '
|
1056 |
+
html_start_pattern_end_dots_regex = r'<(.*?)\.\.'
|
1057 |
+
non_ascii_pattern = r'[^\x00-\x7F]+'
|
1058 |
+
multiple_spaces_regex = r'\s{2,}'
|
1059 |
+
|
1060 |
+
# Define a list of patterns and their replacements
|
1061 |
+
patterns = [
|
1062 |
+
(html_pattern_regex, ' '),
|
1063 |
+
(html_start_pattern_end_dots_regex, ' '),
|
1064 |
+
(non_ascii_pattern, ' '),
|
1065 |
+
(multiple_spaces_regex, ' ')
|
1066 |
+
]
|
1067 |
+
|
1068 |
+
# Apply each regex replacement
|
1069 |
+
for pattern, replacement in patterns:
|
1070 |
+
text = re.sub(pattern, replacement, text)
|
1071 |
|
1072 |
+
return text
|
1073 |
+
|
1074 |
+
|
1075 |
def analyse_text_container(text_container:OCRResult, language:str, chosen_redact_entities:List[str], score_threshold:float, allow_list:List[str]):
|
1076 |
'''
|
1077 |
Take text and bounding boxes in OCRResult format and analyze it for PII using spacy and the Microsoft Presidio package.
|
|
|
1079 |
|
1080 |
analyser_results = []
|
1081 |
|
1082 |
+
#text_to_analyse = initial_clean(text_container.text).strip()
|
1083 |
+
|
1084 |
+
text_to_analyse = initial_clean(text_container.text)
|
1085 |
|
1086 |
if chosen_redact_entities:
|
1087 |
+
#print("Running Presidio analyze method. text_to_analyse:", text_to_analyse)
|
1088 |
+
|
1089 |
+
analyser_results = nlp_analyser.analyze(text=text_to_analyse,
|
1090 |
language=language,
|
1091 |
entities=chosen_redact_entities,
|
1092 |
score_threshold=score_threshold,
|
1093 |
return_decision_process=True,
|
1094 |
+
allow_list=allow_list)
|
1095 |
|
1096 |
return analyser_results
|
1097 |
|
1098 |
+
|
1099 |
def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tuple[List[OCRResult], List[LTChar]]:
|
1100 |
'''
|
1101 |
Create an OCRResult object based on a list of pdfminer LTChar objects.
|
|
|
1108 |
|
1109 |
# Initialize variables
|
1110 |
full_text = ""
|
1111 |
+
added_text = ""
|
1112 |
overall_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')] # [x0, y0, x1, y1]
|
1113 |
word_bboxes = []
|
1114 |
|
|
|
1122 |
if isinstance(char, LTAnno):
|
1123 |
# Handle space separately by finalizing the word
|
1124 |
full_text += char.get_text() # Adds space or newline
|
1125 |
+
|
1126 |
if current_word: # Only finalize if there is a current word
|
1127 |
word_bboxes.append((current_word, current_word_bbox))
|
1128 |
current_word = ""
|
|
|
1147 |
continue
|
1148 |
|
1149 |
# Concatenate text for LTChar
|
1150 |
+
|
1151 |
+
|
1152 |
+
#full_text += char.get_text()
|
1153 |
+
#added_text = re.sub(r'[^\x00-\x7F]+', ' ', char.get_text())
|
1154 |
+
added_text = char.get_text()
|
1155 |
+
if re.search(r'[^\x00-\x7F]', added_text): # Matches any non-ASCII character
|
1156 |
+
#added_text.encode('latin1', errors='replace').decode('utf-8')
|
1157 |
+
added_text = clean_unicode_text(added_text)
|
1158 |
+
full_text += added_text # Adds space or newline, removing
|
1159 |
+
|
1160 |
+
|
1161 |
|
1162 |
# Update overall bounding box
|
1163 |
x0, y0, x1, y1 = char.bbox
|
|
|
1167 |
overall_bbox[3] = max(overall_bbox[3], y1) # y1
|
1168 |
|
1169 |
# Update current word
|
1170 |
+
#current_word += char.get_text()
|
1171 |
+
current_word += added_text
|
1172 |
|
1173 |
# Update current word bounding box
|
1174 |
current_word_bbox[0] = min(current_word_bbox[0], x0) # x0
|
|
|
1176 |
current_word_bbox[2] = max(current_word_bbox[2], x1) # x1
|
1177 |
current_word_bbox[3] = max(current_word_bbox[3], y1) # y1
|
1178 |
|
|
|
1179 |
# Finalize the last word if any
|
1180 |
if current_word:
|
1181 |
word_bboxes.append((current_word, current_word_bbox))
|
1182 |
|
1183 |
if full_text:
|
1184 |
+
#print("full_text before:", full_text)
|
1185 |
+
if re.search(r'[^\x00-\x7F]', full_text): # Matches any non-ASCII character
|
1186 |
+
# Convert special characters to a human-readable format
|
1187 |
+
#full_text = full_text.encode('latin1', errors='replace').decode('utf-8')
|
1188 |
+
full_text = clean_unicode_text(full_text)
|
1189 |
+
#print("full_text:", full_text)
|
1190 |
+
|
1191 |
line_level_results_out.append(OCRResult(full_text, round(overall_bbox[0],2), round(overall_bbox[1], 2), round(overall_bbox[2]-overall_bbox[0],2), round(overall_bbox[3]-overall_bbox[1],2)))
|
1192 |
+
|
1193 |
+
#line_level_characters_out = character_objects_out
|
1194 |
|
1195 |
return line_level_results_out, line_level_characters_out # Return both results and character objects
|
1196 |
|
1197 |
+
def merge_text_bounding_boxes(analyser_results:CustomImageRecognizerResult, characters:List[LTChar], combine_pixel_dist:int=20, vertical_padding:int=0):
|
1198 |
'''
|
1199 |
Merge identified bounding boxes containing PII that are very close to one another
|
1200 |
'''
|
|
|
1250 |
current_box[3] = max(current_box[3], char_box[3]) # Ensure the top is the highest
|
1251 |
current_result.end = max(current_result.end, result.end) # Extend the text range
|
1252 |
try:
|
1253 |
+
current_result.entity_type = current_result.entity_type + " - " + result.entity_type
|
1254 |
+
except Exception as e:
|
1255 |
+
print("Unable to combine result entity types:")
|
1256 |
+
print(e)
|
1257 |
# Add a space if current_text is not empty
|
1258 |
if current_text:
|
1259 |
current_text.append(" ") # Add space between texts
|
|
|
1330 |
annotations_on_page.append(annotation)
|
1331 |
return annotations_on_page
|
1332 |
|
1333 |
+
def redact_text_pdf(
|
1334 |
+
filename: str, # Path to the PDF file to be redacted
|
1335 |
+
prepared_pdf_image_path: str, # Path to the prepared PDF image for redaction
|
1336 |
+
language: str, # Language of the PDF content
|
1337 |
+
chosen_redact_entities: List[str], # List of entities to be redacted
|
1338 |
+
allow_list: List[str] = None, # Optional list of allowed entities
|
1339 |
+
page_min: int = 0, # Minimum page number to start redaction
|
1340 |
+
page_max: int = 999, # Maximum page number to end redaction
|
1341 |
+
analysis_type: str = "Simple text analysis - PDFs with selectable text", # Type of analysis to perform
|
1342 |
+
current_loop_page: int = 0, # Current page being processed in the loop
|
1343 |
+
page_break_return: bool = False, # Flag to indicate if a page break should be returned
|
1344 |
+
annotations_all_pages: List = [], # List of annotations across all pages
|
1345 |
+
all_line_level_ocr_results_df: pd.DataFrame = pd.DataFrame(), # DataFrame for OCR results
|
1346 |
+
all_decision_process_table: pd.DataFrame = pd.DataFrame(), # DataFrame for decision process table
|
1347 |
+
pymupdf_doc: List = [], # List of PyMuPDF documents
|
1348 |
+
page_break_val: int = int(page_break_value), # Value for page break
|
1349 |
+
max_time: int = int(max_time_value),
|
1350 |
+
progress: Progress = Progress(track_tqdm=True) # Progress tracking object
|
1351 |
+
):
|
1352 |
+
|
1353 |
'''
|
1354 |
+
Redact chosen entities from a PDF that is made up of multiple pages that are not images.
|
|
|
|
|
|
|
1355 |
|
1356 |
+
Input Variables:
|
1357 |
+
- filename: Path to the PDF file to be redacted
|
1358 |
+
- prepared_pdf_image_path: Path to the prepared PDF image for redaction
|
1359 |
+
- language: Language of the PDF content
|
1360 |
+
- chosen_redact_entities: List of entities to be redacted
|
1361 |
+
- allow_list: Optional list of allowed entities
|
1362 |
+
- page_min: Minimum page number to start redaction
|
1363 |
+
- page_max: Maximum page number to end redaction
|
1364 |
+
- analysis_type: Type of analysis to perform
|
1365 |
+
- current_loop_page: Current page being processed in the loop
|
1366 |
+
- page_break_return: Flag to indicate if a page break should be returned
|
1367 |
+
- images: List of images (not used in this function)
|
1368 |
+
- annotations_all_pages: List of annotations across all pages
|
1369 |
+
- all_line_level_ocr_results_df: DataFrame for OCR results
|
1370 |
+
- all_decision_process_table: DataFrame for decision process table
|
1371 |
+
- pymupdf_doc: List of PyMuPDF documents
|
1372 |
+
- page_break_val: Value for page break
|
1373 |
+
- max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
|
1374 |
+
- progress: Progress tracking object
|
1375 |
+
'''
|
1376 |
+
|
1377 |
+
tic = time.perf_counter()
|
1378 |
|
1379 |
# Open with Pikepdf to get text lines
|
1380 |
pikepdf_pdf = Pdf.open(filename)
|
1381 |
number_of_pages = len(pikepdf_pdf.pages)
|
|
|
|
|
|
|
1382 |
|
|
|
|
|
1383 |
# Check that page_min and page_max are within expected ranges
|
1384 |
if page_max > number_of_pages or page_max == 0:
|
1385 |
page_max = number_of_pages
|
|
|
|
|
1386 |
|
1387 |
if page_min <= 0: page_min = 0
|
1388 |
else: page_min = page_min - 1
|
1389 |
|
1390 |
print("Page range is",str(page_min + 1), "to", str(page_max))
|
1391 |
+
print("Current_loop_page:", current_loop_page)
|
1392 |
+
|
1393 |
+
if current_loop_page == 0: page_loop_start = 0
|
1394 |
+
else: page_loop_start = current_loop_page
|
1395 |
+
|
1396 |
+
#progress_bar = progress.tqdm(range(current_loop_page, number_of_pages), unit="pages", desc="Redacting pages")
|
1397 |
+
progress_bar = tqdm(range(current_loop_page, number_of_pages), unit="pages remaining", desc="Redacting pages")
|
1398 |
|
1399 |
#for page_no in range(0, number_of_pages):
|
1400 |
+
for page_no in progress_bar:
|
|
|
|
|
|
|
|
|
1401 |
|
1402 |
+
reported_page_number = str(page_no + 1)
|
1403 |
+
print("Redacting page:", reported_page_number)
|
1404 |
|
1405 |
+
# Assuming prepared_pdf_file_paths[page_no] is a PIL image object
|
1406 |
+
try:
|
1407 |
+
image = prepared_pdf_image_path[page_no]#.copy()
|
1408 |
+
#print("image:", image)
|
1409 |
+
except Exception as e:
|
1410 |
+
print("Could not redact page:", reported_page_number, "due to:")
|
1411 |
+
print(e)
|
1412 |
+
continue
|
1413 |
+
|
1414 |
+
image_annotations = {"image": image, "boxes": []}
|
1415 |
pymupdf_page = pymupdf_doc.load_page(page_no)
|
1416 |
|
1417 |
+
#print("pymupdf page loaded")
|
1418 |
+
|
1419 |
+
#print("Page number is:", str(page_no + 1))
|
1420 |
|
1421 |
if page_min <= page_no < page_max:
|
1422 |
|
1423 |
+
#print("Page is in range of pages to redact")
|
1424 |
+
|
1425 |
for page_layout in extract_pages(filename, page_numbers = [page_no], maxpages=1):
|
1426 |
|
1427 |
page_analyser_results = []
|
|
|
1433 |
page_text_outputs = pd.DataFrame()
|
1434 |
|
1435 |
if analysis_type == "Simple text analysis - PDFs with selectable text":
|
1436 |
+
for n, text_container in enumerate(page_layout):
|
1437 |
|
1438 |
text_container_analyser_results = []
|
1439 |
text_container_analysed_bounding_boxes = []
|
1440 |
+
characters = []
|
1441 |
|
1442 |
+
if isinstance(text_container, LTTextContainer) or isinstance(text_container, LTAnno):
|
1443 |
+
characters = get_text_container_characters(text_container)
|
1444 |
|
1445 |
# Create dataframe for all the text on the page
|
1446 |
line_level_text_results_list, line_characters = create_text_bounding_boxes_from_characters(characters)
|
1447 |
|
|
|
|
|
1448 |
# Create page_text_outputs (OCR format outputs)
|
1449 |
if line_level_text_results_list:
|
1450 |
# Convert to DataFrame and add to ongoing logging table
|
|
|
1461 |
|
1462 |
# Analyse each line of text in turn for PII and add to list
|
1463 |
for i, text_line in enumerate(line_level_text_results_list):
|
1464 |
+
|
1465 |
text_line_analyser_result = []
|
1466 |
text_line_bounding_boxes = []
|
1467 |
|
|
|
|
|
1468 |
text_line_analyser_result = analyse_text_container(text_line, language, chosen_redact_entities, score_threshold, allow_list)
|
1469 |
|
1470 |
# Merge bounding boxes for the line if multiple found close together
|
1471 |
if text_line_analyser_result:
|
1472 |
+
|
1473 |
+
#print("Analysed text container, now merging bounding boxes")
|
1474 |
+
|
1475 |
# Merge bounding boxes if very close together
|
1476 |
+
text_line_bounding_boxes = merge_text_bounding_boxes(text_line_analyser_result, line_characters[i])
|
1477 |
+
|
1478 |
+
#print("merged bounding boxes")
|
|
|
|
|
1479 |
|
1480 |
text_container_analyser_results.extend(text_line_analyser_result)
|
1481 |
text_container_analysed_bounding_boxes.extend(text_line_bounding_boxes)
|
1482 |
+
|
|
|
|
|
|
|
1483 |
page_analyser_results.extend(text_container_analyser_results)
|
1484 |
page_analysed_bounding_boxes.extend(text_container_analysed_bounding_boxes)
|
1485 |
|
1486 |
# Annotate redactions on page
|
1487 |
annotations_on_page = create_annotations_for_bounding_boxes(page_analysed_bounding_boxes)
|
|
|
|
|
|
|
|
|
|
|
1488 |
|
1489 |
+
# Make pymupdf page redactions
|
1490 |
pymupdf_page, image_annotations = redact_page_with_pymupdf(pymupdf_page, annotations_on_page, image)
|
1491 |
|
1492 |
+
#print("Did redact_page_with_pymupdf function")
|
1493 |
|
1494 |
print("For page number:", page_no, "there are", len(image_annotations["boxes"]), "annotations")
|
1495 |
|
1496 |
# Write logs
|
1497 |
# Create decision process table
|
1498 |
+
decision_process_table_on_page = create_text_redaction_process_results(page_analyser_results, page_analysed_bounding_boxes, current_loop_page)
|
1499 |
|
1500 |
if not decision_process_table_on_page.empty:
|
1501 |
+
all_decision_process_table = pd.concat([all_decision_process_table, decision_process_table_on_page])
|
1502 |
|
1503 |
if not page_text_outputs.empty:
|
1504 |
page_text_outputs = page_text_outputs.sort_values(["top", "left"], ascending=[False, False]).reset_index(drop=True)
|
1505 |
+
all_line_level_ocr_results_df = pd.concat([all_line_level_ocr_results_df, page_text_outputs])
|
1506 |
+
|
1507 |
+
toc = time.perf_counter()
|
1508 |
+
|
1509 |
+
time_taken = toc - tic
|
1510 |
+
|
1511 |
+
#print("toc - tic:", time_taken)
|
1512 |
+
|
1513 |
+
# Break if time taken is greater than max_time seconds
|
1514 |
+
if time_taken > max_time:
|
1515 |
+
print("Processing for", max_time, "seconds, breaking.")
|
1516 |
+
page_break_return = True
|
1517 |
+
progress.close(_tqdm=progress_bar)
|
1518 |
+
tqdm._instances.clear()
|
1519 |
+
|
1520 |
+
annotations_all_pages.append(image_annotations)
|
1521 |
|
1522 |
+
current_loop_page += 1
|
1523 |
|
1524 |
+
return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return
|
1525 |
+
|
1526 |
+
|
1527 |
+
annotations_all_pages.append(image_annotations)
|
1528 |
+
|
1529 |
+
current_loop_page += 1
|
1530 |
+
|
1531 |
+
# Break if new page is a multiple of 10
|
1532 |
+
if current_loop_page % page_break_val == 0:
|
1533 |
+
page_break_return = True
|
1534 |
+
progress.close(_tqdm=progress_bar)
|
1535 |
+
|
1536 |
+
return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return
|
1537 |
+
|
1538 |
|
1539 |
+
return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return
|
tools/helper_functions.py
CHANGED
@@ -1,6 +1,8 @@
|
|
1 |
import os
|
|
|
2 |
import gradio as gr
|
3 |
import pandas as pd
|
|
|
4 |
|
5 |
def get_or_create_env_var(var_name, default_value):
|
6 |
# Get the environment variable if it exists
|
@@ -166,7 +168,7 @@ def add_folder_to_path(folder_path: str):
|
|
166 |
|
167 |
# Upon running a process, the feedback buttons are revealed
|
168 |
def reveal_feedback_buttons():
|
169 |
-
return gr.Radio(visible=True), gr.Textbox(visible=True), gr.Button(visible=True), gr.Markdown(visible=True)
|
170 |
|
171 |
def wipe_logs(feedback_logs_loc, usage_logs_loc):
|
172 |
try:
|
@@ -238,4 +240,26 @@ async def get_connection_params(request: gr.Request):
|
|
238 |
return out_session_hash, output_folder, out_session_hash
|
239 |
else:
|
240 |
print("No session parameters found.")
|
241 |
-
return "",""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
+
import re
|
3 |
import gradio as gr
|
4 |
import pandas as pd
|
5 |
+
import unicodedata
|
6 |
|
7 |
def get_or_create_env_var(var_name, default_value):
|
8 |
# Get the environment variable if it exists
|
|
|
168 |
|
169 |
# Upon running a process, the feedback buttons are revealed
|
170 |
def reveal_feedback_buttons():
|
171 |
+
return gr.Radio(visible=True, label="Please give some feedback about the results of the redaction. A reminder that the app is only expected to identify about 60% of personally identifiable information in a given (typed) document."), gr.Textbox(visible=True), gr.Button(visible=True), gr.Markdown(visible=True)
|
172 |
|
173 |
def wipe_logs(feedback_logs_loc, usage_logs_loc):
|
174 |
try:
|
|
|
240 |
return out_session_hash, output_folder, out_session_hash
|
241 |
else:
|
242 |
print("No session parameters found.")
|
243 |
+
return "",""
|
244 |
+
|
245 |
+
|
246 |
+
def clean_unicode_text(text):
|
247 |
+
# Step 1: Normalize unicode characters to decompose any special forms
|
248 |
+
normalized_text = unicodedata.normalize('NFKC', text)
|
249 |
+
|
250 |
+
# Step 2: Replace smart quotes and special punctuation with standard ASCII equivalents
|
251 |
+
replacements = {
|
252 |
+
'‘': "'", '’': "'", '“': '"', '”': '"',
|
253 |
+
'–': '-', '—': '-', '…': '...', '•': '*',
|
254 |
+
}
|
255 |
+
|
256 |
+
# Perform replacements
|
257 |
+
for old_char, new_char in replacements.items():
|
258 |
+
normalized_text = normalized_text.replace(old_char, new_char)
|
259 |
+
|
260 |
+
# Step 3: Optionally remove non-ASCII characters if needed
|
261 |
+
# This regex removes any remaining non-ASCII characters, if desired.
|
262 |
+
# Comment this line if you want to keep all Unicode characters.
|
263 |
+
cleaned_text = re.sub(r'[^\x00-\x7F]+', '', normalized_text)
|
264 |
+
|
265 |
+
return cleaned_text
|
tools/load_spacy_model_custom_recognisers.py
CHANGED
@@ -141,7 +141,7 @@ class LoadedSpacyNlpEngine(SpacyNlpEngine):
|
|
141 |
self.nlp = {"en": loaded_spacy_model}
|
142 |
|
143 |
# %%
|
144 |
-
#
|
145 |
try:
|
146 |
import en_core_web_lg
|
147 |
nlp = en_core_web_lg.load()
|
@@ -151,6 +151,16 @@ except:
|
|
151 |
download("en_core_web_lg")
|
152 |
nlp = spacy.load("en_core_web_lg")
|
153 |
print("Successfully downloaded and imported spaCy model")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
154 |
|
155 |
# Pass the loaded model to the new LoadedSpacyNlpEngine
|
156 |
loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp)
|
|
|
141 |
self.nlp = {"en": loaded_spacy_model}
|
142 |
|
143 |
# %%
|
144 |
+
#Load spacy model
|
145 |
try:
|
146 |
import en_core_web_lg
|
147 |
nlp = en_core_web_lg.load()
|
|
|
151 |
download("en_core_web_lg")
|
152 |
nlp = spacy.load("en_core_web_lg")
|
153 |
print("Successfully downloaded and imported spaCy model")
|
154 |
+
|
155 |
+
# try:
|
156 |
+
# import en_core_web_sm
|
157 |
+
# nlp = en_core_web_sm.load()
|
158 |
+
# print("Successfully imported spaCy model")
|
159 |
+
|
160 |
+
# except:
|
161 |
+
# download("en_core_web_sm")
|
162 |
+
# nlp = spacy.load("en_core_web_sm")
|
163 |
+
# print("Successfully downloaded and imported spaCy model")
|
164 |
|
165 |
# Pass the loaded model to the new LoadedSpacyNlpEngine
|
166 |
loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp)
|
tools/redaction_review.py
CHANGED
@@ -18,9 +18,9 @@ def decrease_page(number:int):
|
|
18 |
'''
|
19 |
#print("number:", str(number))
|
20 |
if number > 1:
|
21 |
-
return number - 1
|
22 |
else:
|
23 |
-
return 1
|
24 |
|
25 |
def increase_page(number:int, image_annotator_object:AnnotatedImageData):
|
26 |
'''
|
@@ -28,14 +28,14 @@ def increase_page(number:int, image_annotator_object:AnnotatedImageData):
|
|
28 |
'''
|
29 |
|
30 |
if not image_annotator_object:
|
31 |
-
return 1
|
32 |
|
33 |
max_pages = len(image_annotator_object)
|
34 |
|
35 |
if number < max_pages:
|
36 |
-
return number + 1
|
37 |
else:
|
38 |
-
return max_pages
|
39 |
|
40 |
def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int):
|
41 |
# print("\nImage annotator object:", image_annotator_object)
|
@@ -51,7 +51,7 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int):
|
|
51 |
show_share_button=False,
|
52 |
show_remove_button=False,
|
53 |
interactive=False
|
54 |
-
), gr.Number(label = "
|
55 |
|
56 |
if page_num is None:
|
57 |
page_num = 0
|
@@ -89,9 +89,9 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int):
|
|
89 |
interactive=True
|
90 |
)
|
91 |
|
92 |
-
number_reported = gr.Number(label = "
|
93 |
|
94 |
-
return out_image_annotator, number_reported
|
95 |
|
96 |
def modify_existing_page_redactions(image_annotated:AnnotatedImageData, current_page:int, previous_page:int, all_image_annotations:List[AnnotatedImageData]):
|
97 |
'''
|
@@ -99,7 +99,7 @@ def modify_existing_page_redactions(image_annotated:AnnotatedImageData, current_
|
|
99 |
'''
|
100 |
#If no previous page or is 0, i.e. first time run, then make no changes
|
101 |
if not previous_page:
|
102 |
-
return all_image_annotations, current_page
|
103 |
|
104 |
if not current_page:
|
105 |
current_page = 1
|
@@ -114,7 +114,7 @@ def modify_existing_page_redactions(image_annotated:AnnotatedImageData, current_
|
|
114 |
|
115 |
#print("all_image_annotations after:",all_image_annotations)
|
116 |
|
117 |
-
return all_image_annotations, current_page
|
118 |
|
119 |
def apply_redactions(image_annotated:AnnotatedImageData, file_paths:str, doc:Document, all_image_annotations:List[AnnotatedImageData], current_page:int, progress=gr.Progress(track_tqdm=True)):
|
120 |
'''
|
@@ -132,7 +132,11 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:str, doc:Doc
|
|
132 |
print("No image annotations found")
|
133 |
return doc, all_image_annotations
|
134 |
|
135 |
-
|
|
|
|
|
|
|
|
|
136 |
print("file_path:", file_path)
|
137 |
file_base = get_file_path_end(file_path)
|
138 |
|
|
|
18 |
'''
|
19 |
#print("number:", str(number))
|
20 |
if number > 1:
|
21 |
+
return number - 1, number - 1
|
22 |
else:
|
23 |
+
return 1, 1
|
24 |
|
25 |
def increase_page(number:int, image_annotator_object:AnnotatedImageData):
|
26 |
'''
|
|
|
28 |
'''
|
29 |
|
30 |
if not image_annotator_object:
|
31 |
+
return 1, 1
|
32 |
|
33 |
max_pages = len(image_annotator_object)
|
34 |
|
35 |
if number < max_pages:
|
36 |
+
return number + 1, number + 1
|
37 |
else:
|
38 |
+
return max_pages, max_pages
|
39 |
|
40 |
def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int):
|
41 |
# print("\nImage annotator object:", image_annotator_object)
|
|
|
51 |
show_share_button=False,
|
52 |
show_remove_button=False,
|
53 |
interactive=False
|
54 |
+
), gr.Number(label = "Page (press enter to change)", value=1, precision=0)
|
55 |
|
56 |
if page_num is None:
|
57 |
page_num = 0
|
|
|
89 |
interactive=True
|
90 |
)
|
91 |
|
92 |
+
number_reported = gr.Number(label = "Page (press enter to change)", value=page_num_reported, precision=0)
|
93 |
|
94 |
+
return out_image_annotator, number_reported, number_reported
|
95 |
|
96 |
def modify_existing_page_redactions(image_annotated:AnnotatedImageData, current_page:int, previous_page:int, all_image_annotations:List[AnnotatedImageData]):
|
97 |
'''
|
|
|
99 |
'''
|
100 |
#If no previous page or is 0, i.e. first time run, then make no changes
|
101 |
if not previous_page:
|
102 |
+
return all_image_annotations, current_page, current_page
|
103 |
|
104 |
if not current_page:
|
105 |
current_page = 1
|
|
|
114 |
|
115 |
#print("all_image_annotations after:",all_image_annotations)
|
116 |
|
117 |
+
return all_image_annotations, current_page, current_page
|
118 |
|
119 |
def apply_redactions(image_annotated:AnnotatedImageData, file_paths:str, doc:Document, all_image_annotations:List[AnnotatedImageData], current_page:int, progress=gr.Progress(track_tqdm=True)):
|
120 |
'''
|
|
|
132 |
print("No image annotations found")
|
133 |
return doc, all_image_annotations
|
134 |
|
135 |
+
if isinstance(file_paths, list):
|
136 |
+
file_path = file_paths[-1].name
|
137 |
+
else:
|
138 |
+
file_path = file_paths
|
139 |
+
|
140 |
print("file_path:", file_path)
|
141 |
file_base = get_file_path_end(file_path)
|
142 |
|