Commit
·
542c252
1
Parent(s):
390bef2
Consolidated AWS Comprehend redaction calls to reduce total number
Browse files- app.py +1 -1
- tools/custom_image_analyser_engine.py +133 -103
- tools/file_redaction.py +95 -82
app.py
CHANGED
@@ -318,7 +318,7 @@ with app:
|
|
318 |
|
319 |
# If the output file count text box changes, keep going with redacting each data file until done
|
320 |
text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
|
321 |
-
then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
|
322 |
|
323 |
###
|
324 |
# APP LOAD AND LOGGING
|
|
|
318 |
|
319 |
# If the output file count text box changes, keep going with redacting each data file until done
|
320 |
text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
|
321 |
+
then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
|
322 |
|
323 |
###
|
324 |
# APP LOAD AND LOGGING
|
tools/custom_image_analyser_engine.py
CHANGED
@@ -477,122 +477,152 @@ class CustomImageAnalyzerEngine:
|
|
477 |
allow_list = text_analyzer_kwargs.get('allow_list', [])
|
478 |
|
479 |
combined_results = []
|
480 |
-
|
481 |
-
|
482 |
-
|
483 |
-
|
484 |
-
|
485 |
-
# Analyze each OCR result (line) individually
|
486 |
|
|
|
|
|
487 |
if pii_identification_method == "Local":
|
488 |
analyzer_result = self.analyzer_engine.analyze(
|
489 |
text=line_level_ocr_result.text, **text_analyzer_kwargs
|
490 |
)
|
|
|
491 |
|
492 |
elif pii_identification_method == "AWS Comprehend":
|
493 |
-
|
494 |
if len(line_level_ocr_result.text) >= 3:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
495 |
|
496 |
-
try:
|
497 |
-
# Call the detect_pii_entities method
|
498 |
-
response = comprehend_client.detect_pii_entities(
|
499 |
-
Text=line_level_ocr_result.text,
|
500 |
-
LanguageCode=text_analyzer_kwargs["language"] # Specify the language of the text
|
501 |
-
)
|
502 |
-
except Exception as e:
|
503 |
-
print(e)
|
504 |
-
time.sleep(3)
|
505 |
-
|
506 |
-
response = comprehend_client.detect_pii_entities(
|
507 |
-
Text=line_level_ocr_result.text,
|
508 |
-
LanguageCode=text_analyzer_kwargs["language"] # Specify the language of the text
|
509 |
-
)
|
510 |
-
|
511 |
-
comprehend_query_number += 1
|
512 |
-
|
513 |
-
if response:
|
514 |
-
for result in response["Entities"]:
|
515 |
-
result_text = line_level_ocr_result.text[result["BeginOffset"]:result["EndOffset"]+1]
|
516 |
-
|
517 |
-
if result_text not in allow_list:
|
518 |
-
|
519 |
-
if result.get("Type") in chosen_redact_comprehend_entities:
|
520 |
-
|
521 |
-
recogniser_entity = recognizer_result_from_dict(result)
|
522 |
-
analyzer_result.append(recogniser_entity)
|
523 |
-
|
524 |
-
|
525 |
-
if i < len(ocr_results_with_children): # Check if i is a valid index
|
526 |
child_level_key = list(ocr_results_with_children.keys())[i]
|
527 |
-
|
528 |
-
|
529 |
-
|
530 |
-
|
531 |
-
|
532 |
-
|
533 |
-
|
534 |
-
|
535 |
-
|
536 |
-
|
537 |
-
|
538 |
-
|
539 |
-
|
540 |
-
|
541 |
-
|
542 |
-
|
543 |
-
|
544 |
-
|
545 |
-
|
546 |
-
|
547 |
-
|
548 |
-
|
549 |
-
|
550 |
-
|
551 |
-
|
552 |
-
|
553 |
-
|
554 |
-
|
555 |
-
|
556 |
-
|
557 |
-
|
558 |
-
|
559 |
-
|
560 |
-
|
561 |
-
|
562 |
-
|
563 |
-
|
564 |
-
|
565 |
-
|
566 |
-
|
567 |
-
|
568 |
-
|
569 |
-
|
570 |
-
width=width,
|
571 |
-
height=height + height_buffer
|
572 |
-
)
|
573 |
-
|
574 |
-
if not ocr_results_with_children_line_level:
|
575 |
-
# Fallback to previous method if not found in ocr_results_with_children
|
576 |
-
print("No child info found")
|
577 |
-
continue
|
578 |
|
579 |
-
|
580 |
-
|
581 |
-
|
582 |
-
|
583 |
-
|
584 |
-
#print("result_reset_pos:", result_reset_pos)
|
585 |
-
#print("relevant_line_ocr_result:", relevant_line_ocr_result)
|
586 |
-
#print("ocr_results_with_children_line_level:", ocr_results_with_children_line_level)
|
587 |
|
588 |
-
|
589 |
-
|
590 |
-
|
591 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
592 |
|
593 |
-
|
594 |
-
|
595 |
-
|
596 |
|
597 |
return combined_results, comprehend_query_number
|
598 |
|
|
|
477 |
allow_list = text_analyzer_kwargs.get('allow_list', [])
|
478 |
|
479 |
combined_results = []
|
480 |
+
# Initialize variables for batching
|
481 |
+
current_batch = ""
|
482 |
+
current_batch_mapping = [] # List of (start_pos, line_index, original_text) tuples
|
483 |
+
analyzer_results_by_line = [[] for _ in line_level_ocr_results] # Store results for each line
|
|
|
|
|
484 |
|
485 |
+
# Process OCR results in batches
|
486 |
+
for i, line_level_ocr_result in enumerate(line_level_ocr_results):
|
487 |
if pii_identification_method == "Local":
|
488 |
analyzer_result = self.analyzer_engine.analyze(
|
489 |
text=line_level_ocr_result.text, **text_analyzer_kwargs
|
490 |
)
|
491 |
+
analyzer_results_by_line[i] = analyzer_result
|
492 |
|
493 |
elif pii_identification_method == "AWS Comprehend":
|
|
|
494 |
if len(line_level_ocr_result.text) >= 3:
|
495 |
+
# Add line to current batch with a separator
|
496 |
+
if current_batch:
|
497 |
+
current_batch += " | " # Use a separator that's unlikely to appear in the text
|
498 |
+
|
499 |
+
start_pos = len(current_batch)
|
500 |
+
current_batch += line_level_ocr_result.text
|
501 |
+
current_batch_mapping.append((start_pos, i, line_level_ocr_result.text))
|
502 |
+
|
503 |
+
# Process batch if it's approaching 300 characters or this is the last line
|
504 |
+
if len(current_batch) >= 200 or i == len(line_level_ocr_results) - 1:
|
505 |
+
print("length of text for Comprehend:", len(current_batch))
|
506 |
+
|
507 |
+
try:
|
508 |
+
response = comprehend_client.detect_pii_entities(
|
509 |
+
Text=current_batch,
|
510 |
+
LanguageCode=text_analyzer_kwargs["language"]
|
511 |
+
)
|
512 |
+
except Exception as e:
|
513 |
+
print(e)
|
514 |
+
time.sleep(3)
|
515 |
+
response = comprehend_client.detect_pii_entities(
|
516 |
+
Text=current_batch,
|
517 |
+
LanguageCode=text_analyzer_kwargs["language"]
|
518 |
+
)
|
519 |
+
|
520 |
+
comprehend_query_number += 1
|
521 |
+
|
522 |
+
# Map results back to original lines
|
523 |
+
if response and "Entities" in response:
|
524 |
+
for entity in response["Entities"]:
|
525 |
+
entity_start = entity["BeginOffset"]
|
526 |
+
entity_end = entity["EndOffset"]
|
527 |
+
|
528 |
+
# Find which line this entity belongs to
|
529 |
+
for batch_start, line_idx, original_text in current_batch_mapping:
|
530 |
+
batch_end = batch_start + len(original_text)
|
531 |
+
|
532 |
+
# Check if entity belongs to this line
|
533 |
+
if batch_start <= entity_start < batch_end:
|
534 |
+
# Adjust offsets relative to the original line
|
535 |
+
relative_start = entity_start - batch_start
|
536 |
+
relative_end = min(entity_end - batch_start, len(original_text))
|
537 |
+
|
538 |
+
result_text = original_text[relative_start:relative_end]
|
539 |
+
|
540 |
+
if result_text not in allow_list:
|
541 |
+
if entity.get("Type") in chosen_redact_comprehend_entities:
|
542 |
+
# Create a new entity with adjusted positions
|
543 |
+
adjusted_entity = entity.copy()
|
544 |
+
adjusted_entity["BeginOffset"] = relative_start
|
545 |
+
adjusted_entity["EndOffset"] = relative_end
|
546 |
+
|
547 |
+
recogniser_entity = recognizer_result_from_dict(adjusted_entity)
|
548 |
+
analyzer_results_by_line[line_idx].append(recogniser_entity)
|
549 |
+
|
550 |
+
# Reset batch
|
551 |
+
current_batch = ""
|
552 |
+
current_batch_mapping = []
|
553 |
+
|
554 |
+
# Process results for each line
|
555 |
+
for i, analyzer_result in enumerate(analyzer_results_by_line):
|
556 |
+
if i >= len(ocr_results_with_children):
|
557 |
+
continue
|
558 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
559 |
child_level_key = list(ocr_results_with_children.keys())[i]
|
560 |
+
ocr_results_with_children_line_level = ocr_results_with_children[child_level_key]
|
561 |
+
|
562 |
+
# Go through results to add bounding boxes
|
563 |
+
for result in analyzer_result:
|
564 |
+
# Extract the relevant portion of text based on start and end
|
565 |
+
relevant_text = line_level_ocr_results[i].text[result.start:result.end]
|
566 |
+
|
567 |
+
# Find the corresponding entry in ocr_results_with_children
|
568 |
+
child_words = ocr_results_with_children_line_level['words']
|
569 |
+
|
570 |
+
# Initialize bounding box values
|
571 |
+
left, top, bottom = float('inf'), float('inf'), float('-inf')
|
572 |
+
all_words = ""
|
573 |
+
word_num = 0 # Initialize word count
|
574 |
+
total_width = 0 # Initialize total width
|
575 |
+
|
576 |
+
for word_text in relevant_text.split(): # Iterate through each word in relevant_text
|
577 |
+
#print("Looking for word_text:", word_text)
|
578 |
+
for word in child_words:
|
579 |
+
#if word['text'].strip(string.punctuation).strip() == word_text.strip(string.punctuation).strip(): # Check for exact match
|
580 |
+
if word_text in word['text']:
|
581 |
+
found_word = word
|
582 |
+
#print("found_word:", found_word)
|
583 |
+
|
584 |
+
if word_num == 0: # First word
|
585 |
+
left = found_word['bounding_box'][0]
|
586 |
+
top = found_word['bounding_box'][1]
|
587 |
+
bottom = max(bottom, found_word['bounding_box'][3]) # Update bottom for all words
|
588 |
+
all_words += found_word['text'] + " " # Concatenate words
|
589 |
+
total_width = found_word['bounding_box'][2] - left # Add each word's width
|
590 |
+
word_num += 1
|
591 |
+
break # Move to the next word in relevant_text
|
592 |
+
|
593 |
+
width = total_width + horizontal_buffer # Set width to total width of all matched words
|
594 |
+
height = bottom - top if word_num > 0 else 0 # Calculate height
|
595 |
+
|
596 |
+
relevant_line_ocr_result = OCRResult(
|
597 |
+
text=relevant_text,
|
598 |
+
left=left,
|
599 |
+
top=top - height_buffer,
|
600 |
+
width=width,
|
601 |
+
height=height + height_buffer
|
602 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
603 |
|
604 |
+
if not ocr_results_with_children_line_level:
|
605 |
+
# Fallback to previous method if not found in ocr_results_with_children
|
606 |
+
print("No child info found")
|
607 |
+
continue
|
|
|
|
|
|
|
|
|
608 |
|
609 |
+
# Reset the word positions indicated in the relevant ocr_result - i.e. it starts from 0 and ends at word length
|
610 |
+
result_reset_pos = result
|
611 |
+
result_reset_pos.start = 0
|
612 |
+
result_reset_pos.end = len(relevant_text)
|
613 |
+
|
614 |
+
#print("result_reset_pos:", result_reset_pos)
|
615 |
+
#print("relevant_line_ocr_result:", relevant_line_ocr_result)
|
616 |
+
#print("ocr_results_with_children_line_level:", ocr_results_with_children_line_level)
|
617 |
+
|
618 |
+
# Map the analyzer results to bounding boxes for this line
|
619 |
+
line_results = self.map_analyzer_results_to_bounding_boxes(
|
620 |
+
[result_reset_pos], [relevant_line_ocr_result], relevant_line_ocr_result.text, allow_list, ocr_results_with_children_line_level
|
621 |
+
)
|
622 |
|
623 |
+
#print("line_results:", line_results)
|
624 |
+
|
625 |
+
combined_results.extend(line_results)
|
626 |
|
627 |
return combined_results, comprehend_query_number
|
628 |
|
tools/file_redaction.py
CHANGED
@@ -133,7 +133,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
133 |
|
134 |
# If this is the first time around, set variables to 0/blank
|
135 |
if first_loop_state==True:
|
136 |
-
print("First_loop_state is True")
|
137 |
latest_file_completed = 0
|
138 |
current_loop_page = 0
|
139 |
out_file_paths = []
|
@@ -835,7 +835,7 @@ def redact_image_pdf(file_path:str,
|
|
835 |
else: page_min = page_min - 1
|
836 |
|
837 |
print("Page range:", str(page_min + 1), "to", str(page_max))
|
838 |
-
print("Current_loop_page:", current_loop_page)
|
839 |
|
840 |
if analysis_type == "Quick image analysis - typed text": ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".csv"
|
841 |
elif analysis_type == "Complex image analysis - docs with handwriting/signatures (AWS Textract)": ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.csv"
|
@@ -1300,70 +1300,7 @@ def merge_text_bounding_boxes(analyser_results:CustomImageRecognizerResult, char
|
|
1300 |
|
1301 |
return analysed_bounding_boxes
|
1302 |
|
1303 |
-
def identify_pii_in_text_container(text_container:OCRResult, language:str, chosen_redact_entities:List[str], chosen_redact_comprehend_entities:List[str], score_threshold:float, allow_list:List[str], pii_identification_method:str="Local") -> List[RecognizerResult]:
|
1304 |
-
'''
|
1305 |
-
Take text and bounding boxes in OCRResult format and analyze it for PII using spacy and the Microsoft Presidio package, or the AWS Comprehend service.
|
1306 |
-
'''
|
1307 |
-
comprehend_query_number = 0
|
1308 |
-
analyser_results = []
|
1309 |
-
response = []
|
1310 |
-
|
1311 |
-
#text_to_analyse = initial_clean(text_container.text).strip()
|
1312 |
-
|
1313 |
-
text_to_analyse = text_container.text
|
1314 |
-
|
1315 |
-
if chosen_redact_entities:
|
1316 |
-
if pii_identification_method == "Local":
|
1317 |
-
analyser_results = nlp_analyser.analyze(text=text_to_analyse,
|
1318 |
-
language=language,
|
1319 |
-
entities=chosen_redact_entities,
|
1320 |
-
score_threshold=score_threshold,
|
1321 |
-
return_decision_process=True,
|
1322 |
-
allow_list=allow_list)
|
1323 |
-
|
1324 |
-
elif pii_identification_method == "AWS Comprehend":
|
1325 |
-
|
1326 |
-
|
1327 |
-
if len(text_to_analyse) >= 3:
|
1328 |
-
|
1329 |
-
try:
|
1330 |
-
# Call the detect_pii_entities method
|
1331 |
-
response = comprehend_client.detect_pii_entities(
|
1332 |
-
Text=text_to_analyse,
|
1333 |
-
LanguageCode=language # Specify the language of the text
|
1334 |
-
)
|
1335 |
-
except Exception as e:
|
1336 |
-
print(e)
|
1337 |
-
time.sleep(3)
|
1338 |
-
|
1339 |
-
response = comprehend_client.detect_pii_entities(
|
1340 |
-
Text=text_to_analyse,
|
1341 |
-
LanguageCode=language # Specify the language of the text
|
1342 |
-
)
|
1343 |
-
|
1344 |
-
comprehend_query_number += 1
|
1345 |
|
1346 |
-
if response:
|
1347 |
-
for result in response["Entities"]:
|
1348 |
-
|
1349 |
-
result_text = text_to_analyse[result["BeginOffset"]:result["EndOffset"]+1]
|
1350 |
-
|
1351 |
-
if result_text not in allow_list:
|
1352 |
-
if result.get("Type") in chosen_redact_comprehend_entities:
|
1353 |
-
|
1354 |
-
recogniser_entity = recognizer_result_from_dict(result)
|
1355 |
-
|
1356 |
-
analyser_results.append(recogniser_entity)
|
1357 |
-
else:
|
1358 |
-
analyser_results = []
|
1359 |
-
|
1360 |
-
else:
|
1361 |
-
analyser_results = []
|
1362 |
-
else:
|
1363 |
-
analyser_results = []
|
1364 |
-
|
1365 |
-
|
1366 |
-
return analyser_results, comprehend_query_number
|
1367 |
|
1368 |
def create_text_redaction_process_results(analyser_results, analysed_bounding_boxes, page_num):
|
1369 |
decision_process_table = pd.DataFrame()
|
@@ -1531,27 +1468,103 @@ def redact_text_pdf(
|
|
1531 |
|
1532 |
page_text_outputs = pd.concat([page_text_outputs, line_level_text_results_df])
|
1533 |
|
1534 |
-
#
|
1535 |
-
|
1536 |
-
|
1537 |
-
|
1538 |
-
text_line_bounding_boxes = []
|
1539 |
-
|
1540 |
-
# text_line_analyser_result = identify_pii_in_text_container(text_line, language, chosen_redact_entities, score_threshold, allow_list)
|
1541 |
-
|
1542 |
-
#pii_identification_method="AWS Comprehend"#"Local"
|
1543 |
|
|
|
|
|
1544 |
if chosen_redact_entities:
|
1545 |
-
|
1546 |
-
|
1547 |
-
|
1548 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1549 |
|
1550 |
-
|
1551 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1552 |
|
1553 |
-
#
|
1554 |
-
if
|
|
|
|
|
|
|
1555 |
|
1556 |
#print("Analysed text container, now merging bounding boxes")
|
1557 |
|
|
|
133 |
|
134 |
# If this is the first time around, set variables to 0/blank
|
135 |
if first_loop_state==True:
|
136 |
+
#print("First_loop_state is True")
|
137 |
latest_file_completed = 0
|
138 |
current_loop_page = 0
|
139 |
out_file_paths = []
|
|
|
835 |
else: page_min = page_min - 1
|
836 |
|
837 |
print("Page range:", str(page_min + 1), "to", str(page_max))
|
838 |
+
#print("Current_loop_page:", current_loop_page)
|
839 |
|
840 |
if analysis_type == "Quick image analysis - typed text": ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".csv"
|
841 |
elif analysis_type == "Complex image analysis - docs with handwriting/signatures (AWS Textract)": ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.csv"
|
|
|
1300 |
|
1301 |
return analysed_bounding_boxes
|
1302 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1303 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1304 |
|
1305 |
def create_text_redaction_process_results(analyser_results, analysed_bounding_boxes, page_num):
|
1306 |
decision_process_table = pd.DataFrame()
|
|
|
1468 |
|
1469 |
page_text_outputs = pd.concat([page_text_outputs, line_level_text_results_df])
|
1470 |
|
1471 |
+
# Initialize batching variables
|
1472 |
+
current_batch = ""
|
1473 |
+
current_batch_mapping = [] # List of (start_pos, line_index, OCRResult) tuples
|
1474 |
+
all_text_line_results = [] # Store results for all lines
|
|
|
|
|
|
|
|
|
|
|
1475 |
|
1476 |
+
# First pass: collect all lines into batches
|
1477 |
+
for i, text_line in enumerate(line_level_text_results_list):
|
1478 |
if chosen_redact_entities:
|
1479 |
+
if pii_identification_method == "Local":
|
1480 |
+
# Process immediately for local analysis
|
1481 |
+
text_line_analyser_result = nlp_analyser.analyze(
|
1482 |
+
text=text_line.text,
|
1483 |
+
language=language,
|
1484 |
+
entities=chosen_redact_entities,
|
1485 |
+
score_threshold=score_threshold,
|
1486 |
+
return_decision_process=True,
|
1487 |
+
allow_list=allow_list
|
1488 |
+
)
|
1489 |
+
all_text_line_results.append((i, text_line_analyser_result))
|
1490 |
|
1491 |
+
elif pii_identification_method == "AWS Comprehend":
|
1492 |
+
if len(text_line.text) >= 3:
|
1493 |
+
# Add separator between lines
|
1494 |
+
if current_batch:
|
1495 |
+
current_batch += " | "
|
1496 |
+
|
1497 |
+
start_pos = len(current_batch)
|
1498 |
+
current_batch += text_line.text
|
1499 |
+
current_batch_mapping.append((start_pos, i, text_line))
|
1500 |
+
|
1501 |
+
# Process batch if approaching 300 characters or last line
|
1502 |
+
if len(current_batch) >= 200 or i == len(line_level_text_results_list) - 1:
|
1503 |
+
print("length of text for Comprehend:", len(current_batch))
|
1504 |
+
|
1505 |
+
try:
|
1506 |
+
response = comprehend_client.detect_pii_entities(
|
1507 |
+
Text=current_batch,
|
1508 |
+
LanguageCode=language
|
1509 |
+
)
|
1510 |
+
except Exception as e:
|
1511 |
+
print(e)
|
1512 |
+
time.sleep(3)
|
1513 |
+
response = comprehend_client.detect_pii_entities(
|
1514 |
+
Text=current_batch,
|
1515 |
+
LanguageCode=language
|
1516 |
+
)
|
1517 |
+
|
1518 |
+
comprehend_query_number += 1
|
1519 |
+
|
1520 |
+
# Process response and map back to original lines
|
1521 |
+
if response and "Entities" in response:
|
1522 |
+
for entity in response["Entities"]:
|
1523 |
+
entity_start = entity["BeginOffset"]
|
1524 |
+
entity_end = entity["EndOffset"]
|
1525 |
+
|
1526 |
+
# Find which line this entity belongs to
|
1527 |
+
for batch_start, line_idx, original_line in current_batch_mapping:
|
1528 |
+
batch_end = batch_start + len(original_line.text)
|
1529 |
+
|
1530 |
+
# Check if entity belongs to this line
|
1531 |
+
if batch_start <= entity_start < batch_end:
|
1532 |
+
# Adjust offsets relative to original line
|
1533 |
+
relative_start = entity_start - batch_start
|
1534 |
+
relative_end = min(entity_end - batch_start, len(original_line.text))
|
1535 |
+
|
1536 |
+
result_text = original_line.text[relative_start:relative_end]
|
1537 |
+
|
1538 |
+
if result_text not in allow_list:
|
1539 |
+
if entity.get("Type") in chosen_redact_comprehend_entities:
|
1540 |
+
# Create adjusted entity
|
1541 |
+
adjusted_entity = entity.copy()
|
1542 |
+
adjusted_entity["BeginOffset"] = relative_start
|
1543 |
+
adjusted_entity["EndOffset"] = relative_end
|
1544 |
+
|
1545 |
+
recogniser_entity = recognizer_result_from_dict(adjusted_entity)
|
1546 |
+
|
1547 |
+
# Add to results for this line
|
1548 |
+
existing_results = next((results for idx, results in all_text_line_results if idx == line_idx), [])
|
1549 |
+
if not existing_results:
|
1550 |
+
all_text_line_results.append((line_idx, [recogniser_entity]))
|
1551 |
+
else:
|
1552 |
+
existing_results.append(recogniser_entity)
|
1553 |
+
|
1554 |
+
# Reset batch
|
1555 |
+
current_batch = ""
|
1556 |
+
current_batch_mapping = []
|
1557 |
+
|
1558 |
+
# Second pass: process results for each line
|
1559 |
+
for i, text_line in enumerate(line_level_text_results_list):
|
1560 |
+
text_line_analyser_result = []
|
1561 |
+
text_line_bounding_boxes = []
|
1562 |
|
1563 |
+
# Get results for this line
|
1564 |
+
line_results = next((results for idx, results in all_text_line_results if idx == i), [])
|
1565 |
+
|
1566 |
+
if line_results:
|
1567 |
+
text_line_analyser_result = line_results
|
1568 |
|
1569 |
#print("Analysed text container, now merging bounding boxes")
|
1570 |
|