seanpedrickcase commited on
Commit
542c252
·
1 Parent(s): 390bef2

Consolidated AWS Comprehend redaction calls to reduce total number

Browse files
app.py CHANGED
@@ -318,7 +318,7 @@ with app:
318
 
319
  # If the output file count text box changes, keep going with redacting each data file until done
320
  text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
321
- then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
322
 
323
  ###
324
  # APP LOAD AND LOGGING
 
318
 
319
  # If the output file count text box changes, keep going with redacting each data file until done
320
  text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
321
+ then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
322
 
323
  ###
324
  # APP LOAD AND LOGGING
tools/custom_image_analyser_engine.py CHANGED
@@ -477,122 +477,152 @@ class CustomImageAnalyzerEngine:
477
  allow_list = text_analyzer_kwargs.get('allow_list', [])
478
 
479
  combined_results = []
480
- for i, line_level_ocr_result in enumerate(line_level_ocr_results):
481
-
482
- analyzer_result = []
483
- response = []
484
-
485
- # Analyze each OCR result (line) individually
486
 
 
 
487
  if pii_identification_method == "Local":
488
  analyzer_result = self.analyzer_engine.analyze(
489
  text=line_level_ocr_result.text, **text_analyzer_kwargs
490
  )
 
491
 
492
  elif pii_identification_method == "AWS Comprehend":
493
-
494
  if len(line_level_ocr_result.text) >= 3:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
495
 
496
- try:
497
- # Call the detect_pii_entities method
498
- response = comprehend_client.detect_pii_entities(
499
- Text=line_level_ocr_result.text,
500
- LanguageCode=text_analyzer_kwargs["language"] # Specify the language of the text
501
- )
502
- except Exception as e:
503
- print(e)
504
- time.sleep(3)
505
-
506
- response = comprehend_client.detect_pii_entities(
507
- Text=line_level_ocr_result.text,
508
- LanguageCode=text_analyzer_kwargs["language"] # Specify the language of the text
509
- )
510
-
511
- comprehend_query_number += 1
512
-
513
- if response:
514
- for result in response["Entities"]:
515
- result_text = line_level_ocr_result.text[result["BeginOffset"]:result["EndOffset"]+1]
516
-
517
- if result_text not in allow_list:
518
-
519
- if result.get("Type") in chosen_redact_comprehend_entities:
520
-
521
- recogniser_entity = recognizer_result_from_dict(result)
522
- analyzer_result.append(recogniser_entity)
523
-
524
-
525
- if i < len(ocr_results_with_children): # Check if i is a valid index
526
  child_level_key = list(ocr_results_with_children.keys())[i]
527
- else:
528
- continue
529
-
530
- ocr_results_with_children_line_level = ocr_results_with_children[child_level_key]
531
-
532
- # Go through results to add bounding boxes
533
- for result in analyzer_result:
534
- # Extract the relevant portion of text based on start and end
535
- relevant_text = line_level_ocr_result.text[result.start:result.end]
536
-
537
- # Find the corresponding entry in ocr_results_with_children
538
- child_words = ocr_results_with_children_line_level['words']
539
-
540
- # Initialize bounding box values
541
- left, top, bottom = float('inf'), float('inf'), float('-inf')
542
- all_words = ""
543
- word_num = 0 # Initialize word count
544
- total_width = 0 # Initialize total width
545
-
546
- for word_text in relevant_text.split(): # Iterate through each word in relevant_text
547
- #print("Looking for word_text:", word_text)
548
- for word in child_words:
549
- #if word['text'].strip(string.punctuation).strip() == word_text.strip(string.punctuation).strip(): # Check for exact match
550
- if word_text in word['text']:
551
- found_word = word
552
- #print("found_word:", found_word)
553
-
554
- if word_num == 0: # First word
555
- left = found_word['bounding_box'][0]
556
- top = found_word['bounding_box'][1]
557
- bottom = max(bottom, found_word['bounding_box'][3]) # Update bottom for all words
558
- all_words += found_word['text'] + " " # Concatenate words
559
- total_width = found_word['bounding_box'][2] - left # Add each word's width
560
- word_num += 1
561
- break # Move to the next word in relevant_text
562
-
563
- width = total_width + horizontal_buffer # Set width to total width of all matched words
564
- height = bottom - top if word_num > 0 else 0 # Calculate height
565
-
566
- relevant_line_ocr_result = OCRResult(
567
- text=relevant_text,
568
- left=left,
569
- top=top - height_buffer,
570
- width=width,
571
- height=height + height_buffer
572
- )
573
-
574
- if not ocr_results_with_children_line_level:
575
- # Fallback to previous method if not found in ocr_results_with_children
576
- print("No child info found")
577
- continue
578
 
579
- # Reset the word positions indicated in the relevant ocr_result - i.e. it starts from 0 and ends at word length
580
- result_reset_pos = result
581
- result_reset_pos.start = 0
582
- result_reset_pos.end = len(relevant_text)
583
-
584
- #print("result_reset_pos:", result_reset_pos)
585
- #print("relevant_line_ocr_result:", relevant_line_ocr_result)
586
- #print("ocr_results_with_children_line_level:", ocr_results_with_children_line_level)
587
 
588
- # Map the analyzer results to bounding boxes for this line
589
- line_results = self.map_analyzer_results_to_bounding_boxes(
590
- [result_reset_pos], [relevant_line_ocr_result], relevant_line_ocr_result.text, allow_list, ocr_results_with_children_line_level
591
- )
 
 
 
 
 
 
 
 
 
592
 
593
- #print("line_results:", line_results)
594
-
595
- combined_results.extend(line_results)
596
 
597
  return combined_results, comprehend_query_number
598
 
 
477
  allow_list = text_analyzer_kwargs.get('allow_list', [])
478
 
479
  combined_results = []
480
+ # Initialize variables for batching
481
+ current_batch = ""
482
+ current_batch_mapping = [] # List of (start_pos, line_index, original_text) tuples
483
+ analyzer_results_by_line = [[] for _ in line_level_ocr_results] # Store results for each line
 
 
484
 
485
+ # Process OCR results in batches
486
+ for i, line_level_ocr_result in enumerate(line_level_ocr_results):
487
  if pii_identification_method == "Local":
488
  analyzer_result = self.analyzer_engine.analyze(
489
  text=line_level_ocr_result.text, **text_analyzer_kwargs
490
  )
491
+ analyzer_results_by_line[i] = analyzer_result
492
 
493
  elif pii_identification_method == "AWS Comprehend":
 
494
  if len(line_level_ocr_result.text) >= 3:
495
+ # Add line to current batch with a separator
496
+ if current_batch:
497
+ current_batch += " | " # Use a separator that's unlikely to appear in the text
498
+
499
+ start_pos = len(current_batch)
500
+ current_batch += line_level_ocr_result.text
501
+ current_batch_mapping.append((start_pos, i, line_level_ocr_result.text))
502
+
503
+ # Process batch if it's approaching 300 characters or this is the last line
504
+ if len(current_batch) >= 200 or i == len(line_level_ocr_results) - 1:
505
+ print("length of text for Comprehend:", len(current_batch))
506
+
507
+ try:
508
+ response = comprehend_client.detect_pii_entities(
509
+ Text=current_batch,
510
+ LanguageCode=text_analyzer_kwargs["language"]
511
+ )
512
+ except Exception as e:
513
+ print(e)
514
+ time.sleep(3)
515
+ response = comprehend_client.detect_pii_entities(
516
+ Text=current_batch,
517
+ LanguageCode=text_analyzer_kwargs["language"]
518
+ )
519
+
520
+ comprehend_query_number += 1
521
+
522
+ # Map results back to original lines
523
+ if response and "Entities" in response:
524
+ for entity in response["Entities"]:
525
+ entity_start = entity["BeginOffset"]
526
+ entity_end = entity["EndOffset"]
527
+
528
+ # Find which line this entity belongs to
529
+ for batch_start, line_idx, original_text in current_batch_mapping:
530
+ batch_end = batch_start + len(original_text)
531
+
532
+ # Check if entity belongs to this line
533
+ if batch_start <= entity_start < batch_end:
534
+ # Adjust offsets relative to the original line
535
+ relative_start = entity_start - batch_start
536
+ relative_end = min(entity_end - batch_start, len(original_text))
537
+
538
+ result_text = original_text[relative_start:relative_end]
539
+
540
+ if result_text not in allow_list:
541
+ if entity.get("Type") in chosen_redact_comprehend_entities:
542
+ # Create a new entity with adjusted positions
543
+ adjusted_entity = entity.copy()
544
+ adjusted_entity["BeginOffset"] = relative_start
545
+ adjusted_entity["EndOffset"] = relative_end
546
+
547
+ recogniser_entity = recognizer_result_from_dict(adjusted_entity)
548
+ analyzer_results_by_line[line_idx].append(recogniser_entity)
549
+
550
+ # Reset batch
551
+ current_batch = ""
552
+ current_batch_mapping = []
553
+
554
+ # Process results for each line
555
+ for i, analyzer_result in enumerate(analyzer_results_by_line):
556
+ if i >= len(ocr_results_with_children):
557
+ continue
558
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
559
  child_level_key = list(ocr_results_with_children.keys())[i]
560
+ ocr_results_with_children_line_level = ocr_results_with_children[child_level_key]
561
+
562
+ # Go through results to add bounding boxes
563
+ for result in analyzer_result:
564
+ # Extract the relevant portion of text based on start and end
565
+ relevant_text = line_level_ocr_results[i].text[result.start:result.end]
566
+
567
+ # Find the corresponding entry in ocr_results_with_children
568
+ child_words = ocr_results_with_children_line_level['words']
569
+
570
+ # Initialize bounding box values
571
+ left, top, bottom = float('inf'), float('inf'), float('-inf')
572
+ all_words = ""
573
+ word_num = 0 # Initialize word count
574
+ total_width = 0 # Initialize total width
575
+
576
+ for word_text in relevant_text.split(): # Iterate through each word in relevant_text
577
+ #print("Looking for word_text:", word_text)
578
+ for word in child_words:
579
+ #if word['text'].strip(string.punctuation).strip() == word_text.strip(string.punctuation).strip(): # Check for exact match
580
+ if word_text in word['text']:
581
+ found_word = word
582
+ #print("found_word:", found_word)
583
+
584
+ if word_num == 0: # First word
585
+ left = found_word['bounding_box'][0]
586
+ top = found_word['bounding_box'][1]
587
+ bottom = max(bottom, found_word['bounding_box'][3]) # Update bottom for all words
588
+ all_words += found_word['text'] + " " # Concatenate words
589
+ total_width = found_word['bounding_box'][2] - left # Add each word's width
590
+ word_num += 1
591
+ break # Move to the next word in relevant_text
592
+
593
+ width = total_width + horizontal_buffer # Set width to total width of all matched words
594
+ height = bottom - top if word_num > 0 else 0 # Calculate height
595
+
596
+ relevant_line_ocr_result = OCRResult(
597
+ text=relevant_text,
598
+ left=left,
599
+ top=top - height_buffer,
600
+ width=width,
601
+ height=height + height_buffer
602
+ )
 
 
 
 
 
 
 
 
603
 
604
+ if not ocr_results_with_children_line_level:
605
+ # Fallback to previous method if not found in ocr_results_with_children
606
+ print("No child info found")
607
+ continue
 
 
 
 
608
 
609
+ # Reset the word positions indicated in the relevant ocr_result - i.e. it starts from 0 and ends at word length
610
+ result_reset_pos = result
611
+ result_reset_pos.start = 0
612
+ result_reset_pos.end = len(relevant_text)
613
+
614
+ #print("result_reset_pos:", result_reset_pos)
615
+ #print("relevant_line_ocr_result:", relevant_line_ocr_result)
616
+ #print("ocr_results_with_children_line_level:", ocr_results_with_children_line_level)
617
+
618
+ # Map the analyzer results to bounding boxes for this line
619
+ line_results = self.map_analyzer_results_to_bounding_boxes(
620
+ [result_reset_pos], [relevant_line_ocr_result], relevant_line_ocr_result.text, allow_list, ocr_results_with_children_line_level
621
+ )
622
 
623
+ #print("line_results:", line_results)
624
+
625
+ combined_results.extend(line_results)
626
 
627
  return combined_results, comprehend_query_number
628
 
tools/file_redaction.py CHANGED
@@ -133,7 +133,7 @@ def choose_and_run_redactor(file_paths:List[str],
133
 
134
  # If this is the first time around, set variables to 0/blank
135
  if first_loop_state==True:
136
- print("First_loop_state is True")
137
  latest_file_completed = 0
138
  current_loop_page = 0
139
  out_file_paths = []
@@ -835,7 +835,7 @@ def redact_image_pdf(file_path:str,
835
  else: page_min = page_min - 1
836
 
837
  print("Page range:", str(page_min + 1), "to", str(page_max))
838
- print("Current_loop_page:", current_loop_page)
839
 
840
  if analysis_type == "Quick image analysis - typed text": ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".csv"
841
  elif analysis_type == "Complex image analysis - docs with handwriting/signatures (AWS Textract)": ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.csv"
@@ -1300,70 +1300,7 @@ def merge_text_bounding_boxes(analyser_results:CustomImageRecognizerResult, char
1300
 
1301
  return analysed_bounding_boxes
1302
 
1303
- def identify_pii_in_text_container(text_container:OCRResult, language:str, chosen_redact_entities:List[str], chosen_redact_comprehend_entities:List[str], score_threshold:float, allow_list:List[str], pii_identification_method:str="Local") -> List[RecognizerResult]:
1304
- '''
1305
- Take text and bounding boxes in OCRResult format and analyze it for PII using spacy and the Microsoft Presidio package, or the AWS Comprehend service.
1306
- '''
1307
- comprehend_query_number = 0
1308
- analyser_results = []
1309
- response = []
1310
-
1311
- #text_to_analyse = initial_clean(text_container.text).strip()
1312
-
1313
- text_to_analyse = text_container.text
1314
-
1315
- if chosen_redact_entities:
1316
- if pii_identification_method == "Local":
1317
- analyser_results = nlp_analyser.analyze(text=text_to_analyse,
1318
- language=language,
1319
- entities=chosen_redact_entities,
1320
- score_threshold=score_threshold,
1321
- return_decision_process=True,
1322
- allow_list=allow_list)
1323
-
1324
- elif pii_identification_method == "AWS Comprehend":
1325
-
1326
-
1327
- if len(text_to_analyse) >= 3:
1328
-
1329
- try:
1330
- # Call the detect_pii_entities method
1331
- response = comprehend_client.detect_pii_entities(
1332
- Text=text_to_analyse,
1333
- LanguageCode=language # Specify the language of the text
1334
- )
1335
- except Exception as e:
1336
- print(e)
1337
- time.sleep(3)
1338
-
1339
- response = comprehend_client.detect_pii_entities(
1340
- Text=text_to_analyse,
1341
- LanguageCode=language # Specify the language of the text
1342
- )
1343
-
1344
- comprehend_query_number += 1
1345
 
1346
- if response:
1347
- for result in response["Entities"]:
1348
-
1349
- result_text = text_to_analyse[result["BeginOffset"]:result["EndOffset"]+1]
1350
-
1351
- if result_text not in allow_list:
1352
- if result.get("Type") in chosen_redact_comprehend_entities:
1353
-
1354
- recogniser_entity = recognizer_result_from_dict(result)
1355
-
1356
- analyser_results.append(recogniser_entity)
1357
- else:
1358
- analyser_results = []
1359
-
1360
- else:
1361
- analyser_results = []
1362
- else:
1363
- analyser_results = []
1364
-
1365
-
1366
- return analyser_results, comprehend_query_number
1367
 
1368
  def create_text_redaction_process_results(analyser_results, analysed_bounding_boxes, page_num):
1369
  decision_process_table = pd.DataFrame()
@@ -1531,27 +1468,103 @@ def redact_text_pdf(
1531
 
1532
  page_text_outputs = pd.concat([page_text_outputs, line_level_text_results_df])
1533
 
1534
- # Analyse each line of text in turn for PII and add to list
1535
- for i, text_line in enumerate(line_level_text_results_list):
1536
-
1537
- text_line_analyser_result = []
1538
- text_line_bounding_boxes = []
1539
-
1540
- # text_line_analyser_result = identify_pii_in_text_container(text_line, language, chosen_redact_entities, score_threshold, allow_list)
1541
-
1542
- #pii_identification_method="AWS Comprehend"#"Local"
1543
 
 
 
1544
  if chosen_redact_entities:
1545
-
1546
- text_line_analyser_result, comprehend_query_number_new = identify_pii_in_text_container(text_line, language, chosen_redact_entities, chosen_redact_comprehend_entities, score_threshold, allow_list, pii_identification_method)
1547
-
1548
- comprehend_query_number = comprehend_query_number + comprehend_query_number_new
 
 
 
 
 
 
 
1549
 
1550
- else:
1551
- text_line_analyser_result = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1552
 
1553
- # Merge bounding boxes for the line if multiple found close together
1554
- if text_line_analyser_result:
 
 
 
1555
 
1556
  #print("Analysed text container, now merging bounding boxes")
1557
 
 
133
 
134
  # If this is the first time around, set variables to 0/blank
135
  if first_loop_state==True:
136
+ #print("First_loop_state is True")
137
  latest_file_completed = 0
138
  current_loop_page = 0
139
  out_file_paths = []
 
835
  else: page_min = page_min - 1
836
 
837
  print("Page range:", str(page_min + 1), "to", str(page_max))
838
+ #print("Current_loop_page:", current_loop_page)
839
 
840
  if analysis_type == "Quick image analysis - typed text": ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".csv"
841
  elif analysis_type == "Complex image analysis - docs with handwriting/signatures (AWS Textract)": ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.csv"
 
1300
 
1301
  return analysed_bounding_boxes
1302
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1303
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1304
 
1305
  def create_text_redaction_process_results(analyser_results, analysed_bounding_boxes, page_num):
1306
  decision_process_table = pd.DataFrame()
 
1468
 
1469
  page_text_outputs = pd.concat([page_text_outputs, line_level_text_results_df])
1470
 
1471
+ # Initialize batching variables
1472
+ current_batch = ""
1473
+ current_batch_mapping = [] # List of (start_pos, line_index, OCRResult) tuples
1474
+ all_text_line_results = [] # Store results for all lines
 
 
 
 
 
1475
 
1476
+ # First pass: collect all lines into batches
1477
+ for i, text_line in enumerate(line_level_text_results_list):
1478
  if chosen_redact_entities:
1479
+ if pii_identification_method == "Local":
1480
+ # Process immediately for local analysis
1481
+ text_line_analyser_result = nlp_analyser.analyze(
1482
+ text=text_line.text,
1483
+ language=language,
1484
+ entities=chosen_redact_entities,
1485
+ score_threshold=score_threshold,
1486
+ return_decision_process=True,
1487
+ allow_list=allow_list
1488
+ )
1489
+ all_text_line_results.append((i, text_line_analyser_result))
1490
 
1491
+ elif pii_identification_method == "AWS Comprehend":
1492
+ if len(text_line.text) >= 3:
1493
+ # Add separator between lines
1494
+ if current_batch:
1495
+ current_batch += " | "
1496
+
1497
+ start_pos = len(current_batch)
1498
+ current_batch += text_line.text
1499
+ current_batch_mapping.append((start_pos, i, text_line))
1500
+
1501
+ # Process batch if approaching 300 characters or last line
1502
+ if len(current_batch) >= 200 or i == len(line_level_text_results_list) - 1:
1503
+ print("length of text for Comprehend:", len(current_batch))
1504
+
1505
+ try:
1506
+ response = comprehend_client.detect_pii_entities(
1507
+ Text=current_batch,
1508
+ LanguageCode=language
1509
+ )
1510
+ except Exception as e:
1511
+ print(e)
1512
+ time.sleep(3)
1513
+ response = comprehend_client.detect_pii_entities(
1514
+ Text=current_batch,
1515
+ LanguageCode=language
1516
+ )
1517
+
1518
+ comprehend_query_number += 1
1519
+
1520
+ # Process response and map back to original lines
1521
+ if response and "Entities" in response:
1522
+ for entity in response["Entities"]:
1523
+ entity_start = entity["BeginOffset"]
1524
+ entity_end = entity["EndOffset"]
1525
+
1526
+ # Find which line this entity belongs to
1527
+ for batch_start, line_idx, original_line in current_batch_mapping:
1528
+ batch_end = batch_start + len(original_line.text)
1529
+
1530
+ # Check if entity belongs to this line
1531
+ if batch_start <= entity_start < batch_end:
1532
+ # Adjust offsets relative to original line
1533
+ relative_start = entity_start - batch_start
1534
+ relative_end = min(entity_end - batch_start, len(original_line.text))
1535
+
1536
+ result_text = original_line.text[relative_start:relative_end]
1537
+
1538
+ if result_text not in allow_list:
1539
+ if entity.get("Type") in chosen_redact_comprehend_entities:
1540
+ # Create adjusted entity
1541
+ adjusted_entity = entity.copy()
1542
+ adjusted_entity["BeginOffset"] = relative_start
1543
+ adjusted_entity["EndOffset"] = relative_end
1544
+
1545
+ recogniser_entity = recognizer_result_from_dict(adjusted_entity)
1546
+
1547
+ # Add to results for this line
1548
+ existing_results = next((results for idx, results in all_text_line_results if idx == line_idx), [])
1549
+ if not existing_results:
1550
+ all_text_line_results.append((line_idx, [recogniser_entity]))
1551
+ else:
1552
+ existing_results.append(recogniser_entity)
1553
+
1554
+ # Reset batch
1555
+ current_batch = ""
1556
+ current_batch_mapping = []
1557
+
1558
+ # Second pass: process results for each line
1559
+ for i, text_line in enumerate(line_level_text_results_list):
1560
+ text_line_analyser_result = []
1561
+ text_line_bounding_boxes = []
1562
 
1563
+ # Get results for this line
1564
+ line_results = next((results for idx, results in all_text_line_results if idx == i), [])
1565
+
1566
+ if line_results:
1567
+ text_line_analyser_result = line_results
1568
 
1569
  #print("Analysed text container, now merging bounding boxes")
1570