Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Nov 6, 2024

Commit

542c252

1 Parent(s): 390bef2

Consolidated AWS Comprehend redaction calls to reduce total number

Browse files

Files changed (3) hide show

app.py +1 -1
tools/custom_image_analyser_engine.py +133 -103
tools/file_redaction.py +95 -82

app.py CHANGED Viewed

@@ -318,7 +318,7 @@ with app:
     # If the output file count text box changes, keep going with redacting each data file until done
     text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
-    then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
     ###
     # APP LOAD AND LOGGING

     # If the output file count text box changes, keep going with redacting each data file until done
     text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
+    then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
     ###
     # APP LOAD AND LOGGING

tools/custom_image_analyser_engine.py CHANGED Viewed

@@ -477,122 +477,152 @@ class CustomImageAnalyzerEngine:
         allow_list = text_analyzer_kwargs.get('allow_list', [])
         combined_results = []
-        for i, line_level_ocr_result in enumerate(line_level_ocr_results):
-            analyzer_result = []
-            response = []
-            # Analyze each OCR result (line) individually
             if pii_identification_method == "Local":
                 analyzer_result = self.analyzer_engine.analyze(
                     text=line_level_ocr_result.text, **text_analyzer_kwargs
                 )
             elif pii_identification_method == "AWS Comprehend":
                 if len(line_level_ocr_result.text) >= 3:
-                    try:
-                        # Call the detect_pii_entities method
-                        response = comprehend_client.detect_pii_entities(
-                            Text=line_level_ocr_result.text,
-                            LanguageCode=text_analyzer_kwargs["language"] # Specify the language of the text
-                        )
-                    except Exception as e:
-                        print(e)
-                        time.sleep(3)
-                        response = comprehend_client.detect_pii_entities(
-                            Text=line_level_ocr_result.text,
-                            LanguageCode=text_analyzer_kwargs["language"] # Specify the language of the text
-                        )
-                    comprehend_query_number += 1
-                if response:
-                    for result in response["Entities"]:
-                        result_text = line_level_ocr_result.text[result["BeginOffset"]:result["EndOffset"]+1]
-                        if result_text not in allow_list:
-                            if result.get("Type") in chosen_redact_comprehend_entities:
-                                recogniser_entity = recognizer_result_from_dict(result)
-                                analyzer_result.append(recogniser_entity)
-            if i < len(ocr_results_with_children):  # Check if i is a valid index
                 child_level_key = list(ocr_results_with_children.keys())[i]
-            else:
-               continue
-            ocr_results_with_children_line_level = ocr_results_with_children[child_level_key]
-            # Go through results to add bounding boxes
-            for result in analyzer_result:
-                # Extract the relevant portion of text based on start and end
-                relevant_text = line_level_ocr_result.text[result.start:result.end]
-                # Find the corresponding entry in ocr_results_with_children
-                child_words = ocr_results_with_children_line_level['words']
-                 # Initialize bounding box values
-                left, top, bottom = float('inf'), float('inf'), float('-inf')
-                all_words = ""
-                word_num = 0  # Initialize word count
-                total_width = 0  # Initialize total width
-                for word_text in relevant_text.split():  # Iterate through each word in relevant_text
-                    #print("Looking for word_text:", word_text)
-                    for word in child_words:
-                        #if word['text'].strip(string.punctuation).strip() == word_text.strip(string.punctuation).strip():  # Check for exact match
-                        if word_text in word['text']:
-                            found_word = word
-                            #print("found_word:", found_word)
-                            if word_num == 0:  # First word
-                                left = found_word['bounding_box'][0]
-                                top = found_word['bounding_box'][1]
-                            bottom = max(bottom, found_word['bounding_box'][3])  # Update bottom for all words
-                            all_words += found_word['text'] + " "  # Concatenate words
-                            total_width = found_word['bounding_box'][2] - left  # Add each word's width
-                            word_num += 1
-                            break  # Move to the next word in relevant_text
-                width = total_width + horizontal_buffer # Set width to total width of all matched words
-                height = bottom - top if word_num > 0 else 0  # Calculate height
-                relevant_line_ocr_result = OCRResult(
-                    text=relevant_text,
-                    left=left,
-                    top=top - height_buffer,
-                    width=width,
-                    height=height + height_buffer
-                )
-                if not ocr_results_with_children_line_level:
-                    # Fallback to previous method if not found in ocr_results_with_children
-                    print("No child info found")
-                    continue
-                # Reset the word positions indicated in the relevant ocr_result - i.e. it starts from 0 and ends at word length
-                result_reset_pos = result
-                result_reset_pos.start = 0
-                result_reset_pos.end = len(relevant_text)
-                #print("result_reset_pos:", result_reset_pos)
-                #print("relevant_line_ocr_result:", relevant_line_ocr_result)
-                #print("ocr_results_with_children_line_level:", ocr_results_with_children_line_level)
-                # Map the analyzer results to bounding boxes for this line
-                line_results = self.map_analyzer_results_to_bounding_boxes(
-                    [result_reset_pos], [relevant_line_ocr_result], relevant_line_ocr_result.text, allow_list, ocr_results_with_children_line_level
-                )
-                #print("line_results:", line_results)
-                combined_results.extend(line_results)
         return combined_results, comprehend_query_number

         allow_list = text_analyzer_kwargs.get('allow_list', [])
         combined_results = []
+        # Initialize variables for batching
+        current_batch = ""
+        current_batch_mapping = []  # List of (start_pos, line_index, original_text) tuples
+        analyzer_results_by_line = [[] for _ in line_level_ocr_results]  # Store results for each line
+        # Process OCR results in batches
+        for i, line_level_ocr_result in enumerate(line_level_ocr_results):
             if pii_identification_method == "Local":
                 analyzer_result = self.analyzer_engine.analyze(
                     text=line_level_ocr_result.text, **text_analyzer_kwargs
                 )
+                analyzer_results_by_line[i] = analyzer_result
             elif pii_identification_method == "AWS Comprehend":
                 if len(line_level_ocr_result.text) >= 3:
+                    # Add line to current batch with a separator
+                    if current_batch:
+                        current_batch += " | "  # Use a separator that's unlikely to appear in the text
+                    start_pos = len(current_batch)
+                    current_batch += line_level_ocr_result.text
+                    current_batch_mapping.append((start_pos, i, line_level_ocr_result.text))
+                    # Process batch if it's approaching 300 characters or this is the last line
+                    if len(current_batch) >= 200 or i == len(line_level_ocr_results) - 1:
+                        print("length of text for Comprehend:", len(current_batch))
+                        try:
+                            response = comprehend_client.detect_pii_entities(
+                                Text=current_batch,
+                                LanguageCode=text_analyzer_kwargs["language"]
+                            )
+                        except Exception as e:
+                            print(e)
+                            time.sleep(3)
+                            response = comprehend_client.detect_pii_entities(
+                                Text=current_batch,
+                                LanguageCode=text_analyzer_kwargs["language"]
+                            )
+                        comprehend_query_number += 1
+                        # Map results back to original lines
+                        if response and "Entities" in response:
+                            for entity in response["Entities"]:
+                                entity_start = entity["BeginOffset"]
+                                entity_end = entity["EndOffset"]
+                                # Find which line this entity belongs to
+                                for batch_start, line_idx, original_text in current_batch_mapping:
+                                    batch_end = batch_start + len(original_text)
+                                    # Check if entity belongs to this line
+                                    if batch_start <= entity_start < batch_end:
+                                        # Adjust offsets relative to the original line
+                                        relative_start = entity_start - batch_start
+                                        relative_end = min(entity_end - batch_start, len(original_text))
+                                        result_text = original_text[relative_start:relative_end]
+                                        if result_text not in allow_list:
+                                            if entity.get("Type") in chosen_redact_comprehend_entities:
+                                                # Create a new entity with adjusted positions
+                                                adjusted_entity = entity.copy()
+                                                adjusted_entity["BeginOffset"] = relative_start
+                                                adjusted_entity["EndOffset"] = relative_end
+                                                recogniser_entity = recognizer_result_from_dict(adjusted_entity)
+                                                analyzer_results_by_line[line_idx].append(recogniser_entity)
+                        # Reset batch
+                        current_batch = ""
+                        current_batch_mapping = []
+            # Process results for each line
+        for i, analyzer_result in enumerate(analyzer_results_by_line):
+                if i >= len(ocr_results_with_children):
+                    continue
                 child_level_key = list(ocr_results_with_children.keys())[i]
+                ocr_results_with_children_line_level = ocr_results_with_children[child_level_key]
+                # Go through results to add bounding boxes
+                for result in analyzer_result:
+                    # Extract the relevant portion of text based on start and end
+                    relevant_text = line_level_ocr_results[i].text[result.start:result.end]
+                    # Find the corresponding entry in ocr_results_with_children
+                    child_words = ocr_results_with_children_line_level['words']
+                     # Initialize bounding box values
+                    left, top, bottom = float('inf'), float('inf'), float('-inf')
+                    all_words = ""
+                    word_num = 0  # Initialize word count
+                    total_width = 0  # Initialize total width
+                    for word_text in relevant_text.split():  # Iterate through each word in relevant_text
+                        #print("Looking for word_text:", word_text)
+                        for word in child_words:
+                            #if word['text'].strip(string.punctuation).strip() == word_text.strip(string.punctuation).strip():  # Check for exact match
+                            if word_text in word['text']:
+                                found_word = word
+                                #print("found_word:", found_word)
+                                if word_num == 0:  # First word
+                                    left = found_word['bounding_box'][0]
+                                    top = found_word['bounding_box'][1]
+                                bottom = max(bottom, found_word['bounding_box'][3])  # Update bottom for all words
+                                all_words += found_word['text'] + " "  # Concatenate words
+                                total_width = found_word['bounding_box'][2] - left  # Add each word's width
+                                word_num += 1
+                                break  # Move to the next word in relevant_text
+                    width = total_width + horizontal_buffer # Set width to total width of all matched words
+                    height = bottom - top if word_num > 0 else 0  # Calculate height
+                    relevant_line_ocr_result = OCRResult(
+                        text=relevant_text,
+                        left=left,
+                        top=top - height_buffer,
+                        width=width,
+                        height=height + height_buffer
+                    )
+                    if not ocr_results_with_children_line_level:
+                        # Fallback to previous method if not found in ocr_results_with_children
+                        print("No child info found")
+                        continue
+                    # Reset the word positions indicated in the relevant ocr_result - i.e. it starts from 0 and ends at word length
+                    result_reset_pos = result
+                    result_reset_pos.start = 0
+                    result_reset_pos.end = len(relevant_text)
+                    #print("result_reset_pos:", result_reset_pos)
+                    #print("relevant_line_ocr_result:", relevant_line_ocr_result)
+                    #print("ocr_results_with_children_line_level:", ocr_results_with_children_line_level)
+                    # Map the analyzer results to bounding boxes for this line
+                    line_results = self.map_analyzer_results_to_bounding_boxes(
+                        [result_reset_pos], [relevant_line_ocr_result], relevant_line_ocr_result.text, allow_list, ocr_results_with_children_line_level
+                    )
+                    #print("line_results:", line_results)
+                    combined_results.extend(line_results)
         return combined_results, comprehend_query_number

tools/file_redaction.py CHANGED Viewed

@@ -133,7 +133,7 @@ def choose_and_run_redactor(file_paths:List[str],
     # If this is the first time around, set variables to 0/blank
     if first_loop_state==True:
-        print("First_loop_state is True")
         latest_file_completed = 0
         current_loop_page = 0
         out_file_paths = []
@@ -835,7 +835,7 @@ def redact_image_pdf(file_path:str,
     else: page_min = page_min - 1
     print("Page range:", str(page_min + 1), "to", str(page_max))
-    print("Current_loop_page:", current_loop_page)
     if analysis_type == "Quick image analysis - typed text": ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".csv"
     elif analysis_type == "Complex image analysis - docs with handwriting/signatures (AWS Textract)": ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.csv"
@@ -1300,70 +1300,7 @@ def merge_text_bounding_boxes(analyser_results:CustomImageRecognizerResult, char
     return analysed_bounding_boxes
-def identify_pii_in_text_container(text_container:OCRResult, language:str, chosen_redact_entities:List[str], chosen_redact_comprehend_entities:List[str], score_threshold:float, allow_list:List[str], pii_identification_method:str="Local") -> List[RecognizerResult]:
-    '''
-    Take text and bounding boxes in OCRResult format and analyze it for PII using spacy and the Microsoft Presidio package, or the AWS Comprehend service.
-    '''
-    comprehend_query_number = 0
-    analyser_results = []
-    response = []
-    #text_to_analyse = initial_clean(text_container.text).strip()
-    text_to_analyse = text_container.text
-    if chosen_redact_entities:
-        if pii_identification_method == "Local":
-            analyser_results = nlp_analyser.analyze(text=text_to_analyse,
-                                                    language=language,
-                                                    entities=chosen_redact_entities,
-                                                    score_threshold=score_threshold,
-                                                    return_decision_process=True,
-                                                    allow_list=allow_list)
-        elif pii_identification_method == "AWS Comprehend":
-            if len(text_to_analyse) >= 3:
-                    try:
-                        # Call the detect_pii_entities method
-                        response = comprehend_client.detect_pii_entities(
-                        Text=text_to_analyse,
-                        LanguageCode=language  # Specify the language of the text
-                        )
-                    except Exception as e:
-                        print(e)
-                        time.sleep(3)
-                        response = comprehend_client.detect_pii_entities(
-                        Text=text_to_analyse,
-                        LanguageCode=language  # Specify the language of the text
-                        )
-            comprehend_query_number += 1
-            if response:
-                for result in response["Entities"]:
-                    result_text = text_to_analyse[result["BeginOffset"]:result["EndOffset"]+1]
-                    if result_text not in allow_list:
-                        if result.get("Type") in chosen_redact_comprehend_entities:
-                            recogniser_entity = recognizer_result_from_dict(result)
-                            analyser_results.append(recogniser_entity)
-            else:
-                analyser_results = []
-        else:
-            analyser_results = []
-    else:
-        analyser_results = []
-    return analyser_results, comprehend_query_number
 def create_text_redaction_process_results(analyser_results, analysed_bounding_boxes, page_num):
     decision_process_table = pd.DataFrame()
@@ -1531,27 +1468,103 @@ def redact_text_pdf(
                             page_text_outputs = pd.concat([page_text_outputs, line_level_text_results_df])
-                        # Analyse each line of text in turn for PII and add to list
-                        for i, text_line in enumerate(line_level_text_results_list):
-                            text_line_analyser_result = []
-                            text_line_bounding_boxes = []
-                            # text_line_analyser_result = identify_pii_in_text_container(text_line, language, chosen_redact_entities, score_threshold, allow_list)
-                            #pii_identification_method="AWS Comprehend"#"Local"
                             if chosen_redact_entities:
-                                text_line_analyser_result, comprehend_query_number_new = identify_pii_in_text_container(text_line, language, chosen_redact_entities, chosen_redact_comprehend_entities, score_threshold, allow_list, pii_identification_method)
-                                comprehend_query_number = comprehend_query_number + comprehend_query_number_new
-                            else:
-                                text_line_analyser_result = []
-                            # Merge bounding boxes for the line if multiple found close together
-                            if text_line_analyser_result:
                                 #print("Analysed text container, now merging bounding boxes")

     # If this is the first time around, set variables to 0/blank
     if first_loop_state==True:
+        #print("First_loop_state is True")
         latest_file_completed = 0
         current_loop_page = 0
         out_file_paths = []
     else: page_min = page_min - 1
     print("Page range:", str(page_min + 1), "to", str(page_max))
+    #print("Current_loop_page:", current_loop_page)
     if analysis_type == "Quick image analysis - typed text": ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".csv"
     elif analysis_type == "Complex image analysis - docs with handwriting/signatures (AWS Textract)": ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.csv"
     return analysed_bounding_boxes
 def create_text_redaction_process_results(analyser_results, analysed_bounding_boxes, page_num):
     decision_process_table = pd.DataFrame()
                             page_text_outputs = pd.concat([page_text_outputs, line_level_text_results_df])
+                        # Initialize batching variables
+                        current_batch = ""
+                        current_batch_mapping = []  # List of (start_pos, line_index, OCRResult) tuples
+                        all_text_line_results = []  # Store results for all lines
+                        # First pass: collect all lines into batches
+                        for i, text_line in enumerate(line_level_text_results_list):
                             if chosen_redact_entities:
+                                if pii_identification_method == "Local":
+                                    # Process immediately for local analysis
+                                    text_line_analyser_result = nlp_analyser.analyze(
+                                        text=text_line.text,
+                                        language=language,
+                                        entities=chosen_redact_entities,
+                                        score_threshold=score_threshold,
+                                        return_decision_process=True,
+                                        allow_list=allow_list
+                                    )
+                                    all_text_line_results.append((i, text_line_analyser_result))
+                                elif pii_identification_method == "AWS Comprehend":
+                                    if len(text_line.text) >= 3:
+                                        # Add separator between lines
+                                        if current_batch:
+                                            current_batch += " | "
+                                        start_pos = len(current_batch)
+                                        current_batch += text_line.text
+                                        current_batch_mapping.append((start_pos, i, text_line))
+                                        # Process batch if approaching 300 characters or last line
+                                        if len(current_batch) >= 200 or i == len(line_level_text_results_list) - 1:
+                                            print("length of text for Comprehend:", len(current_batch))
+                                            try:
+                                                response = comprehend_client.detect_pii_entities(
+                                                    Text=current_batch,
+                                                    LanguageCode=language
+                                                )
+                                            except Exception as e:
+                                                print(e)
+                                                time.sleep(3)
+                                                response = comprehend_client.detect_pii_entities(
+                                                    Text=current_batch,
+                                                    LanguageCode=language
+                                                )
+                                            comprehend_query_number += 1
+                                            # Process response and map back to original lines
+                                            if response and "Entities" in response:
+                                                for entity in response["Entities"]:
+                                                    entity_start = entity["BeginOffset"]
+                                                    entity_end = entity["EndOffset"]
+                                                    # Find which line this entity belongs to
+                                                    for batch_start, line_idx, original_line in current_batch_mapping:
+                                                        batch_end = batch_start + len(original_line.text)
+                                                        # Check if entity belongs to this line
+                                                        if batch_start <= entity_start < batch_end:
+                                                            # Adjust offsets relative to original line
+                                                            relative_start = entity_start - batch_start
+                                                            relative_end = min(entity_end - batch_start, len(original_line.text))
+                                                            result_text = original_line.text[relative_start:relative_end]
+                                                            if result_text not in allow_list:
+                                                                if entity.get("Type") in chosen_redact_comprehend_entities:
+                                                                    # Create adjusted entity
+                                                                    adjusted_entity = entity.copy()
+                                                                    adjusted_entity["BeginOffset"] = relative_start
+                                                                    adjusted_entity["EndOffset"] = relative_end
+                                                                    recogniser_entity = recognizer_result_from_dict(adjusted_entity)
+                                                                    # Add to results for this line
+                                                                    existing_results = next((results for idx, results in all_text_line_results if idx == line_idx), [])
+                                                                    if not existing_results:
+                                                                        all_text_line_results.append((line_idx, [recogniser_entity]))
+                                                                    else:
+                                                                        existing_results.append(recogniser_entity)
+                                            # Reset batch
+                                            current_batch = ""
+                                            current_batch_mapping = []
+                        # Second pass: process results for each line
+                        for i, text_line in enumerate(line_level_text_results_list):
+                            text_line_analyser_result = []
+                            text_line_bounding_boxes = []
+                            # Get results for this line
+                            line_results = next((results for idx, results in all_text_line_results if idx == i), [])
+                            if line_results:
+                                text_line_analyser_result = line_results
                                 #print("Analysed text container, now merging bounding boxes")