Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on 6 days ago

Commit

21318d3

1 Parent(s): d5b5291

Added regex search feature for multi-word text search

Browse files

Files changed (7) hide show

README.md +4 -2
app.py +37 -9
src/user_guide.qmd +2 -2
tools/custom_csvlogger.py +5 -5
tools/file_redaction.py +147 -67
tools/find_duplicate_pages.py +157 -46
tools/redaction_review.py +30 -2

README.md CHANGED Viewed

@@ -589,9 +589,11 @@ The workflow is designed to be simple: **Search → Select → Redact**.
 #### **Step 1: Search for Text**
 1.  Navigate to the **"Search text to make new redactions"** tab.
-2.  The main table will initially be populated with all the text extracted from the document, broken down by word.
-3.  To narrow this down, use the **"Multi-word text search"** box to type the word or phrase you want to find.
 4.  Click the **"Search"** button or press Enter.
 5.  The table below will update to show only the rows containing text that matches your search query.

 #### **Step 1: Search for Text**
+#### **Step 1: Search for Text**
 1.  Navigate to the **"Search text to make new redactions"** tab.
+2.  The main table will initially be populated with all the text extracted from the document for a page, broken down by word.
+3.  To narrow this down, use the **"Multi-word text search"** box to type the word or phrase you want to find (this will search the whole document). If you want to do a regex-based search, tick the 'Enable regex pattern matching' box under 'Search options' below (Note this will only be able to search for patterns in text within each cell).
 4.  Click the **"Search"** button or press Enter.
 5.  The table below will update to show only the rows containing text that matches your search query.

app.py CHANGED Viewed

@@ -1701,13 +1701,23 @@ with blocks:
                                 label="Minimum similarity score for match (max=1)",
                                 visible=False,
                             )  # Not used anymore for this exact search
-                            new_redaction_text_label = gr.Textbox(
-                                label="Label for new redactions", value="Redaction"
-                            )
-                            colour_label = gr.Textbox(
-                                label="Colour for labels (three number RGB format, max 255 with brackets)",
-                                value=CUSTOM_BOX_COLOUR,
-                            )
                         all_page_line_level_ocr_results_with_words_df = gr.Dataframe(
                             pd.DataFrame(
@@ -4701,12 +4711,29 @@ with blocks:
         outputs=[all_page_line_level_ocr_results_with_words_df],
     )
     multi_word_search_text.submit(
-        fn=run_full_search_and_analysis,
         inputs=[
             multi_word_search_text,
             all_page_line_level_ocr_results_with_words_df_base,
             similarity_search_score_minimum,
         ],
         outputs=[
             all_page_line_level_ocr_results_with_words_df,
@@ -4716,11 +4743,12 @@ with blocks:
     )
     multi_word_search_text_btn.click(
-        fn=run_full_search_and_analysis,
         inputs=[
             multi_word_search_text,
             all_page_line_level_ocr_results_with_words_df_base,
             similarity_search_score_minimum,
         ],
         outputs=[
             all_page_line_level_ocr_results_with_words_df,

                                 label="Minimum similarity score for match (max=1)",
                                 visible=False,
                             )  # Not used anymore for this exact search
+                            with gr.Row():
+                                with gr.Column():
+                                    new_redaction_text_label = gr.Textbox(
+                                        label="Label for new redactions",
+                                        value="Redaction",
+                                    )
+                                    colour_label = gr.Textbox(
+                                        label="Colour for labels (three number RGB format, max 255 with brackets)",
+                                        value=CUSTOM_BOX_COLOUR,
+                                    )
+                                with gr.Column():
+                                    use_regex_search = gr.Checkbox(
+                                        label="Enable regex pattern matching",
+                                        value=False,
+                                        info="When enabled, the search text will be treated as a regular expression pattern instead of literal text",
+                                    )
                         all_page_line_level_ocr_results_with_words_df = gr.Dataframe(
                             pd.DataFrame(
         outputs=[all_page_line_level_ocr_results_with_words_df],
     )
+    def run_search_with_regex_option(
+        search_text, word_df, similarity_threshold, use_regex_flag
+    ):
+        """Wrapper function to call run_full_search_and_analysis with regex option"""
+        return run_full_search_and_analysis(
+            search_query_text=search_text,
+            word_level_df_orig=word_df,
+            similarity_threshold=similarity_threshold,
+            combine_pages=False,
+            min_word_count=1,
+            min_consecutive_pages=1,
+            greedy_match=True,
+            remake_index=False,
+            use_regex=use_regex_flag,
+        )
     multi_word_search_text.submit(
+        fn=run_search_with_regex_option,
         inputs=[
             multi_word_search_text,
             all_page_line_level_ocr_results_with_words_df_base,
             similarity_search_score_minimum,
+            use_regex_search,
         ],
         outputs=[
             all_page_line_level_ocr_results_with_words_df,
     )
     multi_word_search_text_btn.click(
+        fn=run_search_with_regex_option,
         inputs=[
             multi_word_search_text,
             all_page_line_level_ocr_results_with_words_df_base,
             similarity_search_score_minimum,
+            use_regex_search,
         ],
         outputs=[
             all_page_line_level_ocr_results_with_words_df,

src/user_guide.qmd CHANGED Viewed

@@ -366,8 +366,8 @@ The workflow is designed to be simple: **Search → Select → Redact**.
 #### **Step 1: Search for Text**
 1.  Navigate to the **"Search text to make new redactions"** tab.
-2.  The main table will initially be populated with all the text extracted from the document, broken down by word.
-3.  To narrow this down, use the **"Multi-word text search"** box to type the word or phrase you want to find.
 4.  Click the **"Search"** button or press Enter.
 5.  The table below will update to show only the rows containing text that matches your search query.

 #### **Step 1: Search for Text**
 1.  Navigate to the **"Search text to make new redactions"** tab.
+2.  The main table will initially be populated with all the text extracted from the document for a page, broken down by word.
+3.  To narrow this down, use the **"Multi-word text search"** box to type the word or phrase you want to find (this will search the whole document). If you want to do a regex-based search, tick the 'Enable regex pattern matching' box under 'Search options' below (Note this will only be able to search for patterns in text within each cell).
 4.  Click the **"Search"** button or press Enter.
 5.  The table below will update to show only the rows containing text that matches your search query.

tools/custom_csvlogger.py CHANGED Viewed

@@ -228,7 +228,7 @@ class CSVLogger_custom(FlaggingCallback):
             if RUN_AWS_FUNCTIONS:
                 try:
-                    print("Connecting to DynamoDB via existing SSO connection")
                     dynamodb = boto3.resource("dynamodb", region_name=AWS_REGION)
                     dynamodb.meta.client.list_tables()
@@ -236,9 +236,9 @@ class CSVLogger_custom(FlaggingCallback):
                 except Exception as e:
                     print("No SSO credentials found:", e)
                     if AWS_ACCESS_KEY and AWS_SECRET_KEY:
-                        print(
-                            "Trying to get DynamoDB credentials from environment variables"
-                        )
                         dynamodb = boto3.resource(
                             "dynamodb",
                             aws_access_key_id=AWS_ACCESS_KEY,
@@ -328,7 +328,7 @@ class CSVLogger_custom(FlaggingCallback):
                 table.put_item(Item=item)
-                print("Successfully uploaded log to DynamoDB")
             except Exception as e:
                 print("Could not upload log to DynamobDB due to", e)

             if RUN_AWS_FUNCTIONS:
                 try:
+                    # print("Connecting to DynamoDB via existing SSO connection")
                     dynamodb = boto3.resource("dynamodb", region_name=AWS_REGION)
                     dynamodb.meta.client.list_tables()
                 except Exception as e:
                     print("No SSO credentials found:", e)
                     if AWS_ACCESS_KEY and AWS_SECRET_KEY:
+                        # print(
+                        #     "Trying to get DynamoDB credentials from environment variables"
+                        # )
                         dynamodb = boto3.resource(
                             "dynamodb",
                             aws_access_key_id=AWS_ACCESS_KEY,
                 table.put_item(Item=item)
+                # print("Successfully uploaded log to DynamoDB")
             except Exception as e:
                 print("Could not upload log to DynamobDB due to", e)

tools/file_redaction.py CHANGED Viewed

@@ -441,7 +441,6 @@ def choose_and_run_redactor(
         current_loop_page = 0
         out_file_paths = list()
         log_files_output_paths = list()
-        estimate_total_processing_time = 0
         estimated_time_taken_state = 0
         comprehend_query_number = 0
         total_textract_query_number = 0
@@ -543,9 +542,7 @@ def choose_and_run_redactor(
             if total_textract_query_number > number_of_pages:
                 total_textract_query_number = number_of_pages
-        estimate_total_processing_time = sum_numbers_before_seconds(
-            combined_out_message
-        )
         # print(
         #     "Estimated total processing time:",
         #     str(estimate_total_processing_time),
@@ -1317,7 +1314,7 @@ def choose_and_run_redactor(
                             number_of_pages,
                             page_max,
                         )
-                        #print("Saving redacted PDF file:", out_redacted_pdf_file_path)
                         # Use final document if available, otherwise use main document
                         doc_to_save = (
@@ -1352,7 +1349,7 @@ def choose_and_run_redactor(
                     number_of_pages,
                     page_max,
                 )
-                #print("Saving PDF file for review:", out_review_pdf_file_path)
                 if out_review_pdf_file_path:
                     save_pdf_with_or_without_compression(
@@ -1692,9 +1689,7 @@ def choose_and_run_redactor(
             combined_out_message + " " + out_time_message
         )  # Ensure this is a single string
-        estimate_total_processing_time = sum_numbers_before_seconds(
-            combined_out_message
-        )
         # else:
         #     toc = time.perf_counter()
@@ -3299,7 +3294,7 @@ def redact_image_pdf(
     # Go through each page
     for page_no in progress_bar:
         reported_page_number = str(page_no + 1)
         print(f"Current page: {reported_page_number}")
@@ -3308,7 +3303,6 @@ def redact_image_pdf(
         page_handwriting_recogniser_results = list()
         page_line_level_ocr_results_with_words = list()
         page_break_return = False
         # Try to find image location
         try:
@@ -3419,7 +3413,7 @@ def redact_image_pdf(
             if image is None:
                 # Check if image_path is a placeholder and create the actual image
                 if isinstance(image_path, str) and "placeholder_image" in image_path:
-                    #print(f"Detected placeholder image path: {image_path}")
                     try:
                         # Extract page number from placeholder path
                         page_num_from_placeholder = int(
@@ -3628,26 +3622,25 @@ def redact_image_pdf(
                             page["data"]
                             for page in textract_data["pages"]
                             if page["page_no"] == reported_page_number
-                        )
                 # Check if this is whole-document Textract output (already converted to mediabox space)
                 # by checking if the JSON structure indicates it came from restructure_textract_output
                 # or if textract_output_found is True (indicating pre-existing whole-document output)
-                use_mediabox_for_textract = (
-                    textract_output_found or
-                    ("pages" in textract_data and len(textract_data.get("pages", [])) > 0)
                 )
                 if use_mediabox_for_textract:
                     # Whole-document Textract: use mediabox dimensions
                     textract_page_width = pymupdf_page.mediabox.width
                     textract_page_height = pymupdf_page.mediabox.height
-                    #print(f"Using mediabox dimensions for whole-document Textract: {textract_page_width}x{textract_page_height}")
                 else:
                     # Individual image Textract: use image dimensions (current behavior)
                     textract_page_width = page_width
                     textract_page_height = page_height
-                    #print(f"Using image dimensions for individual image Textract: {textract_page_width}x{textract_page_height}")
                 (
                     page_line_level_ocr_results,
@@ -3658,7 +3651,10 @@ def redact_image_pdf(
                     selection_element_results,
                     form_key_value_results,
                 ) = json_to_ocrresult(
-                    text_blocks, textract_page_width, textract_page_height, reported_page_number
                 )
                 if all_page_line_level_ocr_results_with_words is None:
@@ -4812,9 +4808,13 @@ def redact_text_pdf(
                 if page_text_ocr_outputs_list:
                     # Filter out empty DataFrames before concatenation to avoid FutureWarning
-                    non_empty_ocr_outputs = [df for df in page_text_ocr_outputs_list if not df.empty]
                     if non_empty_ocr_outputs:
-                        page_text_ocr_outputs = pd.concat(non_empty_ocr_outputs, ignore_index=True)
                     else:
                         page_text_ocr_outputs = pd.DataFrame(
                             columns=[
@@ -4960,17 +4960,50 @@ def redact_text_pdf(
                     # Write logs
                     # Filter out empty DataFrames before concatenation to avoid FutureWarning
-                    non_empty_decision_process = [df for df in all_pages_decision_process_list if not df.empty]
                     if non_empty_decision_process:
-                        all_pages_decision_process_table = pd.concat(non_empty_decision_process, ignore_index=True)
                     else:
-                        all_pages_decision_process_table = pd.DataFrame()
-                    non_empty_ocr_results = [df for df in all_line_level_ocr_results_list if not df.empty]
                     if non_empty_ocr_results:
-                        all_line_level_ocr_results_df = pd.concat(non_empty_ocr_results, ignore_index=True)
                     else:
-                        all_line_level_ocr_results_df = pd.DataFrame()
                     current_loop_page += 1
@@ -5010,11 +5043,29 @@ def redact_text_pdf(
             # Write logs
             # Filter out empty DataFrames before concatenation to avoid FutureWarning
-            non_empty_decision_process = [df for df in all_pages_decision_process_list if not df.empty]
             if non_empty_decision_process:
-                all_pages_decision_process_table = pd.concat(non_empty_decision_process, ignore_index=True)
             else:
-                all_pages_decision_process_table = pd.DataFrame()
             return (
                 pymupdf_doc,
@@ -5029,52 +5080,81 @@ def redact_text_pdf(
     # Write all page outputs
     # Filter out empty DataFrames before concatenation to avoid FutureWarning
-    non_empty_decision_process = [df for df in all_pages_decision_process_list if not df.empty]
     if non_empty_decision_process:
-        all_pages_decision_process_table = pd.concat(non_empty_decision_process, ignore_index=True)
     else:
-        all_pages_decision_process_table = pd.DataFrame()
-    non_empty_ocr_results = [df for df in all_line_level_ocr_results_list if not df.empty]
     if non_empty_ocr_results:
-        all_line_level_ocr_results_df = pd.concat(non_empty_ocr_results, ignore_index=True)
     else:
-        all_line_level_ocr_results_df = pd.DataFrame()
-    # Convert decision table to relative coordinates
-    all_pages_decision_process_table = divide_coordinates_by_page_sizes(
-        all_pages_decision_process_table,
-        page_sizes_df,
-        xmin="xmin",
-        xmax="xmax",
-        ymin="ymin",
-        ymax="ymax",
-    )
-    # Coordinates need to be reversed for ymin and ymax to match with image annotator objects downstream
-    all_pages_decision_process_table["ymin"] = reverse_y_coords(
-        all_pages_decision_process_table, "ymin"
-    )
-    all_pages_decision_process_table["ymax"] = reverse_y_coords(
-        all_pages_decision_process_table, "ymax"
-    )
-    # Convert decision table to relative coordinates
-    all_line_level_ocr_results_df = divide_coordinates_by_page_sizes(
-        all_line_level_ocr_results_df,
-        page_sizes_df,
-        xmin="left",
-        xmax="width",
-        ymin="top",
-        ymax="height",
-    )
-    # Coordinates need to be reversed for ymin and ymax to match with image annotator objects downstream
     if not all_line_level_ocr_results_df.empty:
-        all_line_level_ocr_results_df["top"] = reverse_y_coords(
-            all_line_level_ocr_results_df, "top"
         )
     # Remove empty dictionary items from ocr results with words
     all_page_line_level_ocr_results_with_words = [
         d for d in all_page_line_level_ocr_results_with_words if d

         current_loop_page = 0
         out_file_paths = list()
         log_files_output_paths = list()
         estimated_time_taken_state = 0
         comprehend_query_number = 0
         total_textract_query_number = 0
             if total_textract_query_number > number_of_pages:
                 total_textract_query_number = number_of_pages
+        sum_numbers_before_seconds(combined_out_message)
         # print(
         #     "Estimated total processing time:",
         #     str(estimate_total_processing_time),
                             number_of_pages,
                             page_max,
                         )
+                        # print("Saving redacted PDF file:", out_redacted_pdf_file_path)
                         # Use final document if available, otherwise use main document
                         doc_to_save = (
                     number_of_pages,
                     page_max,
                 )
+                # print("Saving PDF file for review:", out_review_pdf_file_path)
                 if out_review_pdf_file_path:
                     save_pdf_with_or_without_compression(
             combined_out_message + " " + out_time_message
         )  # Ensure this is a single string
+        sum_numbers_before_seconds(combined_out_message)
         # else:
         #     toc = time.perf_counter()
     # Go through each page
     for page_no in progress_bar:
         reported_page_number = str(page_no + 1)
         print(f"Current page: {reported_page_number}")
         page_handwriting_recogniser_results = list()
         page_line_level_ocr_results_with_words = list()
         page_break_return = False
         # Try to find image location
         try:
             if image is None:
                 # Check if image_path is a placeholder and create the actual image
                 if isinstance(image_path, str) and "placeholder_image" in image_path:
+                    # print(f"Detected placeholder image path: {image_path}")
                     try:
                         # Extract page number from placeholder path
                         page_num_from_placeholder = int(
                             page["data"]
                             for page in textract_data["pages"]
                             if page["page_no"] == reported_page_number
+                        )
                 # Check if this is whole-document Textract output (already converted to mediabox space)
                 # by checking if the JSON structure indicates it came from restructure_textract_output
                 # or if textract_output_found is True (indicating pre-existing whole-document output)
+                use_mediabox_for_textract = textract_output_found or (
+                    "pages" in textract_data and len(textract_data.get("pages", [])) > 0
                 )
                 if use_mediabox_for_textract:
                     # Whole-document Textract: use mediabox dimensions
                     textract_page_width = pymupdf_page.mediabox.width
                     textract_page_height = pymupdf_page.mediabox.height
+                    # print(f"Using mediabox dimensions for whole-document Textract: {textract_page_width}x{textract_page_height}")
                 else:
                     # Individual image Textract: use image dimensions (current behavior)
                     textract_page_width = page_width
                     textract_page_height = page_height
+                    # print(f"Using image dimensions for individual image Textract: {textract_page_width}x{textract_page_height}")
                 (
                     page_line_level_ocr_results,
                     selection_element_results,
                     form_key_value_results,
                 ) = json_to_ocrresult(
+                    text_blocks,
+                    textract_page_width,
+                    textract_page_height,
+                    reported_page_number,
                 )
                 if all_page_line_level_ocr_results_with_words is None:
                 if page_text_ocr_outputs_list:
                     # Filter out empty DataFrames before concatenation to avoid FutureWarning
+                    non_empty_ocr_outputs = [
+                        df for df in page_text_ocr_outputs_list if not df.empty
+                    ]
                     if non_empty_ocr_outputs:
+                        page_text_ocr_outputs = pd.concat(
+                            non_empty_ocr_outputs, ignore_index=True
+                        )
                     else:
                         page_text_ocr_outputs = pd.DataFrame(
                             columns=[
                     # Write logs
                     # Filter out empty DataFrames before concatenation to avoid FutureWarning
+                    non_empty_decision_process = [
+                        df for df in all_pages_decision_process_list if not df.empty
+                    ]
                     if non_empty_decision_process:
+                        all_pages_decision_process_table = pd.concat(
+                            non_empty_decision_process, ignore_index=True
+                        )
                     else:
+                        all_pages_decision_process_table = pd.DataFrame(
+                            columns=[
+                                "text",
+                                "xmin",
+                                "ymin",
+                                "xmax",
+                                "ymax",
+                                "label",
+                                "start",
+                                "end",
+                                "score",
+                                "page",
+                                "id",
+                            ]
+                        )
+                    non_empty_ocr_results = [
+                        df for df in all_line_level_ocr_results_list if not df.empty
+                    ]
                     if non_empty_ocr_results:
+                        all_line_level_ocr_results_df = pd.concat(
+                            non_empty_ocr_results, ignore_index=True
+                        )
                     else:
+                        all_line_level_ocr_results_df = pd.DataFrame(
+                            columns=[
+                                "page",
+                                "text",
+                                "left",
+                                "top",
+                                "width",
+                                "height",
+                                "line",
+                                "conf",
+                            ]
+                        )
                     current_loop_page += 1
             # Write logs
             # Filter out empty DataFrames before concatenation to avoid FutureWarning
+            non_empty_decision_process = [
+                df for df in all_pages_decision_process_list if not df.empty
+            ]
             if non_empty_decision_process:
+                all_pages_decision_process_table = pd.concat(
+                    non_empty_decision_process, ignore_index=True
+                )
             else:
+                all_pages_decision_process_table = pd.DataFrame(
+                    columns=[
+                        "text",
+                        "xmin",
+                        "ymin",
+                        "xmax",
+                        "ymax",
+                        "label",
+                        "start",
+                        "end",
+                        "score",
+                        "page",
+                        "id",
+                    ]
+                )
             return (
                 pymupdf_doc,
     # Write all page outputs
     # Filter out empty DataFrames before concatenation to avoid FutureWarning
+    non_empty_decision_process = [
+        df for df in all_pages_decision_process_list if not df.empty
+    ]
     if non_empty_decision_process:
+        all_pages_decision_process_table = pd.concat(
+            non_empty_decision_process, ignore_index=True
+        )
     else:
+        all_pages_decision_process_table = pd.DataFrame(
+            columns=[
+                "text",
+                "xmin",
+                "ymin",
+                "xmax",
+                "ymax",
+                "label",
+                "start",
+                "end",
+                "score",
+                "page",
+                "id",
+            ]
+        )
+    non_empty_ocr_results = [
+        df for df in all_line_level_ocr_results_list if not df.empty
+    ]
     if non_empty_ocr_results:
+        all_line_level_ocr_results_df = pd.concat(
+            non_empty_ocr_results, ignore_index=True
+        )
     else:
+        all_line_level_ocr_results_df = pd.DataFrame(
+            columns=["page", "text", "left", "top", "width", "height", "line", "conf"]
+        )
+    if not all_pages_decision_process_table.empty:
+        # Convert decision table to relative coordinates
+        all_pages_decision_process_table = divide_coordinates_by_page_sizes(
+            all_pages_decision_process_table,
+            page_sizes_df,
+            xmin="xmin",
+            xmax="xmax",
+            ymin="ymin",
+            ymax="ymax",
+        )
+        # Coordinates need to be reversed for ymin and ymax to match with image annotator objects downstream
+        all_pages_decision_process_table["ymin"] = reverse_y_coords(
+            all_pages_decision_process_table, "ymin"
+        )
+        all_pages_decision_process_table["ymax"] = reverse_y_coords(
+            all_pages_decision_process_table, "ymax"
+        )
+    # Convert decision table to relative coordinates
     if not all_line_level_ocr_results_df.empty:
+        all_line_level_ocr_results_df = divide_coordinates_by_page_sizes(
+            all_line_level_ocr_results_df,
+            page_sizes_df,
+            xmin="left",
+            xmax="width",
+            ymin="top",
+            ymax="height",
         )
+        # Coordinates need to be reversed for ymin and ymax to match with image annotator objects downstream
+        if not all_line_level_ocr_results_df.empty:
+            all_line_level_ocr_results_df["top"] = reverse_y_coords(
+                all_line_level_ocr_results_df, "top"
+            )
     # Remove empty dictionary items from ocr results with words
     all_page_line_level_ocr_results_with_words = [
         d for d in all_page_line_level_ocr_results_with_words if d

tools/find_duplicate_pages.py CHANGED Viewed

@@ -122,6 +122,7 @@ def run_full_search_and_analysis(
     min_consecutive_pages: int = 1,
     greedy_match: bool = True,
     remake_index: bool = False,
     progress=gr.Progress(track_tqdm=True),
 ):
     """
@@ -133,7 +134,7 @@ def run_full_search_and_analysis(
     4. Executes the similarity analysis on the combined data using the specified parameters such as similarity threshold, minimum word count, minimum consecutive pages, and greedy match strategy.
     Parameters:
-    - search_query_text (str): The text entered by the user to search for in the OCR data.
     - word_level_df_orig (pd.DataFrame): The original DataFrame containing word-level OCR data.
     - similarity_threshold (float, optional): The minimum similarity score required for two pages to be considered duplicates. Defaults to 1.
     - combine_pages (bool, optional): A flag indicating whether to combine text from the same page number within a file. Defaults to False.
@@ -141,6 +142,7 @@ def run_full_search_and_analysis(
     - min_consecutive_pages (int, optional): The minimum number of consecutive pages required to be considered a match. Defaults to 1.
     - greedy_match (bool, optional): A flag indicating whether to use a greedy strategy for matching consecutive pages. Defaults to True.
     - remake_index (bool, optional): A flag indicating whether to remake the index of the DataFrame during processing. Defaults to False.
     - progress (gr.Progress, optional): A Progress object to track the progress of the operation. Defaults to a Progress object with track_tqdm set to True.
     """
@@ -149,30 +151,56 @@ def run_full_search_and_analysis(
     if len(search_query_text) > 100:
         raise Warning("Please use a search query with at less than 100 characters.")
-    if punctuation_at_word_text_end(word_level_df_orig) is True:
-        do_punctuation_split = False
     else:
-        do_punctuation_split = True
-    # Step 1: Process the user's search query string
-    search_query_data, query_word_length = create_dataframe_from_string(
-        search_query_text,
-        file_name="user_search_query",
-        split_words=True,
-        split_punctuation=do_punctuation_split,
-    )
-    if not search_query_data:
-        # Handle case where user submits an empty search string
-        raise Warning("Could not convert search string to required format")
-    if query_word_length > 25:
-        # Handle case where user submits an empty search string
-        raise Warning("Please use a query with less than 25 words")
-    # Overwrite min_consecutive_pages with the search string length
-    min_consecutive_pages = query_word_length
     # Create word index from reference table
     word_level_df_orig["index"] = word_level_df_orig.index
     word_level_df = word_level_df_orig.copy()
@@ -204,6 +232,7 @@ def run_full_search_and_analysis(
         do_text_clean=False,
         file1_name="user_search_query",
         file2_name="source_document",
         progress=progress,
     )
@@ -777,7 +806,10 @@ def _sequences_match(query_seq: List[str], ref_seq: List[str]) -> bool:
 def find_consecutive_sequence_matches(
-    df_filtered: pd.DataFrame, search_file_name: str, reference_file_name: str
 ) -> pd.DataFrame:
     """
     Finds all occurrences of a consecutive sequence of tokens from a search file
@@ -789,6 +821,7 @@ def find_consecutive_sequence_matches(
         df_filtered: The DataFrame containing all tokens, with 'file' and 'text_clean' columns.
         search_file_name: The name of the file containing the search query sequence.
         reference_file_name: The name of the file to search within.
     Returns:
         A DataFrame with two columns ('Page1_Index', 'Page2_Index') mapping the
@@ -803,38 +836,115 @@ def find_consecutive_sequence_matches(
         print("Error: One or both files not found or are empty.")
         return pd.DataFrame(columns=["Page1_Index", "Page2_Index"])
-    # Step 2: Convert the token data into lists for easy comparison.
-    # We need both the text tokens and their original global indices.
-    query_tokens = search_df["text_clean"].tolist()
-    query_indices = search_df.index.tolist()
-    reference_tokens = reference_df["text_clean"].tolist()
-    reference_indices = reference_df.index.tolist()
-    query_len = len(query_tokens)
-    all_found_matches = list()
-    print(f"Searching for a sequence of {query_len} tokens...")
-    # Step 3: Use a "sliding window" to search for the query sequence in the reference list.
-    for i in range(len(reference_tokens) - query_len + 1):
-        # The "window" is a slice of the reference list that is the same size as the query
-        window = reference_tokens[i : i + query_len]
-        # Step 4: If the window matches the query with or without punctuation on end
-        if _sequences_match(query_tokens, window):
-            # Get the global indices for this entire matching block
-            matching_reference_indices = reference_indices[i : i + query_len]
-            # Create the mapping between query indices and the found reference indices
-            for j in range(query_len):
-                all_found_matches.append(
-                    (query_indices[j], matching_reference_indices[j], 1)
-                )
-            # If you only want the *first* match, you can uncomment the next line:
-            # break
     if not all_found_matches:
         print("No matches found")
@@ -860,6 +970,7 @@ def identify_similar_text_sequences(
     file1_name: str = "",
     file2_name: str = "",
     output_folder: str = OUTPUT_FOLDER,
     progress=Progress(track_tqdm=True),
 ) -> Tuple[pd.DataFrame, List[str], pd.DataFrame]:
     """
@@ -903,7 +1014,7 @@ def identify_similar_text_sequences(
         # base_similarity_df = _debug_similarity_between_two_files(df_filtered, vectorizer, similarity_threshold, file1_name, file2_name)
         base_similarity_df = find_consecutive_sequence_matches(
-            df_filtered, file1_name, file2_name
         )
         if base_similarity_df.empty:
             return pd.DataFrame(), [], df_combined

     min_consecutive_pages: int = 1,
     greedy_match: bool = True,
     remake_index: bool = False,
+    use_regex: bool = False,
     progress=gr.Progress(track_tqdm=True),
 ):
     """
     4. Executes the similarity analysis on the combined data using the specified parameters such as similarity threshold, minimum word count, minimum consecutive pages, and greedy match strategy.
     Parameters:
+    - search_query_text (str): The text entered by the user to search for in the OCR data. If use_regex=True, this is treated as a regex pattern.
     - word_level_df_orig (pd.DataFrame): The original DataFrame containing word-level OCR data.
     - similarity_threshold (float, optional): The minimum similarity score required for two pages to be considered duplicates. Defaults to 1.
     - combine_pages (bool, optional): A flag indicating whether to combine text from the same page number within a file. Defaults to False.
     - min_consecutive_pages (int, optional): The minimum number of consecutive pages required to be considered a match. Defaults to 1.
     - greedy_match (bool, optional): A flag indicating whether to use a greedy strategy for matching consecutive pages. Defaults to True.
     - remake_index (bool, optional): A flag indicating whether to remake the index of the DataFrame during processing. Defaults to False.
+    - use_regex (bool, optional): If True, treats search_query_text as a regex pattern instead of literal text. Defaults to False.
     - progress (gr.Progress, optional): A Progress object to track the progress of the operation. Defaults to a Progress object with track_tqdm set to True.
     """
     if len(search_query_text) > 100:
         raise Warning("Please use a search query with at less than 100 characters.")
+    # For regex mode, we handle the query differently
+    if use_regex:
+        # Validate regex pattern
+        try:
+            re.compile(search_query_text)
+        except re.error as e:
+            raise Warning(f"Invalid regex pattern: {e}")
+        # For regex, we don't split into words - treat as single pattern
+        # Create a minimal DataFrame structure for the regex pattern
+        search_query_data = [
+            (
+                "user_search_query",
+                pd.DataFrame({"page": [1], "text": [search_query_text], "line": [1]}),
+            )
+        ]
+        query_word_length = 1  # For regex, we'll handle matching differently
+        min_consecutive_pages = 1  # Regex matches can be variable length
     else:
+        # Original literal text matching logic
+        if punctuation_at_word_text_end(word_level_df_orig) is True:
+            do_punctuation_split = False
+        else:
+            do_punctuation_split = True
+        # Step 1: Process the user's search query string
+        search_query_data, query_word_length = create_dataframe_from_string(
+            search_query_text,
+            file_name="user_search_query",
+            split_words=True,
+            split_punctuation=do_punctuation_split,
+        )
+        if not search_query_data:
+            # Handle case where user submits an empty search string
+            raise Warning("Could not convert search string to required format")
+        if query_word_length > 25:
+            # Handle case where user submits an empty search string
+            raise Warning("Please use a query with less than 25 words")
+        # Overwrite min_consecutive_pages with the search string length
+        min_consecutive_pages = query_word_length
     # Create word index from reference table
+    if word_level_df_orig.empty:
+        raise gr.Error(
+            "No word-level data to process. Please check that you have loaded in OCR data."
+        )
     word_level_df_orig["index"] = word_level_df_orig.index
     word_level_df = word_level_df_orig.copy()
         do_text_clean=False,
         file1_name="user_search_query",
         file2_name="source_document",
+        use_regex=use_regex,
         progress=progress,
     )
 def find_consecutive_sequence_matches(
+    df_filtered: pd.DataFrame,
+    search_file_name: str,
+    reference_file_name: str,
+    use_regex: bool = False,
 ) -> pd.DataFrame:
     """
     Finds all occurrences of a consecutive sequence of tokens from a search file
         df_filtered: The DataFrame containing all tokens, with 'file' and 'text_clean' columns.
         search_file_name: The name of the file containing the search query sequence.
         reference_file_name: The name of the file to search within.
+        use_regex: If True, treats the search query as a regex pattern instead of literal tokens.
     Returns:
         A DataFrame with two columns ('Page1_Index', 'Page2_Index') mapping the
         print("Error: One or both files not found or are empty.")
         return pd.DataFrame(columns=["Page1_Index", "Page2_Index"])
+    if use_regex:
+        # Regex mode: Extract pattern and search in combined text
+        # Get the regex pattern from the search query (should be in 'text' column, not 'text_clean')
+        # We need to get it from the original 'text' column if available, otherwise use 'text_clean'
+        if "text" in search_df.columns:
+            regex_pattern = search_df["text"].iloc[0]
+        else:
+            regex_pattern = search_df["text_clean"].iloc[0]
+        # Join reference tokens back into text for regex searching
+        # Use original 'text' column if available to preserve original formatting (important for emails, etc.)
+        # Otherwise fall back to 'text_clean'
+        if "text" in reference_df.columns:
+            reference_tokens = reference_df["text"].tolist()
+        else:
+            reference_tokens = reference_df["text_clean"].tolist()
+        reference_indices = reference_df.index.tolist()
+        # Join tokens with spaces to reconstruct the text
+        # Note: If tokens were split at special characters like @, this may not perfectly reconstruct
+        # the original text, but it's the best we can do with tokenized data
+        reference_text = " ".join(reference_tokens)
+        # Build a mapping from character positions to token indices
+        # This helps us map regex match positions back to token indices
+        char_to_token_map = []
+        current_pos = 0
+        for idx, token in enumerate(reference_tokens):
+            token_start = current_pos
+            token_end = current_pos + len(token)
+            char_to_token_map.append((token_start, token_end, reference_indices[idx]))
+            # Add 1 for the space separator (except after last token)
+            current_pos = token_end + (1 if idx < len(reference_tokens) - 1 else 0)
+        # Find all regex matches
+        try:
+            pattern = re.compile(regex_pattern, re.IGNORECASE)
+            matches = list(pattern.finditer(reference_text))
+        except re.error as e:
+            print(f"Error compiling regex pattern: {e}")
+            gr.Warning(f"Invalid regex pattern: {e}")
+            return pd.DataFrame(
+                columns=["Page1_Index", "Page2_Index", "Similarity_Score"]
+            )
+        if not matches:
+            print("No regex matches found")
+            gr.Info("No regex matches found")
+            return pd.DataFrame(
+                columns=["Page1_Index", "Page2_Index", "Similarity_Score"]
+            )
+        all_found_matches = []
+        query_index = search_df.index[0]  # Use the first (and only) query index
+        # For each regex match, find which tokens it spans
+        for match in matches:
+            match_start = match.start()
+            match_end = match.end()
+            # Find all tokens that overlap with this match
+            matching_token_indices = []
+            for token_start, token_end, token_idx in char_to_token_map:
+                # Check if token overlaps with match
+                if not (token_end < match_start or token_start > match_end):
+                    matching_token_indices.append(token_idx)
+            # Create matches for all tokens in the span
+            for token_idx in matching_token_indices:
+                all_found_matches.append((query_index, token_idx, 1))
+        print(
+            f"Found {len(matches)} regex match(es) spanning {len(set(idx for _, idx, _ in all_found_matches))} token(s)"
+        )
+    else:
+        # Original literal token matching logic
+        # Step 2: Convert the token data into lists for easy comparison.
+        # We need both the text tokens and their original global indices.
+        query_tokens = search_df["text_clean"].tolist()
+        query_indices = search_df.index.tolist()
+        reference_tokens = reference_df["text_clean"].tolist()
+        reference_indices = reference_df.index.tolist()
+        query_len = len(query_tokens)
+        all_found_matches = list()
+        print(f"Searching for a sequence of {query_len} tokens...")
+        # Step 3: Use a "sliding window" to search for the query sequence in the reference list.
+        for i in range(len(reference_tokens) - query_len + 1):
+            # The "window" is a slice of the reference list that is the same size as the query
+            window = reference_tokens[i : i + query_len]
+            # Step 4: If the window matches the query with or without punctuation on end
+            if _sequences_match(query_tokens, window):
+                # Get the global indices for this entire matching block
+                matching_reference_indices = reference_indices[i : i + query_len]
+                # Create the mapping between query indices and the found reference indices
+                for j in range(query_len):
+                    all_found_matches.append(
+                        (query_indices[j], matching_reference_indices[j], 1)
+                    )
+                # If you only want the *first* match, you can uncomment the next line:
+                # break
     if not all_found_matches:
         print("No matches found")
     file1_name: str = "",
     file2_name: str = "",
     output_folder: str = OUTPUT_FOLDER,
+    use_regex: bool = False,
     progress=Progress(track_tqdm=True),
 ) -> Tuple[pd.DataFrame, List[str], pd.DataFrame]:
     """
         # base_similarity_df = _debug_similarity_between_two_files(df_filtered, vectorizer, similarity_threshold, file1_name, file2_name)
         base_similarity_df = find_consecutive_sequence_matches(
+            df_filtered, file1_name, file2_name, use_regex=use_regex
         )
         if base_similarity_df.empty:
             return pd.DataFrame(), [], df_combined

tools/redaction_review.py CHANGED Viewed

@@ -767,7 +767,21 @@ def get_and_merge_current_page_annotations(
             .drop_duplicates(subset=["id"], keep="first")
         )
     else:
-        updated_df = pd.DataFrame()
     return updated_df
@@ -932,7 +946,21 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
             if dfs_to_concat:
                 updated_annotations_df = pd.concat(dfs_to_concat, ignore_index=True)
             else:
-                updated_annotations_df = pd.DataFrame()
     # --- Part 4: Convert final DataFrame to list-of-dicts ---
     updated_recogniser_entity_df = pd.DataFrame()

             .drop_duplicates(subset=["id"], keep="first")
         )
     else:
+        # Return empty DataFrame with expected columns from convert_annotation_data_to_dataframe
+        updated_df = pd.DataFrame(
+            columns=[
+                "image",
+                "page",
+                "label",
+                "color",
+                "xmin",
+                "xmax",
+                "ymin",
+                "ymax",
+                "text",
+                "id",
+            ]
+        )
     return updated_df
             if dfs_to_concat:
                 updated_annotations_df = pd.concat(dfs_to_concat, ignore_index=True)
             else:
+                # Return empty DataFrame with expected columns matching existing_annotations_df structure
+                updated_annotations_df = pd.DataFrame(
+                    columns=[
+                        "image",
+                        "page",
+                        "label",
+                        "color",
+                        "xmin",
+                        "xmax",
+                        "ymin",
+                        "ymax",
+                        "text",
+                        "id",
+                    ]
+                )
     # --- Part 4: Convert final DataFrame to list-of-dicts ---
     updated_recogniser_entity_df = pd.DataFrame()