seanpedrickcase commited on
Commit
21318d3
·
1 Parent(s): d5b5291

Added regex search feature for multi-word text search

Browse files
README.md CHANGED
@@ -589,9 +589,11 @@ The workflow is designed to be simple: **Search → Select → Redact**.
589
 
590
  #### **Step 1: Search for Text**
591
 
 
 
592
  1. Navigate to the **"Search text to make new redactions"** tab.
593
- 2. The main table will initially be populated with all the text extracted from the document, broken down by word.
594
- 3. To narrow this down, use the **"Multi-word text search"** box to type the word or phrase you want to find.
595
  4. Click the **"Search"** button or press Enter.
596
  5. The table below will update to show only the rows containing text that matches your search query.
597
 
 
589
 
590
  #### **Step 1: Search for Text**
591
 
592
+ #### **Step 1: Search for Text**
593
+
594
  1. Navigate to the **"Search text to make new redactions"** tab.
595
+ 2. The main table will initially be populated with all the text extracted from the document for a page, broken down by word.
596
+ 3. To narrow this down, use the **"Multi-word text search"** box to type the word or phrase you want to find (this will search the whole document). If you want to do a regex-based search, tick the 'Enable regex pattern matching' box under 'Search options' below (Note this will only be able to search for patterns in text within each cell).
597
  4. Click the **"Search"** button or press Enter.
598
  5. The table below will update to show only the rows containing text that matches your search query.
599
 
app.py CHANGED
@@ -1701,13 +1701,23 @@ with blocks:
1701
  label="Minimum similarity score for match (max=1)",
1702
  visible=False,
1703
  ) # Not used anymore for this exact search
1704
- new_redaction_text_label = gr.Textbox(
1705
- label="Label for new redactions", value="Redaction"
1706
- )
1707
- colour_label = gr.Textbox(
1708
- label="Colour for labels (three number RGB format, max 255 with brackets)",
1709
- value=CUSTOM_BOX_COLOUR,
1710
- )
 
 
 
 
 
 
 
 
 
 
1711
 
1712
  all_page_line_level_ocr_results_with_words_df = gr.Dataframe(
1713
  pd.DataFrame(
@@ -4701,12 +4711,29 @@ with blocks:
4701
  outputs=[all_page_line_level_ocr_results_with_words_df],
4702
  )
4703
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4704
  multi_word_search_text.submit(
4705
- fn=run_full_search_and_analysis,
4706
  inputs=[
4707
  multi_word_search_text,
4708
  all_page_line_level_ocr_results_with_words_df_base,
4709
  similarity_search_score_minimum,
 
4710
  ],
4711
  outputs=[
4712
  all_page_line_level_ocr_results_with_words_df,
@@ -4716,11 +4743,12 @@ with blocks:
4716
  )
4717
 
4718
  multi_word_search_text_btn.click(
4719
- fn=run_full_search_and_analysis,
4720
  inputs=[
4721
  multi_word_search_text,
4722
  all_page_line_level_ocr_results_with_words_df_base,
4723
  similarity_search_score_minimum,
 
4724
  ],
4725
  outputs=[
4726
  all_page_line_level_ocr_results_with_words_df,
 
1701
  label="Minimum similarity score for match (max=1)",
1702
  visible=False,
1703
  ) # Not used anymore for this exact search
1704
+
1705
+ with gr.Row():
1706
+ with gr.Column():
1707
+ new_redaction_text_label = gr.Textbox(
1708
+ label="Label for new redactions",
1709
+ value="Redaction",
1710
+ )
1711
+ colour_label = gr.Textbox(
1712
+ label="Colour for labels (three number RGB format, max 255 with brackets)",
1713
+ value=CUSTOM_BOX_COLOUR,
1714
+ )
1715
+ with gr.Column():
1716
+ use_regex_search = gr.Checkbox(
1717
+ label="Enable regex pattern matching",
1718
+ value=False,
1719
+ info="When enabled, the search text will be treated as a regular expression pattern instead of literal text",
1720
+ )
1721
 
1722
  all_page_line_level_ocr_results_with_words_df = gr.Dataframe(
1723
  pd.DataFrame(
 
4711
  outputs=[all_page_line_level_ocr_results_with_words_df],
4712
  )
4713
 
4714
+ def run_search_with_regex_option(
4715
+ search_text, word_df, similarity_threshold, use_regex_flag
4716
+ ):
4717
+ """Wrapper function to call run_full_search_and_analysis with regex option"""
4718
+ return run_full_search_and_analysis(
4719
+ search_query_text=search_text,
4720
+ word_level_df_orig=word_df,
4721
+ similarity_threshold=similarity_threshold,
4722
+ combine_pages=False,
4723
+ min_word_count=1,
4724
+ min_consecutive_pages=1,
4725
+ greedy_match=True,
4726
+ remake_index=False,
4727
+ use_regex=use_regex_flag,
4728
+ )
4729
+
4730
  multi_word_search_text.submit(
4731
+ fn=run_search_with_regex_option,
4732
  inputs=[
4733
  multi_word_search_text,
4734
  all_page_line_level_ocr_results_with_words_df_base,
4735
  similarity_search_score_minimum,
4736
+ use_regex_search,
4737
  ],
4738
  outputs=[
4739
  all_page_line_level_ocr_results_with_words_df,
 
4743
  )
4744
 
4745
  multi_word_search_text_btn.click(
4746
+ fn=run_search_with_regex_option,
4747
  inputs=[
4748
  multi_word_search_text,
4749
  all_page_line_level_ocr_results_with_words_df_base,
4750
  similarity_search_score_minimum,
4751
+ use_regex_search,
4752
  ],
4753
  outputs=[
4754
  all_page_line_level_ocr_results_with_words_df,
src/user_guide.qmd CHANGED
@@ -366,8 +366,8 @@ The workflow is designed to be simple: **Search → Select → Redact**.
366
  #### **Step 1: Search for Text**
367
 
368
  1. Navigate to the **"Search text to make new redactions"** tab.
369
- 2. The main table will initially be populated with all the text extracted from the document, broken down by word.
370
- 3. To narrow this down, use the **"Multi-word text search"** box to type the word or phrase you want to find.
371
  4. Click the **"Search"** button or press Enter.
372
  5. The table below will update to show only the rows containing text that matches your search query.
373
 
 
366
  #### **Step 1: Search for Text**
367
 
368
  1. Navigate to the **"Search text to make new redactions"** tab.
369
+ 2. The main table will initially be populated with all the text extracted from the document for a page, broken down by word.
370
+ 3. To narrow this down, use the **"Multi-word text search"** box to type the word or phrase you want to find (this will search the whole document). If you want to do a regex-based search, tick the 'Enable regex pattern matching' box under 'Search options' below (Note this will only be able to search for patterns in text within each cell).
371
  4. Click the **"Search"** button or press Enter.
372
  5. The table below will update to show only the rows containing text that matches your search query.
373
 
tools/custom_csvlogger.py CHANGED
@@ -228,7 +228,7 @@ class CSVLogger_custom(FlaggingCallback):
228
 
229
  if RUN_AWS_FUNCTIONS:
230
  try:
231
- print("Connecting to DynamoDB via existing SSO connection")
232
  dynamodb = boto3.resource("dynamodb", region_name=AWS_REGION)
233
 
234
  dynamodb.meta.client.list_tables()
@@ -236,9 +236,9 @@ class CSVLogger_custom(FlaggingCallback):
236
  except Exception as e:
237
  print("No SSO credentials found:", e)
238
  if AWS_ACCESS_KEY and AWS_SECRET_KEY:
239
- print(
240
- "Trying to get DynamoDB credentials from environment variables"
241
- )
242
  dynamodb = boto3.resource(
243
  "dynamodb",
244
  aws_access_key_id=AWS_ACCESS_KEY,
@@ -328,7 +328,7 @@ class CSVLogger_custom(FlaggingCallback):
328
 
329
  table.put_item(Item=item)
330
 
331
- print("Successfully uploaded log to DynamoDB")
332
  except Exception as e:
333
  print("Could not upload log to DynamobDB due to", e)
334
 
 
228
 
229
  if RUN_AWS_FUNCTIONS:
230
  try:
231
+ # print("Connecting to DynamoDB via existing SSO connection")
232
  dynamodb = boto3.resource("dynamodb", region_name=AWS_REGION)
233
 
234
  dynamodb.meta.client.list_tables()
 
236
  except Exception as e:
237
  print("No SSO credentials found:", e)
238
  if AWS_ACCESS_KEY and AWS_SECRET_KEY:
239
+ # print(
240
+ # "Trying to get DynamoDB credentials from environment variables"
241
+ # )
242
  dynamodb = boto3.resource(
243
  "dynamodb",
244
  aws_access_key_id=AWS_ACCESS_KEY,
 
328
 
329
  table.put_item(Item=item)
330
 
331
+ # print("Successfully uploaded log to DynamoDB")
332
  except Exception as e:
333
  print("Could not upload log to DynamobDB due to", e)
334
 
tools/file_redaction.py CHANGED
@@ -441,7 +441,6 @@ def choose_and_run_redactor(
441
  current_loop_page = 0
442
  out_file_paths = list()
443
  log_files_output_paths = list()
444
- estimate_total_processing_time = 0
445
  estimated_time_taken_state = 0
446
  comprehend_query_number = 0
447
  total_textract_query_number = 0
@@ -543,9 +542,7 @@ def choose_and_run_redactor(
543
  if total_textract_query_number > number_of_pages:
544
  total_textract_query_number = number_of_pages
545
 
546
- estimate_total_processing_time = sum_numbers_before_seconds(
547
- combined_out_message
548
- )
549
  # print(
550
  # "Estimated total processing time:",
551
  # str(estimate_total_processing_time),
@@ -1317,7 +1314,7 @@ def choose_and_run_redactor(
1317
  number_of_pages,
1318
  page_max,
1319
  )
1320
- #print("Saving redacted PDF file:", out_redacted_pdf_file_path)
1321
 
1322
  # Use final document if available, otherwise use main document
1323
  doc_to_save = (
@@ -1352,7 +1349,7 @@ def choose_and_run_redactor(
1352
  number_of_pages,
1353
  page_max,
1354
  )
1355
- #print("Saving PDF file for review:", out_review_pdf_file_path)
1356
 
1357
  if out_review_pdf_file_path:
1358
  save_pdf_with_or_without_compression(
@@ -1692,9 +1689,7 @@ def choose_and_run_redactor(
1692
  combined_out_message + " " + out_time_message
1693
  ) # Ensure this is a single string
1694
 
1695
- estimate_total_processing_time = sum_numbers_before_seconds(
1696
- combined_out_message
1697
- )
1698
 
1699
  # else:
1700
  # toc = time.perf_counter()
@@ -3299,7 +3294,7 @@ def redact_image_pdf(
3299
 
3300
  # Go through each page
3301
  for page_no in progress_bar:
3302
-
3303
  reported_page_number = str(page_no + 1)
3304
  print(f"Current page: {reported_page_number}")
3305
 
@@ -3308,7 +3303,6 @@ def redact_image_pdf(
3308
  page_handwriting_recogniser_results = list()
3309
  page_line_level_ocr_results_with_words = list()
3310
  page_break_return = False
3311
-
3312
 
3313
  # Try to find image location
3314
  try:
@@ -3419,7 +3413,7 @@ def redact_image_pdf(
3419
  if image is None:
3420
  # Check if image_path is a placeholder and create the actual image
3421
  if isinstance(image_path, str) and "placeholder_image" in image_path:
3422
- #print(f"Detected placeholder image path: {image_path}")
3423
  try:
3424
  # Extract page number from placeholder path
3425
  page_num_from_placeholder = int(
@@ -3628,26 +3622,25 @@ def redact_image_pdf(
3628
  page["data"]
3629
  for page in textract_data["pages"]
3630
  if page["page_no"] == reported_page_number
3631
- )
3632
 
3633
  # Check if this is whole-document Textract output (already converted to mediabox space)
3634
  # by checking if the JSON structure indicates it came from restructure_textract_output
3635
  # or if textract_output_found is True (indicating pre-existing whole-document output)
3636
- use_mediabox_for_textract = (
3637
- textract_output_found or
3638
- ("pages" in textract_data and len(textract_data.get("pages", [])) > 0)
3639
  )
3640
-
3641
  if use_mediabox_for_textract:
3642
  # Whole-document Textract: use mediabox dimensions
3643
  textract_page_width = pymupdf_page.mediabox.width
3644
  textract_page_height = pymupdf_page.mediabox.height
3645
- #print(f"Using mediabox dimensions for whole-document Textract: {textract_page_width}x{textract_page_height}")
3646
  else:
3647
  # Individual image Textract: use image dimensions (current behavior)
3648
  textract_page_width = page_width
3649
  textract_page_height = page_height
3650
- #print(f"Using image dimensions for individual image Textract: {textract_page_width}x{textract_page_height}")
3651
 
3652
  (
3653
  page_line_level_ocr_results,
@@ -3658,7 +3651,10 @@ def redact_image_pdf(
3658
  selection_element_results,
3659
  form_key_value_results,
3660
  ) = json_to_ocrresult(
3661
- text_blocks, textract_page_width, textract_page_height, reported_page_number
 
 
 
3662
  )
3663
 
3664
  if all_page_line_level_ocr_results_with_words is None:
@@ -4812,9 +4808,13 @@ def redact_text_pdf(
4812
 
4813
  if page_text_ocr_outputs_list:
4814
  # Filter out empty DataFrames before concatenation to avoid FutureWarning
4815
- non_empty_ocr_outputs = [df for df in page_text_ocr_outputs_list if not df.empty]
 
 
4816
  if non_empty_ocr_outputs:
4817
- page_text_ocr_outputs = pd.concat(non_empty_ocr_outputs, ignore_index=True)
 
 
4818
  else:
4819
  page_text_ocr_outputs = pd.DataFrame(
4820
  columns=[
@@ -4960,17 +4960,50 @@ def redact_text_pdf(
4960
 
4961
  # Write logs
4962
  # Filter out empty DataFrames before concatenation to avoid FutureWarning
4963
- non_empty_decision_process = [df for df in all_pages_decision_process_list if not df.empty]
 
 
4964
  if non_empty_decision_process:
4965
- all_pages_decision_process_table = pd.concat(non_empty_decision_process, ignore_index=True)
 
 
4966
  else:
4967
- all_pages_decision_process_table = pd.DataFrame()
4968
-
4969
- non_empty_ocr_results = [df for df in all_line_level_ocr_results_list if not df.empty]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4970
  if non_empty_ocr_results:
4971
- all_line_level_ocr_results_df = pd.concat(non_empty_ocr_results, ignore_index=True)
 
 
4972
  else:
4973
- all_line_level_ocr_results_df = pd.DataFrame()
 
 
 
 
 
 
 
 
 
 
 
4974
 
4975
  current_loop_page += 1
4976
 
@@ -5010,11 +5043,29 @@ def redact_text_pdf(
5010
 
5011
  # Write logs
5012
  # Filter out empty DataFrames before concatenation to avoid FutureWarning
5013
- non_empty_decision_process = [df for df in all_pages_decision_process_list if not df.empty]
 
 
5014
  if non_empty_decision_process:
5015
- all_pages_decision_process_table = pd.concat(non_empty_decision_process, ignore_index=True)
 
 
5016
  else:
5017
- all_pages_decision_process_table = pd.DataFrame()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5018
 
5019
  return (
5020
  pymupdf_doc,
@@ -5029,52 +5080,81 @@ def redact_text_pdf(
5029
 
5030
  # Write all page outputs
5031
  # Filter out empty DataFrames before concatenation to avoid FutureWarning
5032
- non_empty_decision_process = [df for df in all_pages_decision_process_list if not df.empty]
 
 
5033
  if non_empty_decision_process:
5034
- all_pages_decision_process_table = pd.concat(non_empty_decision_process, ignore_index=True)
 
 
5035
  else:
5036
- all_pages_decision_process_table = pd.DataFrame()
5037
-
5038
- non_empty_ocr_results = [df for df in all_line_level_ocr_results_list if not df.empty]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5039
  if non_empty_ocr_results:
5040
- all_line_level_ocr_results_df = pd.concat(non_empty_ocr_results, ignore_index=True)
 
 
5041
  else:
5042
- all_line_level_ocr_results_df = pd.DataFrame()
 
 
5043
 
5044
- # Convert decision table to relative coordinates
5045
- all_pages_decision_process_table = divide_coordinates_by_page_sizes(
5046
- all_pages_decision_process_table,
5047
- page_sizes_df,
5048
- xmin="xmin",
5049
- xmax="xmax",
5050
- ymin="ymin",
5051
- ymax="ymax",
5052
- )
5053
 
5054
- # Coordinates need to be reversed for ymin and ymax to match with image annotator objects downstream
5055
- all_pages_decision_process_table["ymin"] = reverse_y_coords(
5056
- all_pages_decision_process_table, "ymin"
5057
- )
5058
- all_pages_decision_process_table["ymax"] = reverse_y_coords(
5059
- all_pages_decision_process_table, "ymax"
5060
- )
 
 
5061
 
5062
- # Convert decision table to relative coordinates
5063
- all_line_level_ocr_results_df = divide_coordinates_by_page_sizes(
5064
- all_line_level_ocr_results_df,
5065
- page_sizes_df,
5066
- xmin="left",
5067
- xmax="width",
5068
- ymin="top",
5069
- ymax="height",
5070
- )
5071
 
5072
- # Coordinates need to be reversed for ymin and ymax to match with image annotator objects downstream
5073
  if not all_line_level_ocr_results_df.empty:
5074
- all_line_level_ocr_results_df["top"] = reverse_y_coords(
5075
- all_line_level_ocr_results_df, "top"
 
 
 
 
 
 
5076
  )
5077
 
 
 
 
 
 
 
5078
  # Remove empty dictionary items from ocr results with words
5079
  all_page_line_level_ocr_results_with_words = [
5080
  d for d in all_page_line_level_ocr_results_with_words if d
 
441
  current_loop_page = 0
442
  out_file_paths = list()
443
  log_files_output_paths = list()
 
444
  estimated_time_taken_state = 0
445
  comprehend_query_number = 0
446
  total_textract_query_number = 0
 
542
  if total_textract_query_number > number_of_pages:
543
  total_textract_query_number = number_of_pages
544
 
545
+ sum_numbers_before_seconds(combined_out_message)
 
 
546
  # print(
547
  # "Estimated total processing time:",
548
  # str(estimate_total_processing_time),
 
1314
  number_of_pages,
1315
  page_max,
1316
  )
1317
+ # print("Saving redacted PDF file:", out_redacted_pdf_file_path)
1318
 
1319
  # Use final document if available, otherwise use main document
1320
  doc_to_save = (
 
1349
  number_of_pages,
1350
  page_max,
1351
  )
1352
+ # print("Saving PDF file for review:", out_review_pdf_file_path)
1353
 
1354
  if out_review_pdf_file_path:
1355
  save_pdf_with_or_without_compression(
 
1689
  combined_out_message + " " + out_time_message
1690
  ) # Ensure this is a single string
1691
 
1692
+ sum_numbers_before_seconds(combined_out_message)
 
 
1693
 
1694
  # else:
1695
  # toc = time.perf_counter()
 
3294
 
3295
  # Go through each page
3296
  for page_no in progress_bar:
3297
+
3298
  reported_page_number = str(page_no + 1)
3299
  print(f"Current page: {reported_page_number}")
3300
 
 
3303
  page_handwriting_recogniser_results = list()
3304
  page_line_level_ocr_results_with_words = list()
3305
  page_break_return = False
 
3306
 
3307
  # Try to find image location
3308
  try:
 
3413
  if image is None:
3414
  # Check if image_path is a placeholder and create the actual image
3415
  if isinstance(image_path, str) and "placeholder_image" in image_path:
3416
+ # print(f"Detected placeholder image path: {image_path}")
3417
  try:
3418
  # Extract page number from placeholder path
3419
  page_num_from_placeholder = int(
 
3622
  page["data"]
3623
  for page in textract_data["pages"]
3624
  if page["page_no"] == reported_page_number
3625
+ )
3626
 
3627
  # Check if this is whole-document Textract output (already converted to mediabox space)
3628
  # by checking if the JSON structure indicates it came from restructure_textract_output
3629
  # or if textract_output_found is True (indicating pre-existing whole-document output)
3630
+ use_mediabox_for_textract = textract_output_found or (
3631
+ "pages" in textract_data and len(textract_data.get("pages", [])) > 0
 
3632
  )
3633
+
3634
  if use_mediabox_for_textract:
3635
  # Whole-document Textract: use mediabox dimensions
3636
  textract_page_width = pymupdf_page.mediabox.width
3637
  textract_page_height = pymupdf_page.mediabox.height
3638
+ # print(f"Using mediabox dimensions for whole-document Textract: {textract_page_width}x{textract_page_height}")
3639
  else:
3640
  # Individual image Textract: use image dimensions (current behavior)
3641
  textract_page_width = page_width
3642
  textract_page_height = page_height
3643
+ # print(f"Using image dimensions for individual image Textract: {textract_page_width}x{textract_page_height}")
3644
 
3645
  (
3646
  page_line_level_ocr_results,
 
3651
  selection_element_results,
3652
  form_key_value_results,
3653
  ) = json_to_ocrresult(
3654
+ text_blocks,
3655
+ textract_page_width,
3656
+ textract_page_height,
3657
+ reported_page_number,
3658
  )
3659
 
3660
  if all_page_line_level_ocr_results_with_words is None:
 
4808
 
4809
  if page_text_ocr_outputs_list:
4810
  # Filter out empty DataFrames before concatenation to avoid FutureWarning
4811
+ non_empty_ocr_outputs = [
4812
+ df for df in page_text_ocr_outputs_list if not df.empty
4813
+ ]
4814
  if non_empty_ocr_outputs:
4815
+ page_text_ocr_outputs = pd.concat(
4816
+ non_empty_ocr_outputs, ignore_index=True
4817
+ )
4818
  else:
4819
  page_text_ocr_outputs = pd.DataFrame(
4820
  columns=[
 
4960
 
4961
  # Write logs
4962
  # Filter out empty DataFrames before concatenation to avoid FutureWarning
4963
+ non_empty_decision_process = [
4964
+ df for df in all_pages_decision_process_list if not df.empty
4965
+ ]
4966
  if non_empty_decision_process:
4967
+ all_pages_decision_process_table = pd.concat(
4968
+ non_empty_decision_process, ignore_index=True
4969
+ )
4970
  else:
4971
+ all_pages_decision_process_table = pd.DataFrame(
4972
+ columns=[
4973
+ "text",
4974
+ "xmin",
4975
+ "ymin",
4976
+ "xmax",
4977
+ "ymax",
4978
+ "label",
4979
+ "start",
4980
+ "end",
4981
+ "score",
4982
+ "page",
4983
+ "id",
4984
+ ]
4985
+ )
4986
+
4987
+ non_empty_ocr_results = [
4988
+ df for df in all_line_level_ocr_results_list if not df.empty
4989
+ ]
4990
  if non_empty_ocr_results:
4991
+ all_line_level_ocr_results_df = pd.concat(
4992
+ non_empty_ocr_results, ignore_index=True
4993
+ )
4994
  else:
4995
+ all_line_level_ocr_results_df = pd.DataFrame(
4996
+ columns=[
4997
+ "page",
4998
+ "text",
4999
+ "left",
5000
+ "top",
5001
+ "width",
5002
+ "height",
5003
+ "line",
5004
+ "conf",
5005
+ ]
5006
+ )
5007
 
5008
  current_loop_page += 1
5009
 
 
5043
 
5044
  # Write logs
5045
  # Filter out empty DataFrames before concatenation to avoid FutureWarning
5046
+ non_empty_decision_process = [
5047
+ df for df in all_pages_decision_process_list if not df.empty
5048
+ ]
5049
  if non_empty_decision_process:
5050
+ all_pages_decision_process_table = pd.concat(
5051
+ non_empty_decision_process, ignore_index=True
5052
+ )
5053
  else:
5054
+ all_pages_decision_process_table = pd.DataFrame(
5055
+ columns=[
5056
+ "text",
5057
+ "xmin",
5058
+ "ymin",
5059
+ "xmax",
5060
+ "ymax",
5061
+ "label",
5062
+ "start",
5063
+ "end",
5064
+ "score",
5065
+ "page",
5066
+ "id",
5067
+ ]
5068
+ )
5069
 
5070
  return (
5071
  pymupdf_doc,
 
5080
 
5081
  # Write all page outputs
5082
  # Filter out empty DataFrames before concatenation to avoid FutureWarning
5083
+ non_empty_decision_process = [
5084
+ df for df in all_pages_decision_process_list if not df.empty
5085
+ ]
5086
  if non_empty_decision_process:
5087
+ all_pages_decision_process_table = pd.concat(
5088
+ non_empty_decision_process, ignore_index=True
5089
+ )
5090
  else:
5091
+ all_pages_decision_process_table = pd.DataFrame(
5092
+ columns=[
5093
+ "text",
5094
+ "xmin",
5095
+ "ymin",
5096
+ "xmax",
5097
+ "ymax",
5098
+ "label",
5099
+ "start",
5100
+ "end",
5101
+ "score",
5102
+ "page",
5103
+ "id",
5104
+ ]
5105
+ )
5106
+
5107
+ non_empty_ocr_results = [
5108
+ df for df in all_line_level_ocr_results_list if not df.empty
5109
+ ]
5110
  if non_empty_ocr_results:
5111
+ all_line_level_ocr_results_df = pd.concat(
5112
+ non_empty_ocr_results, ignore_index=True
5113
+ )
5114
  else:
5115
+ all_line_level_ocr_results_df = pd.DataFrame(
5116
+ columns=["page", "text", "left", "top", "width", "height", "line", "conf"]
5117
+ )
5118
 
5119
+ if not all_pages_decision_process_table.empty:
 
 
 
 
 
 
 
 
5120
 
5121
+ # Convert decision table to relative coordinates
5122
+ all_pages_decision_process_table = divide_coordinates_by_page_sizes(
5123
+ all_pages_decision_process_table,
5124
+ page_sizes_df,
5125
+ xmin="xmin",
5126
+ xmax="xmax",
5127
+ ymin="ymin",
5128
+ ymax="ymax",
5129
+ )
5130
 
5131
+ # Coordinates need to be reversed for ymin and ymax to match with image annotator objects downstream
5132
+
5133
+ all_pages_decision_process_table["ymin"] = reverse_y_coords(
5134
+ all_pages_decision_process_table, "ymin"
5135
+ )
5136
+ all_pages_decision_process_table["ymax"] = reverse_y_coords(
5137
+ all_pages_decision_process_table, "ymax"
5138
+ )
 
5139
 
5140
+ # Convert decision table to relative coordinates
5141
  if not all_line_level_ocr_results_df.empty:
5142
+
5143
+ all_line_level_ocr_results_df = divide_coordinates_by_page_sizes(
5144
+ all_line_level_ocr_results_df,
5145
+ page_sizes_df,
5146
+ xmin="left",
5147
+ xmax="width",
5148
+ ymin="top",
5149
+ ymax="height",
5150
  )
5151
 
5152
+ # Coordinates need to be reversed for ymin and ymax to match with image annotator objects downstream
5153
+ if not all_line_level_ocr_results_df.empty:
5154
+ all_line_level_ocr_results_df["top"] = reverse_y_coords(
5155
+ all_line_level_ocr_results_df, "top"
5156
+ )
5157
+
5158
  # Remove empty dictionary items from ocr results with words
5159
  all_page_line_level_ocr_results_with_words = [
5160
  d for d in all_page_line_level_ocr_results_with_words if d
tools/find_duplicate_pages.py CHANGED
@@ -122,6 +122,7 @@ def run_full_search_and_analysis(
122
  min_consecutive_pages: int = 1,
123
  greedy_match: bool = True,
124
  remake_index: bool = False,
 
125
  progress=gr.Progress(track_tqdm=True),
126
  ):
127
  """
@@ -133,7 +134,7 @@ def run_full_search_and_analysis(
133
  4. Executes the similarity analysis on the combined data using the specified parameters such as similarity threshold, minimum word count, minimum consecutive pages, and greedy match strategy.
134
 
135
  Parameters:
136
- - search_query_text (str): The text entered by the user to search for in the OCR data.
137
  - word_level_df_orig (pd.DataFrame): The original DataFrame containing word-level OCR data.
138
  - similarity_threshold (float, optional): The minimum similarity score required for two pages to be considered duplicates. Defaults to 1.
139
  - combine_pages (bool, optional): A flag indicating whether to combine text from the same page number within a file. Defaults to False.
@@ -141,6 +142,7 @@ def run_full_search_and_analysis(
141
  - min_consecutive_pages (int, optional): The minimum number of consecutive pages required to be considered a match. Defaults to 1.
142
  - greedy_match (bool, optional): A flag indicating whether to use a greedy strategy for matching consecutive pages. Defaults to True.
143
  - remake_index (bool, optional): A flag indicating whether to remake the index of the DataFrame during processing. Defaults to False.
 
144
  - progress (gr.Progress, optional): A Progress object to track the progress of the operation. Defaults to a Progress object with track_tqdm set to True.
145
  """
146
 
@@ -149,30 +151,56 @@ def run_full_search_and_analysis(
149
  if len(search_query_text) > 100:
150
  raise Warning("Please use a search query with at less than 100 characters.")
151
 
152
- if punctuation_at_word_text_end(word_level_df_orig) is True:
153
- do_punctuation_split = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  else:
155
- do_punctuation_split = True
156
-
157
- # Step 1: Process the user's search query string
158
- search_query_data, query_word_length = create_dataframe_from_string(
159
- search_query_text,
160
- file_name="user_search_query",
161
- split_words=True,
162
- split_punctuation=do_punctuation_split,
163
- )
164
- if not search_query_data:
165
- # Handle case where user submits an empty search string
166
- raise Warning("Could not convert search string to required format")
 
 
 
 
167
 
168
- if query_word_length > 25:
169
- # Handle case where user submits an empty search string
170
- raise Warning("Please use a query with less than 25 words")
171
 
172
- # Overwrite min_consecutive_pages with the search string length
173
- min_consecutive_pages = query_word_length
174
 
175
  # Create word index from reference table
 
 
 
 
 
 
176
  word_level_df_orig["index"] = word_level_df_orig.index
177
  word_level_df = word_level_df_orig.copy()
178
 
@@ -204,6 +232,7 @@ def run_full_search_and_analysis(
204
  do_text_clean=False,
205
  file1_name="user_search_query",
206
  file2_name="source_document",
 
207
  progress=progress,
208
  )
209
 
@@ -777,7 +806,10 @@ def _sequences_match(query_seq: List[str], ref_seq: List[str]) -> bool:
777
 
778
 
779
  def find_consecutive_sequence_matches(
780
- df_filtered: pd.DataFrame, search_file_name: str, reference_file_name: str
 
 
 
781
  ) -> pd.DataFrame:
782
  """
783
  Finds all occurrences of a consecutive sequence of tokens from a search file
@@ -789,6 +821,7 @@ def find_consecutive_sequence_matches(
789
  df_filtered: The DataFrame containing all tokens, with 'file' and 'text_clean' columns.
790
  search_file_name: The name of the file containing the search query sequence.
791
  reference_file_name: The name of the file to search within.
 
792
 
793
  Returns:
794
  A DataFrame with two columns ('Page1_Index', 'Page2_Index') mapping the
@@ -803,38 +836,115 @@ def find_consecutive_sequence_matches(
803
  print("Error: One or both files not found or are empty.")
804
  return pd.DataFrame(columns=["Page1_Index", "Page2_Index"])
805
 
806
- # Step 2: Convert the token data into lists for easy comparison.
807
- # We need both the text tokens and their original global indices.
808
- query_tokens = search_df["text_clean"].tolist()
809
- query_indices = search_df.index.tolist()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
810
 
811
- reference_tokens = reference_df["text_clean"].tolist()
812
- reference_indices = reference_df.index.tolist()
813
 
814
- query_len = len(query_tokens)
815
- all_found_matches = list()
 
 
816
 
817
- print(f"Searching for a sequence of {query_len} tokens...")
 
 
 
 
 
818
 
819
- # Step 3: Use a "sliding window" to search for the query sequence in the reference list.
820
- for i in range(len(reference_tokens) - query_len + 1):
821
- # The "window" is a slice of the reference list that is the same size as the query
822
- window = reference_tokens[i : i + query_len]
823
 
824
- # Step 4: If the window matches the query with or without punctuation on end
825
- if _sequences_match(query_tokens, window):
 
826
 
827
- # Get the global indices for this entire matching block
828
- matching_reference_indices = reference_indices[i : i + query_len]
 
 
 
 
829
 
830
- # Create the mapping between query indices and the found reference indices
831
- for j in range(query_len):
832
- all_found_matches.append(
833
- (query_indices[j], matching_reference_indices[j], 1)
834
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
835
 
836
- # If you only want the *first* match, you can uncomment the next line:
837
- # break
838
 
839
  if not all_found_matches:
840
  print("No matches found")
@@ -860,6 +970,7 @@ def identify_similar_text_sequences(
860
  file1_name: str = "",
861
  file2_name: str = "",
862
  output_folder: str = OUTPUT_FOLDER,
 
863
  progress=Progress(track_tqdm=True),
864
  ) -> Tuple[pd.DataFrame, List[str], pd.DataFrame]:
865
  """
@@ -903,7 +1014,7 @@ def identify_similar_text_sequences(
903
 
904
  # base_similarity_df = _debug_similarity_between_two_files(df_filtered, vectorizer, similarity_threshold, file1_name, file2_name)
905
  base_similarity_df = find_consecutive_sequence_matches(
906
- df_filtered, file1_name, file2_name
907
  )
908
  if base_similarity_df.empty:
909
  return pd.DataFrame(), [], df_combined
 
122
  min_consecutive_pages: int = 1,
123
  greedy_match: bool = True,
124
  remake_index: bool = False,
125
+ use_regex: bool = False,
126
  progress=gr.Progress(track_tqdm=True),
127
  ):
128
  """
 
134
  4. Executes the similarity analysis on the combined data using the specified parameters such as similarity threshold, minimum word count, minimum consecutive pages, and greedy match strategy.
135
 
136
  Parameters:
137
+ - search_query_text (str): The text entered by the user to search for in the OCR data. If use_regex=True, this is treated as a regex pattern.
138
  - word_level_df_orig (pd.DataFrame): The original DataFrame containing word-level OCR data.
139
  - similarity_threshold (float, optional): The minimum similarity score required for two pages to be considered duplicates. Defaults to 1.
140
  - combine_pages (bool, optional): A flag indicating whether to combine text from the same page number within a file. Defaults to False.
 
142
  - min_consecutive_pages (int, optional): The minimum number of consecutive pages required to be considered a match. Defaults to 1.
143
  - greedy_match (bool, optional): A flag indicating whether to use a greedy strategy for matching consecutive pages. Defaults to True.
144
  - remake_index (bool, optional): A flag indicating whether to remake the index of the DataFrame during processing. Defaults to False.
145
+ - use_regex (bool, optional): If True, treats search_query_text as a regex pattern instead of literal text. Defaults to False.
146
  - progress (gr.Progress, optional): A Progress object to track the progress of the operation. Defaults to a Progress object with track_tqdm set to True.
147
  """
148
 
 
151
  if len(search_query_text) > 100:
152
  raise Warning("Please use a search query with at less than 100 characters.")
153
 
154
+ # For regex mode, we handle the query differently
155
+ if use_regex:
156
+ # Validate regex pattern
157
+ try:
158
+ re.compile(search_query_text)
159
+ except re.error as e:
160
+ raise Warning(f"Invalid regex pattern: {e}")
161
+
162
+ # For regex, we don't split into words - treat as single pattern
163
+ # Create a minimal DataFrame structure for the regex pattern
164
+ search_query_data = [
165
+ (
166
+ "user_search_query",
167
+ pd.DataFrame({"page": [1], "text": [search_query_text], "line": [1]}),
168
+ )
169
+ ]
170
+ query_word_length = 1 # For regex, we'll handle matching differently
171
+ min_consecutive_pages = 1 # Regex matches can be variable length
172
  else:
173
+ # Original literal text matching logic
174
+ if punctuation_at_word_text_end(word_level_df_orig) is True:
175
+ do_punctuation_split = False
176
+ else:
177
+ do_punctuation_split = True
178
+
179
+ # Step 1: Process the user's search query string
180
+ search_query_data, query_word_length = create_dataframe_from_string(
181
+ search_query_text,
182
+ file_name="user_search_query",
183
+ split_words=True,
184
+ split_punctuation=do_punctuation_split,
185
+ )
186
+ if not search_query_data:
187
+ # Handle case where user submits an empty search string
188
+ raise Warning("Could not convert search string to required format")
189
 
190
+ if query_word_length > 25:
191
+ # Handle case where user submits an empty search string
192
+ raise Warning("Please use a query with less than 25 words")
193
 
194
+ # Overwrite min_consecutive_pages with the search string length
195
+ min_consecutive_pages = query_word_length
196
 
197
  # Create word index from reference table
198
+
199
+ if word_level_df_orig.empty:
200
+ raise gr.Error(
201
+ "No word-level data to process. Please check that you have loaded in OCR data."
202
+ )
203
+
204
  word_level_df_orig["index"] = word_level_df_orig.index
205
  word_level_df = word_level_df_orig.copy()
206
 
 
232
  do_text_clean=False,
233
  file1_name="user_search_query",
234
  file2_name="source_document",
235
+ use_regex=use_regex,
236
  progress=progress,
237
  )
238
 
 
806
 
807
 
808
  def find_consecutive_sequence_matches(
809
+ df_filtered: pd.DataFrame,
810
+ search_file_name: str,
811
+ reference_file_name: str,
812
+ use_regex: bool = False,
813
  ) -> pd.DataFrame:
814
  """
815
  Finds all occurrences of a consecutive sequence of tokens from a search file
 
821
  df_filtered: The DataFrame containing all tokens, with 'file' and 'text_clean' columns.
822
  search_file_name: The name of the file containing the search query sequence.
823
  reference_file_name: The name of the file to search within.
824
+ use_regex: If True, treats the search query as a regex pattern instead of literal tokens.
825
 
826
  Returns:
827
  A DataFrame with two columns ('Page1_Index', 'Page2_Index') mapping the
 
836
  print("Error: One or both files not found or are empty.")
837
  return pd.DataFrame(columns=["Page1_Index", "Page2_Index"])
838
 
839
+ if use_regex:
840
+ # Regex mode: Extract pattern and search in combined text
841
+ # Get the regex pattern from the search query (should be in 'text' column, not 'text_clean')
842
+ # We need to get it from the original 'text' column if available, otherwise use 'text_clean'
843
+ if "text" in search_df.columns:
844
+ regex_pattern = search_df["text"].iloc[0]
845
+ else:
846
+ regex_pattern = search_df["text_clean"].iloc[0]
847
+
848
+ # Join reference tokens back into text for regex searching
849
+ # Use original 'text' column if available to preserve original formatting (important for emails, etc.)
850
+ # Otherwise fall back to 'text_clean'
851
+ if "text" in reference_df.columns:
852
+ reference_tokens = reference_df["text"].tolist()
853
+ else:
854
+ reference_tokens = reference_df["text_clean"].tolist()
855
+ reference_indices = reference_df.index.tolist()
856
+
857
+ # Join tokens with spaces to reconstruct the text
858
+ # Note: If tokens were split at special characters like @, this may not perfectly reconstruct
859
+ # the original text, but it's the best we can do with tokenized data
860
+ reference_text = " ".join(reference_tokens)
861
+
862
+ # Build a mapping from character positions to token indices
863
+ # This helps us map regex match positions back to token indices
864
+ char_to_token_map = []
865
+ current_pos = 0
866
+ for idx, token in enumerate(reference_tokens):
867
+ token_start = current_pos
868
+ token_end = current_pos + len(token)
869
+ char_to_token_map.append((token_start, token_end, reference_indices[idx]))
870
+ # Add 1 for the space separator (except after last token)
871
+ current_pos = token_end + (1 if idx < len(reference_tokens) - 1 else 0)
872
+
873
+ # Find all regex matches
874
+ try:
875
+ pattern = re.compile(regex_pattern, re.IGNORECASE)
876
+ matches = list(pattern.finditer(reference_text))
877
+ except re.error as e:
878
+ print(f"Error compiling regex pattern: {e}")
879
+ gr.Warning(f"Invalid regex pattern: {e}")
880
+ return pd.DataFrame(
881
+ columns=["Page1_Index", "Page2_Index", "Similarity_Score"]
882
+ )
883
+
884
+ if not matches:
885
+ print("No regex matches found")
886
+ gr.Info("No regex matches found")
887
+ return pd.DataFrame(
888
+ columns=["Page1_Index", "Page2_Index", "Similarity_Score"]
889
+ )
890
 
891
+ all_found_matches = []
892
+ query_index = search_df.index[0] # Use the first (and only) query index
893
 
894
+ # For each regex match, find which tokens it spans
895
+ for match in matches:
896
+ match_start = match.start()
897
+ match_end = match.end()
898
 
899
+ # Find all tokens that overlap with this match
900
+ matching_token_indices = []
901
+ for token_start, token_end, token_idx in char_to_token_map:
902
+ # Check if token overlaps with match
903
+ if not (token_end < match_start or token_start > match_end):
904
+ matching_token_indices.append(token_idx)
905
 
906
+ # Create matches for all tokens in the span
907
+ for token_idx in matching_token_indices:
908
+ all_found_matches.append((query_index, token_idx, 1))
 
909
 
910
+ print(
911
+ f"Found {len(matches)} regex match(es) spanning {len(set(idx for _, idx, _ in all_found_matches))} token(s)"
912
+ )
913
 
914
+ else:
915
+ # Original literal token matching logic
916
+ # Step 2: Convert the token data into lists for easy comparison.
917
+ # We need both the text tokens and their original global indices.
918
+ query_tokens = search_df["text_clean"].tolist()
919
+ query_indices = search_df.index.tolist()
920
 
921
+ reference_tokens = reference_df["text_clean"].tolist()
922
+ reference_indices = reference_df.index.tolist()
923
+
924
+ query_len = len(query_tokens)
925
+ all_found_matches = list()
926
+
927
+ print(f"Searching for a sequence of {query_len} tokens...")
928
+
929
+ # Step 3: Use a "sliding window" to search for the query sequence in the reference list.
930
+ for i in range(len(reference_tokens) - query_len + 1):
931
+ # The "window" is a slice of the reference list that is the same size as the query
932
+ window = reference_tokens[i : i + query_len]
933
+
934
+ # Step 4: If the window matches the query with or without punctuation on end
935
+ if _sequences_match(query_tokens, window):
936
+
937
+ # Get the global indices for this entire matching block
938
+ matching_reference_indices = reference_indices[i : i + query_len]
939
+
940
+ # Create the mapping between query indices and the found reference indices
941
+ for j in range(query_len):
942
+ all_found_matches.append(
943
+ (query_indices[j], matching_reference_indices[j], 1)
944
+ )
945
 
946
+ # If you only want the *first* match, you can uncomment the next line:
947
+ # break
948
 
949
  if not all_found_matches:
950
  print("No matches found")
 
970
  file1_name: str = "",
971
  file2_name: str = "",
972
  output_folder: str = OUTPUT_FOLDER,
973
+ use_regex: bool = False,
974
  progress=Progress(track_tqdm=True),
975
  ) -> Tuple[pd.DataFrame, List[str], pd.DataFrame]:
976
  """
 
1014
 
1015
  # base_similarity_df = _debug_similarity_between_two_files(df_filtered, vectorizer, similarity_threshold, file1_name, file2_name)
1016
  base_similarity_df = find_consecutive_sequence_matches(
1017
+ df_filtered, file1_name, file2_name, use_regex=use_regex
1018
  )
1019
  if base_similarity_df.empty:
1020
  return pd.DataFrame(), [], df_combined
tools/redaction_review.py CHANGED
@@ -767,7 +767,21 @@ def get_and_merge_current_page_annotations(
767
  .drop_duplicates(subset=["id"], keep="first")
768
  )
769
  else:
770
- updated_df = pd.DataFrame()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
771
 
772
  return updated_df
773
 
@@ -932,7 +946,21 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
932
  if dfs_to_concat:
933
  updated_annotations_df = pd.concat(dfs_to_concat, ignore_index=True)
934
  else:
935
- updated_annotations_df = pd.DataFrame()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
936
 
937
  # --- Part 4: Convert final DataFrame to list-of-dicts ---
938
  updated_recogniser_entity_df = pd.DataFrame()
 
767
  .drop_duplicates(subset=["id"], keep="first")
768
  )
769
  else:
770
+ # Return empty DataFrame with expected columns from convert_annotation_data_to_dataframe
771
+ updated_df = pd.DataFrame(
772
+ columns=[
773
+ "image",
774
+ "page",
775
+ "label",
776
+ "color",
777
+ "xmin",
778
+ "xmax",
779
+ "ymin",
780
+ "ymax",
781
+ "text",
782
+ "id",
783
+ ]
784
+ )
785
 
786
  return updated_df
787
 
 
946
  if dfs_to_concat:
947
  updated_annotations_df = pd.concat(dfs_to_concat, ignore_index=True)
948
  else:
949
+ # Return empty DataFrame with expected columns matching existing_annotations_df structure
950
+ updated_annotations_df = pd.DataFrame(
951
+ columns=[
952
+ "image",
953
+ "page",
954
+ "label",
955
+ "color",
956
+ "xmin",
957
+ "xmax",
958
+ "ymin",
959
+ "ymax",
960
+ "text",
961
+ "id",
962
+ ]
963
+ )
964
 
965
  # --- Part 4: Convert final DataFrame to list-of-dicts ---
966
  updated_recogniser_entity_df = pd.DataFrame()