Commit
·
21318d3
1
Parent(s):
d5b5291
Added regex search feature for multi-word text search
Browse files- README.md +4 -2
- app.py +37 -9
- src/user_guide.qmd +2 -2
- tools/custom_csvlogger.py +5 -5
- tools/file_redaction.py +147 -67
- tools/find_duplicate_pages.py +157 -46
- tools/redaction_review.py +30 -2
README.md
CHANGED
|
@@ -589,9 +589,11 @@ The workflow is designed to be simple: **Search → Select → Redact**.
|
|
| 589 |
|
| 590 |
#### **Step 1: Search for Text**
|
| 591 |
|
|
|
|
|
|
|
| 592 |
1. Navigate to the **"Search text to make new redactions"** tab.
|
| 593 |
-
2. The main table will initially be populated with all the text extracted from the document, broken down by word.
|
| 594 |
-
3. To narrow this down, use the **"Multi-word text search"** box to type the word or phrase you want to find.
|
| 595 |
4. Click the **"Search"** button or press Enter.
|
| 596 |
5. The table below will update to show only the rows containing text that matches your search query.
|
| 597 |
|
|
|
|
| 589 |
|
| 590 |
#### **Step 1: Search for Text**
|
| 591 |
|
| 592 |
+
#### **Step 1: Search for Text**
|
| 593 |
+
|
| 594 |
1. Navigate to the **"Search text to make new redactions"** tab.
|
| 595 |
+
2. The main table will initially be populated with all the text extracted from the document for a page, broken down by word.
|
| 596 |
+
3. To narrow this down, use the **"Multi-word text search"** box to type the word or phrase you want to find (this will search the whole document). If you want to do a regex-based search, tick the 'Enable regex pattern matching' box under 'Search options' below (Note this will only be able to search for patterns in text within each cell).
|
| 597 |
4. Click the **"Search"** button or press Enter.
|
| 598 |
5. The table below will update to show only the rows containing text that matches your search query.
|
| 599 |
|
app.py
CHANGED
|
@@ -1701,13 +1701,23 @@ with blocks:
|
|
| 1701 |
label="Minimum similarity score for match (max=1)",
|
| 1702 |
visible=False,
|
| 1703 |
) # Not used anymore for this exact search
|
| 1704 |
-
|
| 1705 |
-
|
| 1706 |
-
|
| 1707 |
-
|
| 1708 |
-
|
| 1709 |
-
|
| 1710 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1711 |
|
| 1712 |
all_page_line_level_ocr_results_with_words_df = gr.Dataframe(
|
| 1713 |
pd.DataFrame(
|
|
@@ -4701,12 +4711,29 @@ with blocks:
|
|
| 4701 |
outputs=[all_page_line_level_ocr_results_with_words_df],
|
| 4702 |
)
|
| 4703 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4704 |
multi_word_search_text.submit(
|
| 4705 |
-
fn=
|
| 4706 |
inputs=[
|
| 4707 |
multi_word_search_text,
|
| 4708 |
all_page_line_level_ocr_results_with_words_df_base,
|
| 4709 |
similarity_search_score_minimum,
|
|
|
|
| 4710 |
],
|
| 4711 |
outputs=[
|
| 4712 |
all_page_line_level_ocr_results_with_words_df,
|
|
@@ -4716,11 +4743,12 @@ with blocks:
|
|
| 4716 |
)
|
| 4717 |
|
| 4718 |
multi_word_search_text_btn.click(
|
| 4719 |
-
fn=
|
| 4720 |
inputs=[
|
| 4721 |
multi_word_search_text,
|
| 4722 |
all_page_line_level_ocr_results_with_words_df_base,
|
| 4723 |
similarity_search_score_minimum,
|
|
|
|
| 4724 |
],
|
| 4725 |
outputs=[
|
| 4726 |
all_page_line_level_ocr_results_with_words_df,
|
|
|
|
| 1701 |
label="Minimum similarity score for match (max=1)",
|
| 1702 |
visible=False,
|
| 1703 |
) # Not used anymore for this exact search
|
| 1704 |
+
|
| 1705 |
+
with gr.Row():
|
| 1706 |
+
with gr.Column():
|
| 1707 |
+
new_redaction_text_label = gr.Textbox(
|
| 1708 |
+
label="Label for new redactions",
|
| 1709 |
+
value="Redaction",
|
| 1710 |
+
)
|
| 1711 |
+
colour_label = gr.Textbox(
|
| 1712 |
+
label="Colour for labels (three number RGB format, max 255 with brackets)",
|
| 1713 |
+
value=CUSTOM_BOX_COLOUR,
|
| 1714 |
+
)
|
| 1715 |
+
with gr.Column():
|
| 1716 |
+
use_regex_search = gr.Checkbox(
|
| 1717 |
+
label="Enable regex pattern matching",
|
| 1718 |
+
value=False,
|
| 1719 |
+
info="When enabled, the search text will be treated as a regular expression pattern instead of literal text",
|
| 1720 |
+
)
|
| 1721 |
|
| 1722 |
all_page_line_level_ocr_results_with_words_df = gr.Dataframe(
|
| 1723 |
pd.DataFrame(
|
|
|
|
| 4711 |
outputs=[all_page_line_level_ocr_results_with_words_df],
|
| 4712 |
)
|
| 4713 |
|
| 4714 |
+
def run_search_with_regex_option(
|
| 4715 |
+
search_text, word_df, similarity_threshold, use_regex_flag
|
| 4716 |
+
):
|
| 4717 |
+
"""Wrapper function to call run_full_search_and_analysis with regex option"""
|
| 4718 |
+
return run_full_search_and_analysis(
|
| 4719 |
+
search_query_text=search_text,
|
| 4720 |
+
word_level_df_orig=word_df,
|
| 4721 |
+
similarity_threshold=similarity_threshold,
|
| 4722 |
+
combine_pages=False,
|
| 4723 |
+
min_word_count=1,
|
| 4724 |
+
min_consecutive_pages=1,
|
| 4725 |
+
greedy_match=True,
|
| 4726 |
+
remake_index=False,
|
| 4727 |
+
use_regex=use_regex_flag,
|
| 4728 |
+
)
|
| 4729 |
+
|
| 4730 |
multi_word_search_text.submit(
|
| 4731 |
+
fn=run_search_with_regex_option,
|
| 4732 |
inputs=[
|
| 4733 |
multi_word_search_text,
|
| 4734 |
all_page_line_level_ocr_results_with_words_df_base,
|
| 4735 |
similarity_search_score_minimum,
|
| 4736 |
+
use_regex_search,
|
| 4737 |
],
|
| 4738 |
outputs=[
|
| 4739 |
all_page_line_level_ocr_results_with_words_df,
|
|
|
|
| 4743 |
)
|
| 4744 |
|
| 4745 |
multi_word_search_text_btn.click(
|
| 4746 |
+
fn=run_search_with_regex_option,
|
| 4747 |
inputs=[
|
| 4748 |
multi_word_search_text,
|
| 4749 |
all_page_line_level_ocr_results_with_words_df_base,
|
| 4750 |
similarity_search_score_minimum,
|
| 4751 |
+
use_regex_search,
|
| 4752 |
],
|
| 4753 |
outputs=[
|
| 4754 |
all_page_line_level_ocr_results_with_words_df,
|
src/user_guide.qmd
CHANGED
|
@@ -366,8 +366,8 @@ The workflow is designed to be simple: **Search → Select → Redact**.
|
|
| 366 |
#### **Step 1: Search for Text**
|
| 367 |
|
| 368 |
1. Navigate to the **"Search text to make new redactions"** tab.
|
| 369 |
-
2. The main table will initially be populated with all the text extracted from the document, broken down by word.
|
| 370 |
-
3. To narrow this down, use the **"Multi-word text search"** box to type the word or phrase you want to find.
|
| 371 |
4. Click the **"Search"** button or press Enter.
|
| 372 |
5. The table below will update to show only the rows containing text that matches your search query.
|
| 373 |
|
|
|
|
| 366 |
#### **Step 1: Search for Text**
|
| 367 |
|
| 368 |
1. Navigate to the **"Search text to make new redactions"** tab.
|
| 369 |
+
2. The main table will initially be populated with all the text extracted from the document for a page, broken down by word.
|
| 370 |
+
3. To narrow this down, use the **"Multi-word text search"** box to type the word or phrase you want to find (this will search the whole document). If you want to do a regex-based search, tick the 'Enable regex pattern matching' box under 'Search options' below (Note this will only be able to search for patterns in text within each cell).
|
| 371 |
4. Click the **"Search"** button or press Enter.
|
| 372 |
5. The table below will update to show only the rows containing text that matches your search query.
|
| 373 |
|
tools/custom_csvlogger.py
CHANGED
|
@@ -228,7 +228,7 @@ class CSVLogger_custom(FlaggingCallback):
|
|
| 228 |
|
| 229 |
if RUN_AWS_FUNCTIONS:
|
| 230 |
try:
|
| 231 |
-
print("Connecting to DynamoDB via existing SSO connection")
|
| 232 |
dynamodb = boto3.resource("dynamodb", region_name=AWS_REGION)
|
| 233 |
|
| 234 |
dynamodb.meta.client.list_tables()
|
|
@@ -236,9 +236,9 @@ class CSVLogger_custom(FlaggingCallback):
|
|
| 236 |
except Exception as e:
|
| 237 |
print("No SSO credentials found:", e)
|
| 238 |
if AWS_ACCESS_KEY and AWS_SECRET_KEY:
|
| 239 |
-
print(
|
| 240 |
-
|
| 241 |
-
)
|
| 242 |
dynamodb = boto3.resource(
|
| 243 |
"dynamodb",
|
| 244 |
aws_access_key_id=AWS_ACCESS_KEY,
|
|
@@ -328,7 +328,7 @@ class CSVLogger_custom(FlaggingCallback):
|
|
| 328 |
|
| 329 |
table.put_item(Item=item)
|
| 330 |
|
| 331 |
-
print("Successfully uploaded log to DynamoDB")
|
| 332 |
except Exception as e:
|
| 333 |
print("Could not upload log to DynamobDB due to", e)
|
| 334 |
|
|
|
|
| 228 |
|
| 229 |
if RUN_AWS_FUNCTIONS:
|
| 230 |
try:
|
| 231 |
+
# print("Connecting to DynamoDB via existing SSO connection")
|
| 232 |
dynamodb = boto3.resource("dynamodb", region_name=AWS_REGION)
|
| 233 |
|
| 234 |
dynamodb.meta.client.list_tables()
|
|
|
|
| 236 |
except Exception as e:
|
| 237 |
print("No SSO credentials found:", e)
|
| 238 |
if AWS_ACCESS_KEY and AWS_SECRET_KEY:
|
| 239 |
+
# print(
|
| 240 |
+
# "Trying to get DynamoDB credentials from environment variables"
|
| 241 |
+
# )
|
| 242 |
dynamodb = boto3.resource(
|
| 243 |
"dynamodb",
|
| 244 |
aws_access_key_id=AWS_ACCESS_KEY,
|
|
|
|
| 328 |
|
| 329 |
table.put_item(Item=item)
|
| 330 |
|
| 331 |
+
# print("Successfully uploaded log to DynamoDB")
|
| 332 |
except Exception as e:
|
| 333 |
print("Could not upload log to DynamobDB due to", e)
|
| 334 |
|
tools/file_redaction.py
CHANGED
|
@@ -441,7 +441,6 @@ def choose_and_run_redactor(
|
|
| 441 |
current_loop_page = 0
|
| 442 |
out_file_paths = list()
|
| 443 |
log_files_output_paths = list()
|
| 444 |
-
estimate_total_processing_time = 0
|
| 445 |
estimated_time_taken_state = 0
|
| 446 |
comprehend_query_number = 0
|
| 447 |
total_textract_query_number = 0
|
|
@@ -543,9 +542,7 @@ def choose_and_run_redactor(
|
|
| 543 |
if total_textract_query_number > number_of_pages:
|
| 544 |
total_textract_query_number = number_of_pages
|
| 545 |
|
| 546 |
-
|
| 547 |
-
combined_out_message
|
| 548 |
-
)
|
| 549 |
# print(
|
| 550 |
# "Estimated total processing time:",
|
| 551 |
# str(estimate_total_processing_time),
|
|
@@ -1317,7 +1314,7 @@ def choose_and_run_redactor(
|
|
| 1317 |
number_of_pages,
|
| 1318 |
page_max,
|
| 1319 |
)
|
| 1320 |
-
#print("Saving redacted PDF file:", out_redacted_pdf_file_path)
|
| 1321 |
|
| 1322 |
# Use final document if available, otherwise use main document
|
| 1323 |
doc_to_save = (
|
|
@@ -1352,7 +1349,7 @@ def choose_and_run_redactor(
|
|
| 1352 |
number_of_pages,
|
| 1353 |
page_max,
|
| 1354 |
)
|
| 1355 |
-
#print("Saving PDF file for review:", out_review_pdf_file_path)
|
| 1356 |
|
| 1357 |
if out_review_pdf_file_path:
|
| 1358 |
save_pdf_with_or_without_compression(
|
|
@@ -1692,9 +1689,7 @@ def choose_and_run_redactor(
|
|
| 1692 |
combined_out_message + " " + out_time_message
|
| 1693 |
) # Ensure this is a single string
|
| 1694 |
|
| 1695 |
-
|
| 1696 |
-
combined_out_message
|
| 1697 |
-
)
|
| 1698 |
|
| 1699 |
# else:
|
| 1700 |
# toc = time.perf_counter()
|
|
@@ -3299,7 +3294,7 @@ def redact_image_pdf(
|
|
| 3299 |
|
| 3300 |
# Go through each page
|
| 3301 |
for page_no in progress_bar:
|
| 3302 |
-
|
| 3303 |
reported_page_number = str(page_no + 1)
|
| 3304 |
print(f"Current page: {reported_page_number}")
|
| 3305 |
|
|
@@ -3308,7 +3303,6 @@ def redact_image_pdf(
|
|
| 3308 |
page_handwriting_recogniser_results = list()
|
| 3309 |
page_line_level_ocr_results_with_words = list()
|
| 3310 |
page_break_return = False
|
| 3311 |
-
|
| 3312 |
|
| 3313 |
# Try to find image location
|
| 3314 |
try:
|
|
@@ -3419,7 +3413,7 @@ def redact_image_pdf(
|
|
| 3419 |
if image is None:
|
| 3420 |
# Check if image_path is a placeholder and create the actual image
|
| 3421 |
if isinstance(image_path, str) and "placeholder_image" in image_path:
|
| 3422 |
-
#print(f"Detected placeholder image path: {image_path}")
|
| 3423 |
try:
|
| 3424 |
# Extract page number from placeholder path
|
| 3425 |
page_num_from_placeholder = int(
|
|
@@ -3628,26 +3622,25 @@ def redact_image_pdf(
|
|
| 3628 |
page["data"]
|
| 3629 |
for page in textract_data["pages"]
|
| 3630 |
if page["page_no"] == reported_page_number
|
| 3631 |
-
)
|
| 3632 |
|
| 3633 |
# Check if this is whole-document Textract output (already converted to mediabox space)
|
| 3634 |
# by checking if the JSON structure indicates it came from restructure_textract_output
|
| 3635 |
# or if textract_output_found is True (indicating pre-existing whole-document output)
|
| 3636 |
-
use_mediabox_for_textract = (
|
| 3637 |
-
|
| 3638 |
-
("pages" in textract_data and len(textract_data.get("pages", [])) > 0)
|
| 3639 |
)
|
| 3640 |
-
|
| 3641 |
if use_mediabox_for_textract:
|
| 3642 |
# Whole-document Textract: use mediabox dimensions
|
| 3643 |
textract_page_width = pymupdf_page.mediabox.width
|
| 3644 |
textract_page_height = pymupdf_page.mediabox.height
|
| 3645 |
-
#print(f"Using mediabox dimensions for whole-document Textract: {textract_page_width}x{textract_page_height}")
|
| 3646 |
else:
|
| 3647 |
# Individual image Textract: use image dimensions (current behavior)
|
| 3648 |
textract_page_width = page_width
|
| 3649 |
textract_page_height = page_height
|
| 3650 |
-
#print(f"Using image dimensions for individual image Textract: {textract_page_width}x{textract_page_height}")
|
| 3651 |
|
| 3652 |
(
|
| 3653 |
page_line_level_ocr_results,
|
|
@@ -3658,7 +3651,10 @@ def redact_image_pdf(
|
|
| 3658 |
selection_element_results,
|
| 3659 |
form_key_value_results,
|
| 3660 |
) = json_to_ocrresult(
|
| 3661 |
-
text_blocks,
|
|
|
|
|
|
|
|
|
|
| 3662 |
)
|
| 3663 |
|
| 3664 |
if all_page_line_level_ocr_results_with_words is None:
|
|
@@ -4812,9 +4808,13 @@ def redact_text_pdf(
|
|
| 4812 |
|
| 4813 |
if page_text_ocr_outputs_list:
|
| 4814 |
# Filter out empty DataFrames before concatenation to avoid FutureWarning
|
| 4815 |
-
non_empty_ocr_outputs = [
|
|
|
|
|
|
|
| 4816 |
if non_empty_ocr_outputs:
|
| 4817 |
-
page_text_ocr_outputs = pd.concat(
|
|
|
|
|
|
|
| 4818 |
else:
|
| 4819 |
page_text_ocr_outputs = pd.DataFrame(
|
| 4820 |
columns=[
|
|
@@ -4960,17 +4960,50 @@ def redact_text_pdf(
|
|
| 4960 |
|
| 4961 |
# Write logs
|
| 4962 |
# Filter out empty DataFrames before concatenation to avoid FutureWarning
|
| 4963 |
-
non_empty_decision_process = [
|
|
|
|
|
|
|
| 4964 |
if non_empty_decision_process:
|
| 4965 |
-
all_pages_decision_process_table = pd.concat(
|
|
|
|
|
|
|
| 4966 |
else:
|
| 4967 |
-
all_pages_decision_process_table = pd.DataFrame(
|
| 4968 |
-
|
| 4969 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4970 |
if non_empty_ocr_results:
|
| 4971 |
-
all_line_level_ocr_results_df = pd.concat(
|
|
|
|
|
|
|
| 4972 |
else:
|
| 4973 |
-
all_line_level_ocr_results_df = pd.DataFrame(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4974 |
|
| 4975 |
current_loop_page += 1
|
| 4976 |
|
|
@@ -5010,11 +5043,29 @@ def redact_text_pdf(
|
|
| 5010 |
|
| 5011 |
# Write logs
|
| 5012 |
# Filter out empty DataFrames before concatenation to avoid FutureWarning
|
| 5013 |
-
non_empty_decision_process = [
|
|
|
|
|
|
|
| 5014 |
if non_empty_decision_process:
|
| 5015 |
-
all_pages_decision_process_table = pd.concat(
|
|
|
|
|
|
|
| 5016 |
else:
|
| 5017 |
-
all_pages_decision_process_table = pd.DataFrame(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5018 |
|
| 5019 |
return (
|
| 5020 |
pymupdf_doc,
|
|
@@ -5029,52 +5080,81 @@ def redact_text_pdf(
|
|
| 5029 |
|
| 5030 |
# Write all page outputs
|
| 5031 |
# Filter out empty DataFrames before concatenation to avoid FutureWarning
|
| 5032 |
-
non_empty_decision_process = [
|
|
|
|
|
|
|
| 5033 |
if non_empty_decision_process:
|
| 5034 |
-
all_pages_decision_process_table = pd.concat(
|
|
|
|
|
|
|
| 5035 |
else:
|
| 5036 |
-
all_pages_decision_process_table = pd.DataFrame(
|
| 5037 |
-
|
| 5038 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5039 |
if non_empty_ocr_results:
|
| 5040 |
-
all_line_level_ocr_results_df = pd.concat(
|
|
|
|
|
|
|
| 5041 |
else:
|
| 5042 |
-
all_line_level_ocr_results_df = pd.DataFrame(
|
|
|
|
|
|
|
| 5043 |
|
| 5044 |
-
|
| 5045 |
-
all_pages_decision_process_table = divide_coordinates_by_page_sizes(
|
| 5046 |
-
all_pages_decision_process_table,
|
| 5047 |
-
page_sizes_df,
|
| 5048 |
-
xmin="xmin",
|
| 5049 |
-
xmax="xmax",
|
| 5050 |
-
ymin="ymin",
|
| 5051 |
-
ymax="ymax",
|
| 5052 |
-
)
|
| 5053 |
|
| 5054 |
-
|
| 5055 |
-
|
| 5056 |
-
|
| 5057 |
-
|
| 5058 |
-
|
| 5059 |
-
|
| 5060 |
-
|
|
|
|
|
|
|
| 5061 |
|
| 5062 |
-
|
| 5063 |
-
|
| 5064 |
-
|
| 5065 |
-
|
| 5066 |
-
|
| 5067 |
-
|
| 5068 |
-
|
| 5069 |
-
|
| 5070 |
-
)
|
| 5071 |
|
| 5072 |
-
#
|
| 5073 |
if not all_line_level_ocr_results_df.empty:
|
| 5074 |
-
|
| 5075 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5076 |
)
|
| 5077 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5078 |
# Remove empty dictionary items from ocr results with words
|
| 5079 |
all_page_line_level_ocr_results_with_words = [
|
| 5080 |
d for d in all_page_line_level_ocr_results_with_words if d
|
|
|
|
| 441 |
current_loop_page = 0
|
| 442 |
out_file_paths = list()
|
| 443 |
log_files_output_paths = list()
|
|
|
|
| 444 |
estimated_time_taken_state = 0
|
| 445 |
comprehend_query_number = 0
|
| 446 |
total_textract_query_number = 0
|
|
|
|
| 542 |
if total_textract_query_number > number_of_pages:
|
| 543 |
total_textract_query_number = number_of_pages
|
| 544 |
|
| 545 |
+
sum_numbers_before_seconds(combined_out_message)
|
|
|
|
|
|
|
| 546 |
# print(
|
| 547 |
# "Estimated total processing time:",
|
| 548 |
# str(estimate_total_processing_time),
|
|
|
|
| 1314 |
number_of_pages,
|
| 1315 |
page_max,
|
| 1316 |
)
|
| 1317 |
+
# print("Saving redacted PDF file:", out_redacted_pdf_file_path)
|
| 1318 |
|
| 1319 |
# Use final document if available, otherwise use main document
|
| 1320 |
doc_to_save = (
|
|
|
|
| 1349 |
number_of_pages,
|
| 1350 |
page_max,
|
| 1351 |
)
|
| 1352 |
+
# print("Saving PDF file for review:", out_review_pdf_file_path)
|
| 1353 |
|
| 1354 |
if out_review_pdf_file_path:
|
| 1355 |
save_pdf_with_or_without_compression(
|
|
|
|
| 1689 |
combined_out_message + " " + out_time_message
|
| 1690 |
) # Ensure this is a single string
|
| 1691 |
|
| 1692 |
+
sum_numbers_before_seconds(combined_out_message)
|
|
|
|
|
|
|
| 1693 |
|
| 1694 |
# else:
|
| 1695 |
# toc = time.perf_counter()
|
|
|
|
| 3294 |
|
| 3295 |
# Go through each page
|
| 3296 |
for page_no in progress_bar:
|
| 3297 |
+
|
| 3298 |
reported_page_number = str(page_no + 1)
|
| 3299 |
print(f"Current page: {reported_page_number}")
|
| 3300 |
|
|
|
|
| 3303 |
page_handwriting_recogniser_results = list()
|
| 3304 |
page_line_level_ocr_results_with_words = list()
|
| 3305 |
page_break_return = False
|
|
|
|
| 3306 |
|
| 3307 |
# Try to find image location
|
| 3308 |
try:
|
|
|
|
| 3413 |
if image is None:
|
| 3414 |
# Check if image_path is a placeholder and create the actual image
|
| 3415 |
if isinstance(image_path, str) and "placeholder_image" in image_path:
|
| 3416 |
+
# print(f"Detected placeholder image path: {image_path}")
|
| 3417 |
try:
|
| 3418 |
# Extract page number from placeholder path
|
| 3419 |
page_num_from_placeholder = int(
|
|
|
|
| 3622 |
page["data"]
|
| 3623 |
for page in textract_data["pages"]
|
| 3624 |
if page["page_no"] == reported_page_number
|
| 3625 |
+
)
|
| 3626 |
|
| 3627 |
# Check if this is whole-document Textract output (already converted to mediabox space)
|
| 3628 |
# by checking if the JSON structure indicates it came from restructure_textract_output
|
| 3629 |
# or if textract_output_found is True (indicating pre-existing whole-document output)
|
| 3630 |
+
use_mediabox_for_textract = textract_output_found or (
|
| 3631 |
+
"pages" in textract_data and len(textract_data.get("pages", [])) > 0
|
|
|
|
| 3632 |
)
|
| 3633 |
+
|
| 3634 |
if use_mediabox_for_textract:
|
| 3635 |
# Whole-document Textract: use mediabox dimensions
|
| 3636 |
textract_page_width = pymupdf_page.mediabox.width
|
| 3637 |
textract_page_height = pymupdf_page.mediabox.height
|
| 3638 |
+
# print(f"Using mediabox dimensions for whole-document Textract: {textract_page_width}x{textract_page_height}")
|
| 3639 |
else:
|
| 3640 |
# Individual image Textract: use image dimensions (current behavior)
|
| 3641 |
textract_page_width = page_width
|
| 3642 |
textract_page_height = page_height
|
| 3643 |
+
# print(f"Using image dimensions for individual image Textract: {textract_page_width}x{textract_page_height}")
|
| 3644 |
|
| 3645 |
(
|
| 3646 |
page_line_level_ocr_results,
|
|
|
|
| 3651 |
selection_element_results,
|
| 3652 |
form_key_value_results,
|
| 3653 |
) = json_to_ocrresult(
|
| 3654 |
+
text_blocks,
|
| 3655 |
+
textract_page_width,
|
| 3656 |
+
textract_page_height,
|
| 3657 |
+
reported_page_number,
|
| 3658 |
)
|
| 3659 |
|
| 3660 |
if all_page_line_level_ocr_results_with_words is None:
|
|
|
|
| 4808 |
|
| 4809 |
if page_text_ocr_outputs_list:
|
| 4810 |
# Filter out empty DataFrames before concatenation to avoid FutureWarning
|
| 4811 |
+
non_empty_ocr_outputs = [
|
| 4812 |
+
df for df in page_text_ocr_outputs_list if not df.empty
|
| 4813 |
+
]
|
| 4814 |
if non_empty_ocr_outputs:
|
| 4815 |
+
page_text_ocr_outputs = pd.concat(
|
| 4816 |
+
non_empty_ocr_outputs, ignore_index=True
|
| 4817 |
+
)
|
| 4818 |
else:
|
| 4819 |
page_text_ocr_outputs = pd.DataFrame(
|
| 4820 |
columns=[
|
|
|
|
| 4960 |
|
| 4961 |
# Write logs
|
| 4962 |
# Filter out empty DataFrames before concatenation to avoid FutureWarning
|
| 4963 |
+
non_empty_decision_process = [
|
| 4964 |
+
df for df in all_pages_decision_process_list if not df.empty
|
| 4965 |
+
]
|
| 4966 |
if non_empty_decision_process:
|
| 4967 |
+
all_pages_decision_process_table = pd.concat(
|
| 4968 |
+
non_empty_decision_process, ignore_index=True
|
| 4969 |
+
)
|
| 4970 |
else:
|
| 4971 |
+
all_pages_decision_process_table = pd.DataFrame(
|
| 4972 |
+
columns=[
|
| 4973 |
+
"text",
|
| 4974 |
+
"xmin",
|
| 4975 |
+
"ymin",
|
| 4976 |
+
"xmax",
|
| 4977 |
+
"ymax",
|
| 4978 |
+
"label",
|
| 4979 |
+
"start",
|
| 4980 |
+
"end",
|
| 4981 |
+
"score",
|
| 4982 |
+
"page",
|
| 4983 |
+
"id",
|
| 4984 |
+
]
|
| 4985 |
+
)
|
| 4986 |
+
|
| 4987 |
+
non_empty_ocr_results = [
|
| 4988 |
+
df for df in all_line_level_ocr_results_list if not df.empty
|
| 4989 |
+
]
|
| 4990 |
if non_empty_ocr_results:
|
| 4991 |
+
all_line_level_ocr_results_df = pd.concat(
|
| 4992 |
+
non_empty_ocr_results, ignore_index=True
|
| 4993 |
+
)
|
| 4994 |
else:
|
| 4995 |
+
all_line_level_ocr_results_df = pd.DataFrame(
|
| 4996 |
+
columns=[
|
| 4997 |
+
"page",
|
| 4998 |
+
"text",
|
| 4999 |
+
"left",
|
| 5000 |
+
"top",
|
| 5001 |
+
"width",
|
| 5002 |
+
"height",
|
| 5003 |
+
"line",
|
| 5004 |
+
"conf",
|
| 5005 |
+
]
|
| 5006 |
+
)
|
| 5007 |
|
| 5008 |
current_loop_page += 1
|
| 5009 |
|
|
|
|
| 5043 |
|
| 5044 |
# Write logs
|
| 5045 |
# Filter out empty DataFrames before concatenation to avoid FutureWarning
|
| 5046 |
+
non_empty_decision_process = [
|
| 5047 |
+
df for df in all_pages_decision_process_list if not df.empty
|
| 5048 |
+
]
|
| 5049 |
if non_empty_decision_process:
|
| 5050 |
+
all_pages_decision_process_table = pd.concat(
|
| 5051 |
+
non_empty_decision_process, ignore_index=True
|
| 5052 |
+
)
|
| 5053 |
else:
|
| 5054 |
+
all_pages_decision_process_table = pd.DataFrame(
|
| 5055 |
+
columns=[
|
| 5056 |
+
"text",
|
| 5057 |
+
"xmin",
|
| 5058 |
+
"ymin",
|
| 5059 |
+
"xmax",
|
| 5060 |
+
"ymax",
|
| 5061 |
+
"label",
|
| 5062 |
+
"start",
|
| 5063 |
+
"end",
|
| 5064 |
+
"score",
|
| 5065 |
+
"page",
|
| 5066 |
+
"id",
|
| 5067 |
+
]
|
| 5068 |
+
)
|
| 5069 |
|
| 5070 |
return (
|
| 5071 |
pymupdf_doc,
|
|
|
|
| 5080 |
|
| 5081 |
# Write all page outputs
|
| 5082 |
# Filter out empty DataFrames before concatenation to avoid FutureWarning
|
| 5083 |
+
non_empty_decision_process = [
|
| 5084 |
+
df for df in all_pages_decision_process_list if not df.empty
|
| 5085 |
+
]
|
| 5086 |
if non_empty_decision_process:
|
| 5087 |
+
all_pages_decision_process_table = pd.concat(
|
| 5088 |
+
non_empty_decision_process, ignore_index=True
|
| 5089 |
+
)
|
| 5090 |
else:
|
| 5091 |
+
all_pages_decision_process_table = pd.DataFrame(
|
| 5092 |
+
columns=[
|
| 5093 |
+
"text",
|
| 5094 |
+
"xmin",
|
| 5095 |
+
"ymin",
|
| 5096 |
+
"xmax",
|
| 5097 |
+
"ymax",
|
| 5098 |
+
"label",
|
| 5099 |
+
"start",
|
| 5100 |
+
"end",
|
| 5101 |
+
"score",
|
| 5102 |
+
"page",
|
| 5103 |
+
"id",
|
| 5104 |
+
]
|
| 5105 |
+
)
|
| 5106 |
+
|
| 5107 |
+
non_empty_ocr_results = [
|
| 5108 |
+
df for df in all_line_level_ocr_results_list if not df.empty
|
| 5109 |
+
]
|
| 5110 |
if non_empty_ocr_results:
|
| 5111 |
+
all_line_level_ocr_results_df = pd.concat(
|
| 5112 |
+
non_empty_ocr_results, ignore_index=True
|
| 5113 |
+
)
|
| 5114 |
else:
|
| 5115 |
+
all_line_level_ocr_results_df = pd.DataFrame(
|
| 5116 |
+
columns=["page", "text", "left", "top", "width", "height", "line", "conf"]
|
| 5117 |
+
)
|
| 5118 |
|
| 5119 |
+
if not all_pages_decision_process_table.empty:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5120 |
|
| 5121 |
+
# Convert decision table to relative coordinates
|
| 5122 |
+
all_pages_decision_process_table = divide_coordinates_by_page_sizes(
|
| 5123 |
+
all_pages_decision_process_table,
|
| 5124 |
+
page_sizes_df,
|
| 5125 |
+
xmin="xmin",
|
| 5126 |
+
xmax="xmax",
|
| 5127 |
+
ymin="ymin",
|
| 5128 |
+
ymax="ymax",
|
| 5129 |
+
)
|
| 5130 |
|
| 5131 |
+
# Coordinates need to be reversed for ymin and ymax to match with image annotator objects downstream
|
| 5132 |
+
|
| 5133 |
+
all_pages_decision_process_table["ymin"] = reverse_y_coords(
|
| 5134 |
+
all_pages_decision_process_table, "ymin"
|
| 5135 |
+
)
|
| 5136 |
+
all_pages_decision_process_table["ymax"] = reverse_y_coords(
|
| 5137 |
+
all_pages_decision_process_table, "ymax"
|
| 5138 |
+
)
|
|
|
|
| 5139 |
|
| 5140 |
+
# Convert decision table to relative coordinates
|
| 5141 |
if not all_line_level_ocr_results_df.empty:
|
| 5142 |
+
|
| 5143 |
+
all_line_level_ocr_results_df = divide_coordinates_by_page_sizes(
|
| 5144 |
+
all_line_level_ocr_results_df,
|
| 5145 |
+
page_sizes_df,
|
| 5146 |
+
xmin="left",
|
| 5147 |
+
xmax="width",
|
| 5148 |
+
ymin="top",
|
| 5149 |
+
ymax="height",
|
| 5150 |
)
|
| 5151 |
|
| 5152 |
+
# Coordinates need to be reversed for ymin and ymax to match with image annotator objects downstream
|
| 5153 |
+
if not all_line_level_ocr_results_df.empty:
|
| 5154 |
+
all_line_level_ocr_results_df["top"] = reverse_y_coords(
|
| 5155 |
+
all_line_level_ocr_results_df, "top"
|
| 5156 |
+
)
|
| 5157 |
+
|
| 5158 |
# Remove empty dictionary items from ocr results with words
|
| 5159 |
all_page_line_level_ocr_results_with_words = [
|
| 5160 |
d for d in all_page_line_level_ocr_results_with_words if d
|
tools/find_duplicate_pages.py
CHANGED
|
@@ -122,6 +122,7 @@ def run_full_search_and_analysis(
|
|
| 122 |
min_consecutive_pages: int = 1,
|
| 123 |
greedy_match: bool = True,
|
| 124 |
remake_index: bool = False,
|
|
|
|
| 125 |
progress=gr.Progress(track_tqdm=True),
|
| 126 |
):
|
| 127 |
"""
|
|
@@ -133,7 +134,7 @@ def run_full_search_and_analysis(
|
|
| 133 |
4. Executes the similarity analysis on the combined data using the specified parameters such as similarity threshold, minimum word count, minimum consecutive pages, and greedy match strategy.
|
| 134 |
|
| 135 |
Parameters:
|
| 136 |
-
- search_query_text (str): The text entered by the user to search for in the OCR data.
|
| 137 |
- word_level_df_orig (pd.DataFrame): The original DataFrame containing word-level OCR data.
|
| 138 |
- similarity_threshold (float, optional): The minimum similarity score required for two pages to be considered duplicates. Defaults to 1.
|
| 139 |
- combine_pages (bool, optional): A flag indicating whether to combine text from the same page number within a file. Defaults to False.
|
|
@@ -141,6 +142,7 @@ def run_full_search_and_analysis(
|
|
| 141 |
- min_consecutive_pages (int, optional): The minimum number of consecutive pages required to be considered a match. Defaults to 1.
|
| 142 |
- greedy_match (bool, optional): A flag indicating whether to use a greedy strategy for matching consecutive pages. Defaults to True.
|
| 143 |
- remake_index (bool, optional): A flag indicating whether to remake the index of the DataFrame during processing. Defaults to False.
|
|
|
|
| 144 |
- progress (gr.Progress, optional): A Progress object to track the progress of the operation. Defaults to a Progress object with track_tqdm set to True.
|
| 145 |
"""
|
| 146 |
|
|
@@ -149,30 +151,56 @@ def run_full_search_and_analysis(
|
|
| 149 |
if len(search_query_text) > 100:
|
| 150 |
raise Warning("Please use a search query with at less than 100 characters.")
|
| 151 |
|
| 152 |
-
|
| 153 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
else:
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
|
| 172 |
-
|
| 173 |
-
|
| 174 |
|
| 175 |
# Create word index from reference table
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
word_level_df_orig["index"] = word_level_df_orig.index
|
| 177 |
word_level_df = word_level_df_orig.copy()
|
| 178 |
|
|
@@ -204,6 +232,7 @@ def run_full_search_and_analysis(
|
|
| 204 |
do_text_clean=False,
|
| 205 |
file1_name="user_search_query",
|
| 206 |
file2_name="source_document",
|
|
|
|
| 207 |
progress=progress,
|
| 208 |
)
|
| 209 |
|
|
@@ -777,7 +806,10 @@ def _sequences_match(query_seq: List[str], ref_seq: List[str]) -> bool:
|
|
| 777 |
|
| 778 |
|
| 779 |
def find_consecutive_sequence_matches(
|
| 780 |
-
df_filtered: pd.DataFrame,
|
|
|
|
|
|
|
|
|
|
| 781 |
) -> pd.DataFrame:
|
| 782 |
"""
|
| 783 |
Finds all occurrences of a consecutive sequence of tokens from a search file
|
|
@@ -789,6 +821,7 @@ def find_consecutive_sequence_matches(
|
|
| 789 |
df_filtered: The DataFrame containing all tokens, with 'file' and 'text_clean' columns.
|
| 790 |
search_file_name: The name of the file containing the search query sequence.
|
| 791 |
reference_file_name: The name of the file to search within.
|
|
|
|
| 792 |
|
| 793 |
Returns:
|
| 794 |
A DataFrame with two columns ('Page1_Index', 'Page2_Index') mapping the
|
|
@@ -803,38 +836,115 @@ def find_consecutive_sequence_matches(
|
|
| 803 |
print("Error: One or both files not found or are empty.")
|
| 804 |
return pd.DataFrame(columns=["Page1_Index", "Page2_Index"])
|
| 805 |
|
| 806 |
-
|
| 807 |
-
|
| 808 |
-
|
| 809 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 810 |
|
| 811 |
-
|
| 812 |
-
|
| 813 |
|
| 814 |
-
|
| 815 |
-
|
|
|
|
|
|
|
| 816 |
|
| 817 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 818 |
|
| 819 |
-
|
| 820 |
-
|
| 821 |
-
|
| 822 |
-
window = reference_tokens[i : i + query_len]
|
| 823 |
|
| 824 |
-
|
| 825 |
-
|
|
|
|
| 826 |
|
| 827 |
-
|
| 828 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 829 |
|
| 830 |
-
|
| 831 |
-
|
| 832 |
-
|
| 833 |
-
|
| 834 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 835 |
|
| 836 |
-
|
| 837 |
-
|
| 838 |
|
| 839 |
if not all_found_matches:
|
| 840 |
print("No matches found")
|
|
@@ -860,6 +970,7 @@ def identify_similar_text_sequences(
|
|
| 860 |
file1_name: str = "",
|
| 861 |
file2_name: str = "",
|
| 862 |
output_folder: str = OUTPUT_FOLDER,
|
|
|
|
| 863 |
progress=Progress(track_tqdm=True),
|
| 864 |
) -> Tuple[pd.DataFrame, List[str], pd.DataFrame]:
|
| 865 |
"""
|
|
@@ -903,7 +1014,7 @@ def identify_similar_text_sequences(
|
|
| 903 |
|
| 904 |
# base_similarity_df = _debug_similarity_between_two_files(df_filtered, vectorizer, similarity_threshold, file1_name, file2_name)
|
| 905 |
base_similarity_df = find_consecutive_sequence_matches(
|
| 906 |
-
df_filtered, file1_name, file2_name
|
| 907 |
)
|
| 908 |
if base_similarity_df.empty:
|
| 909 |
return pd.DataFrame(), [], df_combined
|
|
|
|
| 122 |
min_consecutive_pages: int = 1,
|
| 123 |
greedy_match: bool = True,
|
| 124 |
remake_index: bool = False,
|
| 125 |
+
use_regex: bool = False,
|
| 126 |
progress=gr.Progress(track_tqdm=True),
|
| 127 |
):
|
| 128 |
"""
|
|
|
|
| 134 |
4. Executes the similarity analysis on the combined data using the specified parameters such as similarity threshold, minimum word count, minimum consecutive pages, and greedy match strategy.
|
| 135 |
|
| 136 |
Parameters:
|
| 137 |
+
- search_query_text (str): The text entered by the user to search for in the OCR data. If use_regex=True, this is treated as a regex pattern.
|
| 138 |
- word_level_df_orig (pd.DataFrame): The original DataFrame containing word-level OCR data.
|
| 139 |
- similarity_threshold (float, optional): The minimum similarity score required for two pages to be considered duplicates. Defaults to 1.
|
| 140 |
- combine_pages (bool, optional): A flag indicating whether to combine text from the same page number within a file. Defaults to False.
|
|
|
|
| 142 |
- min_consecutive_pages (int, optional): The minimum number of consecutive pages required to be considered a match. Defaults to 1.
|
| 143 |
- greedy_match (bool, optional): A flag indicating whether to use a greedy strategy for matching consecutive pages. Defaults to True.
|
| 144 |
- remake_index (bool, optional): A flag indicating whether to remake the index of the DataFrame during processing. Defaults to False.
|
| 145 |
+
- use_regex (bool, optional): If True, treats search_query_text as a regex pattern instead of literal text. Defaults to False.
|
| 146 |
- progress (gr.Progress, optional): A Progress object to track the progress of the operation. Defaults to a Progress object with track_tqdm set to True.
|
| 147 |
"""
|
| 148 |
|
|
|
|
| 151 |
if len(search_query_text) > 100:
|
| 152 |
raise Warning("Please use a search query with at less than 100 characters.")
|
| 153 |
|
| 154 |
+
# For regex mode, we handle the query differently
|
| 155 |
+
if use_regex:
|
| 156 |
+
# Validate regex pattern
|
| 157 |
+
try:
|
| 158 |
+
re.compile(search_query_text)
|
| 159 |
+
except re.error as e:
|
| 160 |
+
raise Warning(f"Invalid regex pattern: {e}")
|
| 161 |
+
|
| 162 |
+
# For regex, we don't split into words - treat as single pattern
|
| 163 |
+
# Create a minimal DataFrame structure for the regex pattern
|
| 164 |
+
search_query_data = [
|
| 165 |
+
(
|
| 166 |
+
"user_search_query",
|
| 167 |
+
pd.DataFrame({"page": [1], "text": [search_query_text], "line": [1]}),
|
| 168 |
+
)
|
| 169 |
+
]
|
| 170 |
+
query_word_length = 1 # For regex, we'll handle matching differently
|
| 171 |
+
min_consecutive_pages = 1 # Regex matches can be variable length
|
| 172 |
else:
|
| 173 |
+
# Original literal text matching logic
|
| 174 |
+
if punctuation_at_word_text_end(word_level_df_orig) is True:
|
| 175 |
+
do_punctuation_split = False
|
| 176 |
+
else:
|
| 177 |
+
do_punctuation_split = True
|
| 178 |
+
|
| 179 |
+
# Step 1: Process the user's search query string
|
| 180 |
+
search_query_data, query_word_length = create_dataframe_from_string(
|
| 181 |
+
search_query_text,
|
| 182 |
+
file_name="user_search_query",
|
| 183 |
+
split_words=True,
|
| 184 |
+
split_punctuation=do_punctuation_split,
|
| 185 |
+
)
|
| 186 |
+
if not search_query_data:
|
| 187 |
+
# Handle case where user submits an empty search string
|
| 188 |
+
raise Warning("Could not convert search string to required format")
|
| 189 |
|
| 190 |
+
if query_word_length > 25:
|
| 191 |
+
# Handle case where user submits an empty search string
|
| 192 |
+
raise Warning("Please use a query with less than 25 words")
|
| 193 |
|
| 194 |
+
# Overwrite min_consecutive_pages with the search string length
|
| 195 |
+
min_consecutive_pages = query_word_length
|
| 196 |
|
| 197 |
# Create word index from reference table
|
| 198 |
+
|
| 199 |
+
if word_level_df_orig.empty:
|
| 200 |
+
raise gr.Error(
|
| 201 |
+
"No word-level data to process. Please check that you have loaded in OCR data."
|
| 202 |
+
)
|
| 203 |
+
|
| 204 |
word_level_df_orig["index"] = word_level_df_orig.index
|
| 205 |
word_level_df = word_level_df_orig.copy()
|
| 206 |
|
|
|
|
| 232 |
do_text_clean=False,
|
| 233 |
file1_name="user_search_query",
|
| 234 |
file2_name="source_document",
|
| 235 |
+
use_regex=use_regex,
|
| 236 |
progress=progress,
|
| 237 |
)
|
| 238 |
|
|
|
|
| 806 |
|
| 807 |
|
| 808 |
def find_consecutive_sequence_matches(
|
| 809 |
+
df_filtered: pd.DataFrame,
|
| 810 |
+
search_file_name: str,
|
| 811 |
+
reference_file_name: str,
|
| 812 |
+
use_regex: bool = False,
|
| 813 |
) -> pd.DataFrame:
|
| 814 |
"""
|
| 815 |
Finds all occurrences of a consecutive sequence of tokens from a search file
|
|
|
|
| 821 |
df_filtered: The DataFrame containing all tokens, with 'file' and 'text_clean' columns.
|
| 822 |
search_file_name: The name of the file containing the search query sequence.
|
| 823 |
reference_file_name: The name of the file to search within.
|
| 824 |
+
use_regex: If True, treats the search query as a regex pattern instead of literal tokens.
|
| 825 |
|
| 826 |
Returns:
|
| 827 |
A DataFrame with two columns ('Page1_Index', 'Page2_Index') mapping the
|
|
|
|
| 836 |
print("Error: One or both files not found or are empty.")
|
| 837 |
return pd.DataFrame(columns=["Page1_Index", "Page2_Index"])
|
| 838 |
|
| 839 |
+
if use_regex:
|
| 840 |
+
# Regex mode: Extract pattern and search in combined text
|
| 841 |
+
# Get the regex pattern from the search query (should be in 'text' column, not 'text_clean')
|
| 842 |
+
# We need to get it from the original 'text' column if available, otherwise use 'text_clean'
|
| 843 |
+
if "text" in search_df.columns:
|
| 844 |
+
regex_pattern = search_df["text"].iloc[0]
|
| 845 |
+
else:
|
| 846 |
+
regex_pattern = search_df["text_clean"].iloc[0]
|
| 847 |
+
|
| 848 |
+
# Join reference tokens back into text for regex searching
|
| 849 |
+
# Use original 'text' column if available to preserve original formatting (important for emails, etc.)
|
| 850 |
+
# Otherwise fall back to 'text_clean'
|
| 851 |
+
if "text" in reference_df.columns:
|
| 852 |
+
reference_tokens = reference_df["text"].tolist()
|
| 853 |
+
else:
|
| 854 |
+
reference_tokens = reference_df["text_clean"].tolist()
|
| 855 |
+
reference_indices = reference_df.index.tolist()
|
| 856 |
+
|
| 857 |
+
# Join tokens with spaces to reconstruct the text
|
| 858 |
+
# Note: If tokens were split at special characters like @, this may not perfectly reconstruct
|
| 859 |
+
# the original text, but it's the best we can do with tokenized data
|
| 860 |
+
reference_text = " ".join(reference_tokens)
|
| 861 |
+
|
| 862 |
+
# Build a mapping from character positions to token indices
|
| 863 |
+
# This helps us map regex match positions back to token indices
|
| 864 |
+
char_to_token_map = []
|
| 865 |
+
current_pos = 0
|
| 866 |
+
for idx, token in enumerate(reference_tokens):
|
| 867 |
+
token_start = current_pos
|
| 868 |
+
token_end = current_pos + len(token)
|
| 869 |
+
char_to_token_map.append((token_start, token_end, reference_indices[idx]))
|
| 870 |
+
# Add 1 for the space separator (except after last token)
|
| 871 |
+
current_pos = token_end + (1 if idx < len(reference_tokens) - 1 else 0)
|
| 872 |
+
|
| 873 |
+
# Find all regex matches
|
| 874 |
+
try:
|
| 875 |
+
pattern = re.compile(regex_pattern, re.IGNORECASE)
|
| 876 |
+
matches = list(pattern.finditer(reference_text))
|
| 877 |
+
except re.error as e:
|
| 878 |
+
print(f"Error compiling regex pattern: {e}")
|
| 879 |
+
gr.Warning(f"Invalid regex pattern: {e}")
|
| 880 |
+
return pd.DataFrame(
|
| 881 |
+
columns=["Page1_Index", "Page2_Index", "Similarity_Score"]
|
| 882 |
+
)
|
| 883 |
+
|
| 884 |
+
if not matches:
|
| 885 |
+
print("No regex matches found")
|
| 886 |
+
gr.Info("No regex matches found")
|
| 887 |
+
return pd.DataFrame(
|
| 888 |
+
columns=["Page1_Index", "Page2_Index", "Similarity_Score"]
|
| 889 |
+
)
|
| 890 |
|
| 891 |
+
all_found_matches = []
|
| 892 |
+
query_index = search_df.index[0] # Use the first (and only) query index
|
| 893 |
|
| 894 |
+
# For each regex match, find which tokens it spans
|
| 895 |
+
for match in matches:
|
| 896 |
+
match_start = match.start()
|
| 897 |
+
match_end = match.end()
|
| 898 |
|
| 899 |
+
# Find all tokens that overlap with this match
|
| 900 |
+
matching_token_indices = []
|
| 901 |
+
for token_start, token_end, token_idx in char_to_token_map:
|
| 902 |
+
# Check if token overlaps with match
|
| 903 |
+
if not (token_end < match_start or token_start > match_end):
|
| 904 |
+
matching_token_indices.append(token_idx)
|
| 905 |
|
| 906 |
+
# Create matches for all tokens in the span
|
| 907 |
+
for token_idx in matching_token_indices:
|
| 908 |
+
all_found_matches.append((query_index, token_idx, 1))
|
|
|
|
| 909 |
|
| 910 |
+
print(
|
| 911 |
+
f"Found {len(matches)} regex match(es) spanning {len(set(idx for _, idx, _ in all_found_matches))} token(s)"
|
| 912 |
+
)
|
| 913 |
|
| 914 |
+
else:
|
| 915 |
+
# Original literal token matching logic
|
| 916 |
+
# Step 2: Convert the token data into lists for easy comparison.
|
| 917 |
+
# We need both the text tokens and their original global indices.
|
| 918 |
+
query_tokens = search_df["text_clean"].tolist()
|
| 919 |
+
query_indices = search_df.index.tolist()
|
| 920 |
|
| 921 |
+
reference_tokens = reference_df["text_clean"].tolist()
|
| 922 |
+
reference_indices = reference_df.index.tolist()
|
| 923 |
+
|
| 924 |
+
query_len = len(query_tokens)
|
| 925 |
+
all_found_matches = list()
|
| 926 |
+
|
| 927 |
+
print(f"Searching for a sequence of {query_len} tokens...")
|
| 928 |
+
|
| 929 |
+
# Step 3: Use a "sliding window" to search for the query sequence in the reference list.
|
| 930 |
+
for i in range(len(reference_tokens) - query_len + 1):
|
| 931 |
+
# The "window" is a slice of the reference list that is the same size as the query
|
| 932 |
+
window = reference_tokens[i : i + query_len]
|
| 933 |
+
|
| 934 |
+
# Step 4: If the window matches the query with or without punctuation on end
|
| 935 |
+
if _sequences_match(query_tokens, window):
|
| 936 |
+
|
| 937 |
+
# Get the global indices for this entire matching block
|
| 938 |
+
matching_reference_indices = reference_indices[i : i + query_len]
|
| 939 |
+
|
| 940 |
+
# Create the mapping between query indices and the found reference indices
|
| 941 |
+
for j in range(query_len):
|
| 942 |
+
all_found_matches.append(
|
| 943 |
+
(query_indices[j], matching_reference_indices[j], 1)
|
| 944 |
+
)
|
| 945 |
|
| 946 |
+
# If you only want the *first* match, you can uncomment the next line:
|
| 947 |
+
# break
|
| 948 |
|
| 949 |
if not all_found_matches:
|
| 950 |
print("No matches found")
|
|
|
|
| 970 |
file1_name: str = "",
|
| 971 |
file2_name: str = "",
|
| 972 |
output_folder: str = OUTPUT_FOLDER,
|
| 973 |
+
use_regex: bool = False,
|
| 974 |
progress=Progress(track_tqdm=True),
|
| 975 |
) -> Tuple[pd.DataFrame, List[str], pd.DataFrame]:
|
| 976 |
"""
|
|
|
|
| 1014 |
|
| 1015 |
# base_similarity_df = _debug_similarity_between_two_files(df_filtered, vectorizer, similarity_threshold, file1_name, file2_name)
|
| 1016 |
base_similarity_df = find_consecutive_sequence_matches(
|
| 1017 |
+
df_filtered, file1_name, file2_name, use_regex=use_regex
|
| 1018 |
)
|
| 1019 |
if base_similarity_df.empty:
|
| 1020 |
return pd.DataFrame(), [], df_combined
|
tools/redaction_review.py
CHANGED
|
@@ -767,7 +767,21 @@ def get_and_merge_current_page_annotations(
|
|
| 767 |
.drop_duplicates(subset=["id"], keep="first")
|
| 768 |
)
|
| 769 |
else:
|
| 770 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 771 |
|
| 772 |
return updated_df
|
| 773 |
|
|
@@ -932,7 +946,21 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
|
|
| 932 |
if dfs_to_concat:
|
| 933 |
updated_annotations_df = pd.concat(dfs_to_concat, ignore_index=True)
|
| 934 |
else:
|
| 935 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 936 |
|
| 937 |
# --- Part 4: Convert final DataFrame to list-of-dicts ---
|
| 938 |
updated_recogniser_entity_df = pd.DataFrame()
|
|
|
|
| 767 |
.drop_duplicates(subset=["id"], keep="first")
|
| 768 |
)
|
| 769 |
else:
|
| 770 |
+
# Return empty DataFrame with expected columns from convert_annotation_data_to_dataframe
|
| 771 |
+
updated_df = pd.DataFrame(
|
| 772 |
+
columns=[
|
| 773 |
+
"image",
|
| 774 |
+
"page",
|
| 775 |
+
"label",
|
| 776 |
+
"color",
|
| 777 |
+
"xmin",
|
| 778 |
+
"xmax",
|
| 779 |
+
"ymin",
|
| 780 |
+
"ymax",
|
| 781 |
+
"text",
|
| 782 |
+
"id",
|
| 783 |
+
]
|
| 784 |
+
)
|
| 785 |
|
| 786 |
return updated_df
|
| 787 |
|
|
|
|
| 946 |
if dfs_to_concat:
|
| 947 |
updated_annotations_df = pd.concat(dfs_to_concat, ignore_index=True)
|
| 948 |
else:
|
| 949 |
+
# Return empty DataFrame with expected columns matching existing_annotations_df structure
|
| 950 |
+
updated_annotations_df = pd.DataFrame(
|
| 951 |
+
columns=[
|
| 952 |
+
"image",
|
| 953 |
+
"page",
|
| 954 |
+
"label",
|
| 955 |
+
"color",
|
| 956 |
+
"xmin",
|
| 957 |
+
"xmax",
|
| 958 |
+
"ymin",
|
| 959 |
+
"ymax",
|
| 960 |
+
"text",
|
| 961 |
+
"id",
|
| 962 |
+
]
|
| 963 |
+
)
|
| 964 |
|
| 965 |
# --- Part 4: Convert final DataFrame to list-of-dicts ---
|
| 966 |
updated_recogniser_entity_df = pd.DataFrame()
|