|
import argparse |
|
import os |
|
import time |
|
import uuid |
|
|
|
import pandas as pd |
|
|
|
from tools.config import ( |
|
ALLOW_LIST_PATH, |
|
AWS_ACCESS_KEY, |
|
AWS_PII_OPTION, |
|
AWS_REGION, |
|
AWS_SECRET_KEY, |
|
CHOSEN_COMPREHEND_ENTITIES, |
|
CHOSEN_LOCAL_OCR_MODEL, |
|
CHOSEN_REDACT_ENTITIES, |
|
COMPRESS_REDACTED_PDF, |
|
CUSTOM_ENTITIES, |
|
DEFAULT_COMBINE_PAGES, |
|
DEFAULT_COST_CODE, |
|
DEFAULT_DUPLICATE_DETECTION_THRESHOLD, |
|
DEFAULT_FUZZY_SPELLING_MISTAKES_NUM, |
|
DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX, |
|
DEFAULT_LANGUAGE, |
|
DEFAULT_MIN_CONSECUTIVE_PAGES, |
|
DEFAULT_MIN_WORD_COUNT, |
|
DEFAULT_TABULAR_ANONYMISATION_STRATEGY, |
|
DENY_LIST_PATH, |
|
DIRECT_MODE_DEFAULT_USER, |
|
DISPLAY_FILE_NAMES_IN_LOGS, |
|
DO_INITIAL_TABULAR_DATA_CLEAN, |
|
DOCUMENT_REDACTION_BUCKET, |
|
FULL_COMPREHEND_ENTITY_LIST, |
|
FULL_ENTITY_LIST, |
|
IMAGES_DPI, |
|
INPUT_FOLDER, |
|
LOCAL_PII_OPTION, |
|
OUTPUT_FOLDER, |
|
PREPROCESS_LOCAL_OCR_IMAGES, |
|
REMOVE_DUPLICATE_ROWS, |
|
RETURN_REDACTED_PDF, |
|
RUN_AWS_FUNCTIONS, |
|
S3_USAGE_LOGS_FOLDER, |
|
SAVE_LOGS_TO_CSV, |
|
SAVE_LOGS_TO_DYNAMODB, |
|
SESSION_OUTPUT_FOLDER, |
|
TEXTRACT_JOBS_LOCAL_LOC, |
|
TEXTRACT_JOBS_S3_LOC, |
|
TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, |
|
TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, |
|
TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, |
|
USE_GREEDY_DUPLICATE_DETECTION, |
|
WHOLE_PAGE_REDACTION_LIST_PATH, |
|
) |
|
|
|
|
|
def _generate_session_hash() -> str: |
|
"""Generate a unique session hash for logging purposes.""" |
|
return str(uuid.uuid4())[:8] |
|
|
|
|
|
def get_username_and_folders( |
|
username: str = "", |
|
output_folder_textbox: str = OUTPUT_FOLDER, |
|
input_folder_textbox: str = INPUT_FOLDER, |
|
session_output_folder: str = SESSION_OUTPUT_FOLDER, |
|
textract_document_upload_input_folder: str = TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, |
|
textract_document_upload_output_folder: str = TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, |
|
s3_textract_document_logs_subfolder: str = TEXTRACT_JOBS_S3_LOC, |
|
local_textract_document_logs_subfolder: str = TEXTRACT_JOBS_LOCAL_LOC, |
|
): |
|
|
|
|
|
if username: |
|
out_session_hash = username |
|
else: |
|
out_session_hash = _generate_session_hash() |
|
|
|
if session_output_folder == "True" or session_output_folder is True: |
|
output_folder = output_folder_textbox + out_session_hash + "/" |
|
input_folder = input_folder_textbox + out_session_hash + "/" |
|
|
|
textract_document_upload_input_folder = ( |
|
textract_document_upload_input_folder + "/" + out_session_hash |
|
) |
|
textract_document_upload_output_folder = ( |
|
textract_document_upload_output_folder + "/" + out_session_hash |
|
) |
|
|
|
s3_textract_document_logs_subfolder = ( |
|
s3_textract_document_logs_subfolder + "/" + out_session_hash |
|
) |
|
local_textract_document_logs_subfolder = ( |
|
local_textract_document_logs_subfolder + "/" + out_session_hash + "/" |
|
) |
|
|
|
else: |
|
output_folder = output_folder_textbox |
|
input_folder = input_folder_textbox |
|
|
|
if not os.path.exists(output_folder): |
|
os.mkdir(output_folder) |
|
if not os.path.exists(input_folder): |
|
os.mkdir(input_folder) |
|
|
|
return ( |
|
out_session_hash, |
|
output_folder, |
|
out_session_hash, |
|
input_folder, |
|
textract_document_upload_input_folder, |
|
textract_document_upload_output_folder, |
|
s3_textract_document_logs_subfolder, |
|
local_textract_document_logs_subfolder, |
|
) |
|
|
|
|
|
def _get_env_list(env_var_name: str) -> list[str]: |
|
"""Parses a comma-separated environment variable into a list of strings.""" |
|
value = env_var_name[1:-1].strip().replace('"', "").replace("'", "") |
|
if not value: |
|
return [] |
|
|
|
return [s.strip() for s in value.split(",") if s.strip()] |
|
|
|
|
|
|
|
CHOSEN_COMPREHEND_ENTITIES.extend(CUSTOM_ENTITIES) |
|
FULL_COMPREHEND_ENTITY_LIST.extend(CUSTOM_ENTITIES) |
|
|
|
chosen_redact_entities = CHOSEN_REDACT_ENTITIES |
|
full_entity_list = FULL_ENTITY_LIST |
|
chosen_comprehend_entities = CHOSEN_COMPREHEND_ENTITIES |
|
full_comprehend_entity_list = FULL_COMPREHEND_ENTITY_LIST |
|
default_handwrite_signature_checkbox = DEFAULT_HANDWRITE_SIGNATURE_CHECKBOX |
|
|
|
|
|
|
|
def main(direct_mode_args={}): |
|
""" |
|
A unified command-line interface to prepare, redact, and anonymise various document types. |
|
|
|
Args: |
|
direct_mode_args (dict, optional): Dictionary of arguments for direct mode execution. |
|
If provided, uses these instead of parsing command line arguments. |
|
""" |
|
parser = argparse.ArgumentParser( |
|
description="A versatile CLI for redacting PII from PDF/image files and anonymising Word/tabular data.", |
|
formatter_class=argparse.RawTextHelpFormatter, |
|
epilog=""" |
|
Examples: |
|
|
|
To run these, you need to do the following: |
|
|
|
- Open a terminal window |
|
|
|
- CD to the app folder that contains this file (cli_redact.py) |
|
|
|
- Load the virtual environment using either conda or venv depending on your setup |
|
|
|
- Run one of the example commands below |
|
|
|
- Look in the output/ folder to see output files: |
|
|
|
# Redaction |
|
|
|
## Redact a PDF with default settings (local OCR): |
|
python cli_redact.py --input_file example_data/example_of_emails_sent_to_a_professor_before_applying.pdf |
|
|
|
## Extract text from a PDF only (i.e. no redaction), using local OCR: |
|
python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --redact_whole_page_file example_data/partnership_toolkit_redact_some_pages.csv --pii_detector None |
|
|
|
## Extract text from a PDF only (i.e. no redaction), using local OCR, with a whole page redaction list: |
|
python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --redact_whole_page_file example_data/partnership_toolkit_redact_some_pages.csv --pii_detector Local --local_redact_entities CUSTOM |
|
|
|
## Redact a PDF with allow list (local OCR) and custom list of redaction entities: |
|
python cli_redact.py --input_file example_data/graduate-job-example-cover-letter.pdf --allow_list_file example_data/test_allow_list_graduate.csv --local_redact_entities TITLES PERSON DATE_TIME |
|
|
|
## Redact a PDF with limited pages and text extraction method (local text) with custom fuzzy matching: |
|
python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --deny_list_file example_data/Partnership-Agreement-Toolkit_test_deny_list_para_single_spell.csv --local_redact_entities CUSTOM_FUZZY --page_min 1 --page_max 3 --ocr_method "Local text" --fuzzy_mistakes 3 |
|
|
|
## Redaction with custom deny list, allow list, and whole page redaction list: |
|
python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --deny_list_file example_data/partnership_toolkit_redact_custom_deny_list.csv --redact_whole_page_file example_data/partnership_toolkit_redact_some_pages.csv --allow_list_file example_data/test_allow_list_partnership.csv |
|
|
|
## Redact an image: |
|
python cli_redact.py --input_file example_data/example_complaint_letter.jpg |
|
|
|
## Anonymise csv file with specific columns: |
|
python cli_redact.py --input_file example_data/combined_case_notes.csv --text_columns "Case Note" "Client" --anon_strategy replace_redacted |
|
|
|
## Anonymise csv file with a different strategy (remove text completely): |
|
python cli_redact.py --input_file example_data/combined_case_notes.csv --text_columns "Case Note" "Client" --anon_strategy redact |
|
|
|
## Anonymise Excel file, remove text completely: |
|
python cli_redact.py --input_file example_data/combined_case_notes.xlsx --text_columns "Case Note" "Client" --excel_sheets combined_case_notes --anon_strategy redact |
|
|
|
## Anonymise a word document: |
|
python cli_redact.py --input_file "example_data/Bold minimalist professional cover letter.docx" --anon_strategy replace_redacted |
|
|
|
# Redaction with AWS services: |
|
|
|
## Use Textract and Comprehend:: |
|
python cli_redact.py --input_file example_data/example_of_emails_sent_to_a_professor_before_applying.pdf --ocr_method "AWS Textract" --pii_detector "AWS Comprehend" |
|
|
|
## Redact specific pages with AWS OCR and signature extraction: |
|
python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --page_min 6 --page_max 7 --ocr_method "AWS Textract" --handwrite_signature_extraction "Extract handwriting" "Extract signatures" |
|
|
|
## Redact with AWS OCR and additional layout extraction options: |
|
python cli_redact.py --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --ocr_method "AWS Textract" --extract_layout |
|
|
|
# Duplicate page detection |
|
|
|
## Find duplicate pages in OCR files: |
|
python cli_redact.py --task deduplicate --input_file example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv --duplicate_type pages --similarity_threshold 0.95 |
|
|
|
## Find duplicate in OCR files at the line level: |
|
python cli_redact.py --task deduplicate --input_file example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv --duplicate_type pages --similarity_threshold 0.95 --combine_pages False --min_word_count 3 |
|
|
|
## Find duplicate rows in tabular data: |
|
python cli_redact.py --task deduplicate --input_file example_data/Lambeth_2030-Our_Future_Our_Lambeth.pdf.csv --duplicate_type tabular --text_columns "text" --similarity_threshold 0.95 |
|
|
|
# AWS Textract whole document analysis |
|
|
|
## Submit document to Textract for basic text analysis: |
|
python cli_redact.py --task textract --textract_action submit --input_file example_data/example_of_emails_sent_to_a_professor_before_applying.pdf |
|
|
|
## Submit document to Textract for analysis with signature extraction (Job ID will be printed to the console, you need this to retrieve the results): |
|
python cli_redact.py --task textract --textract_action submit --input_file example_data/Partnership-Agreement-Toolkit_0_0.pdf --extract_signatures |
|
|
|
## Retrieve Textract results by job ID (returns a .json file output): |
|
python cli_redact.py --task textract --textract_action retrieve --job_id 12345678-1234-1234-1234-123456789012 |
|
|
|
## List recent Textract jobs: |
|
python cli_redact.py --task textract --textract_action list |
|
|
|
""", |
|
) |
|
|
|
|
|
task_group = parser.add_argument_group("Task Selection") |
|
task_group.add_argument( |
|
"--task", |
|
choices=["redact", "deduplicate", "textract"], |
|
default="redact", |
|
help="Task to perform: redact (PII redaction/anonymisation), deduplicate (find duplicate content), or textract (AWS Textract batch operations).", |
|
) |
|
|
|
|
|
general_group = parser.add_argument_group("General Options") |
|
general_group.add_argument( |
|
"--input_file", |
|
nargs="+", |
|
help="Path to the input file(s) to process. Separate multiple files with a space, and use quotes if there are spaces in the file name.", |
|
) |
|
general_group.add_argument( |
|
"--output_dir", default=OUTPUT_FOLDER, help="Directory for all output files." |
|
) |
|
general_group.add_argument( |
|
"--input_dir", default=INPUT_FOLDER, help="Directory for all input files." |
|
) |
|
general_group.add_argument( |
|
"--language", default=DEFAULT_LANGUAGE, help="Language of the document content." |
|
) |
|
general_group.add_argument( |
|
"--allow_list", |
|
default=ALLOW_LIST_PATH, |
|
help="Path to a CSV file with words to exclude from redaction.", |
|
) |
|
general_group.add_argument( |
|
"--pii_detector", |
|
choices=[LOCAL_PII_OPTION, AWS_PII_OPTION, "None"], |
|
default=LOCAL_PII_OPTION, |
|
help="Core PII detection method (Local or AWS Comprehend, or None).", |
|
) |
|
general_group.add_argument( |
|
"--username", default=DIRECT_MODE_DEFAULT_USER, help="Username for the session." |
|
) |
|
general_group.add_argument( |
|
"--save_to_user_folders", |
|
default=SESSION_OUTPUT_FOLDER, |
|
help="Whether to save to user folders or not.", |
|
) |
|
|
|
general_group.add_argument( |
|
"--local_redact_entities", |
|
nargs="+", |
|
choices=full_entity_list, |
|
default=chosen_redact_entities, |
|
help=f"Local redaction entities to use. Default: {chosen_redact_entities}. Full list: {full_entity_list}.", |
|
) |
|
|
|
general_group.add_argument( |
|
"--aws_redact_entities", |
|
nargs="+", |
|
choices=full_comprehend_entity_list, |
|
default=chosen_comprehend_entities, |
|
help=f"AWS redaction entities to use. Default: {chosen_comprehend_entities}. Full list: {full_comprehend_entity_list}.", |
|
) |
|
|
|
general_group.add_argument( |
|
"--aws_access_key", default=AWS_ACCESS_KEY, help="Your AWS Access Key ID." |
|
) |
|
general_group.add_argument( |
|
"--aws_secret_key", default=AWS_SECRET_KEY, help="Your AWS Secret Access Key." |
|
) |
|
general_group.add_argument( |
|
"--cost_code", default=DEFAULT_COST_CODE, help="Cost code for tracking usage." |
|
) |
|
general_group.add_argument( |
|
"--aws_region", default=AWS_REGION, help="AWS region for cloud services." |
|
) |
|
general_group.add_argument( |
|
"--s3_bucket", |
|
default=DOCUMENT_REDACTION_BUCKET, |
|
help="S3 bucket name for cloud operations.", |
|
) |
|
general_group.add_argument( |
|
"--do_initial_clean", |
|
default=DO_INITIAL_TABULAR_DATA_CLEAN, |
|
help="Perform initial text cleaning for tabular data.", |
|
) |
|
general_group.add_argument( |
|
"--save_logs_to_csv", |
|
default=SAVE_LOGS_TO_CSV, |
|
help="Save processing logs to CSV files.", |
|
) |
|
general_group.add_argument( |
|
"--save_logs_to_dynamodb", |
|
default=SAVE_LOGS_TO_DYNAMODB, |
|
help="Save processing logs to DynamoDB.", |
|
) |
|
general_group.add_argument( |
|
"--display_file_names_in_logs", |
|
default=DISPLAY_FILE_NAMES_IN_LOGS, |
|
help="Include file names in log outputs.", |
|
) |
|
general_group.add_argument( |
|
"--upload_logs_to_s3", |
|
default=RUN_AWS_FUNCTIONS == "1", |
|
help="Upload log files to S3 after processing.", |
|
) |
|
general_group.add_argument( |
|
"--s3_logs_prefix", |
|
default=S3_USAGE_LOGS_FOLDER, |
|
help="S3 prefix for usage log files.", |
|
) |
|
|
|
|
|
pdf_group = parser.add_argument_group( |
|
"PDF/Image Redaction Options (.pdf, .png, .jpg)" |
|
) |
|
pdf_group.add_argument( |
|
"--ocr_method", |
|
choices=["AWS Textract", "Local OCR", "Local text"], |
|
default="Local OCR", |
|
help="OCR method for text extraction from images.", |
|
) |
|
pdf_group.add_argument( |
|
"--page_min", type=int, default=0, help="First page to redact." |
|
) |
|
pdf_group.add_argument( |
|
"--page_max", type=int, default=0, help="Last page to redact." |
|
) |
|
pdf_group.add_argument( |
|
"--images_dpi", |
|
type=float, |
|
default=float(IMAGES_DPI), |
|
help="DPI for image processing.", |
|
) |
|
pdf_group.add_argument( |
|
"--chosen_local_ocr_model", |
|
choices=["tesseract", "hybrid", "paddle"], |
|
default=CHOSEN_LOCAL_OCR_MODEL, |
|
help="Local OCR model to use.", |
|
) |
|
pdf_group.add_argument( |
|
"--preprocess_local_ocr_images", |
|
default=PREPROCESS_LOCAL_OCR_IMAGES, |
|
help="Preprocess images before OCR.", |
|
) |
|
pdf_group.add_argument( |
|
"--compress_redacted_pdf", |
|
default=COMPRESS_REDACTED_PDF, |
|
help="Compress the final redacted PDF.", |
|
) |
|
pdf_group.add_argument( |
|
"--return_pdf_end_of_redaction", |
|
default=RETURN_REDACTED_PDF, |
|
help="Return PDF at end of redaction process.", |
|
) |
|
pdf_group.add_argument( |
|
"--deny_list_file", |
|
default=DENY_LIST_PATH, |
|
help="Custom words file to recognize for redaction.", |
|
) |
|
pdf_group.add_argument( |
|
"--allow_list_file", |
|
default=ALLOW_LIST_PATH, |
|
help="Custom words file to recognize for redaction.", |
|
) |
|
pdf_group.add_argument( |
|
"--redact_whole_page_file", |
|
default=WHOLE_PAGE_REDACTION_LIST_PATH, |
|
help="File for pages to redact completely.", |
|
) |
|
pdf_group.add_argument( |
|
"--handwrite_signature_extraction", |
|
nargs="+", |
|
default=default_handwrite_signature_checkbox, |
|
help='Handwriting and signature extraction options. Choose from "Extract handwriting", "Extract signatures".', |
|
) |
|
pdf_group.add_argument( |
|
"--extract_forms", |
|
action="store_true", |
|
help="Extract forms during Textract analysis.", |
|
) |
|
pdf_group.add_argument( |
|
"--extract_tables", |
|
action="store_true", |
|
help="Extract tables during Textract analysis.", |
|
) |
|
pdf_group.add_argument( |
|
"--extract_layout", |
|
action="store_true", |
|
help="Extract layout during Textract analysis.", |
|
) |
|
|
|
|
|
tabular_group = parser.add_argument_group( |
|
"Word/Tabular Anonymisation Options (.docx, .csv, .xlsx)" |
|
) |
|
tabular_group.add_argument( |
|
"--anon_strategy", |
|
choices=[ |
|
"redact", |
|
"redact completely", |
|
"replace_redacted", |
|
"entity_type", |
|
"encrypt", |
|
"hash", |
|
"replace with 'REDACTED'", |
|
"replace with <ENTITY_NAME>", |
|
"mask", |
|
"fake_first_name", |
|
], |
|
default=DEFAULT_TABULAR_ANONYMISATION_STRATEGY, |
|
help="The anonymisation strategy to apply.", |
|
) |
|
tabular_group.add_argument( |
|
"--text_columns", |
|
nargs="+", |
|
default=list(), |
|
help="A list of column names to anonymise or deduplicate in tabular data.", |
|
) |
|
tabular_group.add_argument( |
|
"--excel_sheets", |
|
nargs="+", |
|
default=list(), |
|
help="Specific Excel sheet names to process.", |
|
) |
|
tabular_group.add_argument( |
|
"--fuzzy_mistakes", |
|
type=int, |
|
default=DEFAULT_FUZZY_SPELLING_MISTAKES_NUM, |
|
help="Number of allowed spelling mistakes for fuzzy matching.", |
|
) |
|
tabular_group.add_argument( |
|
"--match_fuzzy_whole_phrase_bool", |
|
default=True, |
|
help="Match fuzzy whole phrase boolean.", |
|
) |
|
|
|
duplicate_group = parser.add_argument_group("Duplicate Detection Options") |
|
duplicate_group.add_argument( |
|
"--duplicate_type", |
|
choices=["pages", "tabular"], |
|
default="pages", |
|
help="Type of duplicate detection: pages (for OCR files) or tabular (for CSV/Excel files).", |
|
) |
|
duplicate_group.add_argument( |
|
"--similarity_threshold", |
|
type=float, |
|
default=DEFAULT_DUPLICATE_DETECTION_THRESHOLD, |
|
help="Similarity threshold (0-1) to consider content as duplicates.", |
|
) |
|
duplicate_group.add_argument( |
|
"--min_word_count", |
|
type=int, |
|
default=DEFAULT_MIN_WORD_COUNT, |
|
help="Minimum word count for text to be considered in duplicate analysis.", |
|
) |
|
duplicate_group.add_argument( |
|
"--min_consecutive_pages", |
|
type=int, |
|
default=DEFAULT_MIN_CONSECUTIVE_PAGES, |
|
help="Minimum number of consecutive pages to consider as a match.", |
|
) |
|
duplicate_group.add_argument( |
|
"--greedy_match", |
|
default=USE_GREEDY_DUPLICATE_DETECTION, |
|
help="Use greedy matching strategy for consecutive pages.", |
|
) |
|
duplicate_group.add_argument( |
|
"--combine_pages", |
|
default=DEFAULT_COMBINE_PAGES, |
|
help="Combine text from the same page number within a file. Alternative will enable line-level duplicate detection.", |
|
) |
|
duplicate_group.add_argument( |
|
"--remove_duplicate_rows", |
|
default=REMOVE_DUPLICATE_ROWS, |
|
help="Remove duplicate rows from the output.", |
|
) |
|
|
|
|
|
textract_group = parser.add_argument_group("Textract Batch Operations Options") |
|
textract_group.add_argument( |
|
"--textract_action", |
|
choices=["submit", "retrieve", "list"], |
|
help="Textract action to perform: submit (submit document for analysis), retrieve (get results by job ID), or list (show recent jobs).", |
|
) |
|
textract_group.add_argument("--job_id", help="Textract job ID for retrieve action.") |
|
textract_group.add_argument( |
|
"--extract_signatures", |
|
action="store_true", |
|
help="Extract signatures during Textract analysis (for submit action).", |
|
) |
|
textract_group.add_argument( |
|
"--textract_bucket", |
|
default=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, |
|
help="S3 bucket name for Textract operations (overrides default).", |
|
) |
|
textract_group.add_argument( |
|
"--textract_input_prefix", |
|
default=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, |
|
help="S3 prefix for input files in Textract operations.", |
|
) |
|
textract_group.add_argument( |
|
"--textract_output_prefix", |
|
default=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, |
|
help="S3 prefix for output files in Textract operations.", |
|
) |
|
textract_group.add_argument( |
|
"--s3_textract_document_logs_subfolder", |
|
default=TEXTRACT_JOBS_S3_LOC, |
|
help="S3 prefix for logs in Textract operations.", |
|
) |
|
textract_group.add_argument( |
|
"--local_textract_document_logs_subfolder", |
|
default=TEXTRACT_JOBS_LOCAL_LOC, |
|
help="Local prefix for logs in Textract operations.", |
|
) |
|
textract_group.add_argument( |
|
"--poll_interval", |
|
type=int, |
|
default=30, |
|
help="Polling interval in seconds for Textract job status.", |
|
) |
|
textract_group.add_argument( |
|
"--max_poll_attempts", |
|
type=int, |
|
default=120, |
|
help="Maximum number of polling attempts for Textract job completion.", |
|
) |
|
|
|
if direct_mode_args: |
|
|
|
args = argparse.Namespace(**direct_mode_args) |
|
else: |
|
|
|
args = parser.parse_args() |
|
|
|
|
|
|
|
if args.preprocess_local_ocr_images == "True": |
|
args.preprocess_local_ocr_images = True |
|
else: |
|
args.preprocess_local_ocr_images = False |
|
if args.greedy_match == "True": |
|
args.greedy_match = True |
|
else: |
|
args.greedy_match = False |
|
if args.combine_pages == "True": |
|
args.combine_pages = True |
|
else: |
|
args.combine_pages = False |
|
if args.remove_duplicate_rows == "True": |
|
args.remove_duplicate_rows = True |
|
else: |
|
args.remove_duplicate_rows = False |
|
if args.return_pdf_end_of_redaction == "True": |
|
args.return_pdf_end_of_redaction = True |
|
else: |
|
args.return_pdf_end_of_redaction = False |
|
if args.compress_redacted_pdf == "True": |
|
args.compress_redacted_pdf = True |
|
else: |
|
args.compress_redacted_pdf = False |
|
if args.do_initial_clean == "True": |
|
args.do_initial_clean = True |
|
else: |
|
args.do_initial_clean = False |
|
if args.save_logs_to_csv == "True": |
|
args.save_logs_to_csv = True |
|
else: |
|
args.save_logs_to_csv = False |
|
if args.save_logs_to_dynamodb == "True": |
|
args.save_logs_to_dynamodb = True |
|
else: |
|
args.save_logs_to_dynamodb = False |
|
if args.display_file_names_in_logs == "True": |
|
args.display_file_names_in_logs = True |
|
else: |
|
args.display_file_names_in_logs = False |
|
if args.match_fuzzy_whole_phrase_bool == "True": |
|
args.match_fuzzy_whole_phrase_bool = True |
|
else: |
|
args.match_fuzzy_whole_phrase_bool = False |
|
if args.save_to_user_folders == "True": |
|
args.save_to_user_folders = True |
|
else: |
|
args.save_to_user_folders = False |
|
|
|
|
|
extraction_options = ( |
|
list(args.handwrite_signature_extraction) |
|
if args.handwrite_signature_extraction |
|
else [] |
|
) |
|
if args.extract_forms: |
|
extraction_options.append("Extract forms") |
|
if args.extract_tables: |
|
extraction_options.append("Extract tables") |
|
if args.extract_layout: |
|
extraction_options.append("Extract layout") |
|
args.handwrite_signature_extraction = extraction_options |
|
|
|
if args.task in ["redact", "deduplicate"]: |
|
if args.input_file: |
|
if isinstance(args.input_file, str): |
|
args.input_file = [args.input_file] |
|
|
|
_, file_extension = os.path.splitext(args.input_file[0]) |
|
file_extension = file_extension.lower() |
|
else: |
|
raise ValueError("Error: --input_file is required for 'redact' task.") |
|
|
|
|
|
usage_logger = None |
|
if args.save_logs_to_csv or args.save_logs_to_dynamodb: |
|
from tools.cli_usage_logger import create_cli_usage_logger |
|
|
|
try: |
|
usage_logger = create_cli_usage_logger() |
|
except Exception as e: |
|
print(f"Warning: Could not initialise usage logger: {e}") |
|
|
|
|
|
( |
|
session_hash, |
|
args.output_dir, |
|
_, |
|
args.input_dir, |
|
args.textract_input_prefix, |
|
args.textract_output_prefix, |
|
args.s3_textract_document_logs_subfolder, |
|
args.local_textract_document_logs_subfolder, |
|
) = get_username_and_folders( |
|
username=args.username, |
|
output_folder_textbox=args.output_dir, |
|
input_folder_textbox=args.input_dir, |
|
session_output_folder=args.save_to_user_folders, |
|
textract_document_upload_input_folder=args.textract_input_prefix, |
|
textract_document_upload_output_folder=args.textract_output_prefix, |
|
s3_textract_document_logs_subfolder=args.s3_textract_document_logs_subfolder, |
|
local_textract_document_logs_subfolder=args.local_textract_document_logs_subfolder, |
|
) |
|
|
|
print( |
|
f"Conducting analyses with user {args.username}. Outputs will be saved to {args.output_dir}." |
|
) |
|
|
|
|
|
|
|
|
|
if args.task in ["redact", "deduplicate"] and not args.input_file: |
|
print(f"Error: --input_file is required for '{args.task}' task.") |
|
return |
|
|
|
if args.ocr_method in ["Local OCR", "AWS Textract"]: |
|
args.prepare_images = True |
|
else: |
|
args.prepare_images = False |
|
|
|
from tools.cli_usage_logger import create_cli_usage_logger, log_redaction_usage |
|
|
|
|
|
if args.task == "redact": |
|
|
|
|
|
if file_extension in [".pdf", ".png", ".jpg", ".jpeg"]: |
|
print("--- Detected PDF/Image file. Starting Redaction Workflow... ---") |
|
start_time = time.time() |
|
try: |
|
from tools.file_conversion import prepare_image_or_pdf |
|
from tools.file_redaction import choose_and_run_redactor |
|
|
|
|
|
print("\nStep 1: Preparing document...") |
|
( |
|
prep_summary, |
|
prepared_pdf_paths, |
|
image_file_paths, |
|
_, |
|
_, |
|
pdf_doc, |
|
image_annotations, |
|
_, |
|
original_cropboxes, |
|
page_sizes, |
|
_, |
|
_, |
|
_, |
|
_, |
|
_, |
|
) = prepare_image_or_pdf( |
|
file_paths=args.input_file, |
|
text_extract_method=args.ocr_method, |
|
all_line_level_ocr_results_df=pd.DataFrame(), |
|
all_page_line_level_ocr_results_with_words_df=pd.DataFrame(), |
|
first_loop_state=True, |
|
prepare_for_review=False, |
|
output_folder=args.output_dir, |
|
input_folder=args.input_dir, |
|
prepare_images=args.prepare_images, |
|
) |
|
print(f"Preparation complete. {prep_summary}") |
|
|
|
|
|
print("\nStep 2: Running redaction...") |
|
( |
|
output_summary, |
|
output_files, |
|
_, |
|
_, |
|
log_files, |
|
_, |
|
_, |
|
_, |
|
_, |
|
_, |
|
_, |
|
_, |
|
_, |
|
_, |
|
comprehend_query_number, |
|
_, |
|
_, |
|
_, |
|
_, |
|
_, |
|
_, |
|
page_sizes, |
|
_, |
|
_, |
|
_, |
|
total_textract_query_number, |
|
_, |
|
_, |
|
_, |
|
_, |
|
_, |
|
_, |
|
_, |
|
) = choose_and_run_redactor( |
|
file_paths=args.input_file, |
|
prepared_pdf_file_paths=prepared_pdf_paths, |
|
pdf_image_file_paths=image_file_paths, |
|
chosen_redact_entities=args.local_redact_entities, |
|
chosen_redact_comprehend_entities=args.aws_redact_entities, |
|
text_extraction_method=args.ocr_method, |
|
in_allow_list=args.allow_list_file, |
|
in_deny_list=args.deny_list_file, |
|
redact_whole_page_list=args.redact_whole_page_file, |
|
first_loop_state=True, |
|
page_min=args.page_min, |
|
page_max=args.page_max, |
|
handwrite_signature_checkbox=args.handwrite_signature_extraction, |
|
max_fuzzy_spelling_mistakes_num=args.fuzzy_mistakes, |
|
match_fuzzy_whole_phrase_bool=args.match_fuzzy_whole_phrase_bool, |
|
pymupdf_doc=pdf_doc, |
|
annotations_all_pages=image_annotations, |
|
page_sizes=page_sizes, |
|
document_cropboxes=original_cropboxes, |
|
pii_identification_method=args.pii_detector, |
|
aws_access_key_textbox=args.aws_access_key, |
|
aws_secret_key_textbox=args.aws_secret_key, |
|
language=args.language, |
|
output_folder=args.output_dir, |
|
input_folder=args.input_dir, |
|
) |
|
|
|
|
|
end_time = time.time() |
|
processing_time = end_time - start_time |
|
|
|
|
|
if usage_logger: |
|
try: |
|
|
|
print("Saving logs to CSV") |
|
doc_file_name = ( |
|
os.path.basename(args.input_file[0]) |
|
if args.display_file_names_in_logs |
|
else "document" |
|
) |
|
data_file_name = "" |
|
|
|
|
|
is_textract_call = args.ocr_method == "AWS Textract" |
|
|
|
|
|
total_pages = len(page_sizes) if page_sizes else 1 |
|
|
|
|
|
textract_queries = ( |
|
int(total_textract_query_number) if is_textract_call else 0 |
|
) |
|
comprehend_queries = ( |
|
int(comprehend_query_number) |
|
if args.pii_detector == "AWS Comprehend" |
|
else 0 |
|
) |
|
|
|
|
|
handwriting_signature = ( |
|
", ".join(args.handwrite_signature_extraction) |
|
if args.handwrite_signature_extraction |
|
else "" |
|
) |
|
|
|
log_redaction_usage( |
|
logger=usage_logger, |
|
session_hash=session_hash, |
|
doc_file_name=doc_file_name, |
|
data_file_name=data_file_name, |
|
time_taken=processing_time, |
|
total_pages=total_pages, |
|
textract_queries=textract_queries, |
|
pii_method=args.pii_detector, |
|
comprehend_queries=comprehend_queries, |
|
cost_code=args.cost_code, |
|
handwriting_signature=handwriting_signature, |
|
text_extraction_method=args.ocr_method, |
|
is_textract_call=is_textract_call, |
|
task=args.task, |
|
save_to_dynamodb=args.save_logs_to_dynamodb, |
|
save_to_s3=args.upload_logs_to_s3, |
|
s3_bucket=args.s3_bucket, |
|
s3_key_prefix=args.s3_logs_prefix, |
|
) |
|
except Exception as e: |
|
print(f"Warning: Could not log usage data: {e}") |
|
|
|
print("\n--- Redaction Process Complete ---") |
|
print(f"Summary: {output_summary}") |
|
print(f"Processing time: {processing_time:.2f} seconds") |
|
print(f"\nOutput files saved to: {args.output_dir}") |
|
print("Generated Files:", sorted(output_files)) |
|
if log_files: |
|
print("Log Files:", sorted(log_files)) |
|
|
|
except Exception as e: |
|
print( |
|
f"\nAn error occurred during the PDF/Image redaction workflow: {e}" |
|
) |
|
|
|
|
|
elif file_extension in [".docx", ".xlsx", ".xls", ".csv", ".parquet"]: |
|
print( |
|
"--- Detected Word/Tabular file. Starting Anonymisation Workflow... ---" |
|
) |
|
start_time = time.time() |
|
try: |
|
from tools.data_anonymise import anonymise_files_with_open_text |
|
|
|
|
|
|
|
( |
|
output_summary, |
|
output_files, |
|
_, |
|
_, |
|
log_files, |
|
_, |
|
processing_time, |
|
comprehend_query_number, |
|
) = anonymise_files_with_open_text( |
|
file_paths=args.input_file, |
|
in_text="", |
|
anon_strategy=args.anon_strategy, |
|
chosen_cols=args.text_columns, |
|
chosen_redact_entities=args.local_redact_entities, |
|
in_allow_list=args.allow_list_file, |
|
in_excel_sheets=args.excel_sheets, |
|
first_loop_state=True, |
|
output_folder=args.output_dir, |
|
in_deny_list=args.deny_list_file, |
|
max_fuzzy_spelling_mistakes_num=args.fuzzy_mistakes, |
|
pii_identification_method=args.pii_detector, |
|
chosen_redact_comprehend_entities=args.aws_redact_entities, |
|
aws_access_key_textbox=args.aws_access_key, |
|
aws_secret_key_textbox=args.aws_secret_key, |
|
language=args.language, |
|
do_initial_clean=args.do_initial_clean, |
|
) |
|
|
|
|
|
end_time = time.time() |
|
processing_time = end_time - start_time |
|
|
|
|
|
if usage_logger: |
|
try: |
|
print("Saving logs to CSV") |
|
|
|
doc_file_name = "" |
|
data_file_name = ( |
|
os.path.basename(args.input_file[0]) |
|
if args.display_file_names_in_logs |
|
else "data_file" |
|
) |
|
|
|
|
|
is_textract_call = False |
|
|
|
|
|
total_pages = 0 |
|
|
|
|
|
textract_queries = 0 |
|
comprehend_queries = ( |
|
comprehend_query_number |
|
if args.pii_detector == "AWS Comprehend" |
|
else 0 |
|
) |
|
|
|
|
|
handwriting_signature = "" |
|
|
|
log_redaction_usage( |
|
logger=usage_logger, |
|
session_hash=session_hash, |
|
doc_file_name=doc_file_name, |
|
data_file_name=data_file_name, |
|
time_taken=processing_time, |
|
total_pages=total_pages, |
|
textract_queries=textract_queries, |
|
pii_method=args.pii_detector, |
|
comprehend_queries=comprehend_queries, |
|
cost_code=args.cost_code, |
|
handwriting_signature=handwriting_signature, |
|
text_extraction_method="tabular", |
|
is_textract_call=is_textract_call, |
|
task=args.task, |
|
save_to_dynamodb=args.save_logs_to_dynamodb, |
|
save_to_s3=args.upload_logs_to_s3, |
|
s3_bucket=args.s3_bucket, |
|
s3_key_prefix=args.s3_logs_prefix, |
|
) |
|
except Exception as e: |
|
print(f"Warning: Could not log usage data: {e}") |
|
|
|
print("\n--- Anonymisation Process Complete ---") |
|
print(f"Summary: {output_summary}") |
|
print(f"Processing time: {processing_time:.2f} seconds") |
|
print(f"\nOutput files saved to: {args.output_dir}") |
|
print("Generated Files:", sorted(output_files)) |
|
if log_files: |
|
print("Log Files:", sorted(log_files)) |
|
|
|
except Exception as e: |
|
print( |
|
f"\nAn error occurred during the Word/Tabular anonymisation workflow: {e}" |
|
) |
|
|
|
else: |
|
print(f"Error: Unsupported file type '{file_extension}' for redaction.") |
|
print("Supported types for redaction: .pdf, .png, .jpg, .jpeg") |
|
print( |
|
"Supported types for anonymisation: .docx, .xlsx, .xls, .csv, .parquet" |
|
) |
|
|
|
|
|
elif args.task == "deduplicate": |
|
print("--- Starting Duplicate Detection Workflow... ---") |
|
try: |
|
from tools.find_duplicate_pages import run_duplicate_analysis |
|
|
|
if args.duplicate_type == "pages": |
|
|
|
if file_extension == ".csv": |
|
print( |
|
"--- Detected OCR CSV file. Starting Page Duplicate Detection... ---" |
|
) |
|
|
|
start_time = time.time() |
|
|
|
if args.combine_pages is True: |
|
print("Combining pages...") |
|
else: |
|
print("Using line-level duplicate detection...") |
|
|
|
|
|
( |
|
results_df, |
|
output_paths, |
|
full_data_by_file, |
|
processing_time, |
|
task_textbox, |
|
) = run_duplicate_analysis( |
|
files=args.input_file, |
|
threshold=args.similarity_threshold, |
|
min_words=args.min_word_count, |
|
min_consecutive=args.min_consecutive_pages, |
|
greedy_match=args.greedy_match, |
|
combine_pages=args.combine_pages, |
|
output_folder=args.output_dir, |
|
) |
|
|
|
end_time = time.time() |
|
processing_time = end_time - start_time |
|
|
|
print("\n--- Page Duplicate Detection Complete ---") |
|
print(f"Found {len(results_df)} duplicate matches") |
|
print(f"\nOutput files saved to: {args.output_dir}") |
|
if output_paths: |
|
print("Generated Files:", sorted(output_paths)) |
|
|
|
else: |
|
print( |
|
"Error: Page duplicate detection requires CSV files with OCR data." |
|
) |
|
print("Please provide a CSV file containing OCR output data.") |
|
|
|
|
|
if usage_logger: |
|
try: |
|
|
|
print("Saving logs to CSV") |
|
doc_file_name = ( |
|
os.path.basename(args.input_file[0]) |
|
if args.display_file_names_in_logs |
|
else "document" |
|
) |
|
data_file_name = ( |
|
"" |
|
) |
|
|
|
|
|
is_textract_call = False |
|
|
|
|
|
total_pages = len(page_sizes) if page_sizes else 1 |
|
|
|
|
|
textract_queries = 0 |
|
comprehend_queries = 0 |
|
|
|
|
|
handwriting_signature = "" |
|
|
|
log_redaction_usage( |
|
logger=usage_logger, |
|
session_hash=session_hash, |
|
doc_file_name=doc_file_name, |
|
data_file_name=data_file_name, |
|
time_taken=processing_time, |
|
total_pages=total_pages, |
|
textract_queries=textract_queries, |
|
pii_method=args.pii_detector, |
|
comprehend_queries=comprehend_queries, |
|
cost_code=args.cost_code, |
|
handwriting_signature=handwriting_signature, |
|
text_extraction_method=args.ocr_method, |
|
is_textract_call=is_textract_call, |
|
task=args.task, |
|
save_to_dynamodb=args.save_logs_to_dynamodb, |
|
save_to_s3=args.upload_logs_to_s3, |
|
s3_bucket=args.s3_bucket, |
|
s3_key_prefix=args.s3_logs_prefix, |
|
) |
|
except Exception as e: |
|
print(f"Warning: Could not log usage data: {e}") |
|
|
|
elif args.duplicate_type == "tabular": |
|
|
|
from tools.find_duplicate_tabular import run_tabular_duplicate_detection |
|
|
|
if file_extension in [".csv", ".xlsx", ".xls", ".parquet"]: |
|
print( |
|
"--- Detected tabular file. Starting Tabular Duplicate Detection... ---" |
|
) |
|
|
|
start_time = time.time() |
|
|
|
( |
|
results_df, |
|
output_paths, |
|
full_data_by_file, |
|
processing_time, |
|
task_textbox, |
|
) = run_tabular_duplicate_detection( |
|
files=args.input_file, |
|
threshold=args.similarity_threshold, |
|
min_words=args.min_word_count, |
|
text_columns=args.text_columns, |
|
output_folder=args.output_dir, |
|
do_initial_clean_dup=args.do_initial_clean, |
|
in_excel_tabular_sheets=args.excel_sheets, |
|
remove_duplicate_rows=args.remove_duplicate_rows, |
|
) |
|
|
|
end_time = time.time() |
|
processing_time = end_time - start_time |
|
|
|
|
|
if usage_logger: |
|
try: |
|
|
|
print("Saving logs to CSV") |
|
doc_file_name = "" |
|
data_file_name = ( |
|
os.path.basename(args.input_file[0]) |
|
if args.display_file_names_in_logs |
|
else "data_file" |
|
) |
|
|
|
|
|
is_textract_call = False |
|
|
|
|
|
total_pages = len(page_sizes) if page_sizes else 1 |
|
|
|
|
|
textract_queries = 0 |
|
comprehend_queries = 0 |
|
|
|
|
|
handwriting_signature = "" |
|
|
|
log_redaction_usage( |
|
logger=usage_logger, |
|
session_hash=session_hash, |
|
doc_file_name=doc_file_name, |
|
data_file_name=data_file_name, |
|
time_taken=processing_time, |
|
total_pages=total_pages, |
|
textract_queries=textract_queries, |
|
pii_method=args.pii_detector, |
|
comprehend_queries=comprehend_queries, |
|
cost_code=args.cost_code, |
|
handwriting_signature=handwriting_signature, |
|
text_extraction_method=args.ocr_method, |
|
is_textract_call=is_textract_call, |
|
task=args.task, |
|
save_to_dynamodb=args.save_logs_to_dynamodb, |
|
save_to_s3=args.upload_logs_to_s3, |
|
s3_bucket=args.s3_bucket, |
|
s3_key_prefix=args.s3_logs_prefix, |
|
) |
|
except Exception as e: |
|
print(f"Warning: Could not log usage data: {e}") |
|
|
|
print("\n--- Tabular Duplicate Detection Complete ---") |
|
print(f"Found {len(results_df)} duplicate matches") |
|
print(f"\nOutput files saved to: {args.output_dir}") |
|
if output_paths: |
|
print("Generated Files:", sorted(output_paths)) |
|
|
|
else: |
|
print( |
|
"Error: Tabular duplicate detection requires CSV, Excel, or Parquet files." |
|
) |
|
print("Supported types: .csv, .xlsx, .xls, .parquet") |
|
else: |
|
print(f"Error: Invalid duplicate type '{args.duplicate_type}'.") |
|
print("Valid options: 'pages' or 'tabular'") |
|
|
|
except Exception as e: |
|
print(f"\nAn error occurred during the duplicate detection workflow: {e}") |
|
|
|
|
|
elif args.task == "textract": |
|
print("--- Starting Textract Batch Operations Workflow... ---") |
|
|
|
if not args.textract_action: |
|
print("Error: --textract_action is required for textract task.") |
|
print("Valid options: 'submit', 'retrieve', or 'list'") |
|
return |
|
|
|
try: |
|
if args.textract_action == "submit": |
|
from tools.textract_batch_call import ( |
|
analyse_document_with_textract_api, |
|
load_in_textract_job_details, |
|
) |
|
|
|
|
|
if not args.input_file: |
|
print("Error: --input_file is required for submit action.") |
|
return |
|
|
|
print(f"--- Submitting document to Textract: {args.input_file} ---") |
|
|
|
start_time = time.time() |
|
|
|
|
|
job_df = load_in_textract_job_details( |
|
load_s3_jobs_loc=args.s3_textract_document_logs_subfolder, |
|
load_local_jobs_loc=args.local_textract_document_logs_subfolder, |
|
) |
|
|
|
|
|
signature_options = ( |
|
["Extract handwriting", "Extract signatures"] |
|
if args.extract_signatures |
|
else ["Extract handwriting"] |
|
) |
|
|
|
|
|
textract_bucket = args.textract_bucket if args.textract_bucket else "" |
|
|
|
|
|
( |
|
result_message, |
|
job_id, |
|
job_type, |
|
successful_job_number, |
|
is_textract_call, |
|
total_pages, |
|
task_textbox, |
|
) = analyse_document_with_textract_api( |
|
local_pdf_path=args.input_file, |
|
s3_input_prefix=args.textract_input_prefix, |
|
s3_output_prefix=args.textract_output_prefix, |
|
job_df=job_df, |
|
s3_bucket_name=textract_bucket, |
|
general_s3_bucket_name=args.s3_bucket, |
|
local_output_dir=args.output_dir, |
|
handwrite_signature_checkbox=signature_options, |
|
aws_region=args.aws_region, |
|
) |
|
|
|
end_time = time.time() |
|
processing_time = end_time - start_time |
|
|
|
print("\n--- Textract Job Submitted Successfully ---") |
|
print(f"Job ID: {job_id}") |
|
print(f"Job Type: {job_type}") |
|
print(f"Message: {result_message}") |
|
print(f"Results will be available in: {args.output_dir}") |
|
|
|
|
|
if usage_logger: |
|
try: |
|
|
|
print("Saving logs to CSV") |
|
doc_file_name = ( |
|
os.path.basename(args.input_file[0]) |
|
if args.display_file_names_in_logs |
|
else "document" |
|
) |
|
data_file_name = "" |
|
|
|
|
|
is_textract_call = True |
|
args.ocr_method == "AWS Textract" |
|
|
|
|
|
textract_queries = total_pages |
|
comprehend_queries = 0 |
|
|
|
|
|
handwriting_signature = "" |
|
|
|
log_redaction_usage( |
|
logger=usage_logger, |
|
session_hash=session_hash, |
|
doc_file_name=doc_file_name, |
|
data_file_name=data_file_name, |
|
time_taken=processing_time, |
|
total_pages=total_pages, |
|
textract_queries=textract_queries, |
|
pii_method=args.pii_detector, |
|
comprehend_queries=comprehend_queries, |
|
cost_code=args.cost_code, |
|
handwriting_signature=handwriting_signature, |
|
text_extraction_method=args.ocr_method, |
|
is_textract_call=is_textract_call, |
|
task=args.task, |
|
save_to_dynamodb=args.save_logs_to_dynamodb, |
|
save_to_s3=args.upload_logs_to_s3, |
|
s3_bucket=args.s3_bucket, |
|
s3_key_prefix=args.s3_logs_prefix, |
|
) |
|
except Exception as e: |
|
print(f"Warning: Could not log usage data: {e}") |
|
|
|
elif args.textract_action == "retrieve": |
|
print(f"--- Retrieving Textract results for Job ID: {args.job_id} ---") |
|
|
|
from tools.textract_batch_call import ( |
|
load_in_textract_job_details, |
|
poll_whole_document_textract_analysis_progress_and_download, |
|
) |
|
|
|
|
|
if not args.job_id: |
|
print("Error: --job_id is required for retrieve action.") |
|
return |
|
|
|
|
|
print("Loading existing job details...") |
|
job_df = load_in_textract_job_details( |
|
load_s3_jobs_loc=args.s3_textract_document_logs_subfolder, |
|
load_local_jobs_loc=args.local_textract_document_logs_subfolder, |
|
) |
|
|
|
|
|
job_type = "document_text_detection" |
|
if not job_df.empty and "job_id" in job_df.columns: |
|
matching_jobs = job_df.loc[job_df["job_id"] == args.job_id] |
|
if not matching_jobs.empty and "job_type" in matching_jobs.columns: |
|
job_type = matching_jobs.iloc[0]["job_type"] |
|
|
|
|
|
textract_bucket = args.textract_bucket if args.textract_bucket else "" |
|
|
|
|
|
print("Polling for completion and downloading results...") |
|
downloaded_file_path, job_status, updated_job_df, output_filename = ( |
|
poll_whole_document_textract_analysis_progress_and_download( |
|
job_id=args.job_id, |
|
job_type_dropdown=job_type, |
|
s3_output_prefix=args.textract_output_prefix, |
|
pdf_filename="", |
|
job_df=job_df, |
|
s3_bucket_name=textract_bucket, |
|
load_s3_jobs_loc=args.s3_textract_document_logs_subfolder, |
|
load_local_jobs_loc=args.local_textract_document_logs_subfolder, |
|
local_output_dir=args.output_dir, |
|
poll_interval_seconds=args.poll_interval, |
|
max_polling_attempts=args.max_poll_attempts, |
|
) |
|
) |
|
|
|
print("\n--- Textract Results Retrieved Successfully ---") |
|
print(f"Job Status: {job_status}") |
|
print(f"Downloaded File: {downloaded_file_path}") |
|
|
|
|
|
elif args.textract_action == "list": |
|
from tools.textract_batch_call import load_in_textract_job_details |
|
|
|
|
|
print("--- Listing Recent Textract Jobs ---") |
|
|
|
job_df = load_in_textract_job_details( |
|
load_s3_jobs_loc=args.s3_textract_document_logs_subfolder, |
|
load_local_jobs_loc=args.local_textract_document_logs_subfolder, |
|
) |
|
|
|
if job_df.empty: |
|
print("No recent Textract jobs found.") |
|
else: |
|
print(f"\nFound {len(job_df)} recent Textract jobs:") |
|
print("-" * 80) |
|
for _, job in job_df.iterrows(): |
|
print(f"Job ID: {job.get('job_id', 'N/A')}") |
|
print(f"File: {job.get('file_name', 'N/A')}") |
|
print(f"Type: {job.get('job_type', 'N/A')}") |
|
print(f"Signatures: {job.get('signature_extraction', 'N/A')}") |
|
print(f"Date: {job.get('job_date_time', 'N/A')}") |
|
print("-" * 80) |
|
|
|
else: |
|
print(f"Error: Invalid textract_action '{args.textract_action}'.") |
|
print("Valid options: 'submit', 'retrieve', or 'list'") |
|
|
|
except Exception as e: |
|
print(f"\nAn error occurred during the Textract workflow: {e}") |
|
|
|
else: |
|
print(f"Error: Invalid task '{args.task}'.") |
|
print("Valid options: 'redact', 'deduplicate', or 'textract'") |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|