Added option for running redact function through CLI (i.e. not going through Gradio UI or API). Test functions for running this through AWS Lambda.
e5dfae7
import boto3 | |
import os | |
import subprocess | |
from urllib.parse import unquote_plus | |
s3_client = boto3.client("s3") | |
def download_file_from_s3(bucket_name, key, download_path): | |
"""Download a file from S3 to the local filesystem.""" | |
s3_client.download_file(bucket_name, key, download_path) | |
print(f"Downloaded {key} to {download_path}") | |
def upload_file_to_s3(file_path, bucket_name, key): | |
"""Upload a file to S3.""" | |
s3_client.upload_file(file_path, bucket_name, key) | |
print(f"Uploaded {file_path} to {key}") | |
def lambda_handler(event, context): | |
"""Main Lambda function handler.""" | |
# Parse the S3 event | |
for record in event["Records"]: | |
bucket_name = record["s3"]["bucket"]["name"] | |
input_key = unquote_plus(record["s3"]["object"]["key"]) | |
print(f"Processing file {input_key} from bucket {bucket_name}") | |
# Prepare paths | |
input_file_path = f"/tmp/{os.path.basename(input_key)}" | |
allow_list_path = f"/tmp/allow_list.csv" # Adjust this as needed | |
output_dir = "/tmp/output" | |
os.makedirs(output_dir, exist_ok=True) | |
# Download input file | |
download_file_from_s3(bucket_name, input_key, input_file_path) | |
# (Optional) Download allow_list if needed | |
allow_list_key = "path/to/allow_list.csv" # Adjust path as required | |
download_file_from_s3(bucket_name, allow_list_key, allow_list_path) | |
# Construct and run the command | |
command = [ | |
"python", | |
"app.py", | |
"--input_file", input_file_path, | |
"--ocr_method", "Complex image analysis - docs with handwriting/signatures (AWS Textract)", | |
"--pii_detector", "AWS Comprehend", | |
"--page_min", "0", | |
"--page_max", "0", | |
"--allow_list", allow_list_path, | |
"--output_dir", output_dir, | |
] | |
try: | |
result = subprocess.run(command, capture_output=True, text=True, check=True) | |
print("Processing succeeded:", result.stdout) | |
except subprocess.CalledProcessError as e: | |
print("Error during processing:", e.stderr) | |
raise e | |
# Upload output files back to S3 | |
for root, _, files in os.walk(output_dir): | |
for file_name in files: | |
local_file_path = os.path.join(root, file_name) | |
output_key = f"{os.path.dirname(input_key)}/output/{file_name}" | |
upload_file_to_s3(local_file_path, bucket_name, output_key) | |
return {"statusCode": 200, "body": "Processing complete."} |