import boto3 import os import subprocess print("In lambda_entrypoint function") try: s3_client = boto3.client("s3", region_name="eu-west-2") print("s3_client is initialized:", s3_client) except Exception as e: print(f"Error initializing s3_client: {e}") raise e TMP_DIR = "/tmp/" def download_file_from_s3(bucket_name, key, download_path): """Download a file from S3 to the local filesystem.""" s3_client.download_file(bucket_name, key, download_path) print(f"Downloaded {key} to {download_path}") def upload_file_to_s3(file_path, bucket_name, key): """Upload a file to S3.""" s3_client.upload_file(file_path, bucket_name, key) print(f"Uploaded {file_path} to {key}") def lambda_handler(event, context): print("In lambda_handler function") # Create necessary directories os.makedirs(os.path.join(TMP_DIR, "input"), exist_ok=True) os.makedirs(os.path.join(TMP_DIR, "output"), exist_ok=True) print("Got to record loop") print("Event records is:", event["Records"]) # Extract S3 bucket and object key from the Records for record in event.get("Records", [{}]): bucket_name = record.get("s3", {}).get("bucket", {}).get("name") input_key = record.get("s3", {}).get("object", {}).get("key") print(f"Processing file {input_key} from bucket {bucket_name}") # Extract additional arguments arguments = event.get("arguments", {}) if not input_key: input_key = arguments.get("input_file", "") ocr_method = arguments.get("ocr_method", "Complex image analysis - docs with handwriting/signatures (AWS Textract)") pii_detector = arguments.get("pii_detector", "AWS Comprehend") page_min = str(arguments.get("page_min", 0)) page_max = str(arguments.get("page_max", 0)) allow_list = arguments.get("allow_list", None) output_dir = arguments.get("output_dir", os.path.join(TMP_DIR, "output")) print(f"OCR Method: {ocr_method}") print(f"PII Detector: {pii_detector}") print(f"Page Range: {page_min} - {page_max}") print(f"Allow List: {allow_list}") print(f"Output Directory: {output_dir}") # Download input file input_file_path = os.path.join(TMP_DIR, "input", os.path.basename(input_key)) download_file_from_s3(bucket_name, input_key, input_file_path) # Construct command command = [ "python", "app.py", "--input_file", input_file_path, "--ocr_method", ocr_method, "--pii_detector", pii_detector, "--page_min", page_min, "--page_max", page_max, "--output_dir", output_dir, ] # Add allow_list only if provided if allow_list: allow_list_path = os.path.join(TMP_DIR, "allow_list.csv") download_file_from_s3(bucket_name, allow_list, allow_list_path) command.extend(["--allow_list", allow_list_path]) try: result = subprocess.run(command, capture_output=True, text=True, check=True) print("Processing succeeded:", result.stdout) except subprocess.CalledProcessError as e: print("Error during processing:", e.stderr) raise e # Upload output files back to S3 for root, _, files in os.walk(output_dir): for file_name in files: local_file_path = os.path.join(root, file_name) output_key = f"{os.path.dirname(input_key)}/output/{file_name}" upload_file_to_s3(local_file_path, bucket_name, output_key) return {"statusCode": 200, "body": "Processing complete."}