Spaces:

danial0203
/

PDFtoCSVBetterVersion

Runtime error

App Files Files Community

PDFtoCSVBetterVersion / app.py

danial0203

Update app.py

8f49668 verified 10 months ago

raw

history blame

5.21 kB

	import boto3
	import csv
	import os
	from botocore.exceptions import NoCredentialsError
	from pdf2image import convert_from_path
	from PIL import Image
	import gradio as gr
	from io import BytesIO
	from datasets.filesystems import S3FileSystem
	import s3fs


	# AWS Setup
	aws_access_key_id = os.getenv('AWS_ACCESS_KEY')
	aws_secret_access_key = os.getenv('AWS_SECRET_KEY')
	region_name = os.getenv('AWS_REGION')
	s3_bucket = os.getenv('AWS_BUCKET')

	# Initialize s3fs with your AWS credentials
	s3_fs = s3fs.S3FileSystem(key=aws_access_key_id, secret=aws_secret_access_key, client_kwargs={'region_name': region_name})

	textract_client = boto3.client('textract', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, region_name=region_name)

	def upload_file_to_s3(file_content, bucket, object_name=None):
	"""Uploads file to S3 using s3fs."""
	if object_name is None:
	raise ValueError("object_name cannot be None")

	try:
	with s3_fs.open(f's3://{bucket}/{object_name}', 'wb') as f:
	f.write(file_content.read())
	return object_name
	except FileNotFoundError:
	print("The file was not found")
	return None
	except Exception as e: # Catch broader exceptions that may arise from permissions or AWS issues
	print(f"An error occurred: {e}")
	return None


	def process_image(file_content, s3_bucket, textract_client, object_name):
	s3_object_key = upload_file_to_s3(file_content, s3_bucket, object_name)
	if not s3_object_key:
	return None

	# Call Textract
	response = textract_client.analyze_document(
	Document={'S3Object': {'Bucket': s3_bucket, 'Name': s3_object_key}},
	FeatureTypes=["TABLES"]
	)
	return response

	def generate_table_csv(tables, blocks_map, writer):
	for table in tables:
	rows = get_rows_columns_map(table, blocks_map)
	for row_index, cols in rows.items():
	row = []
	for col_index in range(1, max(cols.keys()) + 1):
	row.append(cols.get(col_index, ""))
	writer.writerow(row)

	def get_rows_columns_map(table_result, blocks_map):
	rows = {}
	for relationship in table_result['Relationships']:
	if relationship['Type'] == 'CHILD':
	for child_id in relationship['Ids']:
	cell = blocks_map[child_id]
	if 'RowIndex' in cell and 'ColumnIndex' in cell:
	row_index = cell['RowIndex']
	col_index = cell['ColumnIndex']
	if row_index not in rows:
	rows[row_index] = {}
	rows[row_index][col_index] = get_text(cell, blocks_map)
	return rows

	def get_text(result, blocks_map):
	text = ''
	if 'Relationships' in result:
	for relationship in result['Relationships']:
	if relationship['Type'] == 'CHILD':
	for child_id in relationship['Ids']:
	word = blocks_map[child_id]
	if word['BlockType'] == 'WORD':
	text += word['Text'] + ' '
	if word['BlockType'] == 'SELECTION_ELEMENT':
	if word['SelectionStatus'] == 'SELECTED':
	text += 'X '
	return text.strip()

	def is_image_file(filename):
	image_file_extensions = ['png', 'jpg', 'jpeg']
	return any(filename.lower().endswith(ext) for ext in image_file_extensions)

	def process_file_and_generate_csv(input_file):
	output_csv_path = "output.csv" # Output CSV file name
	file_content = BytesIO(input_file.read()) # Read file content into memory for processing
	file_content.seek(0) # Go to the start of the file-like object

	object_name = os.path.basename(input_file.name)

	# Check if the uploaded file is an image or needs conversion
	images = []
	if is_image_file(object_name):
	images.append(Image.open(file_content))
	file_content.seek(0) # Reset for potential re-use
	else:
	# Convert PDF/TIFF to images
	images.extend(convert_from_path(file_content))

	csv_output = BytesIO()
	writer = csv.writer(csv_output)

	for i, image in enumerate(images):
	# Process each image and upload to S3 for Textract processing
	image_byte_array = BytesIO()
	image.save(image_byte_array, format='JPEG')
	image_byte_array.seek(0)

	response = process_image(image_byte_array, s3_bucket, textract_client, f"{object_name}_{i}.jpg")
	if response:
	blocks = response['Blocks']
	blocks_map = {block['Id']: block for block in blocks}
	tables = [block for block in blocks if block['BlockType'] == "TABLE"]
	generate_table_csv(tables, blocks_map, writer)

	csv_output.seek(0) # Go to the start of the CSV in-memory file
	return csv_output, output_csv_path

	# Gradio Interface
	iface = gr.Interface(
	fn=process_file_and_generate_csv,
	inputs=gr.File(label="Upload your file (PDF, PNG, JPG, TIFF)"),
	outputs=[gr.File(label="Download Generated CSV"), "text"],
	description="Upload a document to extract tables into a CSV file."
	)

	# Launch the interface
	iface.launch()