Spaces:

wyndee
/

pdf-table-extractor

Running

App Files Files Community

pdf-table-extractor / table_extraction.py

wyndee

Update table_extraction.py

da4f677 verified 5 months ago

raw

history blame contribute delete

8.82 kB

	from pdf2image import convert_from_path
	import cv2
	import numpy as np
	import pytesseract
	import math
	import csv

	def extract_table_from_pdf (pdf_path):
	images = convert_from_path(pdf_path)

	# Convert PDF pages to images and save as PNG
	for image in images:
	image.save("img.png", 'PNG')

	# Load the saved image
	image = cv2.imread('img.png', cv2.IMREAD_GRAYSCALE)
	BLUR_KERNEL_SIZE = (17, 17)
	STD_DEV_X_DIRECTION = 0
	STD_DEV_Y_DIRECTION = 0
	blurred = cv2.GaussianBlur(image, BLUR_KERNEL_SIZE, STD_DEV_X_DIRECTION, STD_DEV_Y_DIRECTION)
	MAX_COLOR_VAL = 255
	BLOCK_SIZE = 15
	SUBTRACT_FROM_MEAN = -2

	img_bin = cv2.adaptiveThreshold(~blurred,MAX_COLOR_VAL,cv2.ADAPTIVE_THRESH_MEAN_C,cv2.THRESH_BINARY,BLOCK_SIZE,SUBTRACT_FROM_MEAN)
	vertical = horizontal = img_bin.copy()
	SCALE = 5
	image_width, image_height = horizontal.shape
	horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(image_width / SCALE), 1))
	horizontally_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
	vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(image_height / SCALE)))
	vertically_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)

	horizontally_dilated = cv2.dilate(horizontally_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1)))
	vertically_dilated = cv2.dilate(vertically_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (1, 60)))

	mask = horizontally_dilated + vertically_dilated
	contours, heirarchy = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

	MIN_TABLE_AREA = 1e5
	contours = [c for c in contours if cv2.contourArea(c) > MIN_TABLE_AREA]
	perimeter_lengths = [cv2.arcLength(c, True) for c in contours]
	epsilons = [0.1 * p for p in perimeter_lengths]
	approx_polys = [cv2.approxPolyDP(c, e, True) for c, e in zip(contours, epsilons)]
	bounding_rects = [cv2.boundingRect(a) for a in approx_polys]
	images = [image[y:y+h, x:x+w] for x, y, w, h in bounding_rects]
	for i, table in enumerate(images):
	BLUR_KERNEL_SIZE = (17, 17)
	STD_DEV_X_DIRECTION = 0
	STD_DEV_Y_DIRECTION = 0
	blurred = cv2.GaussianBlur(image, BLUR_KERNEL_SIZE, STD_DEV_X_DIRECTION, STD_DEV_Y_DIRECTION)
	MAX_COLOR_VAL = 255
	BLOCK_SIZE = 15
	SUBTRACT_FROM_MEAN = -2

	img_bin = cv2.adaptiveThreshold(
	~blurred,
	MAX_COLOR_VAL,
	cv2.ADAPTIVE_THRESH_MEAN_C,
	cv2.THRESH_BINARY,
	BLOCK_SIZE,
	SUBTRACT_FROM_MEAN,
	)
	vertical = horizontal = img_bin.copy()
	SCALE = 5
	image_width, image_height = horizontal.shape
	horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(image_width / SCALE), 1))
	horizontally_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
	vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(image_height / SCALE)))
	vertically_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)

	horizontally_dilated = cv2.dilate(horizontally_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1)))
	vertically_dilated = cv2.dilate(vertically_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (1, 60)))

	mask = horizontally_dilated + vertically_dilated
	contours, heirarchy = cv2.findContours(
	mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE,
	)

	perimeter_lengths = [cv2.arcLength(c, True) for c in contours]
	epsilons = [0.05 * p for p in perimeter_lengths]
	approx_polys = [cv2.approxPolyDP(c, e, True) for c, e in zip(contours, epsilons)]

	# Filter out contours that aren't rectangular. Those that aren't rectangular
	# are probably noise.
	approx_rects = [p for p in approx_polys if len(p) == 4]
	bounding_rects = [cv2.boundingRect(a) for a in approx_polys]

	# Filter out rectangles that are too narrow or too short.
	MIN_RECT_WIDTH = 40
	MIN_RECT_HEIGHT = 10
	bounding_rects = [
	r for r in bounding_rects if MIN_RECT_WIDTH < r[2] and MIN_RECT_HEIGHT < r[3]
	]

	# The largest bounding rectangle is assumed to be the entire table.
	# Remove it from the list. don't want to accidentally try to OCR
	# the entire table.
	largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3])
	bounding_rects = [b for b in bounding_rects if b is not largest_rect]

	cells = [c for c in bounding_rects]
	def cell_in_same_row(c1, c2):
	c1_center = c1[1] + c1[3] - c1[3] / 2
	c2_bottom = c2[1] + c2[3]
	c2_top = c2[1]
	return c2_top < c1_center < c2_bottom

	orig_cells = [c for c in cells]
	rows = []
	while cells:
	first = cells[0]
	rest = cells[1:]
	cells_in_same_row = sorted(
	[
	c for c in rest
	if cell_in_same_row(c, first)
	],
	key=lambda c: c[0]
	)

	row_cells = sorted([first] + cells_in_same_row, key=lambda c: c[0])
	rows.append(row_cells)
	cells = [
	c for c in rest
	if not cell_in_same_row(c, first)
	]

	# Sort rows by average height of their center.
	def avg_height_of_center(row):
	centers = [y + h - h / 2 for x, y, w, h in row]
	return sum(centers) / len(centers)

	rows.sort(key=avg_height_of_center)
	cell_images_rows = []
	for row in rows:
	cell_images_row = []
	for x, y, w, h in row:
	cell_images_row.append(image[y:y+h, x:x+w])
	cell_images_rows.append(cell_images_row)

	#cv2.imshow('i',cell_images_rows[3][0])

	rows = len(cell_images_rows)
	cols = len(cell_images_rows[0]) if rows > 0 else 0
	def crop_to_text(image):
	MAX_COLOR_VAL = 255
	BLOCK_SIZE = 15
	SUBTRACT_FROM_MEAN = -2

	img_bin = cv2.adaptiveThreshold(
	~image,
	MAX_COLOR_VAL,
	cv2.ADAPTIVE_THRESH_MEAN_C,
	cv2.THRESH_BINARY,
	BLOCK_SIZE,
	SUBTRACT_FROM_MEAN,
	)

	img_h, img_w = image.shape
	horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(img_w * 0.5), 1))
	vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(img_h * 0.7)))
	horizontal_lines = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel)
	vertical_lines = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel)
	both = horizontal_lines + vertical_lines
	cleaned = img_bin - both

	# Get rid of little noise.
	kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3))
	opened = cv2.morphologyEx(cleaned, cv2.MORPH_OPEN, kernel)
	opened = cv2.dilate(opened, kernel)

	contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
	bounding_rects = [cv2.boundingRect(c) for c in contours]
	NUM_PX_COMMA = 6
	MIN_CHAR_AREA = 5 * 9
	char_sized_bounding_rects = [(x, y, w, h) for x, y, w, h in bounding_rects if w * h > MIN_CHAR_AREA]
	if char_sized_bounding_rects:
	minx, miny, maxx, maxy = math.inf, math.inf, 0, 0
	for x, y, w, h in char_sized_bounding_rects:
	minx = min(minx, x)
	miny = min(miny, y)
	maxx = max(maxx, x + w)
	maxy = max(maxy, y + h)
	x, y, w, h = minx, miny, maxx - minx, maxy - miny
	cropped = image[y:min(img_h, y+h+NUM_PX_COMMA), x:min(img_w, x+w)]
	else:
	# If we morphed out all of the text, assume an empty image.
	cropped = MAX_COLOR_VAL * np.ones(shape=(20, 100), dtype=np.uint8)
	bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255)
	return bordered

	#cv2.imshow('i',crop_to_text(cell_images_rows[1][1]))
	def extract_text_from_cells(cell_images_rows):
	table_data = []
	j=0
	i=0
	for _ in range(rows*cols):
	try:
	extracted_texts = crop_to_text(cell_images_rows[i][j])
	text = [pytesseract.image_to_string(crop_to_text(cell_images_rows[i][j]), config=r'--oem 3 --psm 6').replace("\n","")]
	except Exception as e:
	j += 1
	if j == cols:
	j=0
	i+=1
	continue
	table_data.append(text)
	j += 1
	if j == cols:
	j=0
	i +=1
	merged_rows = []
	for i in range(0, len(table_data), cols):
	merged_rows.append([item for sublist in table_data[i:i+cols] for item in sublist])
	print(merged_rows)
	return merged_rows


	table_data = extract_text_from_cells(cell_images_rows)
	csv_filename = "table_output.csv"
	with open(csv_filename, mode="w", newline="", encoding="utf-8") as file:
	writer = csv.writer(file)
	writer.writerows(table_data)
	output_csv = 'table_output.csv'
	return output_csv