Spaces:
Paused
Paused
| from pdf2image import convert_from_path | |
| import cv2 | |
| import numpy as np | |
| import pytesseract | |
| import math | |
| import csv | |
| def extract_table_from_pdf (pdf_path): | |
| images = convert_from_path(pdf_path) | |
| # Convert PDF pages to images and save as PNG | |
| for image in images: | |
| image.save("img.png", 'PNG') | |
| # Load the saved image | |
| image = cv2.imread('img.png', cv2.IMREAD_GRAYSCALE) | |
| BLUR_KERNEL_SIZE = (17, 17) | |
| STD_DEV_X_DIRECTION = 0 | |
| STD_DEV_Y_DIRECTION = 0 | |
| blurred = cv2.GaussianBlur(image, BLUR_KERNEL_SIZE, STD_DEV_X_DIRECTION, STD_DEV_Y_DIRECTION) | |
| MAX_COLOR_VAL = 255 | |
| BLOCK_SIZE = 15 | |
| SUBTRACT_FROM_MEAN = -2 | |
| img_bin = cv2.adaptiveThreshold(~blurred,MAX_COLOR_VAL,cv2.ADAPTIVE_THRESH_MEAN_C,cv2.THRESH_BINARY,BLOCK_SIZE,SUBTRACT_FROM_MEAN) | |
| vertical = horizontal = img_bin.copy() | |
| SCALE = 5 | |
| image_width, image_height = horizontal.shape | |
| horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(image_width / SCALE), 1)) | |
| horizontally_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel) | |
| vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(image_height / SCALE))) | |
| vertically_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel) | |
| horizontally_dilated = cv2.dilate(horizontally_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1))) | |
| vertically_dilated = cv2.dilate(vertically_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (1, 60))) | |
| mask = horizontally_dilated + vertically_dilated | |
| contours, heirarchy = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) | |
| MIN_TABLE_AREA = 1e5 | |
| contours = [c for c in contours if cv2.contourArea(c) > MIN_TABLE_AREA] | |
| perimeter_lengths = [cv2.arcLength(c, True) for c in contours] | |
| epsilons = [0.1 * p for p in perimeter_lengths] | |
| approx_polys = [cv2.approxPolyDP(c, e, True) for c, e in zip(contours, epsilons)] | |
| bounding_rects = [cv2.boundingRect(a) for a in approx_polys] | |
| images = [image[y:y+h, x:x+w] for x, y, w, h in bounding_rects] | |
| for i, table in enumerate(images): | |
| BLUR_KERNEL_SIZE = (17, 17) | |
| STD_DEV_X_DIRECTION = 0 | |
| STD_DEV_Y_DIRECTION = 0 | |
| blurred = cv2.GaussianBlur(image, BLUR_KERNEL_SIZE, STD_DEV_X_DIRECTION, STD_DEV_Y_DIRECTION) | |
| MAX_COLOR_VAL = 255 | |
| BLOCK_SIZE = 15 | |
| SUBTRACT_FROM_MEAN = -2 | |
| img_bin = cv2.adaptiveThreshold( | |
| ~blurred, | |
| MAX_COLOR_VAL, | |
| cv2.ADAPTIVE_THRESH_MEAN_C, | |
| cv2.THRESH_BINARY, | |
| BLOCK_SIZE, | |
| SUBTRACT_FROM_MEAN, | |
| ) | |
| vertical = horizontal = img_bin.copy() | |
| SCALE = 5 | |
| image_width, image_height = horizontal.shape | |
| horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(image_width / SCALE), 1)) | |
| horizontally_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel) | |
| vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(image_height / SCALE))) | |
| vertically_opened = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel) | |
| horizontally_dilated = cv2.dilate(horizontally_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1))) | |
| vertically_dilated = cv2.dilate(vertically_opened, cv2.getStructuringElement(cv2.MORPH_RECT, (1, 60))) | |
| mask = horizontally_dilated + vertically_dilated | |
| contours, heirarchy = cv2.findContours( | |
| mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE, | |
| ) | |
| perimeter_lengths = [cv2.arcLength(c, True) for c in contours] | |
| epsilons = [0.05 * p for p in perimeter_lengths] | |
| approx_polys = [cv2.approxPolyDP(c, e, True) for c, e in zip(contours, epsilons)] | |
| # Filter out contours that aren't rectangular. Those that aren't rectangular | |
| # are probably noise. | |
| approx_rects = [p for p in approx_polys if len(p) == 4] | |
| bounding_rects = [cv2.boundingRect(a) for a in approx_polys] | |
| # Filter out rectangles that are too narrow or too short. | |
| MIN_RECT_WIDTH = 40 | |
| MIN_RECT_HEIGHT = 10 | |
| bounding_rects = [ | |
| r for r in bounding_rects if MIN_RECT_WIDTH < r[2] and MIN_RECT_HEIGHT < r[3] | |
| ] | |
| # The largest bounding rectangle is assumed to be the entire table. | |
| # Remove it from the list. don't want to accidentally try to OCR | |
| # the entire table. | |
| largest_rect = max(bounding_rects, key=lambda r: r[2] * r[3]) | |
| bounding_rects = [b for b in bounding_rects if b is not largest_rect] | |
| cells = [c for c in bounding_rects] | |
| def cell_in_same_row(c1, c2): | |
| c1_center = c1[1] + c1[3] - c1[3] / 2 | |
| c2_bottom = c2[1] + c2[3] | |
| c2_top = c2[1] | |
| return c2_top < c1_center < c2_bottom | |
| orig_cells = [c for c in cells] | |
| rows = [] | |
| while cells: | |
| first = cells[0] | |
| rest = cells[1:] | |
| cells_in_same_row = sorted( | |
| [ | |
| c for c in rest | |
| if cell_in_same_row(c, first) | |
| ], | |
| key=lambda c: c[0] | |
| ) | |
| row_cells = sorted([first] + cells_in_same_row, key=lambda c: c[0]) | |
| rows.append(row_cells) | |
| cells = [ | |
| c for c in rest | |
| if not cell_in_same_row(c, first) | |
| ] | |
| # Sort rows by average height of their center. | |
| def avg_height_of_center(row): | |
| centers = [y + h - h / 2 for x, y, w, h in row] | |
| return sum(centers) / len(centers) | |
| rows.sort(key=avg_height_of_center) | |
| cell_images_rows = [] | |
| for row in rows: | |
| cell_images_row = [] | |
| for x, y, w, h in row: | |
| cell_images_row.append(image[y:y+h, x:x+w]) | |
| cell_images_rows.append(cell_images_row) | |
| #cv2.imshow('i',cell_images_rows[3][0]) | |
| rows = len(cell_images_rows) | |
| cols = len(cell_images_rows[0]) if rows > 0 else 0 | |
| def crop_to_text(image): | |
| MAX_COLOR_VAL = 255 | |
| BLOCK_SIZE = 15 | |
| SUBTRACT_FROM_MEAN = -2 | |
| img_bin = cv2.adaptiveThreshold( | |
| ~image, | |
| MAX_COLOR_VAL, | |
| cv2.ADAPTIVE_THRESH_MEAN_C, | |
| cv2.THRESH_BINARY, | |
| BLOCK_SIZE, | |
| SUBTRACT_FROM_MEAN, | |
| ) | |
| img_h, img_w = image.shape | |
| horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (int(img_w * 0.5), 1)) | |
| vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(img_h * 0.7))) | |
| horizontal_lines = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, horizontal_kernel) | |
| vertical_lines = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, vertical_kernel) | |
| both = horizontal_lines + vertical_lines | |
| cleaned = img_bin - both | |
| # Get rid of little noise. | |
| kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3)) | |
| opened = cv2.morphologyEx(cleaned, cv2.MORPH_OPEN, kernel) | |
| opened = cv2.dilate(opened, kernel) | |
| contours, hierarchy = cv2.findContours(opened, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) | |
| bounding_rects = [cv2.boundingRect(c) for c in contours] | |
| NUM_PX_COMMA = 6 | |
| MIN_CHAR_AREA = 5 * 9 | |
| char_sized_bounding_rects = [(x, y, w, h) for x, y, w, h in bounding_rects if w * h > MIN_CHAR_AREA] | |
| if char_sized_bounding_rects: | |
| minx, miny, maxx, maxy = math.inf, math.inf, 0, 0 | |
| for x, y, w, h in char_sized_bounding_rects: | |
| minx = min(minx, x) | |
| miny = min(miny, y) | |
| maxx = max(maxx, x + w) | |
| maxy = max(maxy, y + h) | |
| x, y, w, h = minx, miny, maxx - minx, maxy - miny | |
| cropped = image[y:min(img_h, y+h+NUM_PX_COMMA), x:min(img_w, x+w)] | |
| else: | |
| # If we morphed out all of the text, assume an empty image. | |
| cropped = MAX_COLOR_VAL * np.ones(shape=(20, 100), dtype=np.uint8) | |
| bordered = cv2.copyMakeBorder(cropped, 5, 5, 5, 5, cv2.BORDER_CONSTANT, None, 255) | |
| return bordered | |
| #cv2.imshow('i',crop_to_text(cell_images_rows[1][1])) | |
| def extract_text_from_cells(cell_images_rows): | |
| table_data = [] | |
| j=0 | |
| i=0 | |
| for _ in range(rows*cols): | |
| try: | |
| extracted_texts = crop_to_text(cell_images_rows[i][j]) | |
| text = [pytesseract.image_to_string(crop_to_text(cell_images_rows[i][j]), config=r'--oem 3 --psm 6').replace("\n","")] | |
| except Exception as e: | |
| j += 1 | |
| if j == cols: | |
| j=0 | |
| i+=1 | |
| continue | |
| table_data.append(text) | |
| j += 1 | |
| if j == cols: | |
| j=0 | |
| i +=1 | |
| merged_rows = [] | |
| for i in range(0, len(table_data), cols): | |
| merged_rows.append([item for sublist in table_data[i:i+cols] for item in sublist]) | |
| print(merged_rows) | |
| return merged_rows | |
| table_data = extract_text_from_cells(cell_images_rows) | |
| csv_filename = "table_output.csv" | |
| with open(csv_filename, mode="w", newline="", encoding="utf-8") as file: | |
| writer = csv.writer(file) | |
| writer.writerows(table_data) | |
| output_csv = 'table_output.csv' | |
| return output_csv |