Abijith commited on
Commit
38744b1
1 Parent(s): fc047c9

Upload 5 files

Browse files
codes/data_extraction.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import pytesseract
4
+ from pytesseract import Output
5
+ from datatypes.datatypes import Row, Cell
6
+ from codes.image_processing import ImageProcessor
7
+ from datatypes.config import Config
8
+
9
+ class TextDataExtraction():
10
+ def __init__(self):
11
+ pass
12
+
13
+ def clean_ocr_data(self, value):
14
+ transf = ''.join(e for e in value if e==' 'or e=='.' or e.isalnum())
15
+ transf.strip()
16
+ return transf
17
+
18
+ def pytess(self, cell_pil_img):
19
+ return ' '.join(pytesseract.image_to_data(cell_pil_img, output_type=Output.DICT, config='-c tessedit_char_blacklist=œ˜â€œï¬â™Ã©œ¢!|”?«“¥ --psm 6 preserve_interword_spaces')['text']).strip()
20
+
21
+ def cell_data_extraction(self, image, table_data):
22
+ for table in table_data.tables:
23
+ tableimg_processor = ImageProcessor()
24
+ table_bbox = table.detection_box
25
+ table_image = image.crop(table_bbox)
26
+ table_image = tableimg_processor.image_padding(table_image, padd=Config['table_padd'])
27
+
28
+ for row_idx, table_row in enumerate(table.ordered_recognitiondata[0].recognized_row):
29
+ row_obj = Row([])
30
+ xmin_row, ymin_row, xmax_row, ymax_row, _, _ = table_row
31
+
32
+ row_image = table_image.crop((xmin_row,ymin_row,xmax_row,ymax_row))
33
+ row_width, row_height = row_image.size
34
+ row_obj.rowindex = row_idx
35
+
36
+ # Cell bounding box creation
37
+ xa, ya, xb, yb = 0, 0, 0, row_height
38
+
39
+ for indx, table_column in enumerate(table.ordered_recognitiondata[0].recognized_column):
40
+ cell_obj = Cell()
41
+ xmin_col, _, xmax_col, _,_,_ = table_column
42
+ xmin_col, xmax_col = xmin_col -Config['table_padd'], xmax_col - Config['table_padd']
43
+ xa = xmin_col
44
+ xb = xmax_col
45
+ if indx == 0:
46
+ xa = 0
47
+ if indx == len(table.ordered_recognitiondata[0].recognized_column)-1:
48
+ xb = row_width
49
+
50
+ cell_img = row_image.crop((xa, ya, xb, yb))
51
+ xa, ya, xb, yb = xa, ya, xb, yb
52
+
53
+ cell_value = self.pytess(cell_img)
54
+ transformed_cell_value = self.clean_ocr_data(cell_value)
55
+
56
+ cell_obj.cellindex = indx
57
+ cell_obj.value = transformed_cell_value
58
+
59
+ row_obj.extracted_cells.append(cell_obj)
60
+ table.extracted_rows.append(row_obj)
61
+
62
+ return table_data
codes/image_processing.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import cv2
3
+ import numpy as np
4
+ from PIL import Image
5
+
6
+ # Some image process techniques to improve the images.
7
+ class ImageProcessor():
8
+ def __init__(self):
9
+ pass
10
+
11
+ def PIL_to_cv2(self, pil_img):
12
+ return cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR)
13
+
14
+ def cv2_to_PIL(self, cv_img):
15
+ return Image.fromarray(cv2.cvtColor(cv_img, cv2.COLOR_BGR2RGB))
16
+
17
+ def image_padding(self, image, padd):
18
+ '''
19
+ Image boarder padding to avoid table image loss
20
+ '''
21
+ width, height = image.size
22
+ new_width = width +(2*padd)
23
+ new_height = height + (2*padd)
24
+ color = (255, 255, 255)
25
+ result = Image.new(image.mode, (new_width, new_height), color)
26
+ result.paste(image, (padd, padd))
27
+ return result
28
+
29
+
30
+ def sharpen_image(self, pil_img):
31
+ img = self.PIL_to_cv2(pil_img)
32
+ '''
33
+ Image sharpening kernal
34
+ '''
35
+ sharpen_kernel = np.array([[-1, -1, -1],
36
+ [-1, 9, -1],
37
+ [-1, -1, -1]])
38
+
39
+ sharpen = cv2.filter2D(img, -1, sharpen_kernel)
40
+ pil_img = self.cv2_to_PIL(sharpen)
41
+ return pil_img
42
+
43
+ def binarizeBlur_image(self, pil_img):
44
+ image = self.PIL_to_cv2(pil_img)
45
+ thresh = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY_INV)[1]
46
+
47
+ result = cv2.GaussianBlur(thresh, (3,3), 0)
48
+ result = 255 - result
49
+ return self.cv2_to_PIL(result)
50
+
51
+ def whole_image_processing(self, pil_img):
52
+ sharpen_img = self.sharpen_image(pil_img)
53
+ binary_img = self.binarizeBlur_image(sharpen_img)
54
+
55
+ return binary_img
codes/table_detection.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from datatypes.datatypes import ImageData
4
+ from datatypes.datatypes import TableDetectionData
5
+
6
+ class TableDetection():
7
+ def __init__(self, feature_extractor, detection_model, threshold):
8
+ self.feature_extractor = feature_extractor
9
+ self.detection_model = detection_model
10
+ self.threshold = threshold
11
+
12
+ def table_detection_from_image(self, detection_image):
13
+
14
+ table_data_extraction = ImageData([])
15
+ image_width, image_height = detection_image.size
16
+ detection_encoding = self.feature_extractor(detection_image, return_tensors='pt')
17
+ detection_output = self.detection_model(**detection_encoding)
18
+ detection_results = self.feature_extractor.post_process_object_detection(detection_output, threshold=0.3, target_sizes=[(image_height, image_width)])
19
+ detection_results = detection_results[0]
20
+ # copying the detections
21
+ for score, label, bbox in zip((detection_results['scores']).tolist(), (detection_results['labels']).tolist(), (detection_results['boxes']).tolist()):
22
+ detection_table_results = TableDetectionData()
23
+ detection_table_results.detection_score = score
24
+ detection_table_results.detection_label = label
25
+ detection_table_results.detection_box = bbox
26
+ table_data_extraction.tables.append(detection_table_results)
27
+ return table_data_extraction
28
+
codes/table_preprocessing.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from datatypes.datatypes import DetectionLabels, TableRecognitionOrdered
3
+
4
+ class TablePreprocessor():
5
+ def __init__(self):
6
+ pass
7
+
8
+ def table_structure_sorting(self, table_data):
9
+ for table in table_data.tables:
10
+ recognized_row = []
11
+ recognized_column = []
12
+ recognized_ord_obj = TableRecognitionOrdered([])
13
+ # print(table.recognitiondata[0])
14
+ for score, label, box in zip(table.recognitiondata[0].scores, table.recognitiondata[0].labels, table.recognitiondata[0].boxes):
15
+ # print(score, label, box)
16
+ newbox = []
17
+ if label == DetectionLabels.table_row.value:
18
+ newbox = box
19
+ newbox.append(label)
20
+ newbox.append(score)
21
+ recognized_row.append(newbox)
22
+ if label == DetectionLabels.table_column.value:
23
+ newbox = box
24
+ newbox.append(label)
25
+ newbox.append(score)
26
+ recognized_column.append(newbox)
27
+
28
+ recognized_row.sort(key=lambda x:x[1])
29
+ recognized_column.sort(key=lambda x:x[0])
30
+
31
+ recognized_ord_obj.recognized_row = recognized_row
32
+ recognized_ord_obj.recognized_column = recognized_column
33
+ table.ordered_recognitiondata.append(recognized_ord_obj)
34
+
35
+ return table_data
codes/table_recognition.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from datatypes.datatypes import TableRecognitionData, TableDetectionData
4
+ from codes.image_processing import ImageProcessor
5
+ from datatypes.config import Config
6
+
7
+ class TableRecognition:
8
+ def __init__(self, feature_extractor, recognition_model, threshold):
9
+ self.feature_extractor = feature_extractor
10
+ self.recognition_model = recognition_model
11
+ self.threshold = threshold
12
+
13
+ def table_recognition_from_detection(self, recognition_image, detection_results):
14
+
15
+ for table in detection_results.tables:
16
+ recognised_table_results = TableRecognitionData()
17
+ bbox = table.detection_box
18
+ detected_tbl = recognition_image.crop(bbox)
19
+ img_processor = ImageProcessor()
20
+ padded_table = img_processor.image_padding(image=detected_tbl, padd=Config['table_padd'])
21
+ width, height = padded_table.size
22
+
23
+ recognition_encoding = self.feature_extractor(padded_table, return_tensors='pt')
24
+ recognition_output = self.recognition_model(**recognition_encoding)
25
+ recognition_results = self.feature_extractor.post_process_object_detection(recognition_output, threshold=0.7, target_sizes=[(height, width)])
26
+ recognition_results = recognition_results[0]
27
+
28
+ recognised_table_results.scores = (recognition_results['scores'].tolist())
29
+ recognised_table_results.labels = (recognition_results['labels'].tolist())
30
+ recognised_table_results.boxes = (recognition_results['boxes'].tolist())
31
+
32
+ table.recognitiondata.append(recognised_table_results)
33
+ return detection_results