Abijith commited on
Commit
6bd2a33
1 Parent(s): 714cc46

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +116 -0
  2. packages.txt +1 -0
  3. requirements.txt +8 -0
app.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import platform
3
+ import time
4
+ import logging
5
+ from fastapi import FastAPI, UploadFile, File
6
+ import uvicorn
7
+ import pytesseract
8
+ import streamlit as st
9
+ import pandas as pd
10
+ from PIL import Image
11
+ from typing import List
12
+
13
+ from transformers import TableTransformerForObjectDetection, DetrFeatureExtractor
14
+ from codes.table_recognition import TableRecognition
15
+ from codes.table_detection import TableDetection
16
+ from codes.table_preprocessing import TablePreprocessor
17
+ from codes.data_extraction import TextDataExtraction
18
+ from datatypes.config import Config, tesseract_config, model_config
19
+
20
+ if platform.system() == 'Windows':
21
+ pytesseract.pytesseract.tesseract_cmd = tesseract_config['tesseractpath']
22
+
23
+ # Table detection-recognition model loading function
24
+ @st.experimental_singleton
25
+ def load_models():
26
+ try:
27
+ # models loading from local
28
+ # detection_model = TableTransformerForObjectDetection.from_pretrained(model_config['detection_model_path'])
29
+ # recognition_model = TableTransformerForObjectDetection.from_pretrained(model_config['recognition_model_path'])
30
+
31
+ # models loading from hugginfacehub
32
+ detection_model = TableTransformerForObjectDetection.from_pretrained("microsoft/table-transformer-detection")
33
+ recognition_model = TableTransformerForObjectDetection.from_pretrained("microsoft/table-transformer-structure-recognition")
34
+
35
+ return detection_model, recognition_model
36
+ except:
37
+ print('Table detection or recognition model loading is failed!!')
38
+
39
+ # Models loading
40
+ detection_model, recognition_model = load_models()
41
+
42
+ # Detection feature extractor
43
+ detection_feature_extractor = DetrFeatureExtractor(do_resize=True, size=800, max_size=800)
44
+ # Recognition feature extractor
45
+ recognition_feature_extractor = DetrFeatureExtractor(do_resize=True, size=1000, max_size=1000)
46
+
47
+ # config values for the detection and recognition
48
+ # Detection Object
49
+ detection_obj = TableDetection(detection_feature_extractor, detection_model, threshold=Config['table_detection_threshold'])
50
+
51
+ # Recognition Object
52
+ recognition_obj = TableRecognition(recognition_feature_extractor, recognition_model, threshold=Config['table_recognition_threshold'])
53
+
54
+ table_preprocessor = TablePreprocessor()
55
+ textdataextractor = TextDataExtraction()
56
+
57
+ # # Fast API the service if we need to install this as a microservice
58
+ # app = FastAPI()
59
+
60
+ # @app.get("/health")
61
+ # def healthcheck():
62
+ # return "200"
63
+
64
+ # @app.post('/table-data-extraction')
65
+ # def table_data_extraction_from_image(file: UploadFile = File(...)):
66
+ # if not (file.filename.split('.')[-1]).lower() in ("jpg", "jpeg", "png"):
67
+ # return {'Image must be jpg or png format!'}
68
+ # print(f'#---------- Table extractor started {time.strftime("%Y-%m-%d %H:%M:%S")} -----------#')
69
+ # image = Image.open(file.file).convert('RGB')
70
+ # detection_result = detection_obj.table_detection_from_image(image)
71
+ # recognition_result = recognition_obj.table_recognition_from_detection(image, detection_result)
72
+ # preprocessed_tables = table_preprocessor.table_structure_sorting(recognition_result)
73
+ # exracted_table_data = textdataextractor.cell_data_extraction(image, preprocessed_tables)
74
+ # print(f'#---------- Table extractor ended {time.strftime("%Y-%m-%d %H:%M:%S")} -----------#\n')
75
+ # return exracted_table_data
76
+
77
+ def convert_to_df(extracted_object):
78
+ logging.info(f'#---------- Table visualization started {time.strftime("%Y-%m-%d %H:%M:%S")} -----------#')
79
+ def _show_outputdf(table_list:List[List], table_number:int):
80
+ op_df = pd.DataFrame(table_list)
81
+ container.write(f'Extracted tabel: {table_number}')
82
+ container.dataframe(op_df)
83
+ container.write('\n')
84
+ if len(extracted_object.tables) != 0:
85
+ table_no = 1
86
+ for table in extracted_object.tables:
87
+ table_list = []
88
+ for row in table.extracted_rows:
89
+ row_list = []
90
+ for cell in row.extracted_cells:
91
+ row_list.append(cell.value)
92
+ table_list.append(row_list)
93
+ _show_outputdf(table_list=table_list, table_number=table_no)
94
+ table_no += 1
95
+ else:
96
+ container.write('No tables are predicted!!!!')
97
+
98
+ def table_data_extraction_from_image1(file):
99
+ logging.info(f'#---------- Table extractor started {time.strftime("%Y-%m-%d %H:%M:%S")} -----------#')
100
+ image = Image.open(file).convert('RGB')
101
+ detection_result = detection_obj.table_detection_from_image(image)
102
+ recognition_result = recognition_obj.table_recognition_from_detection(image, detection_result)
103
+ preprocessed_tables = table_preprocessor.table_structure_sorting(recognition_result)
104
+ exracted_table_data = textdataextractor.cell_data_extraction(image, preprocessed_tables)
105
+ convert_to_df(exracted_table_data)
106
+
107
+ logging.info((f'#---------- Table extractor ended {time.strftime("%Y-%m-%d %H:%M:%S")} -----------#\n'))
108
+ return exracted_table_data
109
+ if __name__ == '__main__':
110
+ st.title('Table detection and recognition')
111
+ st.write('Table data extraction application with help of microsoft detr models.')
112
+ image = st.sidebar.file_uploader(label='Upload image file for data extraction', type=['png','jpg','jpeg','tif'])
113
+ if image:
114
+ result = st.sidebar.button(label='Predict', on_click=table_data_extraction_from_image1, args=(image,))
115
+ container = st.container()
116
+ container.subheader('Extracted tables :snowflake:')
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ tesseract-ocr
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ transformers
2
+ timm
3
+ fastapi
4
+ uvicorn
5
+ python-multipart
6
+ opencv-python
7
+ pytesseract
8
+ streamlit