Spaces:
Runtime error
Runtime error
Upload 3 files
Browse files- app.py +116 -0
- packages.txt +1 -0
- requirements.txt +8 -0
app.py
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import platform
|
3 |
+
import time
|
4 |
+
import logging
|
5 |
+
from fastapi import FastAPI, UploadFile, File
|
6 |
+
import uvicorn
|
7 |
+
import pytesseract
|
8 |
+
import streamlit as st
|
9 |
+
import pandas as pd
|
10 |
+
from PIL import Image
|
11 |
+
from typing import List
|
12 |
+
|
13 |
+
from transformers import TableTransformerForObjectDetection, DetrFeatureExtractor
|
14 |
+
from codes.table_recognition import TableRecognition
|
15 |
+
from codes.table_detection import TableDetection
|
16 |
+
from codes.table_preprocessing import TablePreprocessor
|
17 |
+
from codes.data_extraction import TextDataExtraction
|
18 |
+
from datatypes.config import Config, tesseract_config, model_config
|
19 |
+
|
20 |
+
if platform.system() == 'Windows':
|
21 |
+
pytesseract.pytesseract.tesseract_cmd = tesseract_config['tesseractpath']
|
22 |
+
|
23 |
+
# Table detection-recognition model loading function
|
24 |
+
@st.experimental_singleton
|
25 |
+
def load_models():
|
26 |
+
try:
|
27 |
+
# models loading from local
|
28 |
+
# detection_model = TableTransformerForObjectDetection.from_pretrained(model_config['detection_model_path'])
|
29 |
+
# recognition_model = TableTransformerForObjectDetection.from_pretrained(model_config['recognition_model_path'])
|
30 |
+
|
31 |
+
# models loading from hugginfacehub
|
32 |
+
detection_model = TableTransformerForObjectDetection.from_pretrained("microsoft/table-transformer-detection")
|
33 |
+
recognition_model = TableTransformerForObjectDetection.from_pretrained("microsoft/table-transformer-structure-recognition")
|
34 |
+
|
35 |
+
return detection_model, recognition_model
|
36 |
+
except:
|
37 |
+
print('Table detection or recognition model loading is failed!!')
|
38 |
+
|
39 |
+
# Models loading
|
40 |
+
detection_model, recognition_model = load_models()
|
41 |
+
|
42 |
+
# Detection feature extractor
|
43 |
+
detection_feature_extractor = DetrFeatureExtractor(do_resize=True, size=800, max_size=800)
|
44 |
+
# Recognition feature extractor
|
45 |
+
recognition_feature_extractor = DetrFeatureExtractor(do_resize=True, size=1000, max_size=1000)
|
46 |
+
|
47 |
+
# config values for the detection and recognition
|
48 |
+
# Detection Object
|
49 |
+
detection_obj = TableDetection(detection_feature_extractor, detection_model, threshold=Config['table_detection_threshold'])
|
50 |
+
|
51 |
+
# Recognition Object
|
52 |
+
recognition_obj = TableRecognition(recognition_feature_extractor, recognition_model, threshold=Config['table_recognition_threshold'])
|
53 |
+
|
54 |
+
table_preprocessor = TablePreprocessor()
|
55 |
+
textdataextractor = TextDataExtraction()
|
56 |
+
|
57 |
+
# # Fast API the service if we need to install this as a microservice
|
58 |
+
# app = FastAPI()
|
59 |
+
|
60 |
+
# @app.get("/health")
|
61 |
+
# def healthcheck():
|
62 |
+
# return "200"
|
63 |
+
|
64 |
+
# @app.post('/table-data-extraction')
|
65 |
+
# def table_data_extraction_from_image(file: UploadFile = File(...)):
|
66 |
+
# if not (file.filename.split('.')[-1]).lower() in ("jpg", "jpeg", "png"):
|
67 |
+
# return {'Image must be jpg or png format!'}
|
68 |
+
# print(f'#---------- Table extractor started {time.strftime("%Y-%m-%d %H:%M:%S")} -----------#')
|
69 |
+
# image = Image.open(file.file).convert('RGB')
|
70 |
+
# detection_result = detection_obj.table_detection_from_image(image)
|
71 |
+
# recognition_result = recognition_obj.table_recognition_from_detection(image, detection_result)
|
72 |
+
# preprocessed_tables = table_preprocessor.table_structure_sorting(recognition_result)
|
73 |
+
# exracted_table_data = textdataextractor.cell_data_extraction(image, preprocessed_tables)
|
74 |
+
# print(f'#---------- Table extractor ended {time.strftime("%Y-%m-%d %H:%M:%S")} -----------#\n')
|
75 |
+
# return exracted_table_data
|
76 |
+
|
77 |
+
def convert_to_df(extracted_object):
|
78 |
+
logging.info(f'#---------- Table visualization started {time.strftime("%Y-%m-%d %H:%M:%S")} -----------#')
|
79 |
+
def _show_outputdf(table_list:List[List], table_number:int):
|
80 |
+
op_df = pd.DataFrame(table_list)
|
81 |
+
container.write(f'Extracted tabel: {table_number}')
|
82 |
+
container.dataframe(op_df)
|
83 |
+
container.write('\n')
|
84 |
+
if len(extracted_object.tables) != 0:
|
85 |
+
table_no = 1
|
86 |
+
for table in extracted_object.tables:
|
87 |
+
table_list = []
|
88 |
+
for row in table.extracted_rows:
|
89 |
+
row_list = []
|
90 |
+
for cell in row.extracted_cells:
|
91 |
+
row_list.append(cell.value)
|
92 |
+
table_list.append(row_list)
|
93 |
+
_show_outputdf(table_list=table_list, table_number=table_no)
|
94 |
+
table_no += 1
|
95 |
+
else:
|
96 |
+
container.write('No tables are predicted!!!!')
|
97 |
+
|
98 |
+
def table_data_extraction_from_image1(file):
|
99 |
+
logging.info(f'#---------- Table extractor started {time.strftime("%Y-%m-%d %H:%M:%S")} -----------#')
|
100 |
+
image = Image.open(file).convert('RGB')
|
101 |
+
detection_result = detection_obj.table_detection_from_image(image)
|
102 |
+
recognition_result = recognition_obj.table_recognition_from_detection(image, detection_result)
|
103 |
+
preprocessed_tables = table_preprocessor.table_structure_sorting(recognition_result)
|
104 |
+
exracted_table_data = textdataextractor.cell_data_extraction(image, preprocessed_tables)
|
105 |
+
convert_to_df(exracted_table_data)
|
106 |
+
|
107 |
+
logging.info((f'#---------- Table extractor ended {time.strftime("%Y-%m-%d %H:%M:%S")} -----------#\n'))
|
108 |
+
return exracted_table_data
|
109 |
+
if __name__ == '__main__':
|
110 |
+
st.title('Table detection and recognition')
|
111 |
+
st.write('Table data extraction application with help of microsoft detr models.')
|
112 |
+
image = st.sidebar.file_uploader(label='Upload image file for data extraction', type=['png','jpg','jpeg','tif'])
|
113 |
+
if image:
|
114 |
+
result = st.sidebar.button(label='Predict', on_click=table_data_extraction_from_image1, args=(image,))
|
115 |
+
container = st.container()
|
116 |
+
container.subheader('Extracted tables :snowflake:')
|
packages.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
tesseract-ocr
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
transformers
|
2 |
+
timm
|
3 |
+
fastapi
|
4 |
+
uvicorn
|
5 |
+
python-multipart
|
6 |
+
opencv-python
|
7 |
+
pytesseract
|
8 |
+
streamlit
|