File size: 5,440 Bytes
6bd2a33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import os
import platform
import time
import logging
from fastapi import  FastAPI, UploadFile, File
import uvicorn
import pytesseract
import streamlit as st
import pandas as pd
from PIL import Image
from typing import List

from transformers import TableTransformerForObjectDetection, DetrFeatureExtractor
from codes.table_recognition import TableRecognition
from codes.table_detection import TableDetection
from codes.table_preprocessing import TablePreprocessor
from codes.data_extraction import TextDataExtraction
from datatypes.config import Config, tesseract_config, model_config

if platform.system() == 'Windows':
    pytesseract.pytesseract.tesseract_cmd = tesseract_config['tesseractpath']

# Table detection-recognition model loading function
@st.experimental_singleton
def load_models():
    try:
        # models loading from local
        # detection_model = TableTransformerForObjectDetection.from_pretrained(model_config['detection_model_path'])
        # recognition_model = TableTransformerForObjectDetection.from_pretrained(model_config['recognition_model_path'])

        # models loading from hugginfacehub
        detection_model = TableTransformerForObjectDetection.from_pretrained("microsoft/table-transformer-detection")
        recognition_model = TableTransformerForObjectDetection.from_pretrained("microsoft/table-transformer-structure-recognition")

        return detection_model, recognition_model
    except:
        print('Table detection or recognition model loading is failed!!')    

# Models loading
detection_model, recognition_model = load_models()

# Detection feature extractor
detection_feature_extractor = DetrFeatureExtractor(do_resize=True, size=800, max_size=800)
# Recognition feature extractor 
recognition_feature_extractor = DetrFeatureExtractor(do_resize=True, size=1000, max_size=1000)

# config values for the detection and recognition
# Detection Object 
detection_obj = TableDetection(detection_feature_extractor, detection_model, threshold=Config['table_detection_threshold'])

# Recognition Object
recognition_obj = TableRecognition(recognition_feature_extractor, recognition_model, threshold=Config['table_recognition_threshold'])

table_preprocessor = TablePreprocessor()
textdataextractor = TextDataExtraction()

# # Fast API the service if we need to install this as a microservice
# app = FastAPI()

# @app.get("/health")
# def healthcheck():
#     return "200"

# @app.post('/table-data-extraction')
# def table_data_extraction_from_image(file: UploadFile = File(...)):
#     if not (file.filename.split('.')[-1]).lower() in ("jpg", "jpeg", "png"):
#         return {'Image must be jpg or png format!'}
#     print(f'#---------- Table extractor started {time.strftime("%Y-%m-%d %H:%M:%S")} -----------#')
#     image = Image.open(file.file).convert('RGB')
#     detection_result = detection_obj.table_detection_from_image(image)
#     recognition_result = recognition_obj.table_recognition_from_detection(image, detection_result)
#     preprocessed_tables = table_preprocessor.table_structure_sorting(recognition_result)
#     exracted_table_data = textdataextractor.cell_data_extraction(image, preprocessed_tables)
#     print(f'#---------- Table extractor ended {time.strftime("%Y-%m-%d %H:%M:%S")} -----------#\n')
#     return exracted_table_data

def convert_to_df(extracted_object):
    logging.info(f'#---------- Table visualization started {time.strftime("%Y-%m-%d %H:%M:%S")} -----------#')
    def _show_outputdf(table_list:List[List], table_number:int):
        op_df = pd.DataFrame(table_list)
        container.write(f'Extracted tabel: {table_number}')
        container.dataframe(op_df)
        container.write('\n')
    if len(extracted_object.tables) != 0:
        table_no = 1
        for table in extracted_object.tables:
            table_list = []
            for row in table.extracted_rows:
                row_list = []
                for cell in row.extracted_cells:
                    row_list.append(cell.value)
                table_list.append(row_list)
            _show_outputdf(table_list=table_list, table_number=table_no)
            table_no += 1
    else:
        container.write('No tables are predicted!!!!')

def table_data_extraction_from_image1(file):
    logging.info(f'#---------- Table extractor started {time.strftime("%Y-%m-%d %H:%M:%S")} -----------#')
    image = Image.open(file).convert('RGB')
    detection_result = detection_obj.table_detection_from_image(image)
    recognition_result = recognition_obj.table_recognition_from_detection(image, detection_result)
    preprocessed_tables = table_preprocessor.table_structure_sorting(recognition_result)
    exracted_table_data = textdataextractor.cell_data_extraction(image, preprocessed_tables)
    convert_to_df(exracted_table_data)
    
    logging.info((f'#---------- Table extractor ended {time.strftime("%Y-%m-%d %H:%M:%S")} -----------#\n'))
    return exracted_table_data
if __name__ == '__main__':
    st.title('Table detection and recognition')
    st.write('Table data extraction application with help of microsoft detr models.')
    image = st.sidebar.file_uploader(label='Upload image file for data extraction', type=['png','jpg','jpeg','tif'])
    if image:
        result = st.sidebar.button(label='Predict', on_click=table_data_extraction_from_image1, args=(image,))
        container = st.container()
        container.subheader('Extracted tables :snowflake:')