|
import os |
|
import platform |
|
import time |
|
import logging |
|
from fastapi import FastAPI, UploadFile, File |
|
import uvicorn |
|
import pytesseract |
|
import streamlit as st |
|
import pandas as pd |
|
from PIL import Image |
|
from typing import List |
|
|
|
from transformers import TableTransformerForObjectDetection, DetrFeatureExtractor |
|
from codes.table_recognition import TableRecognition |
|
from codes.table_detection import TableDetection |
|
from codes.table_preprocessing import TablePreprocessor |
|
from codes.data_extraction import TextDataExtraction |
|
from datatypes.config import Config, tesseract_config, model_config |
|
|
|
if platform.system() == 'Windows': |
|
pytesseract.pytesseract.tesseract_cmd = tesseract_config['tesseractpath'] |
|
|
|
|
|
@st.cache_resource |
|
def load_models(): |
|
try: |
|
|
|
|
|
|
|
|
|
|
|
detection_model = TableTransformerForObjectDetection.from_pretrained("microsoft/table-transformer-detection") |
|
recognition_model = TableTransformerForObjectDetection.from_pretrained("microsoft/table-structure-recognition-v1.1-all") |
|
|
|
return detection_model, recognition_model |
|
except: |
|
print('Table detection or recognition model loading is failed!!') |
|
|
|
|
|
detection_model, recognition_model = load_models() |
|
|
|
|
|
detection_feature_extractor = DetrFeatureExtractor(do_resize=True, size=800, max_size=800) |
|
|
|
recognition_feature_extractor = DetrFeatureExtractor(do_resize=True, size=1000, max_size=1000) |
|
|
|
|
|
|
|
detection_obj = TableDetection(detection_feature_extractor, detection_model, threshold=Config['table_detection_threshold']) |
|
|
|
|
|
recognition_obj = TableRecognition(recognition_feature_extractor, recognition_model, threshold=Config['table_recognition_threshold']) |
|
|
|
table_preprocessor = TablePreprocessor() |
|
textdataextractor = TextDataExtraction() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def convert_to_df(extracted_object): |
|
logging.info(f'#---------- Table visualization started {time.strftime("%Y-%m-%d %H:%M:%S")} -----------#') |
|
def _show_outputdf(table_list:List[List], table_number:int): |
|
op_df = pd.DataFrame(table_list) |
|
container.write(f'Extracted tabel: {table_number}') |
|
container.dataframe(op_df) |
|
container.write('\n') |
|
if len(extracted_object.tables) != 0: |
|
table_no = 1 |
|
for table in extracted_object.tables: |
|
table_list = [] |
|
for row in table.extracted_rows: |
|
row_list = [] |
|
for cell in row.extracted_cells: |
|
row_list.append(cell.value) |
|
table_list.append(row_list) |
|
_show_outputdf(table_list=table_list, table_number=table_no) |
|
table_no += 1 |
|
else: |
|
container.write('No tables are predicted!!!!') |
|
|
|
def table_data_extraction_from_image1(file): |
|
logging.info(f'#---------- Table extractor started {time.strftime("%Y-%m-%d %H:%M:%S")} -----------#') |
|
image = Image.open(file).convert('RGB') |
|
detection_result = detection_obj.table_detection_from_image(image) |
|
recognition_result = recognition_obj.table_recognition_from_detection(image, detection_result) |
|
preprocessed_tables = table_preprocessor.table_structure_sorting(recognition_result) |
|
exracted_table_data = textdataextractor.cell_data_extraction(image, preprocessed_tables) |
|
convert_to_df(exracted_table_data) |
|
|
|
logging.info((f'#---------- Table extractor ended {time.strftime("%Y-%m-%d %H:%M:%S")} -----------#\n')) |
|
return exracted_table_data |
|
if __name__ == '__main__': |
|
st.title('Table detection and recognition') |
|
st.write('Table data extraction application with help of microsoft detr models.') |
|
image = st.sidebar.file_uploader(label='Upload image file for data extraction', type=['png','jpg','jpeg','tif']) |
|
if image: |
|
result = st.sidebar.button(label='Predict', on_click=table_data_extraction_from_image1, args=(image,)) |
|
container = st.container() |
|
container.subheader('Extracted tables :snowflake:') |