Spaces:
Build error
Build error
import os | |
import traceback | |
import argparse | |
from typing import List, Tuple, Set, Dict | |
import time | |
from PIL import Image | |
import numpy as np | |
import logging | |
import pandas as pd | |
from bs4 import BeautifulSoup | |
from utils import cropImages | |
from utils import draw_only_box,draw_box_with_text,getlogger,Annotation | |
from ocr_component1 import OCRComponent1 | |
from detectionAndOcrTable1 import DetectionAndOcrTable1 | |
from detectionAndOcrTable2 import DetectionAndOcrTable2 | |
from detectionAndOcrTable3 import DetectionAndOcrTable3 | |
from detectionAndOcrTable4 import DetectionAndOcrTable4 | |
from ocrTable1 import OcrTable1 | |
from ocrTable2 import OcrTable2 | |
from pdf2image import convert_from_path | |
def convertHTMLToCSV(html:str,output_path:str)->str: | |
# empty list | |
data = [] | |
# for getting the header from | |
# the HTML file | |
list_header = [] | |
soup = BeautifulSoup(html,'html.parser') | |
header = soup.find_all("table")[0].find("tr") | |
for items in header: | |
try: | |
list_header.append(items.get_text()) | |
except: | |
continue | |
# for getting the data | |
HTML_data = soup.find_all("table")[0].find_all("tr")[1:] | |
for element in HTML_data: | |
sub_data = [] | |
for sub_element in element: | |
try: | |
sub_data.append(sub_element.get_text()) | |
except: | |
continue | |
data.append(sub_data) | |
# Storing the data into Pandas | |
# DataFrame | |
dataFrame = pd.DataFrame(data = data, columns = list_header) | |
# Converting Pandas DataFrame | |
# into CSV file | |
dataFrame.to_csv(output_path) | |
def saveResults(image_list, results, labels, output_dir='output/', threshold=0.5): | |
if not os.path.exists(output_dir): | |
os.makedirs(output_dir) | |
for idx, im in enumerate(image_list): | |
im = draw_only_box(im, results[idx], labels, threshold=threshold) | |
out_path = os.path.join(output_dir, f"{idx}.jpg") | |
im.save(out_path, quality=95) | |
print("save result to: " + out_path) | |
def InputToImages(input_path:str,resolution=300)-> List[Image.Image]: | |
""" | |
input is file location to image | |
return : List of Pillow image objects | |
""" | |
images=[] | |
try: | |
img =Image.open(input_path) | |
if img.mode == 'RGBA': | |
img = img.convert('RGB') | |
images.append(img) | |
except Exception as e: | |
traceback.print_exc() | |
return images | |
def drawTextDetRes(bxs :List[List[float]],img:Image.Image,output_path:str): | |
""" | |
draw layout analysis results | |
""" | |
"""bxs_draw is xmin, ymin, xmax, ymax""" | |
bxs_draw = [[b[0][0], b[0][1], b[1][0], b[-1][1]] for b in bxs if b[0][0] <= b[1][0] and b[0][1] <= b[-1][1]] | |
#images_to_recognizer = cropImage(bxs, img) | |
img_to_save = draw_only_box(img, bxs_draw) | |
img_to_save.save(output_path, quality=95) | |
def test_ocr_component1(test_file="TestingFiles/OCRTest1German.pdf", debug_folder = './res/table1/',englishFlag = False): | |
#Takes as input image of a single page and returns the detected lines and words | |
# | |
if not os.path.exists(debug_folder): | |
os.makedirs(debug_folder) | |
images = convert_from_path(test_file) | |
ocr = OCRComponent1(englishFlag) | |
ocr_results = {} | |
for page_number,img in enumerate(images): | |
line_annotations= ocr.predict(img = np.array(img)) | |
ocr_results[page_number] = line_annotations | |
"""boxes_to_draw =[] | |
for list_of_ann in word_annotations: | |
for ann in list_of_ann: | |
logger.info(ann.text) | |
b = ann.box | |
boxes_to_draw.append(b) | |
img_to_save = draw_only_box(img,boxes_to_draw) | |
img_to_save.save("res/12June_2_lines.png", quality=95) | |
""" | |
line_boxes_to_draw =[] | |
#print("Detected lines are ") | |
#print(len(line_annotations.items())) | |
for index,ann in line_annotations.items(): | |
b = ann.box | |
line_boxes_to_draw.append(b) | |
line_words = "" | |
#print("detected words per line") | |
#print(len(ann.words)) | |
for wordann in ann.words: | |
line_words += wordann.text +" " | |
print(line_words) | |
img_to_save1 = draw_only_box(img,line_boxes_to_draw) | |
imgname = test_file.split("/")[-1][:-4] | |
img_to_save1.save(debug_folder+imgname+"_"+str(page_number)+"_bbox_detection.png", quality=95) | |
return ocr_results | |
def test_tableOcrOnly1(test_file = './cropped_table_0.png' , debug_folder = './res/table1/',denoise = False,englishFlag = False): | |
if not os.path.exists(debug_folder): | |
os.makedirs(debug_folder) | |
#Hybrid Unitable +DocTR | |
#Good at these kind of tables - with a lot of texts | |
table = OcrTable1(englishFlag) | |
image = Image.open(test_file).convert("RGB") | |
parts = test_file.split("/") | |
filename = parts[-1][:-4] | |
debugfolder_filename_page_name= debug_folder+filename+"_" | |
table_code = table.predict([image],debugfolder_filename_page_name,denoise = denoise) | |
with open(debugfolder_filename_page_name+'output.txt', 'w') as file: | |
file.write(table_code) | |
return table_code | |
def test_tableOcrOnly2(test_file = './cropped_table_1.png' , debug_folder = './res/table2/'): | |
if not os.path.exists(debug_folder): | |
os.makedirs(debug_folder) | |
table = OcrTable2() | |
#FullUnitable | |
#Good at these kind of tables - with not much text | |
image = Image.open(test_file).convert("RGB") | |
table.predict([image],debug_folder) | |
def test_table_component1(test_file = 'TestingFiles/TableOCRTestEnglish.pdf', debug_folder ='./res/table_debug2/',denoise = False,englishFlag = True): | |
table_predictor = DetectionAndOcrTable1(englishFlag) | |
if not os.path.exists(debug_folder): | |
os.makedirs(debug_folder) | |
images = convert_from_path(test_file) | |
for page_number,img in enumerate(images): | |
#print(img.mode) | |
print("Looking at page:") | |
print(page_number) | |
parts = test_file.split("/") | |
filename = parts[-1][:-4] | |
debugfolder_filename_page_name= debug_folder+filename+"_"+ str(page_number)+'_' | |
table_codes = table_predictor.predict(img,debugfolder_filename_page_name=debugfolder_filename_page_name,denoise = denoise) | |
for index, table_code in enumerate(table_codes): | |
with open(debugfolder_filename_page_name+str(index)+'output.xls', 'w') as file: | |
file.write(table_code) | |
def test_table_component2(test_file = 'TestingFiles/TableOCRTestEnglish.pdf', debug_folder ='./res/table_debug2/'): | |
#This components can take in entire pdf page as input , scan for tables and return the table in html format | |
#Uses the full unitable model | |
if not os.path.exists(debug_folder): | |
os.makedirs(debug_folder) | |
table_predictor = DetectionAndOcrTable2() | |
images = convert_from_path(test_file) | |
for page_number,img in enumerate(images): | |
print("Looking at page:") | |
print(page_number) | |
parts = test_file.split("/") | |
filename = parts[-1][:-4] | |
debugfolder_filename_page_name= debug_folder+filename+"_"+ str(page_number)+'_' | |
table_codes = table_predictor.predict(img,debugfolder_filename_page_name=debugfolder_filename_page_name) | |
for index, table_code in enumerate(table_codes): | |
with open(debugfolder_filename_page_name+str(index)+'output.xls', 'w',encoding='utf-8') as file: | |
file.write(table_code) | |
def test_table_component3(test_file = 'TestingFiles/TableOCRTestEnglish.pdf',debug_folder ='./res/table_debug3/',denoise = False,englishFlag = True): | |
if not os.path.exists(debug_folder): | |
os.makedirs(debug_folder) | |
table_predictor = DetectionAndOcrTable3(englishFlag) | |
images = convert_from_path(test_file) | |
for page_number,img in enumerate(images): | |
#print(img.mode) | |
print("Looking at page:") | |
print(page_number) | |
parts = test_file.split("/") | |
filename = parts[-1][:-4] | |
debugfolder_filename_page_name= debug_folder+filename+"_"+ str(page_number)+'_' | |
table_codes = table_predictor.predict(img,debugfolder_filename_page_name=debugfolder_filename_page_name) | |
for index, table_code in enumerate(table_codes): | |
with open(debugfolder_filename_page_name+str(index)+'output.xls', 'w',encoding='utf-8') as file: | |
file.write(table_code) | |
def test_table_component4(test_file = 'TestingFiles/TableOCRTestEnglish.pdf',debug_folder ='./res/table_debug3/'): | |
if not os.path.exists(debug_folder): | |
os.makedirs(debug_folder) | |
table_predictor = DetectionAndOcrTable4() | |
images = convert_from_path(test_file) | |
for page_number,img in enumerate(images): | |
#print(img.mode) | |
print("Looking at page:") | |
print(page_number) | |
parts = test_file.split("/") | |
filename = parts[-1][:-4] | |
debugfolder_filename_page_name= debug_folder+filename+"_"+ str(page_number)+'_' | |
table_codes = table_predictor.predict(img,debugfolder_filename_page_name=debugfolder_filename_page_name) | |
for index, table_code in enumerate(table_codes): | |
with open(debugfolder_filename_page_name+str(index)+'output.xls', 'w',encoding='utf-8') as file: | |
file.write(table_code) | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser(description='Process some strings.') | |
parser.add_argument('ocr', type=str, help='type in id of the component to test') | |
parser.add_argument('--test_file',type=str, help='path to the testing file') | |
parser.add_argument('--debug_folder',type=str, help='path to the folder you want to save your results in') | |
parser.add_argument('--englishFlag',type=bool, help='Whether your pdf is in english => could lead to better results ') | |
parser.add_argument('--denoise',type=bool, help='preprocessing for not clean scans ') | |
args = parser.parse_args() | |
start = time.time() | |
if args.ocr == "ocr1": | |
test_ocr_component1(args.test_file,args.debug_folder, args.englishFlag) | |
elif args.ocr == "table1": | |
test_tableOcrOnly1(args.test_file,args.debug_folder,args.englishFlag,args.denoise) | |
elif args.ocr == "table2": | |
test_tableOcrOnly2(args.test_file,args.debug_folder) | |
elif args.ocr =="pdftable1": | |
test_table_component1(args.test_file,args.debug_folder,args.englishFlag,args.denoise) | |
elif args.ocr =="pdftable2": | |
test_table_component2(args.test_file,args.debug_folder) | |
elif args.ocr =="pdftable3": | |
test_table_component3(args.test_file,args.debug_folder,args.englishFlag,args.denoise) | |
elif args.ocr =="pdftable4": | |
test_table_component4(args.test_file,args.debug_folder) | |
#test_table_component1() | |
#test_ocr_component1("TestingFilesImages/OCRTest3English_0.jpg") | |
#test_tableOcrOnly2() | |
#test_tableOcrOnly2() | |
#test_tableOcrOnly2_singleImage() | |
# Example run | |
# python main.py pdftable2 --test_file TestingFiles/TableOCRTestEnglish.pdf --debug_foler ./res/table_debug2/ | |
# python main.py ocr1 --test_file TestingFiles/OCRTest1German.pdf --debug_repo ./res/ocrdebug1/ | |
# python main.py table1 --test_file ./cropped_table_0.png --debug_repo ./res/table1/ | |
# python main.py table2 --test_file ./cropped_table_1.png --debug_repo ./res/table2/ | |
# python main.py pdftable1 --test_file TestingFiles/TableOCRTestEnglish.pdf --debug_foler ./res/table_debug2/ | |
end = time.time() | |
print("The entire pipeline took " , end-start) | |