# from alessandro import re import cv2 import numpy as np from paddleocr import PaddleOCR from PIL import Image import matplotlib.pyplot as plt import pandas as pd import matplotlib.pyplot as plt ocr = PaddleOCR(lang='sl') # def convert_to_image(document): # ''' # Function: converts the pdf to image # Input: pdf document # Output: image # ''' # # reads PDFs # # reads only first page of PDF documents # # os.path.join(document.name, 'sample.pdf') # pdf_document = load_from_file(document) # page_1 = pdf_document.create_page(0) # images = renderer.render_page(page_1) # image_data = image.data # # convert the image to numpy array # image = np.array(images) # # handles non-PDF formats (e.g., .tif) # # else: # # images = Image.open(document) # # # convert the image to RGB # # image = images.convert('RGB') # # # convert the image to numpy array # # image = np.array(image) # # # TODO: change to dynamic scaling # # # downscale the image # # scale = 1.494 # # width = int(image.shape[1] / scale) # # height = int(image.shape[0] / scale) # # dim = (width, height) # # image = cv2.resize(image, dim, interpolation = cv2.INTER_AREA) # # fig, ax = plt.subplots(figsize=(15, 10)) # # ax.imshow(image, cmap = 'gray') # return image def deskew(image, model): ''' Function: deskew an image Input: takes an image as an array Output: deskewed image ''' # map the model classes to the actual degree of skew map = { 0: '-1', 1: '-10', 2: '-11', 3: '-12', 4: '-13', 5: '-14',6: '-15', 7: '-2', 8: '-3', 9: '-4', 10: '-5',11: '-6',12: '-7', 13: '-8', 14: '-9', 15: '0', 16: '1', 17: '10', 18: '11', 19: '12', 20: '13',21: '14',22: '15', 23: '180',24: '2', 25: '270',26: '3',27: '4', 28: '5', 29: '6', 30: '7', 31: '8',32: '9', 33: '90'} image_d = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) width = int(image_d.shape[1] * 0.2) height = int(image_d.shape[0] * 0.2) dim = (width, height) # resize image res = cv2.resize(image_d, dim, interpolation = cv2.INTER_AREA) resized = cv2.resize(res, (200, 200)) # add two dimensions to feed to the model resized = resized.astype('float32').reshape(1, 200, 200 ,1) # normalize resized = resized/255 # predictions predictions = model.run(None, {'conv2d_input': resized}) # best prediction pred = predictions[0].argmax() # angle of skew angle = int(map[pred]) skew_confidence = predictions[0][0][pred] * 100 # deskew original image if angle == 90: deskewed_image = cv2.rotate(image, cv2.ROTATE_90_COUNTERCLOCKWISE) return deskewed_image, angle, skew_confidence if angle == 270: deskewed_image = cv2.rotate(image, cv2.ROTATE_90_CLOCKWISE) return deskewed_image, angle, skew_confidence (h, w) = image.shape[:2] center = (w // 2, h // 2) M = cv2.getRotationMatrix2D(center, -angle, 1.0) deskewed_image = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE) return deskewed_image, angle, skew_confidence def prepare_image_to_autoencoder(image): ''' Function: prepare the image to be passed to the autoencoder. Input: image (_type_): deskewed image Output: resized image to be passed to the autoencoder ''' height, width = image.shape[:2] target_height = 600 target_width = 600 image = image[int(height/3.6): int(height/1.87), int(width/3.67): int(width/1.575)] # reshape image to fixed size image = cv2.resize(image, (target_width, target_height)) image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) # normalize images image = image / 255.0 # reshape to pass image to autoencoder image = image.reshape(target_height, target_width, 1) return image def autoencode_ONNX(image, model): ''' Function: remove noise from image Input: image and autoencoder model Output: image ''' image = image.astype(np.float32).reshape(1, 600, 600, 1) image = model.run(None, {'input_2': image}) image = image[0] image = image.squeeze() image = image * 255 image = image.astype('uint8') # fig, ax = plt.subplots(figsize=(8, 5)) # ax.imshow(image, cmap = 'gray') return image def detect_entries_ONNX(denoised, model): ''' Function: detect boxes Priimek, Ime and Datum boxes Priimek: lastname Ime: firstname Datum smrti: date of death Input: image Output: boxes and confidence scores ''' # the object detection model requires a tensor(1, h, w, 3) autoencoded_RGB = cv2.cvtColor(denoised, cv2.COLOR_GRAY2RGB) # adds the 1 to the tensor autoencoded_expanded = np.expand_dims(autoencoded_RGB, axis=0) detections = model.run(None, {'input_tensor': autoencoded_expanded}) boxes = detections[1] confidence = detections[4] # returns a ndarray in a list of list boxes = np.array(boxes[0]) confidence = np.array(confidence).reshape(5, 1) boxes_and_confidence = np.append(boxes, confidence, axis=1) # reshapes the boxes to be sorted boxes_and_confidence = boxes_and_confidence.reshape(5, 5) # sorts boxes_and_confidence = \ boxes_and_confidence[boxes_and_confidence[:, 0].argsort()] # boxes (expressed in image %) boxes = boxes_and_confidence[:, :-1] # boxes (expressed in actual pixels: ymin, xmin, ymax, xmax) boxes = boxes * 600 # confidence boxes confidence_boxes = boxes_and_confidence[:, -1].tolist() for box in boxes: ymin, xmin, ymax, xmax = box.astype(int) cv2.rectangle(autoencoded_RGB, (xmin, ymin), (xmax, ymax), (0, 255, 0), 2) plt.figure() plt.imshow(cv2.cvtColor(autoencoded_RGB, cv2.COLOR_BGR2RGB)) plt.title("Detected Boxes") plt.savefig("test.jpg") img = cv2.imread("test.jpg") return Image.fromarray(img), confidence_boxes def extract_detected_entries_pdl(image): result = ocr.ocr(image, cls=False) # boxes = [line[0] for line in result] # txts = [line[1][0] for line in result] # scores = [line[1][1] for line in result] # im_show = draw_ocr(image, boxes, txts, scores, font_path ='/usr/share/fonts/truetype/liberation/LiberationMono-Regular.ttf') txt = [] scores = [] boxes = [] for r in result[0]: txt.append(cleanString_basic(r[-1][0])) scores.append(r[-1][1]) boxes.append(r[0]) return pd.DataFrame(np.transpose([txt,scores, boxes]),columns = ["Text","Score", "Boundary Box"]) def cleanString_basic(word): word = word.replace("$", "s") return word def clean_string_start(string: 'str'): names_flags = "√" chars_to_remove = ['!', "'", '[', ']', '*', '|', '.', ':', '\\', '/'] if string.startswith(tuple(chars_to_remove)): names_flags = string[0] string = string[1:] return string, names_flags def clean_string_end(string: 'str'): names_flags = "√" chars_to_remove = ['!', "'", '[', ']', '*', '|', '.', ':', '\\', '/'] if string.endswith(tuple(chars_to_remove)): names_flags = string[-1] string = string[:-1] return string, names_flags def clean_dates(date: 'str'): ''' Function: cleans the fields "datum smrti" and returns the char removed. Input: date (string format) Output: cleaned frame ''' date_flags = "Y" # finds special characters in the string special_char = re.findall(r'[a-zA-Z!\[\|]', date) if len(special_char) > 0: date_flags = special_char # remove special characters in the string string = re.sub(r'[a-zA-Z!\[\|]', '', date) return string, date_flags def regex_string(string): ''' Function: swaps the carachters with the "hat" with the regular ones Input: string Output: cleaned string ''' map = {'Č': 'C', 'č': 'c', 'Š': 'S', 'š': 's', 'Ž': 'Z', 'ž':'z'} for x in string: if x in map: string = string.replace(x, map[x]) return string import onnxruntime def pdf_deskew_gr (document): img = convert_to_image(document) model = onnxruntime.InferenceSession("./models/CNN_deskew_v0.0.2.onnx") deskewed_image, angle, skew_confidence = deskew(img, model) return deskewed_image, angle, skew_confidence def pdf_clean_gr(document): img = convert_to_image(document) model = onnxruntime.InferenceSession("./models/CNN_deskew_v0.0.2.onnx") deskewed_image, angle, skew_confidence = deskew(img, model) img = prepare_image_to_autoencoder(img) model = onnxruntime.InferenceSession("./models/autoencoder_denoise_v0.0.2.onnx") img = autoencode_ONNX(img, model) return img def pdf_resnet_gr(document): img = convert_to_image(document) model = onnxruntime.InferenceSession("/content/drive/MyDrive/cpo/Alessandro/ai_models/Latest/CNN_deskew_v0.0.2.onnx") deskewed_image, angle, skew_confidence = deskew(img, model) img = prepare_image_to_autoencoder(img) model = onnxruntime.InferenceSession("/content/drive/MyDrive/cpo/Alessandro/ai_models/Latest/autoencoder_denoise_v0.0.2.onnx") img = autoencode_ONNX(img, model) model = onnxruntime.InferenceSession("/content/drive/MyDrive/cpo/Alessandro/ai_models/Latest/ResNet_od_v0.0.2.onnx") boxes, confidence_boxes = detect_entries_ONNX(img, model) return boxes, confidence_boxes def pdf_extract_gr(extractimg): # extractimg = convert_to_image(document) extractimg = np.array(extractimg) model = onnxruntime.InferenceSession("./models/CNN_deskew_v0.0.2.onnx") deskewed_image, angle, skew_confidence = deskew(extractimg, model) cleanimg = prepare_image_to_autoencoder(deskewed_image) model = onnxruntime.InferenceSession("./models/autoencoder_denoise_v0.0.2.onnx") img = autoencode_ONNX(cleanimg, model) # model = onnxruntime.InferenceSession("./models/ResNet_od_v0.0.2.onnx") # boxes, confidence_boxes = detect_entries_ONNX(img, model) # confidence_entries, lastname, firstname, death_date = extract_detected_entries_pdl(img, boxes) df = extract_detected_entries_pdl(img) firstnamerow = df.iloc[0] firstname = firstnamerow[0] firstnameconfidence = round(float(firstnamerow[1]) * 100,3) firstnameconfidence = f"{firstnameconfidence}%" surnamerow = df.iloc[1] surname = surnamerow[0] surnameconfidence = round(float(surnamerow[1]) * 100,3) surnameconfidence = f"{surnameconfidence}%" dodrow = df.iloc[2] dodname = dodrow[0] dodconfidence = round(float(dodrow[1]) * 100,3) dodconfidence = f"{dodconfidence}%" return df, deskewed_image, angle, skew_confidence, img, firstname, firstnameconfidence, surname, surnameconfidence, dodname, dodconfidence css = """ .run_container { display: flex; flex-direction: column; align-items: center; gap: 10px; } .run_btn { margin: auto; width: 50%; display: flex; } .upload_cell { margin: auto; display: flex; } .results_container { display: flex; justify-content: space-evenly; } .results_cell { } """ import gradio as gr with gr.Blocks(css = css) as demo: gr.Markdown(""" # Death Certificate Extraction """, elem_classes = "h1") gr.Markdown("Upload a PDF, extract data") with gr.Box(elem_classes = "run_container"): # ExtractInput = gr.File(label = "Death Certificate", elem_classes="upload_cell") ExtractButton = gr.Button(label = "Extract", elem_classes="run_btn") with gr.Row(elem_id = "hide"): with gr.Column(): ExtractInput = gr.Image() with gr.Column(): # ExtractResult = gr.Image(label = "result") with gr.Row(elem_classes = "results_container"): FirstNameBox = gr.Textbox(label = "First Name", elem_classes = "results_cell") FirstNameConfidenceBox = gr.Textbox(label = "First Name Confidence", elem_classes = "results_cell") with gr.Row(elem_classes = "results_container"): SurnameNameBox = gr.Textbox(label = "Surname", elem_classes = "results_cell") SurnameNameConfidenceBox = gr.Textbox(label = "Surname Confidence", elem_classes = "results_cell") with gr.Row(elem_classes = "results_container"): DODBox = gr.Textbox(label = "Date of Death", elem_classes = "results_cell") DODConfidenceBox = gr.Textbox(label = "Date of Death Confidence", elem_classes = "results_cell") with gr.Accordion("Full Results", open = False): ExtractDF = gr.Dataframe(label = "Results") with gr.Accordion("Clean Image", open = False): CleanOutput = gr.Image() with gr.Accordion("Deskew", open = False): DeskewOutput = gr.Image() with gr.Column(): DeskewAngle = gr.Number(label = "Angle") with gr.Column(): DeskewConfidence = gr.Number(label = "Confidence") ExtractButton.click(fn=pdf_extract_gr, inputs = ExtractInput, outputs = [ExtractDF, DeskewOutput, DeskewAngle, DeskewConfidence, CleanOutput, FirstNameBox, FirstNameConfidenceBox, SurnameNameBox, SurnameNameConfidenceBox, DODBox, DODConfidenceBox]) demo.launch(show_api=True, share=False, debug=True)