deathCertReader / app.py
Alealejandrooo's picture
Duplicate from LumeraDS/deathCertReader
1d47317
raw
history blame
13.4 kB
# from alessandro
import re
import cv2
import numpy as np
from paddleocr import PaddleOCR
from PIL import Image
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib.pyplot as plt
ocr = PaddleOCR(lang='sl')
# def convert_to_image(document):
# '''
# Function: converts the pdf to image
# Input: pdf document
# Output: image
# '''
# # reads PDFs
# # reads only first page of PDF documents
# # os.path.join(document.name, 'sample.pdf')
# pdf_document = load_from_file(document)
# page_1 = pdf_document.create_page(0)
# images = renderer.render_page(page_1)
# image_data = image.data
# # convert the image to numpy array
# image = np.array(images)
# # handles non-PDF formats (e.g., .tif)
# # else:
# # images = Image.open(document)
# # # convert the image to RGB
# # image = images.convert('RGB')
# # # convert the image to numpy array
# # image = np.array(image)
# # # TODO: change to dynamic scaling
# # # downscale the image
# # scale = 1.494
# # width = int(image.shape[1] / scale)
# # height = int(image.shape[0] / scale)
# # dim = (width, height)
# # image = cv2.resize(image, dim, interpolation = cv2.INTER_AREA)
# # fig, ax = plt.subplots(figsize=(15, 10))
# # ax.imshow(image, cmap = 'gray')
# return image
def deskew(image, model):
'''
Function: deskew an image
Input: takes an image as an array
Output: deskewed image
'''
# map the model classes to the actual degree of skew
map = { 0: '-1', 1: '-10', 2: '-11', 3: '-12', 4: '-13',
5: '-14',6: '-15', 7: '-2', 8: '-3', 9: '-4',
10: '-5',11: '-6',12: '-7', 13: '-8', 14: '-9',
15: '0', 16: '1', 17: '10', 18: '11', 19: '12',
20: '13',21: '14',22: '15', 23: '180',24: '2',
25: '270',26: '3',27: '4', 28: '5', 29: '6',
30: '7', 31: '8',32: '9', 33: '90'}
image_d = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
width = int(image_d.shape[1] * 0.2)
height = int(image_d.shape[0] * 0.2)
dim = (width, height)
# resize image
res = cv2.resize(image_d, dim, interpolation = cv2.INTER_AREA)
resized = cv2.resize(res, (200, 200))
# add two dimensions to feed to the model
resized = resized.astype('float32').reshape(1, 200, 200 ,1)
# normalize
resized = resized/255
# predictions
predictions = model.run(None, {'conv2d_input': resized})
# best prediction
pred = predictions[0].argmax()
# angle of skew
angle = int(map[pred])
skew_confidence = predictions[0][0][pred] * 100
# deskew original image
if angle == 90:
deskewed_image = cv2.rotate(image, cv2.ROTATE_90_COUNTERCLOCKWISE)
return deskewed_image, angle, skew_confidence
if angle == 270:
deskewed_image = cv2.rotate(image, cv2.ROTATE_90_CLOCKWISE)
return deskewed_image, angle, skew_confidence
(h, w) = image.shape[:2]
center = (w // 2, h // 2)
M = cv2.getRotationMatrix2D(center, -angle, 1.0)
deskewed_image = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC,
borderMode=cv2.BORDER_REPLICATE)
return deskewed_image, angle, skew_confidence
def prepare_image_to_autoencoder(image):
'''
Function: prepare the image to be passed to the autoencoder.
Input: image (_type_): deskewed image
Output: resized image to be passed to the autoencoder
'''
height, width = image.shape[:2]
target_height = 600
target_width = 600
image = image[int(height/3.6): int(height/1.87), int(width/3.67): int(width/1.575)]
# reshape image to fixed size
image = cv2.resize(image, (target_width, target_height))
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# normalize images
image = image / 255.0
# reshape to pass image to autoencoder
image = image.reshape(target_height, target_width, 1)
return image
def autoencode_ONNX(image, model):
'''
Function: remove noise from image
Input: image and autoencoder model
Output: image
'''
image = image.astype(np.float32).reshape(1, 600, 600, 1)
image = model.run(None, {'input_2': image})
image = image[0]
image = image.squeeze()
image = image * 255
image = image.astype('uint8')
# fig, ax = plt.subplots(figsize=(8, 5))
# ax.imshow(image, cmap = 'gray')
return image
def detect_entries_ONNX(denoised, model):
'''
Function: detect boxes Priimek, Ime and Datum boxes
Priimek: lastname
Ime: firstname
Datum smrti: date of death
Input: image
Output: boxes and confidence scores
'''
# the object detection model requires a tensor(1, h, w, 3)
autoencoded_RGB = cv2.cvtColor(denoised, cv2.COLOR_GRAY2RGB)
# adds the 1 to the tensor
autoencoded_expanded = np.expand_dims(autoencoded_RGB, axis=0)
detections = model.run(None, {'input_tensor': autoencoded_expanded})
boxes = detections[1]
confidence = detections[4] # returns a ndarray in a list of list
boxes = np.array(boxes[0])
confidence = np.array(confidence).reshape(5, 1)
boxes_and_confidence = np.append(boxes, confidence, axis=1)
# reshapes the boxes to be sorted
boxes_and_confidence = boxes_and_confidence.reshape(5, 5)
# sorts
boxes_and_confidence = \
boxes_and_confidence[boxes_and_confidence[:, 0].argsort()]
# boxes (expressed in image %)
boxes = boxes_and_confidence[:, :-1]
# boxes (expressed in actual pixels: ymin, xmin, ymax, xmax)
boxes = boxes * 600
# confidence boxes
confidence_boxes = boxes_and_confidence[:, -1].tolist()
for box in boxes:
ymin, xmin, ymax, xmax = box.astype(int)
cv2.rectangle(autoencoded_RGB, (xmin, ymin), (xmax, ymax), (0, 255, 0), 2)
plt.figure()
plt.imshow(cv2.cvtColor(autoencoded_RGB, cv2.COLOR_BGR2RGB))
plt.title("Detected Boxes")
plt.savefig("test.jpg")
img = cv2.imread("test.jpg")
return Image.fromarray(img), confidence_boxes
def extract_detected_entries_pdl(image):
result = ocr.ocr(image, cls=False)
# boxes = [line[0] for line in result]
# txts = [line[1][0] for line in result]
# scores = [line[1][1] for line in result]
# im_show = draw_ocr(image, boxes, txts, scores, font_path ='/usr/share/fonts/truetype/liberation/LiberationMono-Regular.ttf')
txt = []
scores = []
boxes = []
for r in result[0]:
txt.append(cleanString_basic(r[-1][0]))
scores.append(r[-1][1])
boxes.append(r[0])
return pd.DataFrame(np.transpose([txt,scores, boxes]),columns = ["Text","Score", "Boundary Box"])
def cleanString_basic(word):
word = word.replace("$", "s")
return word
def clean_string_start(string: 'str'):
names_flags = "√"
chars_to_remove = ['!', "'", '[', ']', '*', '|', '.', ':', '\\', '/']
if string.startswith(tuple(chars_to_remove)):
names_flags = string[0]
string = string[1:]
return string, names_flags
def clean_string_end(string: 'str'):
names_flags = "√"
chars_to_remove = ['!', "'", '[', ']', '*', '|', '.', ':', '\\', '/']
if string.endswith(tuple(chars_to_remove)):
names_flags = string[-1]
string = string[:-1]
return string, names_flags
def clean_dates(date: 'str'):
'''
Function: cleans the fields "datum smrti" and returns the char removed.
Input: date (string format)
Output: cleaned frame
'''
date_flags = "Y"
# finds special characters in the string
special_char = re.findall(r'[a-zA-Z!\[\|]', date)
if len(special_char) > 0:
date_flags = special_char
# remove special characters in the string
string = re.sub(r'[a-zA-Z!\[\|]', '', date)
return string, date_flags
def regex_string(string):
'''
Function: swaps the carachters with the "hat" with the regular ones
Input: string
Output: cleaned string
'''
map = {'Č': 'C',
'č': 'c',
'Š': 'S',
'š': 's',
'Ž': 'Z',
'ž':'z'}
for x in string:
if x in map:
string = string.replace(x, map[x])
return string
import onnxruntime
def pdf_deskew_gr (document):
img = convert_to_image(document)
model = onnxruntime.InferenceSession("./models/CNN_deskew_v0.0.2.onnx")
deskewed_image, angle, skew_confidence = deskew(img, model)
return deskewed_image, angle, skew_confidence
def pdf_clean_gr(document):
img = convert_to_image(document)
model = onnxruntime.InferenceSession("./models/CNN_deskew_v0.0.2.onnx")
deskewed_image, angle, skew_confidence = deskew(img, model)
img = prepare_image_to_autoencoder(img)
model = onnxruntime.InferenceSession("./models/autoencoder_denoise_v0.0.2.onnx")
img = autoencode_ONNX(img, model)
return img
def pdf_resnet_gr(document):
img = convert_to_image(document)
model = onnxruntime.InferenceSession("/content/drive/MyDrive/cpo/Alessandro/ai_models/Latest/CNN_deskew_v0.0.2.onnx")
deskewed_image, angle, skew_confidence = deskew(img, model)
img = prepare_image_to_autoencoder(img)
model = onnxruntime.InferenceSession("/content/drive/MyDrive/cpo/Alessandro/ai_models/Latest/autoencoder_denoise_v0.0.2.onnx")
img = autoencode_ONNX(img, model)
model = onnxruntime.InferenceSession("/content/drive/MyDrive/cpo/Alessandro/ai_models/Latest/ResNet_od_v0.0.2.onnx")
boxes, confidence_boxes = detect_entries_ONNX(img, model)
return boxes, confidence_boxes
def pdf_extract_gr(extractimg):
# extractimg = convert_to_image(document)
extractimg = np.array(extractimg)
model = onnxruntime.InferenceSession("./models/CNN_deskew_v0.0.2.onnx")
deskewed_image, angle, skew_confidence = deskew(extractimg, model)
cleanimg = prepare_image_to_autoencoder(deskewed_image)
model = onnxruntime.InferenceSession("./models/autoencoder_denoise_v0.0.2.onnx")
img = autoencode_ONNX(cleanimg, model)
# model = onnxruntime.InferenceSession("./models/ResNet_od_v0.0.2.onnx")
# boxes, confidence_boxes = detect_entries_ONNX(img, model)
# confidence_entries, lastname, firstname, death_date = extract_detected_entries_pdl(img, boxes)
df = extract_detected_entries_pdl(img)
firstnamerow = df.iloc[0]
firstname = firstnamerow[0]
firstnameconfidence = round(float(firstnamerow[1]) * 100,3)
firstnameconfidence = f"{firstnameconfidence}%"
surnamerow = df.iloc[1]
surname = surnamerow[0]
surnameconfidence = round(float(surnamerow[1]) * 100,3)
surnameconfidence = f"{surnameconfidence}%"
dodrow = df.iloc[2]
dodname = dodrow[0]
dodconfidence = round(float(dodrow[1]) * 100,3)
dodconfidence = f"{dodconfidence}%"
return df, deskewed_image, angle, skew_confidence, img, firstname, firstnameconfidence, surname, surnameconfidence, dodname, dodconfidence
css = """
.run_container {
display: flex;
flex-direction: column;
align-items: center;
gap: 10px;
}
.run_btn {
margin: auto;
width: 50%;
display: flex;
}
.upload_cell {
margin: auto;
display: flex;
}
.results_container {
display: flex;
justify-content: space-evenly;
}
.results_cell {
}
"""
import gradio as gr
with gr.Blocks(css = css) as demo:
gr.Markdown("""
# Death Certificate Extraction
""", elem_classes = "h1")
gr.Markdown("Upload a PDF, extract data")
with gr.Box(elem_classes = "run_container"):
# ExtractInput = gr.File(label = "Death Certificate", elem_classes="upload_cell")
ExtractButton = gr.Button(label = "Extract", elem_classes="run_btn")
with gr.Row(elem_id = "hide"):
with gr.Column():
ExtractInput = gr.Image()
with gr.Column():
# ExtractResult = gr.Image(label = "result")
with gr.Row(elem_classes = "results_container"):
FirstNameBox = gr.Textbox(label = "First Name", elem_classes = "results_cell")
FirstNameConfidenceBox = gr.Textbox(label = "First Name Confidence", elem_classes = "results_cell")
with gr.Row(elem_classes = "results_container"):
SurnameNameBox = gr.Textbox(label = "Surname", elem_classes = "results_cell")
SurnameNameConfidenceBox = gr.Textbox(label = "Surname Confidence", elem_classes = "results_cell")
with gr.Row(elem_classes = "results_container"):
DODBox = gr.Textbox(label = "Date of Death", elem_classes = "results_cell")
DODConfidenceBox = gr.Textbox(label = "Date of Death Confidence", elem_classes = "results_cell")
with gr.Accordion("Full Results", open = False):
ExtractDF = gr.Dataframe(label = "Results")
with gr.Accordion("Clean Image", open = False):
CleanOutput = gr.Image()
with gr.Accordion("Deskew", open = False):
DeskewOutput = gr.Image()
with gr.Column():
DeskewAngle = gr.Number(label = "Angle")
with gr.Column():
DeskewConfidence = gr.Number(label = "Confidence")
ExtractButton.click(fn=pdf_extract_gr,
inputs = ExtractInput,
outputs = [ExtractDF, DeskewOutput, DeskewAngle,
DeskewConfidence, CleanOutput, FirstNameBox,
FirstNameConfidenceBox, SurnameNameBox,
SurnameNameConfidenceBox, DODBox, DODConfidenceBox])
demo.launch(show_api=True, share=False, debug=True)