BrucePoD's picture
Duplicate from esraa-abdelmaksoud/Indonesian-ID-Data-Extraction
4315965
from paddleocr import PaddleOCR
import cv2
import pytesseract
import pandas as pd
import re
from thefuzz import fuzz
from thefuzz import process
def extract_data(img):
data_dict = {
"provinsi": "",
"kabupaten": "",
"nik": "",
"nama": "",
"tempat/tgl lahir": "",
"jenis kelamin": "",
"gol darah": "",
"alamat": "",
"rt/rw": "",
"kel/desa": "",
"kecamatan": "",
"agama": "",
"status perkawinan": "",
"pekerjaan": "",
"kewarganegaraan": "",
"berlaku hingga": "",
}
labels = list(data_dict.keys())
labels.remove("kabupaten")
df = pd.read_csv('data.csv')
image = resize_image(img)
image_xyz = rotate_image(image)
all_data = run_ocr(image_xyz)
new_data = split_items(all_data)
try:
new_data = correct_labels(new_data, labels)
new_data = correct_data(new_data, df)
except:
pass
text = print_output(new_data)
return text
def run_ocr(image):
ocr = PaddleOCR(
use_angle_cls=True,
lang="id",
det_max_side_len=1500,
det_limit_type="min",
det_db_unclip_ratio=1.7,
show_log=False,
)
result = ocr.ocr(image, cls=True)
all_data = []
for i, res in enumerate(result[0]):
x, y = [], []
paddle_text = res[1][0]
for i in range(4):
x.append(res[0][i][0])
y.append(res[0][i][1])
x1, y1, x2, y2 = int(min(x)), int(min(y)), int(max(x)), int(max(y))
snip = image[y1:y2, x1:x2]
tess_text = pytesseract.image_to_string(snip, lang="ind+eng", config="--psm 6")
tess_text, paddle_text = clean_text(tess_text, paddle_text, i)
all_data.append([x1, y1, x2, y2, paddle_text, tess_text])
return all_data
def clean_text(tess_text, paddle_text, i):
if "\n" in tess_text or "\x0c" in tess_text:
tess_text = tess_text.replace("\n", "")
tess_text = tess_text.replace("\x0c", "")
# Remove space before or after colon and hyphen
pattern = r"\s*([-:])\s*"
paddle_text = re.sub(pattern, r"\1", paddle_text)
tess_text = re.sub(pattern, r"\1", tess_text)
# Clean blood group
if "Darah" in tess_text or "Darah" in paddle_text:
tess_text = tess_text.replace("0", "O")
paddle_text = paddle_text.replace("0", "O")
# Clean symbols
for item in ["'", '"', "!", "‘", "“", ":"]:
paddle_text = paddle_text.replace(item, "")
tess_text = tess_text.replace(item, "")
return tess_text, paddle_text
def resize_image(img):
width = int(img.shape[1])
height = int(img.shape[0])
thresh = 1500
if width < thresh and height < thresh:
if width > height:
percent = thresh // width
else:
percent = thresh // height
dim = (width * percent, height * percent)
img = cv2.resize(img, dim, interpolation=cv2.INTER_AREA)
return img
def rotate_image(image):
image_xyz = cv2.cvtColor(image, cv2.COLOR_BGR2XYZ)
for i in range(4):
text = pytesseract.image_to_string(image_xyz, lang="ind+eng", config="--psm 6")
if "PROVINSI" in text or "Darah" in text or "NIK" in text:
return image_xyz
else:
image_xyz = cv2.rotate(image_xyz, cv2.ROTATE_90_CLOCKWISE)
if i == 3:
return image_xyz
def correct_labels(new_data, labels):
thresh = 80
for i in range(len(new_data)):
paddle_fuzz = process.extractOne(new_data[i][0], labels, scorer=fuzz.ratio)
tess_fuzz = process.extractOne(new_data[i][1], labels, scorer=fuzz.ratio)
if paddle_fuzz[1] >= thresh:
new_data[i][0] = paddle_fuzz[0]
new_data[i][1] = paddle_fuzz[0]
new_data[i].append("label")
elif tess_fuzz[1] >= thresh:
new_data[i][0] = tess_fuzz[0]
new_data[i][1] = tess_fuzz[0]
new_data[i].append("label")
elif (len(new_data[i][0]) == 3 or len(new_data[i][1]) == 3) and (
"IK" == new_data[i][0] or "IK" in new_data[i][1]
):
new_data[i][0] = "nik"
new_data[i][1] = "nik"
new_data[i].append("label")
return new_data
def find_uppercase_index(text):
pattern = r"(?<![A-Z])[A-Z]{3,}"
match = re.search(pattern, text)
if match:
return match.start()
else:
return -1
def correct_data(new_data, df):
provinsi_df = df["provinsi"].dropna().tolist()
provinsi = [f"PROVINSI {item}" for item in provinsi_df]
other_vals = [
"LAKI-LAKI",
"PEREMPUAN",
"A",
"B",
"AB",
"O",
"ISLAM",
"KRISTEN",
"KATOLIK",
"HINDU",
"BUDDHA",
"KONGHUCU",
"BELUM KAWIN",
"KAWIN",
"CERAI HIDUP",
"CERAI MATI",
"WNI",
"WNA",
"SEUMUR HIDUP",
]
for i in range(len(new_data)):
if i == 0 or ("PROVINSI" in new_data[i][0] or "PROVINSI" in new_data[i][1]):
new_data[i][0], new_data[i][1] = replace_data(new_data, i, provinsi)
kabupaten = df[new_data[i][0].replace("PROVINSI ", "")].dropna().tolist()
elif i == 1:
try:
new_data[i][0], new_data[i][1] = replace_data(new_data, i, kabupaten)
except:
pass
elif len(new_data[i]) == 2:
new_data[i][0], new_data[i][1] = replace_data(new_data, i, other_vals)
elif i == 3 or new_data[i - 1][0] == "NIK":
new_data[i][1] = new_data[i][0]
# Fix dates
if i > 4:
pattern = r"(\d{2})\W{0,1}(\d{2})\W{0,1}((19|20)\d{2})"
new_data[i][0] = re.sub(pattern, r"\1-\2-\3", new_data[i][0])
new_data[i][1] = re.sub(pattern, r"\1-\2-\3", new_data[i][1])
return new_data
def replace_data(new_data, i, options_list):
paddle_fuzz = process.extractOne(new_data[i][0], options_list, scorer=fuzz.ratio)
tess_fuzz = process.extractOne(new_data[i][1], options_list, scorer=fuzz.ratio)
if paddle_fuzz[1] > 80:
new_data[i][0] = paddle_fuzz[0]
new_data[i][1] = paddle_fuzz[0]
elif tess_fuzz[1] > 80:
new_data[i][0] = tess_fuzz[0]
new_data[i][1] = tess_fuzz[0]
return new_data[i][0], new_data[i][1]
def split_items(all_data):
new_data = []
for i in range(len(all_data)):
paddle_idx = find_uppercase_index(all_data[i][4])
tess_idx = find_uppercase_index(all_data[i][5])
if paddle_idx not in [0, -1] and tess_idx not in [0, -1]:
new_data.append(
[all_data[i][4][:paddle_idx].strip(), all_data[i][5][:tess_idx].strip()]
)
new_data.append(
[all_data[i][4][paddle_idx:].strip(), all_data[i][5][tess_idx:].strip()]
)
elif "Darah" in all_data[i][4] or "Darah" in all_data[i][5]:
darah_match_1 = re.sub(r"(Darah)\W*((A|AB|B|O))", r"\1 \2", all_data[i][4])
darah_match_2 = re.sub(r"(Darah)\W*((A|AB|B|O))", r"\1 \2", all_data[i][5])
space_1 = darah_match_1.rfind(" ")
space_2 = darah_match_2.rfind(" ")
if darah_match_1[-1] in ["A", "B", "O"]:
new_data.append(
[darah_match_1[:space_1].strip(), darah_match_1[:space_1].strip()]
)
new_data.append(
[
darah_match_1[space_1 + 1 :].strip(),
darah_match_1[space_1 + 1 :].strip(),
]
)
elif darah_match_2[-1] in ["A", "B", "O"]:
new_data.append(
[darah_match_2[:space_2].strip(), darah_match_2[:space_2].strip()]
)
new_data.append(
[
darah_match_2[space_2 + 1 :].strip(),
darah_match_2[space_2 + 1 :].strip(),
]
)
else:
new_data.append([all_data[i][4].strip(), all_data[i][5].strip()])
return new_data
def print_output(new_data):
text = ""
for i in range(len(new_data)):
if new_data[i][0] == new_data[i][1] and len(new_data[i]) == 3:
text += f"{new_data[i][0].upper()}\n"
elif new_data[i][0] == new_data[i][1]:
text += f"{new_data[i][0]}\n"
else:
text += f"{new_data[i][0]} \ {new_data[i][1]}\n"
return text