Indonesian-ID-Data-Extraction

Runtime error

App Files Files Community

Indonesian-ID-Data-Extraction / main.py

BrucePoD

Duplicate from esraa-abdelmaksoud/Indonesian-ID-Data-Extraction

4315965 about 2 years ago

raw

history blame contribute delete

8.57 kB

	from paddleocr import PaddleOCR
	import cv2
	import pytesseract
	import pandas as pd
	import re
	from thefuzz import fuzz
	from thefuzz import process

	def extract_data(img):
	data_dict = {
	"provinsi": "",
	"kabupaten": "",
	"nik": "",
	"nama": "",
	"tempat/tgl lahir": "",
	"jenis kelamin": "",
	"gol darah": "",
	"alamat": "",
	"rt/rw": "",
	"kel/desa": "",
	"kecamatan": "",
	"agama": "",
	"status perkawinan": "",
	"pekerjaan": "",
	"kewarganegaraan": "",
	"berlaku hingga": "",
	}
	labels = list(data_dict.keys())
	labels.remove("kabupaten")

	df = pd.read_csv('data.csv')
	image = resize_image(img)
	image_xyz = rotate_image(image)
	all_data = run_ocr(image_xyz)
	new_data = split_items(all_data)
	try:
	new_data = correct_labels(new_data, labels)
	new_data = correct_data(new_data, df)
	except:
	pass
	text = print_output(new_data)
	return text


	def run_ocr(image):
	ocr = PaddleOCR(
	use_angle_cls=True,
	lang="id",
	det_max_side_len=1500,
	det_limit_type="min",
	det_db_unclip_ratio=1.7,
	show_log=False,
	)
	result = ocr.ocr(image, cls=True)
	all_data = []
	for i, res in enumerate(result[0]):
	x, y = [], []
	paddle_text = res[1][0]
	for i in range(4):
	x.append(res[0][i][0])
	y.append(res[0][i][1])
	x1, y1, x2, y2 = int(min(x)), int(min(y)), int(max(x)), int(max(y))
	snip = image[y1:y2, x1:x2]
	tess_text = pytesseract.image_to_string(snip, lang="ind+eng", config="--psm 6")
	tess_text, paddle_text = clean_text(tess_text, paddle_text, i)

	all_data.append([x1, y1, x2, y2, paddle_text, tess_text])
	return all_data


	def clean_text(tess_text, paddle_text, i):
	if "\n" in tess_text or "\x0c" in tess_text:
	tess_text = tess_text.replace("\n", "")
	tess_text = tess_text.replace("\x0c", "")
	# Remove space before or after colon and hyphen
	pattern = r"\s([-:])\s"
	paddle_text = re.sub(pattern, r"\1", paddle_text)
	tess_text = re.sub(pattern, r"\1", tess_text)
	# Clean blood group
	if "Darah" in tess_text or "Darah" in paddle_text:
	tess_text = tess_text.replace("0", "O")
	paddle_text = paddle_text.replace("0", "O")
	# Clean symbols
	for item in ["'", '"', "!", "‘", "“", ":"]:
	paddle_text = paddle_text.replace(item, "")
	tess_text = tess_text.replace(item, "")

	return tess_text, paddle_text


	def resize_image(img):
	width = int(img.shape[1])
	height = int(img.shape[0])
	thresh = 1500

	if width < thresh and height < thresh:
	if width > height:
	percent = thresh // width
	else:
	percent = thresh // height
	dim = (width * percent, height * percent)
	img = cv2.resize(img, dim, interpolation=cv2.INTER_AREA)
	return img


	def rotate_image(image):
	image_xyz = cv2.cvtColor(image, cv2.COLOR_BGR2XYZ)
	for i in range(4):
	text = pytesseract.image_to_string(image_xyz, lang="ind+eng", config="--psm 6")
	if "PROVINSI" in text or "Darah" in text or "NIK" in text:
	return image_xyz
	else:
	image_xyz = cv2.rotate(image_xyz, cv2.ROTATE_90_CLOCKWISE)
	if i == 3:
	return image_xyz


	def correct_labels(new_data, labels):
	thresh = 80
	for i in range(len(new_data)):
	paddle_fuzz = process.extractOne(new_data[i][0], labels, scorer=fuzz.ratio)
	tess_fuzz = process.extractOne(new_data[i][1], labels, scorer=fuzz.ratio)
	if paddle_fuzz[1] >= thresh:
	new_data[i][0] = paddle_fuzz[0]
	new_data[i][1] = paddle_fuzz[0]
	new_data[i].append("label")
	elif tess_fuzz[1] >= thresh:
	new_data[i][0] = tess_fuzz[0]
	new_data[i][1] = tess_fuzz[0]
	new_data[i].append("label")
	elif (len(new_data[i][0]) == 3 or len(new_data[i][1]) == 3) and (
	"IK" == new_data[i][0] or "IK" in new_data[i][1]
	):
	new_data[i][0] = "nik"
	new_data[i][1] = "nik"
	new_data[i].append("label")
	return new_data


	def find_uppercase_index(text):
	pattern = r"(?<![A-Z])[A-Z]{3,}"
	match = re.search(pattern, text)
	if match:
	return match.start()
	else:
	return -1


	def correct_data(new_data, df):
	provinsi_df = df["provinsi"].dropna().tolist()
	provinsi = [f"PROVINSI {item}" for item in provinsi_df]
	other_vals = [
	"LAKI-LAKI",
	"PEREMPUAN",
	"A",
	"B",
	"AB",
	"O",
	"ISLAM",
	"KRISTEN",
	"KATOLIK",
	"HINDU",
	"BUDDHA",
	"KONGHUCU",
	"BELUM KAWIN",
	"KAWIN",
	"CERAI HIDUP",
	"CERAI MATI",
	"WNI",
	"WNA",
	"SEUMUR HIDUP",
	]

	for i in range(len(new_data)):
	if i == 0 or ("PROVINSI" in new_data[i][0] or "PROVINSI" in new_data[i][1]):
	new_data[i][0], new_data[i][1] = replace_data(new_data, i, provinsi)
	kabupaten = df[new_data[i][0].replace("PROVINSI ", "")].dropna().tolist()
	elif i == 1:
	try:
	new_data[i][0], new_data[i][1] = replace_data(new_data, i, kabupaten)
	except:
	pass
	elif len(new_data[i]) == 2:
	new_data[i][0], new_data[i][1] = replace_data(new_data, i, other_vals)
	elif i == 3 or new_data[i - 1][0] == "NIK":
	new_data[i][1] = new_data[i][0]
	# Fix dates
	if i > 4:
	pattern = r"(\d{2})\W{0,1}(\d{2})\W{0,1}((19\|20)\d{2})"
	new_data[i][0] = re.sub(pattern, r"\1-\2-\3", new_data[i][0])
	new_data[i][1] = re.sub(pattern, r"\1-\2-\3", new_data[i][1])
	return new_data


	def replace_data(new_data, i, options_list):
	paddle_fuzz = process.extractOne(new_data[i][0], options_list, scorer=fuzz.ratio)
	tess_fuzz = process.extractOne(new_data[i][1], options_list, scorer=fuzz.ratio)
	if paddle_fuzz[1] > 80:
	new_data[i][0] = paddle_fuzz[0]
	new_data[i][1] = paddle_fuzz[0]
	elif tess_fuzz[1] > 80:
	new_data[i][0] = tess_fuzz[0]
	new_data[i][1] = tess_fuzz[0]
	return new_data[i][0], new_data[i][1]


	def split_items(all_data):
	new_data = []
	for i in range(len(all_data)):
	paddle_idx = find_uppercase_index(all_data[i][4])
	tess_idx = find_uppercase_index(all_data[i][5])
	if paddle_idx not in [0, -1] and tess_idx not in [0, -1]:
	new_data.append(
	[all_data[i][4][:paddle_idx].strip(), all_data[i][5][:tess_idx].strip()]
	)
	new_data.append(
	[all_data[i][4][paddle_idx:].strip(), all_data[i][5][tess_idx:].strip()]
	)
	elif "Darah" in all_data[i][4] or "Darah" in all_data[i][5]:
	darah_match_1 = re.sub(r"(Darah)\W*((A\|AB\|B\|O))", r"\1 \2", all_data[i][4])
	darah_match_2 = re.sub(r"(Darah)\W*((A\|AB\|B\|O))", r"\1 \2", all_data[i][5])
	space_1 = darah_match_1.rfind(" ")
	space_2 = darah_match_2.rfind(" ")
	if darah_match_1[-1] in ["A", "B", "O"]:
	new_data.append(
	[darah_match_1[:space_1].strip(), darah_match_1[:space_1].strip()]
	)
	new_data.append(
	[
	darah_match_1[space_1 + 1 :].strip(),
	darah_match_1[space_1 + 1 :].strip(),
	]
	)
	elif darah_match_2[-1] in ["A", "B", "O"]:
	new_data.append(
	[darah_match_2[:space_2].strip(), darah_match_2[:space_2].strip()]
	)
	new_data.append(
	[
	darah_match_2[space_2 + 1 :].strip(),
	darah_match_2[space_2 + 1 :].strip(),
	]
	)
	else:
	new_data.append([all_data[i][4].strip(), all_data[i][5].strip()])

	return new_data


	def print_output(new_data):
	text = ""
	for i in range(len(new_data)):
	if new_data[i][0] == new_data[i][1] and len(new_data[i]) == 3:
	text += f"{new_data[i][0].upper()}\n"
	elif new_data[i][0] == new_data[i][1]:
	text += f"{new_data[i][0]}\n"
	else:
	text += f"{new_data[i][0]} \ {new_data[i][1]}\n"
	return text