Spaces:
Runtime error
Runtime error
from paddleocr import PaddleOCR | |
import cv2 | |
import pytesseract | |
import pandas as pd | |
import re | |
from thefuzz import fuzz | |
from thefuzz import process | |
def extract_data(img): | |
data_dict = { | |
"provinsi": "", | |
"kabupaten": "", | |
"nik": "", | |
"nama": "", | |
"tempat/tgl lahir": "", | |
"jenis kelamin": "", | |
"gol darah": "", | |
"alamat": "", | |
"rt/rw": "", | |
"kel/desa": "", | |
"kecamatan": "", | |
"agama": "", | |
"status perkawinan": "", | |
"pekerjaan": "", | |
"kewarganegaraan": "", | |
"berlaku hingga": "", | |
} | |
labels = list(data_dict.keys()) | |
labels.remove("kabupaten") | |
df = pd.read_csv('data.csv') | |
image = resize_image(img) | |
image_xyz = rotate_image(image) | |
all_data = run_ocr(image_xyz) | |
new_data = split_items(all_data) | |
try: | |
new_data = correct_labels(new_data, labels) | |
new_data = correct_data(new_data, df) | |
except: | |
pass | |
text = print_output(new_data) | |
return text | |
def run_ocr(image): | |
ocr = PaddleOCR( | |
use_angle_cls=True, | |
lang="id", | |
det_max_side_len=1500, | |
det_limit_type="min", | |
det_db_unclip_ratio=1.7, | |
show_log=False, | |
) | |
result = ocr.ocr(image, cls=True) | |
all_data = [] | |
for i, res in enumerate(result[0]): | |
x, y = [], [] | |
paddle_text = res[1][0] | |
for i in range(4): | |
x.append(res[0][i][0]) | |
y.append(res[0][i][1]) | |
x1, y1, x2, y2 = int(min(x)), int(min(y)), int(max(x)), int(max(y)) | |
snip = image[y1:y2, x1:x2] | |
tess_text = pytesseract.image_to_string(snip, lang="ind+eng", config="--psm 6") | |
tess_text, paddle_text = clean_text(tess_text, paddle_text, i) | |
all_data.append([x1, y1, x2, y2, paddle_text, tess_text]) | |
return all_data | |
def clean_text(tess_text, paddle_text, i): | |
if "\n" in tess_text or "\x0c" in tess_text: | |
tess_text = tess_text.replace("\n", "") | |
tess_text = tess_text.replace("\x0c", "") | |
# Remove space before or after colon and hyphen | |
pattern = r"\s*([-:])\s*" | |
paddle_text = re.sub(pattern, r"\1", paddle_text) | |
tess_text = re.sub(pattern, r"\1", tess_text) | |
# Clean blood group | |
if "Darah" in tess_text or "Darah" in paddle_text: | |
tess_text = tess_text.replace("0", "O") | |
paddle_text = paddle_text.replace("0", "O") | |
# Clean symbols | |
for item in ["'", '"', "!", "‘", "“", ":"]: | |
paddle_text = paddle_text.replace(item, "") | |
tess_text = tess_text.replace(item, "") | |
return tess_text, paddle_text | |
def resize_image(img): | |
width = int(img.shape[1]) | |
height = int(img.shape[0]) | |
thresh = 1500 | |
if width < thresh and height < thresh: | |
if width > height: | |
percent = thresh // width | |
else: | |
percent = thresh // height | |
dim = (width * percent, height * percent) | |
img = cv2.resize(img, dim, interpolation=cv2.INTER_AREA) | |
return img | |
def rotate_image(image): | |
image_xyz = cv2.cvtColor(image, cv2.COLOR_BGR2XYZ) | |
for i in range(4): | |
text = pytesseract.image_to_string(image_xyz, lang="ind+eng", config="--psm 6") | |
if "PROVINSI" in text or "Darah" in text or "NIK" in text: | |
return image_xyz | |
else: | |
image_xyz = cv2.rotate(image_xyz, cv2.ROTATE_90_CLOCKWISE) | |
if i == 3: | |
return image_xyz | |
def correct_labels(new_data, labels): | |
thresh = 80 | |
for i in range(len(new_data)): | |
paddle_fuzz = process.extractOne(new_data[i][0], labels, scorer=fuzz.ratio) | |
tess_fuzz = process.extractOne(new_data[i][1], labels, scorer=fuzz.ratio) | |
if paddle_fuzz[1] >= thresh: | |
new_data[i][0] = paddle_fuzz[0] | |
new_data[i][1] = paddle_fuzz[0] | |
new_data[i].append("label") | |
elif tess_fuzz[1] >= thresh: | |
new_data[i][0] = tess_fuzz[0] | |
new_data[i][1] = tess_fuzz[0] | |
new_data[i].append("label") | |
elif (len(new_data[i][0]) == 3 or len(new_data[i][1]) == 3) and ( | |
"IK" == new_data[i][0] or "IK" in new_data[i][1] | |
): | |
new_data[i][0] = "nik" | |
new_data[i][1] = "nik" | |
new_data[i].append("label") | |
return new_data | |
def find_uppercase_index(text): | |
pattern = r"(?<![A-Z])[A-Z]{3,}" | |
match = re.search(pattern, text) | |
if match: | |
return match.start() | |
else: | |
return -1 | |
def correct_data(new_data, df): | |
provinsi_df = df["provinsi"].dropna().tolist() | |
provinsi = [f"PROVINSI {item}" for item in provinsi_df] | |
other_vals = [ | |
"LAKI-LAKI", | |
"PEREMPUAN", | |
"A", | |
"B", | |
"AB", | |
"O", | |
"ISLAM", | |
"KRISTEN", | |
"KATOLIK", | |
"HINDU", | |
"BUDDHA", | |
"KONGHUCU", | |
"BELUM KAWIN", | |
"KAWIN", | |
"CERAI HIDUP", | |
"CERAI MATI", | |
"WNI", | |
"WNA", | |
"SEUMUR HIDUP", | |
] | |
for i in range(len(new_data)): | |
if i == 0 or ("PROVINSI" in new_data[i][0] or "PROVINSI" in new_data[i][1]): | |
new_data[i][0], new_data[i][1] = replace_data(new_data, i, provinsi) | |
kabupaten = df[new_data[i][0].replace("PROVINSI ", "")].dropna().tolist() | |
elif i == 1: | |
try: | |
new_data[i][0], new_data[i][1] = replace_data(new_data, i, kabupaten) | |
except: | |
pass | |
elif len(new_data[i]) == 2: | |
new_data[i][0], new_data[i][1] = replace_data(new_data, i, other_vals) | |
elif i == 3 or new_data[i - 1][0] == "NIK": | |
new_data[i][1] = new_data[i][0] | |
# Fix dates | |
if i > 4: | |
pattern = r"(\d{2})\W{0,1}(\d{2})\W{0,1}((19|20)\d{2})" | |
new_data[i][0] = re.sub(pattern, r"\1-\2-\3", new_data[i][0]) | |
new_data[i][1] = re.sub(pattern, r"\1-\2-\3", new_data[i][1]) | |
return new_data | |
def replace_data(new_data, i, options_list): | |
paddle_fuzz = process.extractOne(new_data[i][0], options_list, scorer=fuzz.ratio) | |
tess_fuzz = process.extractOne(new_data[i][1], options_list, scorer=fuzz.ratio) | |
if paddle_fuzz[1] > 80: | |
new_data[i][0] = paddle_fuzz[0] | |
new_data[i][1] = paddle_fuzz[0] | |
elif tess_fuzz[1] > 80: | |
new_data[i][0] = tess_fuzz[0] | |
new_data[i][1] = tess_fuzz[0] | |
return new_data[i][0], new_data[i][1] | |
def split_items(all_data): | |
new_data = [] | |
for i in range(len(all_data)): | |
paddle_idx = find_uppercase_index(all_data[i][4]) | |
tess_idx = find_uppercase_index(all_data[i][5]) | |
if paddle_idx not in [0, -1] and tess_idx not in [0, -1]: | |
new_data.append( | |
[all_data[i][4][:paddle_idx].strip(), all_data[i][5][:tess_idx].strip()] | |
) | |
new_data.append( | |
[all_data[i][4][paddle_idx:].strip(), all_data[i][5][tess_idx:].strip()] | |
) | |
elif "Darah" in all_data[i][4] or "Darah" in all_data[i][5]: | |
darah_match_1 = re.sub(r"(Darah)\W*((A|AB|B|O))", r"\1 \2", all_data[i][4]) | |
darah_match_2 = re.sub(r"(Darah)\W*((A|AB|B|O))", r"\1 \2", all_data[i][5]) | |
space_1 = darah_match_1.rfind(" ") | |
space_2 = darah_match_2.rfind(" ") | |
if darah_match_1[-1] in ["A", "B", "O"]: | |
new_data.append( | |
[darah_match_1[:space_1].strip(), darah_match_1[:space_1].strip()] | |
) | |
new_data.append( | |
[ | |
darah_match_1[space_1 + 1 :].strip(), | |
darah_match_1[space_1 + 1 :].strip(), | |
] | |
) | |
elif darah_match_2[-1] in ["A", "B", "O"]: | |
new_data.append( | |
[darah_match_2[:space_2].strip(), darah_match_2[:space_2].strip()] | |
) | |
new_data.append( | |
[ | |
darah_match_2[space_2 + 1 :].strip(), | |
darah_match_2[space_2 + 1 :].strip(), | |
] | |
) | |
else: | |
new_data.append([all_data[i][4].strip(), all_data[i][5].strip()]) | |
return new_data | |
def print_output(new_data): | |
text = "" | |
for i in range(len(new_data)): | |
if new_data[i][0] == new_data[i][1] and len(new_data[i]) == 3: | |
text += f"{new_data[i][0].upper()}\n" | |
elif new_data[i][0] == new_data[i][1]: | |
text += f"{new_data[i][0]}\n" | |
else: | |
text += f"{new_data[i][0]} \ {new_data[i][1]}\n" | |
return text | |