Spaces:
Runtime error
Runtime error
from paddleocr import PaddleOCR | |
import os | |
import cv2 | |
import pytesseract | |
import pandas as pd | |
import re | |
from thefuzz import fuzz | |
from thefuzz import process | |
import logging | |
import json | |
logging.getLogger().setLevel(logging.ERROR) | |
def process_image(path): | |
""" | |
The main function that performs optical character recognition (OCR) on an image and processes the extracted data. | |
Returns: | |
obj: Processed text output containing extracted information. | |
""" | |
csv_path = 'data.csv' | |
data_dict = { | |
"provinsi": "", | |
"kabupaten": "", | |
"nik": "", | |
"nama": "", | |
"tempat/tgl lahir": "", | |
"jenis kelamin": "", | |
"gol. darah": "", | |
"alamat": "", | |
"rt/rw": "", | |
"kel/desa": "", | |
"kecamatan": "", | |
"agama": "", | |
"status perkawinan": "", | |
"pekerjaan": "", | |
"kewarganegaraan": "", | |
"berlaku hingga": "", | |
} | |
# Create list for labels spelling correction | |
labels = list(data_dict.keys()) | |
labels.remove("kabupaten") | |
try: | |
# Read csv data file | |
df = pd.read_csv(csv_path) | |
except: | |
raise ValueError("Cannot find the csv data file.") | |
try: | |
# Resize image | |
image = resize_image(path) | |
# Run Tesseract to get the right rotation and color conversion | |
image_xyz = rotate_image(image) | |
except: | |
raise ValueError("Invalid image input.") | |
# Run PaddleOCR on the whole image and Tesseract on detected areas by PaddleOCR | |
all_data = run_ocr(image_xyz) | |
# Check if the 16-digit ID number exists | |
all_data = check_numbers(all_data) | |
# Split labels and data | |
new_data = split_items(all_data) | |
try: | |
# Correct the text of labels | |
new_data, found_labels = correct_labels(new_data, labels) | |
# Correct the data | |
new_data = correct_data(new_data, df) | |
except: | |
pass | |
try: | |
# Add labels if missing | |
new_data = add_missing_labels(new_data, labels, found_labels) | |
except: | |
pass | |
# Print the clean output | |
text = print_output(new_data) | |
# Convert to JSON | |
text_obj = json.dumps({"text":text}) | |
return text_obj | |
def get_scores(result): | |
""" | |
Get scores from the OCR result. | |
Args: | |
result (list): The OCR result list. | |
Returns: | |
tuple: A tuple containing lists of sorted confidence scores, overall score, and all scores. | |
""" | |
scores = [round(line[1][1],4) for line in result[0]] | |
overall_score = 0 | |
for score in scores: | |
overall_score += score | |
overall_score = round(overall_score/len(scores),4) | |
sorted_scores = sorted(scores) | |
# Raise error if the 3rd confidence score is less than 90% | |
if sorted_scores[2] < 0.9: | |
raise ValueError("Poor image quality. Please avoid shadows, flashlights, and patterned backgrounds.") | |
return overall_score, sorted_scores, scores | |
def add_missing_labels(new_data, labels, found_labels): | |
# Add labels if a maximum of 3 labels is missing | |
if len(found_labels) < 15 and len(found_labels) > 12: | |
added = 0 | |
for i in range(len(labels)): | |
if labels[i] != found_labels[i][0]: | |
# Use next label index - 2 + the number of shifted items | |
# Else, use previous label index + 2 + the number of shifted items | |
try: | |
if labels[i] == "gol. darah": | |
idx = found_labels[i][1] + added | |
elif labels[i] == "alamat": | |
# Get Gol. Darah index and check if the length of next item is greater than two | |
gol_idx = new_data.index("gol. darah") | |
if len(new_data[gol_idx+1]) > 2: | |
idx = gol_idx + 1 | |
else: | |
idx = gol_idx + 2 | |
else: | |
idx = found_labels[i+1][1] - 2 + added | |
except: | |
idx = found_labels[i-1][1] + 2 + added | |
if idx < len(new_data)-1: | |
new_data.insert(idx, [labels[i], labels[i], 'label']) | |
found_labels.insert(i, [labels[i], idx]) | |
else: | |
new_data.insert(len(new_data)-2, [labels[i], labels[i], 'label']) | |
found_labels.insert(i, [labels[i], len(new_data)-2]) | |
added += 1 | |
else: | |
raise ValueError("Some labels cannot be detected. Please recapture a photo of the ID.") | |
return new_data | |
def check_numbers(all_data): | |
""" | |
Check if there is a 16-digit number in OCR text. | |
Args: | |
all_data (list): The structured OCR result list. | |
Returns: | |
list: A list containing the structured OCR output | |
""" | |
ktp_num = "" | |
for i in range(len(all_data)): | |
id_output = re.findall("\d{16}", all_data[i][4]) | |
rt_output = re.findall("\d{3}/\d{3}", all_data[i][4]) | |
if len(id_output) > 0: | |
# Keep PaddleOCR output for both | |
ktp_num, all_data[i][4], all_data[i][5] = id_output[0], id_output[0], id_output[0] | |
if len(rt_output) > 0: | |
all_data[i][4], all_data[i][5] = rt_output[0], rt_output[0] | |
if ktp_num == "": | |
raise ValueError("KTP number cannot be detected. Please recapture a photo of the ID.") | |
return all_data | |
def run_ocr(image): | |
""" | |
Perform optical character recognition (OCR) on the given image. | |
Args: | |
image (ndarray): The image array on which OCR will be performed. | |
Returns: | |
list: A list containing information about the recognized text regions, including coordinates, recognized text, | |
and corresponding OCR outputs from different OCR engines. | |
""" | |
ocr = PaddleOCR( | |
use_angle_cls=True, | |
lang="id", | |
det_max_side_len=1500, | |
det_limit_type="min", | |
det_db_unclip_ratio=1.7, | |
drop_score = 0.75, | |
show_log=False, | |
) | |
result = ocr.ocr(image, cls=True) | |
all_data = [] | |
# Check the if the confidence score is higher than the threshold | |
get_scores(result) | |
# Create a list of values in form of x1, y1, x2, y2, Paddle output, Tesseract output | |
for i, res in enumerate(result[0]): | |
x, y = [], [] | |
paddle_text = res[1][0] | |
for i in range(4): | |
x.append(res[0][i][0]) | |
y.append(res[0][i][1]) | |
x1, y1, x2, y2 = int(min(x)), int(min(y)), int(max(x)), int(max(y)) | |
# Crop the area of text detected by Paddle | |
snip = image[y1:y2, x1:x2] | |
# Run Tesseract on the cropped area | |
tess_text = pytesseract.image_to_string(snip, lang="ind+eng", config="--psm 6") | |
# Clean the output of Tesseract and Paddle | |
tess_text, paddle_text = clean_text(tess_text, paddle_text) | |
all_data.append([x1, y1, x2, y2, paddle_text, tess_text]) | |
return all_data | |
def clean_text(tess_text, paddle_text): | |
""" | |
Clean and preprocess the recognized text from Tesseract and PaddleOCR. | |
Args: | |
tess_text (str): Text recognized by Tesseract OCR. | |
paddle_text (str): Text recognized by PaddleOCR. | |
Returns: | |
tuple: A tuple containing the cleaned and preprocessed text from Tesseract and PaddleOCR, respectively. | |
""" | |
# Remove unicode | |
if "\n" in tess_text or "\x0c" in tess_text: | |
tess_text = tess_text.replace("\n", "") | |
tess_text = tess_text.replace("\x0c", "") | |
# Remove space before or after colon and hyphen | |
pattern = r"\s*([-:*])\s*" | |
paddle_text = re.sub(pattern, r"\1", paddle_text) | |
tess_text = re.sub(pattern, r"\1", tess_text) | |
# Replace any 1O with 10 | |
paddle_text = paddle_text.replace("1O","10") | |
tess_text = tess_text.replace("1O","10") | |
# Fix dots in ID number | |
pattern = r"[0-9\.]{10}" | |
res = re.findall(pattern, paddle_text) | |
if len(res) != 0: | |
paddle_text = paddle_text.replace(".","") | |
# Add space after dot or comma and remove any two spaces | |
paddle_text = re.sub(r"([A-Z]\.)([A-z])", r"\1 \2", paddle_text) | |
# Fix commas recognized as dots and add space after it | |
if "NO" not in paddle_text: | |
pattern = r"([A-Za-z][\.,]\s{0,1})(\d{2})" | |
paddle_text = re.sub(pattern, r", \2", paddle_text) | |
tess_text = re.sub(pattern, r", \2", tess_text) | |
else: | |
pattern = r"([A-Za-z][\.]\s{0,1})(\d{1})" | |
paddle_text = re.sub(pattern, r". \2", paddle_text) | |
tess_text = re.sub(pattern, r". \2", tess_text) | |
# Clean blood group | |
if "Darah" in tess_text or "Darah" in paddle_text: | |
tess_text = tess_text.replace("0", "O") | |
paddle_text = paddle_text.replace("0", "O") | |
# Clean symbols | |
for item in ["'", '"', "!", "‘", "“", ":", "*","=", "+"]: | |
paddle_text = paddle_text.replace(item, "") | |
tess_text = tess_text.replace(item, "") | |
# Remove hyphen, dot, or comma if in the beginning of the text | |
if len(tess_text) > 0: | |
if tess_text[0] in ['-','.',',']: | |
tess_text = tess_text[1:] | |
if len(paddle_text) > 0: | |
if paddle_text[0] in ['-','.',',']: | |
paddle_text = paddle_text[1:] | |
# if paddle text is similar to tesseract text without spaces, replace paddle text with tesseract text | |
temp = tess_text.replace(" ","") | |
if paddle_text == temp: | |
paddle_text = tess_text | |
# If JL in the beggining of text, add the dot | |
if paddle_text[:2] == "JL" or tess_text[:2] == "JL": | |
paddle_text = re.sub(r"(JL)(\.{0,1})([A-Z])",r"JL. \3", paddle_text) | |
tess_text = re.sub(r"(JL)(\.{0,1})([A-Z])",r"JL. \3", tess_text) | |
# Check add missing spaces to Paddle Output | |
idxs = [] | |
for i, char in enumerate(tess_text): | |
if char.isspace(): | |
idxs.append(i) | |
for idx in idxs: | |
try: | |
p1 = tess_text[idx-2:idx] | |
p2 = tess_text[idx+1:idx+3] | |
if p1.isalpha() == True and p2.isalpha() == True: | |
to_replace = p1+p2 | |
new = p1+" "+p2 | |
paddle_text = paddle_text.replace(to_replace, new) | |
except: | |
pass | |
return tess_text, paddle_text | |
def resize_image(path): | |
""" | |
Resize the image if its dimensions are smaller than the specified threshold. | |
Args: | |
path (str): The path to the image file. | |
Returns: | |
ndarray: The resized image array. | |
""" | |
img = cv2.imread(path) | |
width = int(img.shape[1]) | |
height = int(img.shape[0]) | |
thresh = 1500 | |
# Resize image to match the threshold | |
if width < thresh and height < thresh: | |
if width > height: | |
percent = thresh // width | |
else: | |
percent = thresh // height | |
dim = (width * percent, height * percent) | |
img = cv2.resize(img, dim, interpolation=cv2.INTER_AREA) | |
return img | |
def rotate_image(image): | |
""" | |
Rotate the image to the correct orientation by checking for specific text patterns in different rotations. | |
Args: | |
image (ndarray): The image array to be rotated. | |
Returns: | |
ndarray: The rotated image array if specific text patterns are found, otherwise the original image array. | |
""" | |
# Convert color to XYZ | |
image_xyz = cv2.cvtColor(image, cv2.COLOR_BGR2XYZ) | |
# Rotate the image by 90 degrees for 4 times until recognizing some correct text | |
for i in range(4): | |
text = pytesseract.image_to_string(image_xyz, lang="ind+eng", config="--psm 6") | |
if "PROVINSI" in text or "Darah" in text or "NIK" in text: | |
return image_xyz | |
else: | |
image_xyz = cv2.rotate(image_xyz, cv2.ROTATE_90_CLOCKWISE) | |
# If text is not found until last round, return image in original rotation | |
if i == 3: | |
return image_xyz | |
def correct_labels(new_data, labels): | |
""" | |
Correct the labels of the extracted data by matching them with a list of valid labels. | |
Args: | |
new_data (list): The extracted data list to be corrected. | |
labels (list): The list of valid labels. | |
Returns: | |
tuple: The corrected extracted data list with updated labels and list of labels | |
and corresponding indexes. | |
""" | |
thresh = 75 | |
found_labels = [["provinsi", 0]] | |
for i in range(len(new_data)): | |
paddle_fuzz = process.extractOne(new_data[i][0], labels, scorer=fuzz.ratio) | |
tess_fuzz = process.extractOne(new_data[i][1], labels, scorer=fuzz.ratio) | |
# Skip adding provinsi because it's already added at index 0 | |
if paddle_fuzz[0] != 'provinsi' and tess_fuzz[0] != 'provinsi': | |
# Correct text using the match that is more than the threshold | |
if paddle_fuzz[1] >= thresh: | |
new_data[i][0] = paddle_fuzz[0] | |
new_data[i][1] = paddle_fuzz[0] | |
new_data[i].append("label") | |
found_labels.append([paddle_fuzz[0], i]) | |
elif tess_fuzz[1] >= thresh: | |
new_data[i][0] = tess_fuzz[0] | |
new_data[i][1] = tess_fuzz[0] | |
new_data[i].append("label") | |
found_labels.append([tess_fuzz[0], i]) | |
# Correct "NIK" | |
elif (len(new_data[i][0]) == 3 or len(new_data[i][1]) == 3) and ( | |
"IK" == new_data[i][0] or "IK" in new_data[i][1] | |
): | |
new_data[i][0] = "nik" | |
new_data[i][1] = "nik" | |
new_data[i].append("label") | |
found_labels.append(["nik", i]) | |
return new_data, found_labels | |
def find_uppercase_index(text): | |
""" | |
Find the index of the first uppercase word in the given text. | |
Args: | |
text (str): The input text. | |
Returns: | |
int: The index of the first uppercase word, or -1 if no uppercase word is found. | |
""" | |
# Split lowercase followed by uppercase without space | |
pattern = r"(?<![A-Z])[A-Z]{3,}" | |
match = re.search(pattern, text) | |
if match: | |
return match.start() | |
else: | |
return -1 | |
def correct_data(new_data, df): | |
""" | |
Correct the extracted data based on reference data from a DataFrame. | |
Args: | |
new_data (list): The extracted data list to be corrected. | |
df (DataFrame): The reference DataFrame containing the data for correction. | |
Returns: | |
list: The corrected extracted data list. | |
""" | |
# Make lists to be used in text correction | |
provinsi_df = df["provinsi"].dropna().tolist() | |
provinsi = [f"PROVINSI {item}" for item in provinsi_df] | |
other_vals = [ | |
"LAKI-LAKI", | |
"PEREMPUAN", | |
"A", | |
"B", | |
"AB", | |
"O", | |
"ISLAM", | |
"KRISTEN", | |
"KATOLIK", | |
"HINDU", | |
"BUDHA", | |
"KONGHUCU", | |
"BELUM KAWIN", | |
"KAWIN", | |
"CERAI HIDUP", | |
"CERAI MATI", | |
"WNI", | |
"WNA", | |
"SEUMUR HIDUP", | |
] | |
paddle_except_city = [] | |
for i in range(len(new_data)): | |
# Fix Provinsi | |
if i == 0 or ("PROVINSI" in new_data[i][0] or "PROVINSI" in new_data[i][1]): | |
new_data[i][0], new_data[i][1] = replace_data(new_data, i, provinsi) | |
kabupaten = df[new_data[i][0].replace("PROVINSI ", "")].dropna().tolist() | |
# Fix Kabupaten | |
elif i == 1: | |
try: | |
new_data[i][0], new_data[i][1] = replace_data(new_data, i, kabupaten) | |
except: | |
pass | |
# Fix other values such as religion | |
elif len(new_data[i]) == 2: | |
new_data[i][0], new_data[i][1] = replace_data(new_data, i, other_vals) | |
# Fix NIK | |
elif i == 3 or new_data[i - 1][0].upper() == "NIK": | |
new_data[i][1] = new_data[i][0] | |
# Fix dates | |
if i > 4: | |
pattern = r"(\d{2})\W{0,1}(\d{2})\W{0,1}((19|20)\d{2})" | |
new_data[i][0] = re.sub(pattern, r"\1-\2-\3", new_data[i][0]) | |
new_data[i][1] = re.sub(pattern, r"\1-\2-\3", new_data[i][1]) | |
if i != 1: | |
paddle_except_city.append(new_data[i][0]) | |
# Add WNI if no WNI or WNA | |
paddle_temp = [data[0] for data in new_data] | |
tess_temp = [data[1] for data in new_data] | |
if not {"WNI", "WNA"}.intersection(set(paddle_temp)) and not {"WNI", "WNA"}.intersection(set(tess_temp)): | |
try: | |
kew_idx = paddle_temp.index("kewarganegaraan") | |
new_data.insert(kew_idx+1, ["WNI", "WNI"]) | |
except: | |
pass | |
# Fix issuer province name if similar to province name in line 2 | |
issuer_fuzz = process.extractOne(new_data[1][0], paddle_except_city, scorer=fuzz.ratio) | |
if issuer_fuzz[1] >= 85: | |
for i in range(len(new_data)): | |
if new_data[i][0] == issuer_fuzz[0]: | |
new_data[i][0], new_data[i][1] = new_data[1][0], new_data[1][0] | |
return new_data | |
def replace_data(new_data, i, options_list): | |
""" | |
Replace the data in the extracted list with the closest matching option from the given list. | |
Args: | |
new_data (list): The extracted data list. | |
i (int): The index of the item to be replaced. | |
options_list (list): The list of options for replacement. | |
Returns: | |
tuple: A tuple containing the replaced values for the item at index i. | |
""" | |
paddle_fuzz = process.extractOne(new_data[i][0], options_list, scorer=fuzz.ratio) | |
tess_fuzz = process.extractOne(new_data[i][1], options_list, scorer=fuzz.ratio) | |
# Replace values if fuzzy matching score exceeds threshold | |
if len(new_data[i][0]) < 4: | |
thresh = 65 | |
else: | |
thresh = 75 | |
if paddle_fuzz[1] > thresh: | |
new_data[i][0] = paddle_fuzz[0] | |
new_data[i][1] = paddle_fuzz[0] | |
elif tess_fuzz[1] > thresh: | |
new_data[i][0] = tess_fuzz[0] | |
new_data[i][1] = tess_fuzz[0] | |
return new_data[i][0], new_data[i][1] | |
def split_items(all_data): | |
""" | |
Split the data items in the given list into separate items based on certain conditions. | |
Args: | |
all_data (list): The list of data items to be split. | |
Returns: | |
list: The new list of split data items. | |
""" | |
new_data = [] | |
for i in range(len(all_data)): | |
paddle_idx = find_uppercase_index(all_data[i][4]) | |
tess_idx = find_uppercase_index(all_data[i][5]) | |
if paddle_idx not in [0, -1] and tess_idx not in [0, -1]: | |
p1 = [all_data[i][4][:paddle_idx].strip(), all_data[i][5][:tess_idx].strip()] | |
p2 = [all_data[i][4][paddle_idx:].strip(), all_data[i][5][tess_idx:].strip()] | |
if p1 != ["",""]: | |
new_data.append(p1) | |
if p2 != ["",""]: | |
new_data.append(p2) | |
# Fix the text related to blood type | |
elif "Darah" in all_data[i][4] or "Darah" in all_data[i][5]: | |
# Add space between blood type and label | |
darah_match_1 = re.sub(r"(Darah)\W*((A|AB|B|O))", r"\1 \2", all_data[i][4]) | |
darah_match_2 = re.sub(r"(Darah)\W*((A|AB|B|O))", r"\1 \2", all_data[i][5]) | |
# Locate the space | |
space_1 = darah_match_1.rfind(" ") | |
space_2 = darah_match_2.rfind(" ") | |
# Write the label and values in two seperate lists | |
try: | |
if darah_match_1[-1] in ["A", "B", "O"]: | |
new_data.append( | |
[darah_match_1[:space_1].strip(), darah_match_1[:space_1].strip()] | |
) | |
new_data.append( | |
[ | |
darah_match_1[space_1 + 1 :].strip(), | |
darah_match_1[space_1 + 1 :].strip(), | |
] | |
) | |
elif darah_match_2[-1] in ["A", "B", "O"]: | |
new_data.append( | |
[darah_match_2[:space_2].strip(), darah_match_2[:space_2].strip()] | |
) | |
new_data.append( | |
[ | |
darah_match_2[space_2 + 1 :].strip(), | |
darah_match_2[space_2 + 1 :].strip(), | |
] | |
) | |
except: | |
pass | |
else: | |
new_data.append([all_data[i][4].strip(), all_data[i][5].strip()]) | |
return new_data | |
def print_output(new_data): | |
""" | |
Create a formatted string output based on the given data. | |
Args: | |
new_data (list): The list of data items. | |
Returns: | |
str: The formatted string output. | |
""" | |
text = "" | |
for i in range(len(new_data)): | |
# Change labels to Uppercase | |
if new_data[i][0] == new_data[i][1] and len(new_data[i]) == 3: | |
text += f"{new_data[i][0].upper()}\n" | |
else: | |
if len(new_data[i][0]) > 0: | |
text += f"{new_data[i][0]}\n" | |
return text |