Spaces:
Sleeping
Sleeping
update KTP scanning
Browse files- Dockerfile +12 -1
- apt.txt +3 -0
- backend/file_uploads.py +2 -2
- backend/file_utils.py +4 -4
- backend/text_recog.py +41 -10
Dockerfile
CHANGED
@@ -1,6 +1,17 @@
|
|
1 |
FROM python:3.10-slim
|
2 |
|
3 |
-
RUN apt-get update && apt-get install -y
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
COPY requirements.txt .
|
6 |
RUN pip install -r requirements.txt
|
|
|
1 |
FROM python:3.10-slim
|
2 |
|
3 |
+
RUN apt-get update && apt-get install -y \
|
4 |
+
tesseract-ocr \
|
5 |
+
tesseract-ocr-ind \
|
6 |
+
libtesseract-dev \
|
7 |
+
libleptonica-dev \
|
8 |
+
poppler-utils \
|
9 |
+
libsm6 \
|
10 |
+
libxext6 \
|
11 |
+
libxrender-dev \
|
12 |
+
build-essential \
|
13 |
+
&& apt-get clean \
|
14 |
+
&& rm -rf /var/lib/apt/lists/*
|
15 |
|
16 |
COPY requirements.txt .
|
17 |
RUN pip install -r requirements.txt
|
apt.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
tesseract-ocr
|
2 |
+
libtesseract-dev
|
3 |
+
tesseract-ocr-ind
|
backend/file_uploads.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
from flask import Flask, request, jsonify
|
2 |
from werkzeug.utils import secure_filename
|
3 |
from .parser import parse_promotion_pdf, parse_promotion_excel
|
4 |
-
from .text_recog import
|
5 |
import os
|
6 |
|
7 |
app = Flask(__name__)
|
@@ -34,7 +34,7 @@ def upload_file():
|
|
34 |
if filename.endswith('.pdf'):
|
35 |
parsed_file = parse_promotion_pdf(filepath)
|
36 |
elif filename.endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff')):
|
37 |
-
parsed_file =
|
38 |
elif filename.endswith('.xlsx' or '.csv'):
|
39 |
parsed_file = parse_promotion_excel(filepath, filename)
|
40 |
else:
|
|
|
1 |
from flask import Flask, request, jsonify
|
2 |
from werkzeug.utils import secure_filename
|
3 |
from .parser import parse_promotion_pdf, parse_promotion_excel
|
4 |
+
from .text_recog import extract_ktp_info
|
5 |
import os
|
6 |
|
7 |
app = Flask(__name__)
|
|
|
34 |
if filename.endswith('.pdf'):
|
35 |
parsed_file = parse_promotion_pdf(filepath)
|
36 |
elif filename.endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff')):
|
37 |
+
parsed_file = extract_ktp_info(filepath, filename)
|
38 |
elif filename.endswith('.xlsx' or '.csv'):
|
39 |
parsed_file = parse_promotion_excel(filepath, filename)
|
40 |
else:
|
backend/file_utils.py
CHANGED
@@ -15,11 +15,11 @@ def convert_image_to_word(text, filename):
|
|
15 |
doc.add_paragraph(text)
|
16 |
doc.add_paragraph("\n" + "-"*50 + "\n")
|
17 |
|
18 |
-
if not os.path.exists(os.path.join('
|
19 |
-
filepath = os.path.join('
|
20 |
else:
|
21 |
copy_num+=1
|
22 |
-
filepath = os.path.join('
|
23 |
|
24 |
doc.save(filepath)
|
25 |
|
@@ -33,7 +33,7 @@ def wait_for_file_release(file_path, timeout=5):
|
|
33 |
time.sleep(0.5)
|
34 |
return False
|
35 |
|
36 |
-
def delete_temp_folder(temp_path="
|
37 |
time.sleep(0.5)
|
38 |
for filename in os.listdir(temp_path):
|
39 |
file_path = os.path.join(temp_path, filename)
|
|
|
15 |
doc.add_paragraph(text)
|
16 |
doc.add_paragraph("\n" + "-"*50 + "\n")
|
17 |
|
18 |
+
if not os.path.exists(os.path.join('/tmp', filename)):
|
19 |
+
filepath = os.path.join('/tmp', filename)
|
20 |
else:
|
21 |
copy_num+=1
|
22 |
+
filepath = os.path.join('/tmp', f'{filename}({copy_num})')
|
23 |
|
24 |
doc.save(filepath)
|
25 |
|
|
|
33 |
time.sleep(0.5)
|
34 |
return False
|
35 |
|
36 |
+
def delete_temp_folder(temp_path="/tmp"):
|
37 |
time.sleep(0.5)
|
38 |
for filename in os.listdir(temp_path):
|
39 |
file_path = os.path.join(temp_path, filename)
|
backend/text_recog.py
CHANGED
@@ -1,16 +1,47 @@
|
|
1 |
-
import cv2
|
2 |
import pytesseract
|
|
|
|
|
|
|
3 |
from .file_utils import convert_image_to_word
|
4 |
|
5 |
-
def
|
6 |
-
|
7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
-
|
10 |
-
|
11 |
|
12 |
-
|
13 |
-
data = pytesseract.image_to_string(thresh, config=custom_config)
|
14 |
-
convert_image_to_word(data, filename)
|
15 |
|
16 |
-
return
|
|
|
|
|
1 |
import pytesseract
|
2 |
+
import cv2
|
3 |
+
import re
|
4 |
+
import platform
|
5 |
from .file_utils import convert_image_to_word
|
6 |
|
7 |
+
def configure_tesseract():
|
8 |
+
system = platform.system()
|
9 |
+
if system == "Windows":
|
10 |
+
pytesseract.pytesseract.tesseract_cmd = r"C:\Users\hp\AppData\Local\Programs\Tesseract-OCR\tesseract.exe"
|
11 |
+
else:
|
12 |
+
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
|
13 |
+
|
14 |
+
def extract_ktp_info(image_path, filename):
|
15 |
+
configure_tesseract()
|
16 |
+
|
17 |
+
img = cv2.imread(image_path)
|
18 |
+
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
19 |
+
text = pytesseract.image_to_string(gray, lang='ind')
|
20 |
+
|
21 |
+
text = text.replace('\n\n\n', '\n').replace('\n\n', '\n').replace('\n', '\n')
|
22 |
+
|
23 |
+
matches = re.findall(r':\s*(.+)', text)
|
24 |
+
city = re.search(r'PROVINSI\s+(.+?)\n(.+?)\n', text)
|
25 |
+
|
26 |
+
result = {
|
27 |
+
"nik" : matches[1],
|
28 |
+
"nama" : matches[2],
|
29 |
+
"tempat_tgl_lahir" : matches[3],
|
30 |
+
"jenis_kelamin" : matches[4],
|
31 |
+
"alamat" : matches[5],
|
32 |
+
"rt_rw" : matches[6],
|
33 |
+
"kel/desa" : matches[7],
|
34 |
+
"kecamatan" : matches[8],
|
35 |
+
"provinsi" : city.group(1).strip(),
|
36 |
+
"kab/kota" : city.group(2).strip(),
|
37 |
+
"agama" : matches[9],
|
38 |
+
"pekerjaan" : matches[10],
|
39 |
+
"gol_dar" : matches[12]
|
40 |
+
}
|
41 |
|
42 |
+
if "WNI" in text:
|
43 |
+
result["Kewarganegaraan"] = "WNI"
|
44 |
|
45 |
+
convert_image_to_word(result, f'KTP {filename}')
|
|
|
|
|
46 |
|
47 |
+
return result
|