blaxx14 commited on
Commit
605c260
·
1 Parent(s): fa6a5e5

update KTP scanning

Browse files
Dockerfile CHANGED
@@ -1,6 +1,17 @@
1
  FROM python:3.10-slim
2
 
3
- RUN apt-get update && apt-get install -y tesseract-ocr libtesseract-dev
 
 
 
 
 
 
 
 
 
 
 
4
 
5
  COPY requirements.txt .
6
  RUN pip install -r requirements.txt
 
1
  FROM python:3.10-slim
2
 
3
+ RUN apt-get update && apt-get install -y \
4
+ tesseract-ocr \
5
+ tesseract-ocr-ind \
6
+ libtesseract-dev \
7
+ libleptonica-dev \
8
+ poppler-utils \
9
+ libsm6 \
10
+ libxext6 \
11
+ libxrender-dev \
12
+ build-essential \
13
+ && apt-get clean \
14
+ && rm -rf /var/lib/apt/lists/*
15
 
16
  COPY requirements.txt .
17
  RUN pip install -r requirements.txt
apt.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ tesseract-ocr
2
+ libtesseract-dev
3
+ tesseract-ocr-ind
backend/file_uploads.py CHANGED
@@ -1,7 +1,7 @@
1
  from flask import Flask, request, jsonify
2
  from werkzeug.utils import secure_filename
3
  from .parser import parse_promotion_pdf, parse_promotion_excel
4
- from .text_recog import parsing_image
5
  import os
6
 
7
  app = Flask(__name__)
@@ -34,7 +34,7 @@ def upload_file():
34
  if filename.endswith('.pdf'):
35
  parsed_file = parse_promotion_pdf(filepath)
36
  elif filename.endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff')):
37
- parsed_file = parsing_image(filepath, filename)
38
  elif filename.endswith('.xlsx' or '.csv'):
39
  parsed_file = parse_promotion_excel(filepath, filename)
40
  else:
 
1
  from flask import Flask, request, jsonify
2
  from werkzeug.utils import secure_filename
3
  from .parser import parse_promotion_pdf, parse_promotion_excel
4
+ from .text_recog import extract_ktp_info
5
  import os
6
 
7
  app = Flask(__name__)
 
34
  if filename.endswith('.pdf'):
35
  parsed_file = parse_promotion_pdf(filepath)
36
  elif filename.endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff')):
37
+ parsed_file = extract_ktp_info(filepath, filename)
38
  elif filename.endswith('.xlsx' or '.csv'):
39
  parsed_file = parse_promotion_excel(filepath, filename)
40
  else:
backend/file_utils.py CHANGED
@@ -15,11 +15,11 @@ def convert_image_to_word(text, filename):
15
  doc.add_paragraph(text)
16
  doc.add_paragraph("\n" + "-"*50 + "\n")
17
 
18
- if not os.path.exists(os.path.join('temp', filename)):
19
- filepath = os.path.join('temp', filename)
20
  else:
21
  copy_num+=1
22
- filepath = os.path.join('temp', f'{filename}({copy_num})')
23
 
24
  doc.save(filepath)
25
 
@@ -33,7 +33,7 @@ def wait_for_file_release(file_path, timeout=5):
33
  time.sleep(0.5)
34
  return False
35
 
36
- def delete_temp_folder(temp_path="./temp"):
37
  time.sleep(0.5)
38
  for filename in os.listdir(temp_path):
39
  file_path = os.path.join(temp_path, filename)
 
15
  doc.add_paragraph(text)
16
  doc.add_paragraph("\n" + "-"*50 + "\n")
17
 
18
+ if not os.path.exists(os.path.join('/tmp', filename)):
19
+ filepath = os.path.join('/tmp', filename)
20
  else:
21
  copy_num+=1
22
+ filepath = os.path.join('/tmp', f'{filename}({copy_num})')
23
 
24
  doc.save(filepath)
25
 
 
33
  time.sleep(0.5)
34
  return False
35
 
36
+ def delete_temp_folder(temp_path="/tmp"):
37
  time.sleep(0.5)
38
  for filename in os.listdir(temp_path):
39
  file_path = os.path.join(temp_path, filename)
backend/text_recog.py CHANGED
@@ -1,16 +1,47 @@
1
- import cv2
2
  import pytesseract
 
 
 
3
  from .file_utils import convert_image_to_word
4
 
5
- def parsing_image(image, filename):
6
- pytesseract.pytesseract.tesseract_cmd = r'C:\Users\hp\AppData\Local\Programs\Tesseract-OCR\tesseract.exe'
7
- image = cv2.imread(image)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
- gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
10
- _, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY_INV)
11
 
12
- custom_config = r'--oem 3 --psm 6'
13
- data = pytesseract.image_to_string(thresh, config=custom_config)
14
- convert_image_to_word(data, filename)
15
 
16
- return {}
 
 
1
  import pytesseract
2
+ import cv2
3
+ import re
4
+ import platform
5
  from .file_utils import convert_image_to_word
6
 
7
+ def configure_tesseract():
8
+ system = platform.system()
9
+ if system == "Windows":
10
+ pytesseract.pytesseract.tesseract_cmd = r"C:\Users\hp\AppData\Local\Programs\Tesseract-OCR\tesseract.exe"
11
+ else:
12
+ pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
13
+
14
+ def extract_ktp_info(image_path, filename):
15
+ configure_tesseract()
16
+
17
+ img = cv2.imread(image_path)
18
+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
19
+ text = pytesseract.image_to_string(gray, lang='ind')
20
+
21
+ text = text.replace('\n\n\n', '\n').replace('\n\n', '\n').replace('\n', '\n')
22
+
23
+ matches = re.findall(r':\s*(.+)', text)
24
+ city = re.search(r'PROVINSI\s+(.+?)\n(.+?)\n', text)
25
+
26
+ result = {
27
+ "nik" : matches[1],
28
+ "nama" : matches[2],
29
+ "tempat_tgl_lahir" : matches[3],
30
+ "jenis_kelamin" : matches[4],
31
+ "alamat" : matches[5],
32
+ "rt_rw" : matches[6],
33
+ "kel/desa" : matches[7],
34
+ "kecamatan" : matches[8],
35
+ "provinsi" : city.group(1).strip(),
36
+ "kab/kota" : city.group(2).strip(),
37
+ "agama" : matches[9],
38
+ "pekerjaan" : matches[10],
39
+ "gol_dar" : matches[12]
40
+ }
41
 
42
+ if "WNI" in text:
43
+ result["Kewarganegaraan"] = "WNI"
44
 
45
+ convert_image_to_word(result, f'KTP {filename}')
 
 
46
 
47
+ return result