Spaces:
Runtime error
Runtime error
esraa-abdelmaksoud
commited on
Commit
•
db656f0
1
Parent(s):
7f1fc7d
Upload 7 files
Browse files- Dockerfile.txt +33 -0
- app.py +22 -0
- data.csv +39 -0
- id.jpg +0 -0
- ktp_reader.py +632 -0
- packages.txt +1 -0
- requirements.txt +9 -0
Dockerfile.txt
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Use the official Python 3.9 image
|
2 |
+
FROM python:3.9
|
3 |
+
|
4 |
+
# Set the working directory to /code
|
5 |
+
WORKDIR /code
|
6 |
+
|
7 |
+
# Copy the current directory contents into the container at /code
|
8 |
+
COPY ./requirements.txt /code/requirements.txt
|
9 |
+
|
10 |
+
# Install requirements.txt
|
11 |
+
RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
|
12 |
+
|
13 |
+
# Set up a new user named "user" with user ID 1000
|
14 |
+
RUN useradd -m -u 1000 user
|
15 |
+
|
16 |
+
# Run these packages to avoid OpenCV and Tesseract errors
|
17 |
+
RUN apt-get update && apt-get install ffmpeg libsm6 libxext6 tesseract-ocr-all -y
|
18 |
+
|
19 |
+
# Switch to the "user" user
|
20 |
+
USER user
|
21 |
+
|
22 |
+
# Set home to the user's home directory
|
23 |
+
ENV HOME=/home/user \
|
24 |
+
PATH=/home/user/.local/bin:$PATH
|
25 |
+
|
26 |
+
# Set the working directory to the user's home directory
|
27 |
+
WORKDIR $HOME/app
|
28 |
+
|
29 |
+
# Copy the current directory contents into the container at $HOME/app setting the owner to the user
|
30 |
+
COPY --chown=user . $HOME/app
|
31 |
+
|
32 |
+
# Start the FastAPI app on port 7860, the default port expected by Spaces
|
33 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
app.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI
|
2 |
+
from pydantic import BaseModel
|
3 |
+
from ktp_reader import process_image
|
4 |
+
|
5 |
+
|
6 |
+
app = FastAPI()
|
7 |
+
|
8 |
+
class ImageData(BaseModel):
|
9 |
+
image_path: str
|
10 |
+
|
11 |
+
@app.post("/process_image/")
|
12 |
+
def read_image(image_data: ImageData):
|
13 |
+
try:
|
14 |
+
# Use the provided image path
|
15 |
+
image_path = image_data.image_path
|
16 |
+
|
17 |
+
# Process the image using the image_path
|
18 |
+
result = process_image(image_path)
|
19 |
+
|
20 |
+
return result
|
21 |
+
except Exception as e:
|
22 |
+
return {"error": str(e)}
|
data.csv
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
provinsi,ACEH,BALI,BANGKA BELITUNG,BANTEN,BENGKULU,GORONTALO,DKI JAKARTA,JAMBI,JAWA BARAT,JAWA TENGAH,JAWA TIMUR,KALIMANTAN BARAT,KALIMANTAN SELATAN,KALIMANTAN TENGAH,KALIMANTAN TIMUR,KALIMANTAN UTARA,KEPULAUAN RIAU,LAMPUNG,MALUKU,MALUKU UTARA,NUSA TENGGARA BARAT,NUSA TENGGARA TIMUR,PAPUA,PAPUA BARAT,RIAU,SULAWESI BARAT,SULAWESI SELATAN,SULAWESI TENGAH,SULAWESI TENGGARA,SULAWESI UTARA,SUMATERA BARAT,SUMATERA SELATAN,SUMATERA UTARA
|
2 |
+
ACEH,KABUPATEN ACEH BARAT,KABUPATEN BADUNG,KABUPATEN BANGKA,KABUPATEN LEBAK,KABUPATEN BENGKULU SELATAN,KABUPATEN BOALEMO,JAKARTA BARAT,KABUPATEN BATANGHARI,KABUPATEN BANDUNG,KABUPATEN BANJARNEGARA,KABUPATEN BANGKALAN,KABUPATEN BENGKAYANG,KABUPATEN BALANGAN,KABUPATEN BARITO SELATAN,KABUPATEN BERAU,KABUPATEN BULUNGAN,KABUPATEN BINTAN,KABUPATEN LAMPUNG TENGAH,KABUPATEN BURU,KABUPATEN HALMAHERA BARAT,KABUPATEN BIMA,KABUPATEN ALOR,KABUPATEN ASMAT,KABUPATEN FAKFAK,KABUPATEN BENGKALIS,KABUPATEN MAJENE,KABUPATEN BANTAENG,KABUPATEN BANGGAI,KABUPATEN BOMBANA,KABUPATEN BOLAANG MONGONDOW,KABUPATEN AGAM,KABUPATEN BANYUASIN,KABUPATEN ASAHAN
|
3 |
+
BALI,KABUPATEN ACEH BARAT DAYA,KABUPATEN BANGLI,KABUPATEN BANGKA BARAT,KABUPATEN PANDEGLANG,KABUPATEN BENGKULU TENGAH,KABUPATEN BONE BOLANGO,JAKARTA PUSAT,KABUPATEN BUNGO,KABUPATEN BANDUNG BARAT,KABUPATEN BANYUMAS,KABUPATEN BANYUWANGI,KABUPATEN KAPUAS HULU,KABUPATEN BANJAR,KABUPATEN BARITO TIMUR,KABUPATEN KUTAI BARAT,KABUPATEN MALINAU,KABUPATEN KARIMUN,KABUPATEN LAMPUNG UTARA,KABUPATEN BURU SELATAN,KABUPATEN HALMAHERA TENGAH,KABUPATEN DOMPU,KABUPATEN BELU,KABUPATEN BIAK NUMFOR,KABUPATEN KAIMANA,KABUPATEN INDRAGIRI HILIR,KABUPATEN MAMASA,KABUPATEN BARRU,KABUPATEN BANGGAI KEPULAUAN,KABUPATEN BUTON,KABUPATEN BOLAANG MONGONDOW SELATAN,KABUPATEN DHARMASRAYA,KABUPATEN EMPAT LAWANG,KABUPATEN BATUBARA
|
4 |
+
BANGKA BELITUNG,KABUPATEN ACEH BESAR,KABUPATEN BULELENG,KABUPATEN BANGKA SELATAN,KABUPATEN SERANG,KABUPATEN BENGKULU UTARA,KABUPATEN GORONTALO,JAKARTA SELATAN,KABUPATEN KERINCI,KABUPATEN BEKASI,KABUPATEN BATANG,KABUPATEN BLITAR,KABUPATEN KAYONG UTARA,KABUPATEN BARITO KUALA,KABUPATEN BARITO UTARA,KABUPATEN KUTAI KARTANEGARA,KABUPATEN NUNUKAN,KABUPATEN KEPULAUAN ANAMBAS,KABUPATEN LAMPUNG SELATAN,KABUPATEN KEPULAUAN ARU,KABUPATEN HALMAHERA UTARA,KABUPATEN LOMBOK BARAT,KABUPATEN ENDE,KABUPATEN BOVEN DIGOEL,KABUPATEN MANOKWARI,KABUPATEN INDRAGIRI HULU,KABUPATEN MAMUJU,KABUPATEN BONE,KABUPATEN BANGGAI LAUT,KABUPATEN BUTON SELATAN,KABUPATEN BOLAANG MONGONDOW TIMUR,KABUPATEN KEPULAUAN MENTAWAI,KABUPATEN LAHAT,KABUPATEN DAIRI
|
5 |
+
BANTEN,KABUPATEN ACEH JAYA,KABUPATEN GIANYAR,KABUPATEN BANGKA TENGAH,KABUPATEN TANGERANG,KABUPATEN KAUR,KABUPATEN GORONTALO UTARA,JAKARTA TIMUR,KABUPATEN MERANGIN,KABUPATEN BOGOR,KABUPATEN BLORA,KABUPATEN BOJONEGORO,KABUPATEN KETAPANG,KABUPATEN HULU SUNGAI SELATAN,KABUPATEN GUNUNG MAS,KABUPATEN KUTAI TIMUR,KABUPATEN TANA TIDUNG,KABUPATEN LINGGA,KABUPATEN LAMPUNG BARAT,KABUPATEN MALUKU BARAT DAYA,KABUPATEN HALMAHERA SELATAN,KABUPATEN LOMBOK TENGAH,KABUPATEN FLORES TIMUR,KABUPATEN DEIYAI,KABUPATEN MANOKWARI SELATAN,KABUPATEN KAMPAR,KABUPATEN MAMUJU TENGAH,KABUPATEN BULUKUMBA,KABUPATEN BUOL,KABUPATEN BUTON TENGAH,KABUPATEN BOLAANG MONGONDOW UTARA,KABUPATEN LIMA PULUH KOTA,KABUPATEN MUARA ENIM,KABUPATEN DELI SERDANG
|
6 |
+
BENGKULU,KABUPATEN ACEH SELATAN,KABUPATEN JEMBRANA,KABUPATEN BELITUNG,KOTA CILEGON,KABUPATEN KEPAHIANG,KABUPATEN POHUWATO,JAKARTA UTARA,KABUPATEN MUARO JAMBI,KABUPATEN CIAMIS,KABUPATEN BOYOLALI,KABUPATEN BONDOWOSO,KABUPATEN KUBU RAYA,KABUPATEN HULU SUNGAI TENGAH,KABUPATEN KAPUAS,KABUPATEN MAHAKAM ULU,KOTA TARAKAN,KABUPATEN NATUNA,KABUPATEN LAMPUNG TIMUR,KABUPATEN MALUKU TENGAH,KABUPATEN KEPULAUAN SULA,KABUPATEN LOMBOK TIMUR,KABUPATEN KUPANG,KABUPATEN DOGIYAI,KABUPATEN MAYBRAT,KABUPATEN KEPULAUAN MERANTI,KABUPATEN MAMUJU UTARA,KABUPATEN ENREKANG,KABUPATEN DONGGALA,KABUPATEN BUTON UTARA,KABUPATEN KEPULAUAN SANGIHE,KABUPATEN PADANG PARIAMAN,KABUPATEN MUSI BANYUASIN,KABUPATEN HUMBANG HASUNDUTAN
|
7 |
+
GORONTALO,KABUPATEN ACEH SINGKIL,KABUPATEN KARANGASEM,KABUPATEN BELITUNG TIMUR,KOTA SERANG,KABUPATEN LEBONG,KOTA GORONTALO,KEPULAUAN SERIBU,KABUPATEN SAROLANGUN,KABUPATEN CIANJUR,KABUPATEN BREBES,KABUPATEN GRESIK,KABUPATEN LANDAK,KABUPATEN HULU SUNGAI UTARA,KABUPATEN KATINGAN,KABUPATEN PASER,,KOTA BATAM,KABUPATEN MESUJI,KABUPATEN MALUKU TENGGARA,KABUPATEN HALMAHERA TIMUR,KABUPATEN LOMBOK UTARA,KABUPATEN LEMBATA,KABUPATEN INTAN JAYA,KABUPATEN PEGUNUNGAN ARFAK,KABUPATEN KUANTAN SINGINGI,KABUPATEN POLEWALI MANDAR,KABUPATEN GOWA,KABUPATEN MOROWALI,KABUPATEN KOLAKA,KABUPATEN KEPULAUAN SIAU TAGULANDANG BIARO,KABUPATEN PASAMAN,KABUPATEN MUSI RAWAS,KABUPATEN KARO
|
8 |
+
DKI JAKARTA,KABUPATEN ACEH TAMIANG,KABUPATEN KLUNGKUNG,KOTA PANGKAL PINANG,KOTA TANGERANG,KABUPATEN MUKOMUKO,,,KABUPATEN TANJUNG JABUNG BARAT,KABUPATEN CIREBON,KABUPATEN CILACAP,KABUPATEN JEMBER,KABUPATEN MELAWI,KABUPATEN KOTABARU,KABUPATEN KOTAWARINGIN BARAT,KABUPATEN PENAJAM PASER UTARA,,KOTA TANJUNG PINANG,KABUPATEN PESAWARAN,KABUPATEN MALUKU TENGGARA BARAT,KABUPATEN PULAU MOROTAI,KABUPATEN SUMBAWA,KABUPATEN MALAKA,KABUPATEN JAYAPURA,KABUPATEN RAJA AMPAT,KABUPATEN PELALAWAN,KOTA MAMUJU,KABUPATEN JENEPONTO,KABUPATEN MOROWALI UTARA,KABUPATEN KOLAKA TIMUR,KABUPATEN KEPULAUAN TALAUD,KABUPATEN PASAMAN BARAT,KABUPATEN MUSI RAWAS UTARA,KABUPATEN LABUHANBATU
|
9 |
+
JAMBI,KABUPATEN ACEH TENGAH,KABUPATEN TABANAN,,KOTA TANGERANG SELATAN,KABUPATEN REJANG LEBONG,,,KABUPATEN TANJUNG JABUNG TIMUR,KABUPATEN GARUT,KABUPATEN DEMAK,KABUPATEN JOMBANG,KABUPATEN MEMPAWAH,KABUPATEN TABALONG,KABUPATEN KOTAWARINGIN TIMUR,KOTA BALIKPAPAN,,,KABUPATEN PESISIR BARAT,KABUPATEN SERAM BAGIAN BARAT,KABUPATEN PULAU TALIABU,KABUPATEN SUMBAWA BARAT,KABUPATEN MANGGARAI,KABUPATEN JAYAWIJAYA,KABUPATEN SORONG,KABUPATEN ROKAN HILIR,,KABUPATEN KEPULAUAN SELAYAR,KABUPATEN PARIGI MOUTONG,KABUPATEN KOLAKA UTARA,KABUPATEN MINAHASA,KABUPATEN PESISIR SELATAN,KABUPATEN OGAN ILIR,KABUPATEN LABUHANBATU SELATAN
|
10 |
+
JAWA BARAT,KABUPATEN ACEH TENGGARA,KOTA DENPASAR,,,KABUPATEN SELUMA,,,KABUPATEN TEBO,KABUPATEN INDRAMAYU,KABUPATEN GROBOGAN,KABUPATEN KEDIRI,KABUPATEN SAMBAS,KABUPATEN TANAH BUMBU,KABUPATEN LAMANDAU,KOTA BONTANG,,,KABUPATEN PRINGSEWU,KABUPATEN SERAM BAGIAN TIMUR,KOTA TERNATE,KOTA BIMA,KABUPATEN MANGGARAI BARAT,KABUPATEN KEEROM,KABUPATEN SORONG SELATAN,KABUPATEN ROKAN HULU,,KABUPATEN LUWU,KABUPATEN POSO,KABUPATEN KONAWE,KABUPATEN MINAHASA SELATAN,KABUPATEN SIJUNJUNG,KABUPATEN OGAN KOMERING ILIR,KABUPATEN LABUHANBATU UTARA
|
11 |
+
JAWA TENGAH,KABUPATEN ACEH TIMUR,,,,KOTA BENGKULU,,,KOTA JAMBI,KABUPATEN KARAWANG,KABUPATEN JEPARA,KABUPATEN LAMONGAN,KABUPATEN SANGGAU,KABUPATEN TANAH LAUT,KABUPATEN MURUNG RAYA,KOTA SAMARINDA,,,KABUPATEN TULANG BAWANG,KOTA AMBON,KOTA TIDORE KEPULAUAN,KOTA MATARAM,KABUPATEN MANGGARAI TIMUR,KABUPATEN KEPULAUAN YAPEN,KABUPATEN TAMBRAUW,KABUPATEN SIAK,,KABUPATEN LUWU TIMUR,KABUPATEN SIGI,KABUPATEN KONAWE KEPULAUAN,KABUPATEN MINAHASA TENGGARA,KABUPATEN SOLOK,KABUPATEN OGAN KOMERING ULU,KABUPATEN LANGKAT
|
12 |
+
JAWA TIMUR,KABUPATEN ACEH UTARA,,,,,,,KOTA SUNGAI PENUH,KABUPATEN KUNINGAN,KABUPATEN KARANGANYAR,KABUPATEN LUMAJANG,KABUPATEN SEKADAU,KABUPATEN TAPIN,KABUPATEN PULANG PISAU,,,,KABUPATEN TULANG BAWANG BARAT,KOTA TUAL,,,KABUPATEN NGADA,KABUPATEN LANNY JAYA,KABUPATEN TELUK BINTUNI,KOTA DUMAI,,KABUPATEN LUWU UTARA,KABUPATEN TOJO UNA-UNA,KABUPATEN KONAWE SELATAN,KABUPATEN MINAHASA UTARA,KABUPATEN SOLOK SELATAN,KABUPATEN OGAN KOMERING ULU SELATAN,KABUPATEN MANDAILING NATAL
|
13 |
+
KALIMANTAN BARAT,KABUPATEN BENER MERIAH,,,,,,,,KABUPATEN MAJALENGKA,KABUPATEN KEBUMEN,KABUPATEN MADIUN,KABUPATEN SINTANG,KOTA BANJARBARU,KABUPATEN SUKAMARA,,,,KABUPATEN TANGGAMUS,,,,KABUPATEN NAGEKEO,KABUPATEN MAMBERAMO RAYA,KABUPATEN TELUK WONDAMA,KOTA PEKANBARU,,KABUPATEN MAROS,KABUPATEN TOLI-TOLI,KABUPATEN KONAWE UTARA,KOTA BITUNG,KABUPATEN TANAH DATAR,KABUPATEN OGAN KOMERING ULU TIMUR,KABUPATEN NIAS
|
14 |
+
KALIMANTAN SELATAN,KABUPATEN BIREUEN,,,,,,,,KABUPATEN PANGANDARAN,KABUPATEN KENDAL,KABUPATEN MAGETAN,KOTA PONTIANAK,KOTA BANJARMASIN,KABUPATEN SERUYAN,,,,KABUPATEN WAY KANAN,,,,KABUPATEN ROTE NDAO,KABUPATEN MAMBERAMO TENGAH,,,,KABUPATEN PANGKAJENE DAN KEPULAUAN,KOTA PALU,KABUPATEN MUNA,KOTA KOTAMOBAGU,KOTA BUKITTINGGI,KABUPATEN PENUKAL ABAB LEMATANG ILIR,KABUPATEN NIAS BARAT
|
15 |
+
KALIMANTAN TENGAH,KABUPATEN GAYO LUES,,,,,,,,KABUPATEN PURWAKARTA,KABUPATEN KLATEN,KABUPATEN MALANG,KOTA SINGKAWANG,,KOTA PALANGKA RAYA,,,,KOTA BANDAR LAMPUNG,,,,KABUPATEN SABU RAIJUA,KABUPATEN MAPPI,,,,KABUPATEN PINRANG,,KABUPATEN MUNA BARAT,KOTA MANADO,KOTA PADANG,KOTA LUBUKLINGGAU,KABUPATEN NIAS SELATAN
|
16 |
+
KALIMANTAN TIMUR,KABUPATEN NAGAN RAYA,,,,,,,,KABUPATEN SUBANG,KABUPATEN KUDUS,KABUPATEN MOJOKERTO,,,,,,,KOTA METRO,,,,KABUPATEN SIKKA,KABUPATEN MERAUKE,,,,KABUPATEN SIDENRENG RAPPANG,,KABUPATEN WAKATOBI,KOTA TOMOHON,KOTA PADANGPANJANG,KOTA PAGAR ALAM,KABUPATEN NIAS UTARA
|
17 |
+
KALIMANTAN UTARA,KABUPATEN PIDIE,,,,,,,,KABUPATEN SUKABUMI,KABUPATEN MAGELANG,KABUPATEN NGANJUK,,,,,,,,,,,KABUPATEN SUMBA BARAT,KABUPATEN MIMIKA,,,,KABUPATEN SINJAI,,KOTA BAU-BAU,,KOTA PARIAMAN,KOTA PALEMBANG,KABUPATEN PADANG LAWAS
|
18 |
+
KEPULAUAN RIAU,KABUPATEN PIDIE JAYA,,,,,,,,KABUPATEN SUMEDANG,KABUPATEN PATI,KABUPATEN NGAWI,,,,,,,,,,,KABUPATEN SUMBA BARAT DAYA,KABUPATEN NABIRE,,,,KABUPATEN SOPPENG,,KOTA KENDARI,,KOTA PAYAKUMBUH,KOTA PRABUMULIH,KABUPATEN PADANG LAWAS UTARA
|
19 |
+
LAMPUNG,KABUPATEN SIMEULUE,,,,,,,,KABUPATEN TASIKMALAYA,KABUPATEN PEKALONGAN,KABUPATEN PACITAN,,,,,,,,,,,KABUPATEN SUMBA TENGAH,KABUPATEN NDUGA,,,,KABUPATEN TAKALAR,,,,KOTA SAWAHLUNTO,,KABUPATEN PAKPAK BHARAT
|
20 |
+
MALUKU,KOTA BANDA ACEH,,,,,,,,KOTA BANDUNG,KABUPATEN PEMALANG,KABUPATEN PAMEKASAN,,,,,,,,,,,KABUPATEN SUMBA TIMUR,KABUPATEN PANIAI,,,,KABUPATEN TANA TORAJA,,,,KOTA SOLOK,,KABUPATEN SAMOSIR
|
21 |
+
MALUKU UTARA,KOTA LANGSA,,,,,,,,KOTA BANJAR,KABUPATEN PURBALINGGA,KABUPATEN PASURUAN,,,,,,,,,,,KABUPATEN TIMOR TENGAH SELATAN,KABUPATEN PEGUNUNGAN BINTANG,,,,KABUPATEN TORAJA UTARA,,,,,,KABUPATEN SERDANG BEDAGAI
|
22 |
+
NUSA TENGGARA BARAT,KOTA LHOKSEUMAWE,,,,,,,,KOTA BEKASI,KABUPATEN PURWOREJO,KABUPATEN PONOROGO,,,,,,,,,,,KABUPATEN TIMOR TENGAH UTARA,KABUPATEN PUNCAK,,,,KABUPATEN WAJO,,,,,,KABUPATEN SIMALUNGUN
|
23 |
+
NUSA TENGGARA TIMUR,KOTA SABANG,,,,,,,,KOTA BOGOR,KABUPATEN REMBANG,KABUPATEN PROBOLINGGO,,,,,,,,,,,KOTA KUPANG,KABUPATEN PUNCAK JAYA,,,,KOTA MAKASSAR,,,,,,KABUPATEN TAPANULI SELATAN
|
24 |
+
PAPUA,KOTA SUBULUSSALAM,,,,,,,,KOTA CIMAHI,KABUPATEN SEMARANG,KABUPATEN SAMPANG,,,,,,,,,,,,KABUPATEN SARMI,,,,KOTA PALOPO,,,,,,KABUPATEN TAPANULI TENGAH
|
25 |
+
PAPUA BARAT,,,,,,,,,KOTA CIREBON,KABUPATEN SRAGEN,KABUPATEN SIDOARJO,,,,,,,,,,,,KABUPATEN SUPIORI,,,,KOTA PAREPARE,,,,,,KABUPATEN TAPANULI UTARA
|
26 |
+
RIAU,,,,,,,,,KOTA DEPOK,KABUPATEN SUKOHARJO,KABUPATEN SITUBONDO,,,,,,,,,,,,KABUPATEN TOLIKARA,,,,,,,,,,KABUPATEN TOBA SAMOSIR
|
27 |
+
SULAWESI BARAT,,,,,,,,,KOTA SUKABUMI,KABUPATEN TEGAL,KABUPATEN SUMENEP,,,,,,,,,,,,KABUPATEN WAROPEN,,,,,,,,,,KOTA BINJAI
|
28 |
+
SULAWESI SELATAN,,,,,,,,,KOTA TASIKMALAYA,KABUPATEN TEMANGGUNG,KABUPATEN TRENGGALEK,,,,,,,,,,,,KABUPATEN YAHUKIMO,,,,,,,,,,KOTA GUNUNGSITOLI
|
29 |
+
SULAWESI TENGAH,,,,,,,,,,KABUPATEN WONOGIRI,KABUPATEN TUBAN,,,,,,,,,,,,KABUPATEN YALIMO,,,,,,,,,,KOTA MEDAN
|
30 |
+
SULAWESI TENGGARA,,,,,,,,,,KABUPATEN WONOSOBO,KABUPATEN TULUNGAGUNG,,,,,,,,,,,,KOTA JAYAPURA,,,,,,,,,,KOTA PADANGSIDEMPUAN
|
31 |
+
SULAWESI UTARA,,,,,,,,,,KOTA MAGELANG,KOTA BATU,,,,,,,,,,,,,,,,,,,,,,KOTA PEMATANGSIANTAR
|
32 |
+
SUMATERA BARAT,,,,,,,,,,KOTA PEKALONGAN,KOTA BLITAR,,,,,,,,,,,,,,,,,,,,,,KOTA SIBOLGA
|
33 |
+
SUMATERA SELATAN,,,,,,,,,,KOTA SALATIGA,KOTA KEDIRI,,,,,,,,,,,,,,,,,,,,,,KOTA TANJUNGBALAI
|
34 |
+
SUMATERA UTARA,,,,,,,,,,KOTA SEMARANG,KOTA MADIUN,,,,,,,,,,,,,,,,,,,,,,KOTA TEBING TINGGI
|
35 |
+
,,,,,,,,,,KOTA SURAKARTA,KOTA MALANG,,,,,,,,,,,,,,,,,,,,,,
|
36 |
+
,,,,,,,,,,KOTA TEGAL,KOTA MOJOKERTO,,,,,,,,,,,,,,,,,,,,,,
|
37 |
+
,,,,,,,,,,,KOTA PASURUAN,,,,,,,,,,,,,,,,,,,,,,
|
38 |
+
,,,,,,,,,,,KOTA PROBOLINGGO,,,,,,,,,,,,,,,,,,,,,,
|
39 |
+
,,,,,,,,,,,KOTA SURABAYA,,,,,,,,,,,,,,,,,,,,,,
|
id.jpg
ADDED
ktp_reader.py
ADDED
@@ -0,0 +1,632 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from paddleocr import PaddleOCR
|
2 |
+
import os
|
3 |
+
import cv2
|
4 |
+
import pytesseract
|
5 |
+
import pandas as pd
|
6 |
+
import re
|
7 |
+
from thefuzz import fuzz
|
8 |
+
from thefuzz import process
|
9 |
+
import logging
|
10 |
+
import json
|
11 |
+
|
12 |
+
logging.getLogger().setLevel(logging.ERROR)
|
13 |
+
|
14 |
+
|
15 |
+
def process_image(path):
|
16 |
+
"""
|
17 |
+
The main function that performs optical character recognition (OCR) on an image and processes the extracted data.
|
18 |
+
|
19 |
+
Returns:
|
20 |
+
obj: Processed text output containing extracted information.
|
21 |
+
"""
|
22 |
+
csv_path = 'data.csv'
|
23 |
+
data_dict = {
|
24 |
+
"provinsi": "",
|
25 |
+
"kabupaten": "",
|
26 |
+
"nik": "",
|
27 |
+
"nama": "",
|
28 |
+
"tempat/tgl lahir": "",
|
29 |
+
"jenis kelamin": "",
|
30 |
+
"gol. darah": "",
|
31 |
+
"alamat": "",
|
32 |
+
"rt/rw": "",
|
33 |
+
"kel/desa": "",
|
34 |
+
"kecamatan": "",
|
35 |
+
"agama": "",
|
36 |
+
"status perkawinan": "",
|
37 |
+
"pekerjaan": "",
|
38 |
+
"kewarganegaraan": "",
|
39 |
+
"berlaku hingga": "",
|
40 |
+
}
|
41 |
+
# Create list for labels spelling correction
|
42 |
+
labels = list(data_dict.keys())
|
43 |
+
labels.remove("kabupaten")
|
44 |
+
|
45 |
+
try:
|
46 |
+
# Read csv data file
|
47 |
+
df = pd.read_csv(csv_path)
|
48 |
+
except:
|
49 |
+
raise ValueError("Cannot find the csv data file.")
|
50 |
+
|
51 |
+
try:
|
52 |
+
# Resize image
|
53 |
+
image = resize_image(path)
|
54 |
+
|
55 |
+
# Run Tesseract to get the right rotation and color conversion
|
56 |
+
image_xyz = rotate_image(image)
|
57 |
+
except:
|
58 |
+
raise ValueError("Invalid image input.")
|
59 |
+
|
60 |
+
|
61 |
+
# Run PaddleOCR on the whole image and Tesseract on detected areas by PaddleOCR
|
62 |
+
all_data = run_ocr(image_xyz)
|
63 |
+
|
64 |
+
# Check if the 16-digit ID number exists
|
65 |
+
all_data = check_numbers(all_data)
|
66 |
+
|
67 |
+
# Split labels and data
|
68 |
+
new_data = split_items(all_data)
|
69 |
+
|
70 |
+
try:
|
71 |
+
# Correct the text of labels
|
72 |
+
new_data, found_labels = correct_labels(new_data, labels)
|
73 |
+
|
74 |
+
# Correct the data
|
75 |
+
new_data = correct_data(new_data, df)
|
76 |
+
except:
|
77 |
+
pass
|
78 |
+
|
79 |
+
try:
|
80 |
+
# Add labels if missing
|
81 |
+
new_data = add_missing_labels(new_data, labels, found_labels)
|
82 |
+
except:
|
83 |
+
pass
|
84 |
+
|
85 |
+
# Print the clean output
|
86 |
+
text = print_output(new_data)
|
87 |
+
|
88 |
+
# Convert to JSON
|
89 |
+
text_obj = json.dumps({"text":text})
|
90 |
+
|
91 |
+
return text_obj
|
92 |
+
|
93 |
+
|
94 |
+
def get_scores(result):
|
95 |
+
"""
|
96 |
+
Get scores from the OCR result.
|
97 |
+
|
98 |
+
Args:
|
99 |
+
result (list): The OCR result list.
|
100 |
+
|
101 |
+
Returns:
|
102 |
+
tuple: A tuple containing lists of sorted confidence scores, overall score, and all scores.
|
103 |
+
"""
|
104 |
+
scores = [round(line[1][1],4) for line in result[0]]
|
105 |
+
overall_score = 0
|
106 |
+
for score in scores:
|
107 |
+
overall_score += score
|
108 |
+
overall_score = round(overall_score/len(scores),4)
|
109 |
+
sorted_scores = sorted(scores)
|
110 |
+
|
111 |
+
# Raise error if the 3rd confidence score is less than 90%
|
112 |
+
if sorted_scores[2] < 0.9:
|
113 |
+
raise ValueError("Poor image quality. Please avoid shadows, flashlights, and patterned backgrounds.")
|
114 |
+
return overall_score, sorted_scores, scores
|
115 |
+
|
116 |
+
|
117 |
+
|
118 |
+
def add_missing_labels(new_data, labels, found_labels):
|
119 |
+
|
120 |
+
# Add labels if a maximum of 3 labels is missing
|
121 |
+
if len(found_labels) < 15 and len(found_labels) > 12:
|
122 |
+
added = 0
|
123 |
+
for i in range(len(labels)):
|
124 |
+
if labels[i] != found_labels[i][0]:
|
125 |
+
# Use next label index - 2 + the number of shifted items
|
126 |
+
# Else, use previous label index + 2 + the number of shifted items
|
127 |
+
try:
|
128 |
+
if labels[i] == "gol. darah":
|
129 |
+
idx = found_labels[i][1] + added
|
130 |
+
elif labels[i] == "alamat":
|
131 |
+
# Get Gol. Darah index and check if the length of next item is greater than two
|
132 |
+
gol_idx = new_data.index("gol. darah")
|
133 |
+
if len(new_data[gol_idx+1]) > 2:
|
134 |
+
idx = gol_idx + 1
|
135 |
+
else:
|
136 |
+
idx = gol_idx + 2
|
137 |
+
else:
|
138 |
+
idx = found_labels[i+1][1] - 2 + added
|
139 |
+
except:
|
140 |
+
idx = found_labels[i-1][1] + 2 + added
|
141 |
+
if idx < len(new_data)-1:
|
142 |
+
new_data.insert(idx, [labels[i], labels[i], 'label'])
|
143 |
+
found_labels.insert(i, [labels[i], idx])
|
144 |
+
else:
|
145 |
+
new_data.insert(len(new_data)-2, [labels[i], labels[i], 'label'])
|
146 |
+
found_labels.insert(i, [labels[i], len(new_data)-2])
|
147 |
+
added += 1
|
148 |
+
else:
|
149 |
+
raise ValueError("Some labels cannot be detected. Please recapture a photo of the ID.")
|
150 |
+
return new_data
|
151 |
+
|
152 |
+
|
153 |
+
def check_numbers(all_data):
|
154 |
+
"""
|
155 |
+
Check if there is a 16-digit number in OCR text.
|
156 |
+
|
157 |
+
Args:
|
158 |
+
all_data (list): The structured OCR result list.
|
159 |
+
|
160 |
+
Returns:
|
161 |
+
list: A list containing the structured OCR output
|
162 |
+
"""
|
163 |
+
ktp_num = ""
|
164 |
+
for i in range(len(all_data)):
|
165 |
+
id_output = re.findall("\d{16}", all_data[i][4])
|
166 |
+
rt_output = re.findall("\d{3}/\d{3}", all_data[i][4])
|
167 |
+
if len(id_output) > 0:
|
168 |
+
# Keep PaddleOCR output for both
|
169 |
+
ktp_num, all_data[i][4], all_data[i][5] = id_output[0], id_output[0], id_output[0]
|
170 |
+
if len(rt_output) > 0:
|
171 |
+
all_data[i][4], all_data[i][5] = rt_output[0], rt_output[0]
|
172 |
+
if ktp_num == "":
|
173 |
+
raise ValueError("KTP number cannot be detected. Please recapture a photo of the ID.")
|
174 |
+
|
175 |
+
return all_data
|
176 |
+
|
177 |
+
|
178 |
+
def run_ocr(image):
|
179 |
+
"""
|
180 |
+
Perform optical character recognition (OCR) on the given image.
|
181 |
+
|
182 |
+
Args:
|
183 |
+
image (ndarray): The image array on which OCR will be performed.
|
184 |
+
|
185 |
+
Returns:
|
186 |
+
list: A list containing information about the recognized text regions, including coordinates, recognized text,
|
187 |
+
and corresponding OCR outputs from different OCR engines.
|
188 |
+
"""
|
189 |
+
ocr = PaddleOCR(
|
190 |
+
use_angle_cls=True,
|
191 |
+
lang="id",
|
192 |
+
det_max_side_len=1500,
|
193 |
+
det_limit_type="min",
|
194 |
+
det_db_unclip_ratio=1.7,
|
195 |
+
drop_score = 0.75,
|
196 |
+
show_log=False,
|
197 |
+
)
|
198 |
+
result = ocr.ocr(image, cls=True)
|
199 |
+
all_data = []
|
200 |
+
|
201 |
+
# Check the if the confidence score is higher than the threshold
|
202 |
+
get_scores(result)
|
203 |
+
|
204 |
+
# Create a list of values in form of x1, y1, x2, y2, Paddle output, Tesseract output
|
205 |
+
for i, res in enumerate(result[0]):
|
206 |
+
x, y = [], []
|
207 |
+
paddle_text = res[1][0]
|
208 |
+
for i in range(4):
|
209 |
+
x.append(res[0][i][0])
|
210 |
+
y.append(res[0][i][1])
|
211 |
+
x1, y1, x2, y2 = int(min(x)), int(min(y)), int(max(x)), int(max(y))
|
212 |
+
|
213 |
+
# Crop the area of text detected by Paddle
|
214 |
+
snip = image[y1:y2, x1:x2]
|
215 |
+
|
216 |
+
# Run Tesseract on the cropped area
|
217 |
+
tess_text = pytesseract.image_to_string(snip, lang="ind+eng", config="--psm 6")
|
218 |
+
|
219 |
+
# Clean the output of Tesseract and Paddle
|
220 |
+
tess_text, paddle_text = clean_text(tess_text, paddle_text)
|
221 |
+
|
222 |
+
all_data.append([x1, y1, x2, y2, paddle_text, tess_text])
|
223 |
+
|
224 |
+
return all_data
|
225 |
+
|
226 |
+
|
227 |
+
def clean_text(tess_text, paddle_text):
|
228 |
+
"""
|
229 |
+
Clean and preprocess the recognized text from Tesseract and PaddleOCR.
|
230 |
+
|
231 |
+
Args:
|
232 |
+
tess_text (str): Text recognized by Tesseract OCR.
|
233 |
+
paddle_text (str): Text recognized by PaddleOCR.
|
234 |
+
|
235 |
+
Returns:
|
236 |
+
tuple: A tuple containing the cleaned and preprocessed text from Tesseract and PaddleOCR, respectively.
|
237 |
+
"""
|
238 |
+
# Remove unicode
|
239 |
+
if "\n" in tess_text or "\x0c" in tess_text:
|
240 |
+
tess_text = tess_text.replace("\n", "")
|
241 |
+
tess_text = tess_text.replace("\x0c", "")
|
242 |
+
|
243 |
+
# Remove space before or after colon and hyphen
|
244 |
+
pattern = r"\s*([-:*])\s*"
|
245 |
+
paddle_text = re.sub(pattern, r"\1", paddle_text)
|
246 |
+
tess_text = re.sub(pattern, r"\1", tess_text)
|
247 |
+
|
248 |
+
# Replace any 1O with 10
|
249 |
+
paddle_text = paddle_text.replace("1O","10")
|
250 |
+
tess_text = tess_text.replace("1O","10")
|
251 |
+
|
252 |
+
# Fix dots in ID number
|
253 |
+
pattern = r"[0-9\.]{10}"
|
254 |
+
res = re.findall(pattern, paddle_text)
|
255 |
+
if len(res) != 0:
|
256 |
+
paddle_text = paddle_text.replace(".","")
|
257 |
+
|
258 |
+
# Add space after dot or comma and remove any two spaces
|
259 |
+
paddle_text = re.sub(r"([A-Z]\.)([A-z])", r"\1 \2", paddle_text)
|
260 |
+
|
261 |
+
# Fix commas recognized as dots and add space after it
|
262 |
+
if "NO" not in paddle_text:
|
263 |
+
pattern = r"([A-Za-z][\.,]\s{0,1})(\d{2})"
|
264 |
+
paddle_text = re.sub(pattern, r", \2", paddle_text)
|
265 |
+
tess_text = re.sub(pattern, r", \2", tess_text)
|
266 |
+
else:
|
267 |
+
pattern = r"([A-Za-z][\.]\s{0,1})(\d{1})"
|
268 |
+
paddle_text = re.sub(pattern, r". \2", paddle_text)
|
269 |
+
tess_text = re.sub(pattern, r". \2", tess_text)
|
270 |
+
|
271 |
+
# Clean blood group
|
272 |
+
if "Darah" in tess_text or "Darah" in paddle_text:
|
273 |
+
tess_text = tess_text.replace("0", "O")
|
274 |
+
paddle_text = paddle_text.replace("0", "O")
|
275 |
+
|
276 |
+
# Clean symbols
|
277 |
+
for item in ["'", '"', "!", "‘", "“", ":", "*","=", "+"]:
|
278 |
+
paddle_text = paddle_text.replace(item, "")
|
279 |
+
tess_text = tess_text.replace(item, "")
|
280 |
+
|
281 |
+
# Remove hyphen, dot, or comma if in the beginning of the text
|
282 |
+
if len(tess_text) > 0:
|
283 |
+
if tess_text[0] in ['-','.',',']:
|
284 |
+
tess_text = tess_text[1:]
|
285 |
+
if len(paddle_text) > 0:
|
286 |
+
if paddle_text[0] in ['-','.',',']:
|
287 |
+
paddle_text = paddle_text[1:]
|
288 |
+
|
289 |
+
# if paddle text is similar to tesseract text without spaces, replace paddle text with tesseract text
|
290 |
+
temp = tess_text.replace(" ","")
|
291 |
+
if paddle_text == temp:
|
292 |
+
paddle_text = tess_text
|
293 |
+
|
294 |
+
# If JL in the beggining of text, add the dot
|
295 |
+
if paddle_text[:2] == "JL" or tess_text[:2] == "JL":
|
296 |
+
paddle_text = re.sub(r"(JL)(\.{0,1})([A-Z])",r"JL. \3", paddle_text)
|
297 |
+
tess_text = re.sub(r"(JL)(\.{0,1})([A-Z])",r"JL. \3", tess_text)
|
298 |
+
|
299 |
+
# Check add missing spaces to Paddle Output
|
300 |
+
idxs = []
|
301 |
+
for i, char in enumerate(tess_text):
|
302 |
+
if char.isspace():
|
303 |
+
idxs.append(i)
|
304 |
+
for idx in idxs:
|
305 |
+
try:
|
306 |
+
p1 = tess_text[idx-2:idx]
|
307 |
+
p2 = tess_text[idx+1:idx+3]
|
308 |
+
if p1.isalpha() == True and p2.isalpha() == True:
|
309 |
+
to_replace = p1+p2
|
310 |
+
new = p1+" "+p2
|
311 |
+
paddle_text = paddle_text.replace(to_replace, new)
|
312 |
+
except:
|
313 |
+
pass
|
314 |
+
|
315 |
+
return tess_text, paddle_text
|
316 |
+
|
317 |
+
|
318 |
+
def resize_image(path):
|
319 |
+
"""
|
320 |
+
Resize the image if its dimensions are smaller than the specified threshold.
|
321 |
+
|
322 |
+
Args:
|
323 |
+
path (str): The path to the image file.
|
324 |
+
|
325 |
+
Returns:
|
326 |
+
ndarray: The resized image array.
|
327 |
+
"""
|
328 |
+
img = cv2.imread(path)
|
329 |
+
width = int(img.shape[1])
|
330 |
+
height = int(img.shape[0])
|
331 |
+
thresh = 1500
|
332 |
+
|
333 |
+
# Resize image to match the threshold
|
334 |
+
if width < thresh and height < thresh:
|
335 |
+
if width > height:
|
336 |
+
percent = thresh // width
|
337 |
+
else:
|
338 |
+
percent = thresh // height
|
339 |
+
dim = (width * percent, height * percent)
|
340 |
+
img = cv2.resize(img, dim, interpolation=cv2.INTER_AREA)
|
341 |
+
return img
|
342 |
+
|
343 |
+
|
344 |
+
def rotate_image(image):
|
345 |
+
"""
|
346 |
+
Rotate the image to the correct orientation by checking for specific text patterns in different rotations.
|
347 |
+
|
348 |
+
Args:
|
349 |
+
image (ndarray): The image array to be rotated.
|
350 |
+
|
351 |
+
Returns:
|
352 |
+
ndarray: The rotated image array if specific text patterns are found, otherwise the original image array.
|
353 |
+
"""
|
354 |
+
# Convert color to XYZ
|
355 |
+
image_xyz = cv2.cvtColor(image, cv2.COLOR_BGR2XYZ)
|
356 |
+
|
357 |
+
# Rotate the image by 90 degrees for 4 times until recognizing some correct text
|
358 |
+
for i in range(4):
|
359 |
+
text = pytesseract.image_to_string(image_xyz, lang="ind+eng", config="--psm 6")
|
360 |
+
if "PROVINSI" in text or "Darah" in text or "NIK" in text:
|
361 |
+
return image_xyz
|
362 |
+
else:
|
363 |
+
image_xyz = cv2.rotate(image_xyz, cv2.ROTATE_90_CLOCKWISE)
|
364 |
+
|
365 |
+
# If text is not found until last round, return image in original rotation
|
366 |
+
if i == 3:
|
367 |
+
return image_xyz
|
368 |
+
|
369 |
+
|
370 |
+
def correct_labels(new_data, labels):
|
371 |
+
"""
|
372 |
+
Correct the labels of the extracted data by matching them with a list of valid labels.
|
373 |
+
|
374 |
+
Args:
|
375 |
+
new_data (list): The extracted data list to be corrected.
|
376 |
+
labels (list): The list of valid labels.
|
377 |
+
|
378 |
+
Returns:
|
379 |
+
tuple: The corrected extracted data list with updated labels and list of labels
|
380 |
+
and corresponding indexes.
|
381 |
+
"""
|
382 |
+
thresh = 75
|
383 |
+
found_labels = [["provinsi", 0]]
|
384 |
+
for i in range(len(new_data)):
|
385 |
+
paddle_fuzz = process.extractOne(new_data[i][0], labels, scorer=fuzz.ratio)
|
386 |
+
tess_fuzz = process.extractOne(new_data[i][1], labels, scorer=fuzz.ratio)
|
387 |
+
|
388 |
+
# Skip adding provinsi because it's already added at index 0
|
389 |
+
if paddle_fuzz[0] != 'provinsi' and tess_fuzz[0] != 'provinsi':
|
390 |
+
# Correct text using the match that is more than the threshold
|
391 |
+
if paddle_fuzz[1] >= thresh:
|
392 |
+
new_data[i][0] = paddle_fuzz[0]
|
393 |
+
new_data[i][1] = paddle_fuzz[0]
|
394 |
+
new_data[i].append("label")
|
395 |
+
found_labels.append([paddle_fuzz[0], i])
|
396 |
+
elif tess_fuzz[1] >= thresh:
|
397 |
+
new_data[i][0] = tess_fuzz[0]
|
398 |
+
new_data[i][1] = tess_fuzz[0]
|
399 |
+
new_data[i].append("label")
|
400 |
+
found_labels.append([tess_fuzz[0], i])
|
401 |
+
# Correct "NIK"
|
402 |
+
elif (len(new_data[i][0]) == 3 or len(new_data[i][1]) == 3) and (
|
403 |
+
"IK" == new_data[i][0] or "IK" in new_data[i][1]
|
404 |
+
):
|
405 |
+
new_data[i][0] = "nik"
|
406 |
+
new_data[i][1] = "nik"
|
407 |
+
new_data[i].append("label")
|
408 |
+
found_labels.append(["nik", i])
|
409 |
+
|
410 |
+
return new_data, found_labels
|
411 |
+
|
412 |
+
|
413 |
+
def find_uppercase_index(text):
|
414 |
+
"""
|
415 |
+
Find the index of the first uppercase word in the given text.
|
416 |
+
|
417 |
+
Args:
|
418 |
+
text (str): The input text.
|
419 |
+
|
420 |
+
Returns:
|
421 |
+
int: The index of the first uppercase word, or -1 if no uppercase word is found.
|
422 |
+
"""
|
423 |
+
# Split lowercase followed by uppercase without space
|
424 |
+
pattern = r"(?<![A-Z])[A-Z]{3,}"
|
425 |
+
match = re.search(pattern, text)
|
426 |
+
if match:
|
427 |
+
return match.start()
|
428 |
+
else:
|
429 |
+
return -1
|
430 |
+
|
431 |
+
|
432 |
+
def correct_data(new_data, df):
|
433 |
+
"""
|
434 |
+
Correct the extracted data based on reference data from a DataFrame.
|
435 |
+
|
436 |
+
Args:
|
437 |
+
new_data (list): The extracted data list to be corrected.
|
438 |
+
df (DataFrame): The reference DataFrame containing the data for correction.
|
439 |
+
|
440 |
+
Returns:
|
441 |
+
list: The corrected extracted data list.
|
442 |
+
"""
|
443 |
+
# Make lists to be used in text correction
|
444 |
+
provinsi_df = df["provinsi"].dropna().tolist()
|
445 |
+
provinsi = [f"PROVINSI {item}" for item in provinsi_df]
|
446 |
+
other_vals = [
|
447 |
+
"LAKI-LAKI",
|
448 |
+
"PEREMPUAN",
|
449 |
+
"A",
|
450 |
+
"B",
|
451 |
+
"AB",
|
452 |
+
"O",
|
453 |
+
"ISLAM",
|
454 |
+
"KRISTEN",
|
455 |
+
"KATOLIK",
|
456 |
+
"HINDU",
|
457 |
+
"BUDHA",
|
458 |
+
"KONGHUCU",
|
459 |
+
"BELUM KAWIN",
|
460 |
+
"KAWIN",
|
461 |
+
"CERAI HIDUP",
|
462 |
+
"CERAI MATI",
|
463 |
+
"WNI",
|
464 |
+
"WNA",
|
465 |
+
"SEUMUR HIDUP",
|
466 |
+
]
|
467 |
+
|
468 |
+
paddle_except_city = []
|
469 |
+
for i in range(len(new_data)):
|
470 |
+
|
471 |
+
# Fix Provinsi
|
472 |
+
if i == 0 or ("PROVINSI" in new_data[i][0] or "PROVINSI" in new_data[i][1]):
|
473 |
+
new_data[i][0], new_data[i][1] = replace_data(new_data, i, provinsi)
|
474 |
+
kabupaten = df[new_data[i][0].replace("PROVINSI ", "")].dropna().tolist()
|
475 |
+
|
476 |
+
# Fix Kabupaten
|
477 |
+
elif i == 1:
|
478 |
+
try:
|
479 |
+
new_data[i][0], new_data[i][1] = replace_data(new_data, i, kabupaten)
|
480 |
+
except:
|
481 |
+
pass
|
482 |
+
|
483 |
+
# Fix other values such as religion
|
484 |
+
elif len(new_data[i]) == 2:
|
485 |
+
new_data[i][0], new_data[i][1] = replace_data(new_data, i, other_vals)
|
486 |
+
|
487 |
+
# Fix NIK
|
488 |
+
elif i == 3 or new_data[i - 1][0].upper() == "NIK":
|
489 |
+
new_data[i][1] = new_data[i][0]
|
490 |
+
|
491 |
+
# Fix dates
|
492 |
+
if i > 4:
|
493 |
+
pattern = r"(\d{2})\W{0,1}(\d{2})\W{0,1}((19|20)\d{2})"
|
494 |
+
new_data[i][0] = re.sub(pattern, r"\1-\2-\3", new_data[i][0])
|
495 |
+
new_data[i][1] = re.sub(pattern, r"\1-\2-\3", new_data[i][1])
|
496 |
+
|
497 |
+
if i != 1:
|
498 |
+
paddle_except_city.append(new_data[i][0])
|
499 |
+
|
500 |
+
# Add WNI if no WNI or WNA
|
501 |
+
paddle_temp = [data[0] for data in new_data]
|
502 |
+
tess_temp = [data[1] for data in new_data]
|
503 |
+
if not {"WNI", "WNA"}.intersection(set(paddle_temp)) and not {"WNI", "WNA"}.intersection(set(tess_temp)):
|
504 |
+
try:
|
505 |
+
kew_idx = paddle_temp.index("kewarganegaraan")
|
506 |
+
new_data.insert(kew_idx+1, ["WNI", "WNI"])
|
507 |
+
except:
|
508 |
+
pass
|
509 |
+
|
510 |
+
# Fix issuer province name if similar to province name in line 2
|
511 |
+
issuer_fuzz = process.extractOne(new_data[1][0], paddle_except_city, scorer=fuzz.ratio)
|
512 |
+
if issuer_fuzz[1] >= 85:
|
513 |
+
for i in range(len(new_data)):
|
514 |
+
if new_data[i][0] == issuer_fuzz[0]:
|
515 |
+
new_data[i][0], new_data[i][1] = new_data[1][0], new_data[1][0]
|
516 |
+
|
517 |
+
|
518 |
+
return new_data
|
519 |
+
|
520 |
+
|
521 |
+
def replace_data(new_data, i, options_list):
|
522 |
+
"""
|
523 |
+
Replace the data in the extracted list with the closest matching option from the given list.
|
524 |
+
|
525 |
+
Args:
|
526 |
+
new_data (list): The extracted data list.
|
527 |
+
i (int): The index of the item to be replaced.
|
528 |
+
options_list (list): The list of options for replacement.
|
529 |
+
|
530 |
+
Returns:
|
531 |
+
tuple: A tuple containing the replaced values for the item at index i.
|
532 |
+
"""
|
533 |
+
paddle_fuzz = process.extractOne(new_data[i][0], options_list, scorer=fuzz.ratio)
|
534 |
+
tess_fuzz = process.extractOne(new_data[i][1], options_list, scorer=fuzz.ratio)
|
535 |
+
|
536 |
+
# Replace values if fuzzy matching score exceeds threshold
|
537 |
+
if len(new_data[i][0]) < 4:
|
538 |
+
thresh = 65
|
539 |
+
else:
|
540 |
+
thresh = 75
|
541 |
+
if paddle_fuzz[1] > thresh:
|
542 |
+
new_data[i][0] = paddle_fuzz[0]
|
543 |
+
new_data[i][1] = paddle_fuzz[0]
|
544 |
+
elif tess_fuzz[1] > thresh:
|
545 |
+
new_data[i][0] = tess_fuzz[0]
|
546 |
+
new_data[i][1] = tess_fuzz[0]
|
547 |
+
return new_data[i][0], new_data[i][1]
|
548 |
+
|
549 |
+
|
550 |
+
def split_items(all_data):
|
551 |
+
"""
|
552 |
+
Split the data items in the given list into separate items based on certain conditions.
|
553 |
+
|
554 |
+
Args:
|
555 |
+
all_data (list): The list of data items to be split.
|
556 |
+
|
557 |
+
Returns:
|
558 |
+
list: The new list of split data items.
|
559 |
+
"""
|
560 |
+
new_data = []
|
561 |
+
for i in range(len(all_data)):
|
562 |
+
paddle_idx = find_uppercase_index(all_data[i][4])
|
563 |
+
tess_idx = find_uppercase_index(all_data[i][5])
|
564 |
+
if paddle_idx not in [0, -1] and tess_idx not in [0, -1]:
|
565 |
+
p1 = [all_data[i][4][:paddle_idx].strip(), all_data[i][5][:tess_idx].strip()]
|
566 |
+
p2 = [all_data[i][4][paddle_idx:].strip(), all_data[i][5][tess_idx:].strip()]
|
567 |
+
if p1 != ["",""]:
|
568 |
+
new_data.append(p1)
|
569 |
+
if p2 != ["",""]:
|
570 |
+
new_data.append(p2)
|
571 |
+
|
572 |
+
# Fix the text related to blood type
|
573 |
+
elif "Darah" in all_data[i][4] or "Darah" in all_data[i][5]:
|
574 |
+
|
575 |
+
# Add space between blood type and label
|
576 |
+
darah_match_1 = re.sub(r"(Darah)\W*((A|AB|B|O))", r"\1 \2", all_data[i][4])
|
577 |
+
darah_match_2 = re.sub(r"(Darah)\W*((A|AB|B|O))", r"\1 \2", all_data[i][5])
|
578 |
+
|
579 |
+
# Locate the space
|
580 |
+
space_1 = darah_match_1.rfind(" ")
|
581 |
+
space_2 = darah_match_2.rfind(" ")
|
582 |
+
|
583 |
+
# Write the label and values in two seperate lists
|
584 |
+
try:
|
585 |
+
if darah_match_1[-1] in ["A", "B", "O"]:
|
586 |
+
new_data.append(
|
587 |
+
[darah_match_1[:space_1].strip(), darah_match_1[:space_1].strip()]
|
588 |
+
)
|
589 |
+
new_data.append(
|
590 |
+
[
|
591 |
+
darah_match_1[space_1 + 1 :].strip(),
|
592 |
+
darah_match_1[space_1 + 1 :].strip(),
|
593 |
+
]
|
594 |
+
)
|
595 |
+
elif darah_match_2[-1] in ["A", "B", "O"]:
|
596 |
+
new_data.append(
|
597 |
+
[darah_match_2[:space_2].strip(), darah_match_2[:space_2].strip()]
|
598 |
+
)
|
599 |
+
new_data.append(
|
600 |
+
[
|
601 |
+
darah_match_2[space_2 + 1 :].strip(),
|
602 |
+
darah_match_2[space_2 + 1 :].strip(),
|
603 |
+
]
|
604 |
+
)
|
605 |
+
except:
|
606 |
+
pass
|
607 |
+
else:
|
608 |
+
new_data.append([all_data[i][4].strip(), all_data[i][5].strip()])
|
609 |
+
|
610 |
+
return new_data
|
611 |
+
|
612 |
+
|
613 |
+
def print_output(new_data):
|
614 |
+
"""
|
615 |
+
Create a formatted string output based on the given data.
|
616 |
+
|
617 |
+
Args:
|
618 |
+
new_data (list): The list of data items.
|
619 |
+
|
620 |
+
Returns:
|
621 |
+
str: The formatted string output.
|
622 |
+
"""
|
623 |
+
text = ""
|
624 |
+
for i in range(len(new_data)):
|
625 |
+
|
626 |
+
# Change labels to Uppercase
|
627 |
+
if new_data[i][0] == new_data[i][1] and len(new_data[i]) == 3:
|
628 |
+
text += f"{new_data[i][0].upper()}\n"
|
629 |
+
else:
|
630 |
+
if len(new_data[i][0]) > 0:
|
631 |
+
text += f"{new_data[i][0]}\n"
|
632 |
+
return text
|
packages.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
tesseract-ocr-all
|
requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
fastapi==0.100.0
|
2 |
+
opencv-python==4.6.0.66
|
3 |
+
paddleocr==2.6.1.3
|
4 |
+
paddlepaddle==2.4.1
|
5 |
+
pydantic==1.10.5
|
6 |
+
pytesseract==0.3.10
|
7 |
+
python-Levenshtein==0.20.9
|
8 |
+
thefuzz==0.19.0
|
9 |
+
uvicorn==0.23.1
|