esraa-abdelmaksoud commited on
Commit
db656f0
1 Parent(s): 7f1fc7d

Upload 7 files

Browse files
Files changed (7) hide show
  1. Dockerfile.txt +33 -0
  2. app.py +22 -0
  3. data.csv +39 -0
  4. id.jpg +0 -0
  5. ktp_reader.py +632 -0
  6. packages.txt +1 -0
  7. requirements.txt +9 -0
Dockerfile.txt ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use the official Python 3.9 image
2
+ FROM python:3.9
3
+
4
+ # Set the working directory to /code
5
+ WORKDIR /code
6
+
7
+ # Copy the current directory contents into the container at /code
8
+ COPY ./requirements.txt /code/requirements.txt
9
+
10
+ # Install requirements.txt
11
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
12
+
13
+ # Set up a new user named "user" with user ID 1000
14
+ RUN useradd -m -u 1000 user
15
+
16
+ # Run these packages to avoid OpenCV and Tesseract errors
17
+ RUN apt-get update && apt-get install ffmpeg libsm6 libxext6 tesseract-ocr-all -y
18
+
19
+ # Switch to the "user" user
20
+ USER user
21
+
22
+ # Set home to the user's home directory
23
+ ENV HOME=/home/user \
24
+ PATH=/home/user/.local/bin:$PATH
25
+
26
+ # Set the working directory to the user's home directory
27
+ WORKDIR $HOME/app
28
+
29
+ # Copy the current directory contents into the container at $HOME/app setting the owner to the user
30
+ COPY --chown=user . $HOME/app
31
+
32
+ # Start the FastAPI app on port 7860, the default port expected by Spaces
33
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from pydantic import BaseModel
3
+ from ktp_reader import process_image
4
+
5
+
6
+ app = FastAPI()
7
+
8
+ class ImageData(BaseModel):
9
+ image_path: str
10
+
11
+ @app.post("/process_image/")
12
+ def read_image(image_data: ImageData):
13
+ try:
14
+ # Use the provided image path
15
+ image_path = image_data.image_path
16
+
17
+ # Process the image using the image_path
18
+ result = process_image(image_path)
19
+
20
+ return result
21
+ except Exception as e:
22
+ return {"error": str(e)}
data.csv ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ provinsi,ACEH,BALI,BANGKA BELITUNG,BANTEN,BENGKULU,GORONTALO,DKI JAKARTA,JAMBI,JAWA BARAT,JAWA TENGAH,JAWA TIMUR,KALIMANTAN BARAT,KALIMANTAN SELATAN,KALIMANTAN TENGAH,KALIMANTAN TIMUR,KALIMANTAN UTARA,KEPULAUAN RIAU,LAMPUNG,MALUKU,MALUKU UTARA,NUSA TENGGARA BARAT,NUSA TENGGARA TIMUR,PAPUA,PAPUA BARAT,RIAU,SULAWESI BARAT,SULAWESI SELATAN,SULAWESI TENGAH,SULAWESI TENGGARA,SULAWESI UTARA,SUMATERA BARAT,SUMATERA SELATAN,SUMATERA UTARA
2
+ ACEH,KABUPATEN ACEH BARAT,KABUPATEN BADUNG,KABUPATEN BANGKA,KABUPATEN LEBAK,KABUPATEN BENGKULU SELATAN,KABUPATEN BOALEMO,JAKARTA BARAT,KABUPATEN BATANGHARI,KABUPATEN BANDUNG,KABUPATEN BANJARNEGARA,KABUPATEN BANGKALAN,KABUPATEN BENGKAYANG,KABUPATEN BALANGAN,KABUPATEN BARITO SELATAN,KABUPATEN BERAU,KABUPATEN BULUNGAN,KABUPATEN BINTAN,KABUPATEN LAMPUNG TENGAH,KABUPATEN BURU,KABUPATEN HALMAHERA BARAT,KABUPATEN BIMA,KABUPATEN ALOR,KABUPATEN ASMAT,KABUPATEN FAKFAK,KABUPATEN BENGKALIS,KABUPATEN MAJENE,KABUPATEN BANTAENG,KABUPATEN BANGGAI,KABUPATEN BOMBANA,KABUPATEN BOLAANG MONGONDOW,KABUPATEN AGAM,KABUPATEN BANYUASIN,KABUPATEN ASAHAN
3
+ BALI,KABUPATEN ACEH BARAT DAYA,KABUPATEN BANGLI,KABUPATEN BANGKA BARAT,KABUPATEN PANDEGLANG,KABUPATEN BENGKULU TENGAH,KABUPATEN BONE BOLANGO,JAKARTA PUSAT,KABUPATEN BUNGO,KABUPATEN BANDUNG BARAT,KABUPATEN BANYUMAS,KABUPATEN BANYUWANGI,KABUPATEN KAPUAS HULU,KABUPATEN BANJAR,KABUPATEN BARITO TIMUR,KABUPATEN KUTAI BARAT,KABUPATEN MALINAU,KABUPATEN KARIMUN,KABUPATEN LAMPUNG UTARA,KABUPATEN BURU SELATAN,KABUPATEN HALMAHERA TENGAH,KABUPATEN DOMPU,KABUPATEN BELU,KABUPATEN BIAK NUMFOR,KABUPATEN KAIMANA,KABUPATEN INDRAGIRI HILIR,KABUPATEN MAMASA,KABUPATEN BARRU,KABUPATEN BANGGAI KEPULAUAN,KABUPATEN BUTON,KABUPATEN BOLAANG MONGONDOW SELATAN,KABUPATEN DHARMASRAYA,KABUPATEN EMPAT LAWANG,KABUPATEN BATUBARA
4
+ BANGKA BELITUNG,KABUPATEN ACEH BESAR,KABUPATEN BULELENG,KABUPATEN BANGKA SELATAN,KABUPATEN SERANG,KABUPATEN BENGKULU UTARA,KABUPATEN GORONTALO,JAKARTA SELATAN,KABUPATEN KERINCI,KABUPATEN BEKASI,KABUPATEN BATANG,KABUPATEN BLITAR,KABUPATEN KAYONG UTARA,KABUPATEN BARITO KUALA,KABUPATEN BARITO UTARA,KABUPATEN KUTAI KARTANEGARA,KABUPATEN NUNUKAN,KABUPATEN KEPULAUAN ANAMBAS,KABUPATEN LAMPUNG SELATAN,KABUPATEN KEPULAUAN ARU,KABUPATEN HALMAHERA UTARA,KABUPATEN LOMBOK BARAT,KABUPATEN ENDE,KABUPATEN BOVEN DIGOEL,KABUPATEN MANOKWARI,KABUPATEN INDRAGIRI HULU,KABUPATEN MAMUJU,KABUPATEN BONE,KABUPATEN BANGGAI LAUT,KABUPATEN BUTON SELATAN,KABUPATEN BOLAANG MONGONDOW TIMUR,KABUPATEN KEPULAUAN MENTAWAI,KABUPATEN LAHAT,KABUPATEN DAIRI
5
+ BANTEN,KABUPATEN ACEH JAYA,KABUPATEN GIANYAR,KABUPATEN BANGKA TENGAH,KABUPATEN TANGERANG,KABUPATEN KAUR,KABUPATEN GORONTALO UTARA,JAKARTA TIMUR,KABUPATEN MERANGIN,KABUPATEN BOGOR,KABUPATEN BLORA,KABUPATEN BOJONEGORO,KABUPATEN KETAPANG,KABUPATEN HULU SUNGAI SELATAN,KABUPATEN GUNUNG MAS,KABUPATEN KUTAI TIMUR,KABUPATEN TANA TIDUNG,KABUPATEN LINGGA,KABUPATEN LAMPUNG BARAT,KABUPATEN MALUKU BARAT DAYA,KABUPATEN HALMAHERA SELATAN,KABUPATEN LOMBOK TENGAH,KABUPATEN FLORES TIMUR,KABUPATEN DEIYAI,KABUPATEN MANOKWARI SELATAN,KABUPATEN KAMPAR,KABUPATEN MAMUJU TENGAH,KABUPATEN BULUKUMBA,KABUPATEN BUOL,KABUPATEN BUTON TENGAH,KABUPATEN BOLAANG MONGONDOW UTARA,KABUPATEN LIMA PULUH KOTA,KABUPATEN MUARA ENIM,KABUPATEN DELI SERDANG
6
+ BENGKULU,KABUPATEN ACEH SELATAN,KABUPATEN JEMBRANA,KABUPATEN BELITUNG,KOTA CILEGON,KABUPATEN KEPAHIANG,KABUPATEN POHUWATO,JAKARTA UTARA,KABUPATEN MUARO JAMBI,KABUPATEN CIAMIS,KABUPATEN BOYOLALI,KABUPATEN BONDOWOSO,KABUPATEN KUBU RAYA,KABUPATEN HULU SUNGAI TENGAH,KABUPATEN KAPUAS,KABUPATEN MAHAKAM ULU,KOTA TARAKAN,KABUPATEN NATUNA,KABUPATEN LAMPUNG TIMUR,KABUPATEN MALUKU TENGAH,KABUPATEN KEPULAUAN SULA,KABUPATEN LOMBOK TIMUR,KABUPATEN KUPANG,KABUPATEN DOGIYAI,KABUPATEN MAYBRAT,KABUPATEN KEPULAUAN MERANTI,KABUPATEN MAMUJU UTARA,KABUPATEN ENREKANG,KABUPATEN DONGGALA,KABUPATEN BUTON UTARA,KABUPATEN KEPULAUAN SANGIHE,KABUPATEN PADANG PARIAMAN,KABUPATEN MUSI BANYUASIN,KABUPATEN HUMBANG HASUNDUTAN
7
+ GORONTALO,KABUPATEN ACEH SINGKIL,KABUPATEN KARANGASEM,KABUPATEN BELITUNG TIMUR,KOTA SERANG,KABUPATEN LEBONG,KOTA GORONTALO,KEPULAUAN SERIBU,KABUPATEN SAROLANGUN,KABUPATEN CIANJUR,KABUPATEN BREBES,KABUPATEN GRESIK,KABUPATEN LANDAK,KABUPATEN HULU SUNGAI UTARA,KABUPATEN KATINGAN,KABUPATEN PASER,,KOTA BATAM,KABUPATEN MESUJI,KABUPATEN MALUKU TENGGARA,KABUPATEN HALMAHERA TIMUR,KABUPATEN LOMBOK UTARA,KABUPATEN LEMBATA,KABUPATEN INTAN JAYA,KABUPATEN PEGUNUNGAN ARFAK,KABUPATEN KUANTAN SINGINGI,KABUPATEN POLEWALI MANDAR,KABUPATEN GOWA,KABUPATEN MOROWALI,KABUPATEN KOLAKA,KABUPATEN KEPULAUAN SIAU TAGULANDANG BIARO,KABUPATEN PASAMAN,KABUPATEN MUSI RAWAS,KABUPATEN KARO
8
+ DKI JAKARTA,KABUPATEN ACEH TAMIANG,KABUPATEN KLUNGKUNG,KOTA PANGKAL PINANG,KOTA TANGERANG,KABUPATEN MUKOMUKO,,,KABUPATEN TANJUNG JABUNG BARAT,KABUPATEN CIREBON,KABUPATEN CILACAP,KABUPATEN JEMBER,KABUPATEN MELAWI,KABUPATEN KOTABARU,KABUPATEN KOTAWARINGIN BARAT,KABUPATEN PENAJAM PASER UTARA,,KOTA TANJUNG PINANG,KABUPATEN PESAWARAN,KABUPATEN MALUKU TENGGARA BARAT,KABUPATEN PULAU MOROTAI,KABUPATEN SUMBAWA,KABUPATEN MALAKA,KABUPATEN JAYAPURA,KABUPATEN RAJA AMPAT,KABUPATEN PELALAWAN,KOTA MAMUJU,KABUPATEN JENEPONTO,KABUPATEN MOROWALI UTARA,KABUPATEN KOLAKA TIMUR,KABUPATEN KEPULAUAN TALAUD,KABUPATEN PASAMAN BARAT,KABUPATEN MUSI RAWAS UTARA,KABUPATEN LABUHANBATU
9
+ JAMBI,KABUPATEN ACEH TENGAH,KABUPATEN TABANAN,,KOTA TANGERANG SELATAN,KABUPATEN REJANG LEBONG,,,KABUPATEN TANJUNG JABUNG TIMUR,KABUPATEN GARUT,KABUPATEN DEMAK,KABUPATEN JOMBANG,KABUPATEN MEMPAWAH,KABUPATEN TABALONG,KABUPATEN KOTAWARINGIN TIMUR,KOTA BALIKPAPAN,,,KABUPATEN PESISIR BARAT,KABUPATEN SERAM BAGIAN BARAT,KABUPATEN PULAU TALIABU,KABUPATEN SUMBAWA BARAT,KABUPATEN MANGGARAI,KABUPATEN JAYAWIJAYA,KABUPATEN SORONG,KABUPATEN ROKAN HILIR,,KABUPATEN KEPULAUAN SELAYAR,KABUPATEN PARIGI MOUTONG,KABUPATEN KOLAKA UTARA,KABUPATEN MINAHASA,KABUPATEN PESISIR SELATAN,KABUPATEN OGAN ILIR,KABUPATEN LABUHANBATU SELATAN
10
+ JAWA BARAT,KABUPATEN ACEH TENGGARA,KOTA DENPASAR,,,KABUPATEN SELUMA,,,KABUPATEN TEBO,KABUPATEN INDRAMAYU,KABUPATEN GROBOGAN,KABUPATEN KEDIRI,KABUPATEN SAMBAS,KABUPATEN TANAH BUMBU,KABUPATEN LAMANDAU,KOTA BONTANG,,,KABUPATEN PRINGSEWU,KABUPATEN SERAM BAGIAN TIMUR,KOTA TERNATE,KOTA BIMA,KABUPATEN MANGGARAI BARAT,KABUPATEN KEEROM,KABUPATEN SORONG SELATAN,KABUPATEN ROKAN HULU,,KABUPATEN LUWU,KABUPATEN POSO,KABUPATEN KONAWE,KABUPATEN MINAHASA SELATAN,KABUPATEN SIJUNJUNG,KABUPATEN OGAN KOMERING ILIR,KABUPATEN LABUHANBATU UTARA
11
+ JAWA TENGAH,KABUPATEN ACEH TIMUR,,,,KOTA BENGKULU,,,KOTA JAMBI,KABUPATEN KARAWANG,KABUPATEN JEPARA,KABUPATEN LAMONGAN,KABUPATEN SANGGAU,KABUPATEN TANAH LAUT,KABUPATEN MURUNG RAYA,KOTA SAMARINDA,,,KABUPATEN TULANG BAWANG,KOTA AMBON,KOTA TIDORE KEPULAUAN,KOTA MATARAM,KABUPATEN MANGGARAI TIMUR,KABUPATEN KEPULAUAN YAPEN,KABUPATEN TAMBRAUW,KABUPATEN SIAK,,KABUPATEN LUWU TIMUR,KABUPATEN SIGI,KABUPATEN KONAWE KEPULAUAN,KABUPATEN MINAHASA TENGGARA,KABUPATEN SOLOK,KABUPATEN OGAN KOMERING ULU,KABUPATEN LANGKAT
12
+ JAWA TIMUR,KABUPATEN ACEH UTARA,,,,,,,KOTA SUNGAI PENUH,KABUPATEN KUNINGAN,KABUPATEN KARANGANYAR,KABUPATEN LUMAJANG,KABUPATEN SEKADAU,KABUPATEN TAPIN,KABUPATEN PULANG PISAU,,,,KABUPATEN TULANG BAWANG BARAT,KOTA TUAL,,,KABUPATEN NGADA,KABUPATEN LANNY JAYA,KABUPATEN TELUK BINTUNI,KOTA DUMAI,,KABUPATEN LUWU UTARA,KABUPATEN TOJO UNA-UNA,KABUPATEN KONAWE SELATAN,KABUPATEN MINAHASA UTARA,KABUPATEN SOLOK SELATAN,KABUPATEN OGAN KOMERING ULU SELATAN,KABUPATEN MANDAILING NATAL
13
+ KALIMANTAN BARAT,KABUPATEN BENER MERIAH,,,,,,,,KABUPATEN MAJALENGKA,KABUPATEN KEBUMEN,KABUPATEN MADIUN,KABUPATEN SINTANG,KOTA BANJARBARU,KABUPATEN SUKAMARA,,,,KABUPATEN TANGGAMUS,,,,KABUPATEN NAGEKEO,KABUPATEN MAMBERAMO RAYA,KABUPATEN TELUK WONDAMA,KOTA PEKANBARU,,KABUPATEN MAROS,KABUPATEN TOLI-TOLI,KABUPATEN KONAWE UTARA,KOTA BITUNG,KABUPATEN TANAH DATAR,KABUPATEN OGAN KOMERING ULU TIMUR,KABUPATEN NIAS
14
+ KALIMANTAN SELATAN,KABUPATEN BIREUEN,,,,,,,,KABUPATEN PANGANDARAN,KABUPATEN KENDAL,KABUPATEN MAGETAN,KOTA PONTIANAK,KOTA BANJARMASIN,KABUPATEN SERUYAN,,,,KABUPATEN WAY KANAN,,,,KABUPATEN ROTE NDAO,KABUPATEN MAMBERAMO TENGAH,,,,KABUPATEN PANGKAJENE DAN KEPULAUAN,KOTA PALU,KABUPATEN MUNA,KOTA KOTAMOBAGU,KOTA BUKITTINGGI,KABUPATEN PENUKAL ABAB LEMATANG ILIR,KABUPATEN NIAS BARAT
15
+ KALIMANTAN TENGAH,KABUPATEN GAYO LUES,,,,,,,,KABUPATEN PURWAKARTA,KABUPATEN KLATEN,KABUPATEN MALANG,KOTA SINGKAWANG,,KOTA PALANGKA RAYA,,,,KOTA BANDAR LAMPUNG,,,,KABUPATEN SABU RAIJUA,KABUPATEN MAPPI,,,,KABUPATEN PINRANG,,KABUPATEN MUNA BARAT,KOTA MANADO,KOTA PADANG,KOTA LUBUKLINGGAU,KABUPATEN NIAS SELATAN
16
+ KALIMANTAN TIMUR,KABUPATEN NAGAN RAYA,,,,,,,,KABUPATEN SUBANG,KABUPATEN KUDUS,KABUPATEN MOJOKERTO,,,,,,,KOTA METRO,,,,KABUPATEN SIKKA,KABUPATEN MERAUKE,,,,KABUPATEN SIDENRENG RAPPANG,,KABUPATEN WAKATOBI,KOTA TOMOHON,KOTA PADANGPANJANG,KOTA PAGAR ALAM,KABUPATEN NIAS UTARA
17
+ KALIMANTAN UTARA,KABUPATEN PIDIE,,,,,,,,KABUPATEN SUKABUMI,KABUPATEN MAGELANG,KABUPATEN NGANJUK,,,,,,,,,,,KABUPATEN SUMBA BARAT,KABUPATEN MIMIKA,,,,KABUPATEN SINJAI,,KOTA BAU-BAU,,KOTA PARIAMAN,KOTA PALEMBANG,KABUPATEN PADANG LAWAS
18
+ KEPULAUAN RIAU,KABUPATEN PIDIE JAYA,,,,,,,,KABUPATEN SUMEDANG,KABUPATEN PATI,KABUPATEN NGAWI,,,,,,,,,,,KABUPATEN SUMBA BARAT DAYA,KABUPATEN NABIRE,,,,KABUPATEN SOPPENG,,KOTA KENDARI,,KOTA PAYAKUMBUH,KOTA PRABUMULIH,KABUPATEN PADANG LAWAS UTARA
19
+ LAMPUNG,KABUPATEN SIMEULUE,,,,,,,,KABUPATEN TASIKMALAYA,KABUPATEN PEKALONGAN,KABUPATEN PACITAN,,,,,,,,,,,KABUPATEN SUMBA TENGAH,KABUPATEN NDUGA,,,,KABUPATEN TAKALAR,,,,KOTA SAWAHLUNTO,,KABUPATEN PAKPAK BHARAT
20
+ MALUKU,KOTA BANDA ACEH,,,,,,,,KOTA BANDUNG,KABUPATEN PEMALANG,KABUPATEN PAMEKASAN,,,,,,,,,,,KABUPATEN SUMBA TIMUR,KABUPATEN PANIAI,,,,KABUPATEN TANA TORAJA,,,,KOTA SOLOK,,KABUPATEN SAMOSIR
21
+ MALUKU UTARA,KOTA LANGSA,,,,,,,,KOTA BANJAR,KABUPATEN PURBALINGGA,KABUPATEN PASURUAN,,,,,,,,,,,KABUPATEN TIMOR TENGAH SELATAN,KABUPATEN PEGUNUNGAN BINTANG,,,,KABUPATEN TORAJA UTARA,,,,,,KABUPATEN SERDANG BEDAGAI
22
+ NUSA TENGGARA BARAT,KOTA LHOKSEUMAWE,,,,,,,,KOTA BEKASI,KABUPATEN PURWOREJO,KABUPATEN PONOROGO,,,,,,,,,,,KABUPATEN TIMOR TENGAH UTARA,KABUPATEN PUNCAK,,,,KABUPATEN WAJO,,,,,,KABUPATEN SIMALUNGUN
23
+ NUSA TENGGARA TIMUR,KOTA SABANG,,,,,,,,KOTA BOGOR,KABUPATEN REMBANG,KABUPATEN PROBOLINGGO,,,,,,,,,,,KOTA KUPANG,KABUPATEN PUNCAK JAYA,,,,KOTA MAKASSAR,,,,,,KABUPATEN TAPANULI SELATAN
24
+ PAPUA,KOTA SUBULUSSALAM,,,,,,,,KOTA CIMAHI,KABUPATEN SEMARANG,KABUPATEN SAMPANG,,,,,,,,,,,,KABUPATEN SARMI,,,,KOTA PALOPO,,,,,,KABUPATEN TAPANULI TENGAH
25
+ PAPUA BARAT,,,,,,,,,KOTA CIREBON,KABUPATEN SRAGEN,KABUPATEN SIDOARJO,,,,,,,,,,,,KABUPATEN SUPIORI,,,,KOTA PAREPARE,,,,,,KABUPATEN TAPANULI UTARA
26
+ RIAU,,,,,,,,,KOTA DEPOK,KABUPATEN SUKOHARJO,KABUPATEN SITUBONDO,,,,,,,,,,,,KABUPATEN TOLIKARA,,,,,,,,,,KABUPATEN TOBA SAMOSIR
27
+ SULAWESI BARAT,,,,,,,,,KOTA SUKABUMI,KABUPATEN TEGAL,KABUPATEN SUMENEP,,,,,,,,,,,,KABUPATEN WAROPEN,,,,,,,,,,KOTA BINJAI
28
+ SULAWESI SELATAN,,,,,,,,,KOTA TASIKMALAYA,KABUPATEN TEMANGGUNG,KABUPATEN TRENGGALEK,,,,,,,,,,,,KABUPATEN YAHUKIMO,,,,,,,,,,KOTA GUNUNGSITOLI
29
+ SULAWESI TENGAH,,,,,,,,,,KABUPATEN WONOGIRI,KABUPATEN TUBAN,,,,,,,,,,,,KABUPATEN YALIMO,,,,,,,,,,KOTA MEDAN
30
+ SULAWESI TENGGARA,,,,,,,,,,KABUPATEN WONOSOBO,KABUPATEN TULUNGAGUNG,,,,,,,,,,,,KOTA JAYAPURA,,,,,,,,,,KOTA PADANGSIDEMPUAN
31
+ SULAWESI UTARA,,,,,,,,,,KOTA MAGELANG,KOTA BATU,,,,,,,,,,,,,,,,,,,,,,KOTA PEMATANGSIANTAR
32
+ SUMATERA BARAT,,,,,,,,,,KOTA PEKALONGAN,KOTA BLITAR,,,,,,,,,,,,,,,,,,,,,,KOTA SIBOLGA
33
+ SUMATERA SELATAN,,,,,,,,,,KOTA SALATIGA,KOTA KEDIRI,,,,,,,,,,,,,,,,,,,,,,KOTA TANJUNGBALAI
34
+ SUMATERA UTARA,,,,,,,,,,KOTA SEMARANG,KOTA MADIUN,,,,,,,,,,,,,,,,,,,,,,KOTA TEBING TINGGI
35
+ ,,,,,,,,,,KOTA SURAKARTA,KOTA MALANG,,,,,,,,,,,,,,,,,,,,,,
36
+ ,,,,,,,,,,KOTA TEGAL,KOTA MOJOKERTO,,,,,,,,,,,,,,,,,,,,,,
37
+ ,,,,,,,,,,,KOTA PASURUAN,,,,,,,,,,,,,,,,,,,,,,
38
+ ,,,,,,,,,,,KOTA PROBOLINGGO,,,,,,,,,,,,,,,,,,,,,,
39
+ ,,,,,,,,,,,KOTA SURABAYA,,,,,,,,,,,,,,,,,,,,,,
id.jpg ADDED
ktp_reader.py ADDED
@@ -0,0 +1,632 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from paddleocr import PaddleOCR
2
+ import os
3
+ import cv2
4
+ import pytesseract
5
+ import pandas as pd
6
+ import re
7
+ from thefuzz import fuzz
8
+ from thefuzz import process
9
+ import logging
10
+ import json
11
+
12
+ logging.getLogger().setLevel(logging.ERROR)
13
+
14
+
15
+ def process_image(path):
16
+ """
17
+ The main function that performs optical character recognition (OCR) on an image and processes the extracted data.
18
+
19
+ Returns:
20
+ obj: Processed text output containing extracted information.
21
+ """
22
+ csv_path = 'data.csv'
23
+ data_dict = {
24
+ "provinsi": "",
25
+ "kabupaten": "",
26
+ "nik": "",
27
+ "nama": "",
28
+ "tempat/tgl lahir": "",
29
+ "jenis kelamin": "",
30
+ "gol. darah": "",
31
+ "alamat": "",
32
+ "rt/rw": "",
33
+ "kel/desa": "",
34
+ "kecamatan": "",
35
+ "agama": "",
36
+ "status perkawinan": "",
37
+ "pekerjaan": "",
38
+ "kewarganegaraan": "",
39
+ "berlaku hingga": "",
40
+ }
41
+ # Create list for labels spelling correction
42
+ labels = list(data_dict.keys())
43
+ labels.remove("kabupaten")
44
+
45
+ try:
46
+ # Read csv data file
47
+ df = pd.read_csv(csv_path)
48
+ except:
49
+ raise ValueError("Cannot find the csv data file.")
50
+
51
+ try:
52
+ # Resize image
53
+ image = resize_image(path)
54
+
55
+ # Run Tesseract to get the right rotation and color conversion
56
+ image_xyz = rotate_image(image)
57
+ except:
58
+ raise ValueError("Invalid image input.")
59
+
60
+
61
+ # Run PaddleOCR on the whole image and Tesseract on detected areas by PaddleOCR
62
+ all_data = run_ocr(image_xyz)
63
+
64
+ # Check if the 16-digit ID number exists
65
+ all_data = check_numbers(all_data)
66
+
67
+ # Split labels and data
68
+ new_data = split_items(all_data)
69
+
70
+ try:
71
+ # Correct the text of labels
72
+ new_data, found_labels = correct_labels(new_data, labels)
73
+
74
+ # Correct the data
75
+ new_data = correct_data(new_data, df)
76
+ except:
77
+ pass
78
+
79
+ try:
80
+ # Add labels if missing
81
+ new_data = add_missing_labels(new_data, labels, found_labels)
82
+ except:
83
+ pass
84
+
85
+ # Print the clean output
86
+ text = print_output(new_data)
87
+
88
+ # Convert to JSON
89
+ text_obj = json.dumps({"text":text})
90
+
91
+ return text_obj
92
+
93
+
94
+ def get_scores(result):
95
+ """
96
+ Get scores from the OCR result.
97
+
98
+ Args:
99
+ result (list): The OCR result list.
100
+
101
+ Returns:
102
+ tuple: A tuple containing lists of sorted confidence scores, overall score, and all scores.
103
+ """
104
+ scores = [round(line[1][1],4) for line in result[0]]
105
+ overall_score = 0
106
+ for score in scores:
107
+ overall_score += score
108
+ overall_score = round(overall_score/len(scores),4)
109
+ sorted_scores = sorted(scores)
110
+
111
+ # Raise error if the 3rd confidence score is less than 90%
112
+ if sorted_scores[2] < 0.9:
113
+ raise ValueError("Poor image quality. Please avoid shadows, flashlights, and patterned backgrounds.")
114
+ return overall_score, sorted_scores, scores
115
+
116
+
117
+
118
+ def add_missing_labels(new_data, labels, found_labels):
119
+
120
+ # Add labels if a maximum of 3 labels is missing
121
+ if len(found_labels) < 15 and len(found_labels) > 12:
122
+ added = 0
123
+ for i in range(len(labels)):
124
+ if labels[i] != found_labels[i][0]:
125
+ # Use next label index - 2 + the number of shifted items
126
+ # Else, use previous label index + 2 + the number of shifted items
127
+ try:
128
+ if labels[i] == "gol. darah":
129
+ idx = found_labels[i][1] + added
130
+ elif labels[i] == "alamat":
131
+ # Get Gol. Darah index and check if the length of next item is greater than two
132
+ gol_idx = new_data.index("gol. darah")
133
+ if len(new_data[gol_idx+1]) > 2:
134
+ idx = gol_idx + 1
135
+ else:
136
+ idx = gol_idx + 2
137
+ else:
138
+ idx = found_labels[i+1][1] - 2 + added
139
+ except:
140
+ idx = found_labels[i-1][1] + 2 + added
141
+ if idx < len(new_data)-1:
142
+ new_data.insert(idx, [labels[i], labels[i], 'label'])
143
+ found_labels.insert(i, [labels[i], idx])
144
+ else:
145
+ new_data.insert(len(new_data)-2, [labels[i], labels[i], 'label'])
146
+ found_labels.insert(i, [labels[i], len(new_data)-2])
147
+ added += 1
148
+ else:
149
+ raise ValueError("Some labels cannot be detected. Please recapture a photo of the ID.")
150
+ return new_data
151
+
152
+
153
+ def check_numbers(all_data):
154
+ """
155
+ Check if there is a 16-digit number in OCR text.
156
+
157
+ Args:
158
+ all_data (list): The structured OCR result list.
159
+
160
+ Returns:
161
+ list: A list containing the structured OCR output
162
+ """
163
+ ktp_num = ""
164
+ for i in range(len(all_data)):
165
+ id_output = re.findall("\d{16}", all_data[i][4])
166
+ rt_output = re.findall("\d{3}/\d{3}", all_data[i][4])
167
+ if len(id_output) > 0:
168
+ # Keep PaddleOCR output for both
169
+ ktp_num, all_data[i][4], all_data[i][5] = id_output[0], id_output[0], id_output[0]
170
+ if len(rt_output) > 0:
171
+ all_data[i][4], all_data[i][5] = rt_output[0], rt_output[0]
172
+ if ktp_num == "":
173
+ raise ValueError("KTP number cannot be detected. Please recapture a photo of the ID.")
174
+
175
+ return all_data
176
+
177
+
178
+ def run_ocr(image):
179
+ """
180
+ Perform optical character recognition (OCR) on the given image.
181
+
182
+ Args:
183
+ image (ndarray): The image array on which OCR will be performed.
184
+
185
+ Returns:
186
+ list: A list containing information about the recognized text regions, including coordinates, recognized text,
187
+ and corresponding OCR outputs from different OCR engines.
188
+ """
189
+ ocr = PaddleOCR(
190
+ use_angle_cls=True,
191
+ lang="id",
192
+ det_max_side_len=1500,
193
+ det_limit_type="min",
194
+ det_db_unclip_ratio=1.7,
195
+ drop_score = 0.75,
196
+ show_log=False,
197
+ )
198
+ result = ocr.ocr(image, cls=True)
199
+ all_data = []
200
+
201
+ # Check the if the confidence score is higher than the threshold
202
+ get_scores(result)
203
+
204
+ # Create a list of values in form of x1, y1, x2, y2, Paddle output, Tesseract output
205
+ for i, res in enumerate(result[0]):
206
+ x, y = [], []
207
+ paddle_text = res[1][0]
208
+ for i in range(4):
209
+ x.append(res[0][i][0])
210
+ y.append(res[0][i][1])
211
+ x1, y1, x2, y2 = int(min(x)), int(min(y)), int(max(x)), int(max(y))
212
+
213
+ # Crop the area of text detected by Paddle
214
+ snip = image[y1:y2, x1:x2]
215
+
216
+ # Run Tesseract on the cropped area
217
+ tess_text = pytesseract.image_to_string(snip, lang="ind+eng", config="--psm 6")
218
+
219
+ # Clean the output of Tesseract and Paddle
220
+ tess_text, paddle_text = clean_text(tess_text, paddle_text)
221
+
222
+ all_data.append([x1, y1, x2, y2, paddle_text, tess_text])
223
+
224
+ return all_data
225
+
226
+
227
+ def clean_text(tess_text, paddle_text):
228
+ """
229
+ Clean and preprocess the recognized text from Tesseract and PaddleOCR.
230
+
231
+ Args:
232
+ tess_text (str): Text recognized by Tesseract OCR.
233
+ paddle_text (str): Text recognized by PaddleOCR.
234
+
235
+ Returns:
236
+ tuple: A tuple containing the cleaned and preprocessed text from Tesseract and PaddleOCR, respectively.
237
+ """
238
+ # Remove unicode
239
+ if "\n" in tess_text or "\x0c" in tess_text:
240
+ tess_text = tess_text.replace("\n", "")
241
+ tess_text = tess_text.replace("\x0c", "")
242
+
243
+ # Remove space before or after colon and hyphen
244
+ pattern = r"\s*([-:*])\s*"
245
+ paddle_text = re.sub(pattern, r"\1", paddle_text)
246
+ tess_text = re.sub(pattern, r"\1", tess_text)
247
+
248
+ # Replace any 1O with 10
249
+ paddle_text = paddle_text.replace("1O","10")
250
+ tess_text = tess_text.replace("1O","10")
251
+
252
+ # Fix dots in ID number
253
+ pattern = r"[0-9\.]{10}"
254
+ res = re.findall(pattern, paddle_text)
255
+ if len(res) != 0:
256
+ paddle_text = paddle_text.replace(".","")
257
+
258
+ # Add space after dot or comma and remove any two spaces
259
+ paddle_text = re.sub(r"([A-Z]\.)([A-z])", r"\1 \2", paddle_text)
260
+
261
+ # Fix commas recognized as dots and add space after it
262
+ if "NO" not in paddle_text:
263
+ pattern = r"([A-Za-z][\.,]\s{0,1})(\d{2})"
264
+ paddle_text = re.sub(pattern, r", \2", paddle_text)
265
+ tess_text = re.sub(pattern, r", \2", tess_text)
266
+ else:
267
+ pattern = r"([A-Za-z][\.]\s{0,1})(\d{1})"
268
+ paddle_text = re.sub(pattern, r". \2", paddle_text)
269
+ tess_text = re.sub(pattern, r". \2", tess_text)
270
+
271
+ # Clean blood group
272
+ if "Darah" in tess_text or "Darah" in paddle_text:
273
+ tess_text = tess_text.replace("0", "O")
274
+ paddle_text = paddle_text.replace("0", "O")
275
+
276
+ # Clean symbols
277
+ for item in ["'", '"', "!", "‘", "“", ":", "*","=", "+"]:
278
+ paddle_text = paddle_text.replace(item, "")
279
+ tess_text = tess_text.replace(item, "")
280
+
281
+ # Remove hyphen, dot, or comma if in the beginning of the text
282
+ if len(tess_text) > 0:
283
+ if tess_text[0] in ['-','.',',']:
284
+ tess_text = tess_text[1:]
285
+ if len(paddle_text) > 0:
286
+ if paddle_text[0] in ['-','.',',']:
287
+ paddle_text = paddle_text[1:]
288
+
289
+ # if paddle text is similar to tesseract text without spaces, replace paddle text with tesseract text
290
+ temp = tess_text.replace(" ","")
291
+ if paddle_text == temp:
292
+ paddle_text = tess_text
293
+
294
+ # If JL in the beggining of text, add the dot
295
+ if paddle_text[:2] == "JL" or tess_text[:2] == "JL":
296
+ paddle_text = re.sub(r"(JL)(\.{0,1})([A-Z])",r"JL. \3", paddle_text)
297
+ tess_text = re.sub(r"(JL)(\.{0,1})([A-Z])",r"JL. \3", tess_text)
298
+
299
+ # Check add missing spaces to Paddle Output
300
+ idxs = []
301
+ for i, char in enumerate(tess_text):
302
+ if char.isspace():
303
+ idxs.append(i)
304
+ for idx in idxs:
305
+ try:
306
+ p1 = tess_text[idx-2:idx]
307
+ p2 = tess_text[idx+1:idx+3]
308
+ if p1.isalpha() == True and p2.isalpha() == True:
309
+ to_replace = p1+p2
310
+ new = p1+" "+p2
311
+ paddle_text = paddle_text.replace(to_replace, new)
312
+ except:
313
+ pass
314
+
315
+ return tess_text, paddle_text
316
+
317
+
318
+ def resize_image(path):
319
+ """
320
+ Resize the image if its dimensions are smaller than the specified threshold.
321
+
322
+ Args:
323
+ path (str): The path to the image file.
324
+
325
+ Returns:
326
+ ndarray: The resized image array.
327
+ """
328
+ img = cv2.imread(path)
329
+ width = int(img.shape[1])
330
+ height = int(img.shape[0])
331
+ thresh = 1500
332
+
333
+ # Resize image to match the threshold
334
+ if width < thresh and height < thresh:
335
+ if width > height:
336
+ percent = thresh // width
337
+ else:
338
+ percent = thresh // height
339
+ dim = (width * percent, height * percent)
340
+ img = cv2.resize(img, dim, interpolation=cv2.INTER_AREA)
341
+ return img
342
+
343
+
344
+ def rotate_image(image):
345
+ """
346
+ Rotate the image to the correct orientation by checking for specific text patterns in different rotations.
347
+
348
+ Args:
349
+ image (ndarray): The image array to be rotated.
350
+
351
+ Returns:
352
+ ndarray: The rotated image array if specific text patterns are found, otherwise the original image array.
353
+ """
354
+ # Convert color to XYZ
355
+ image_xyz = cv2.cvtColor(image, cv2.COLOR_BGR2XYZ)
356
+
357
+ # Rotate the image by 90 degrees for 4 times until recognizing some correct text
358
+ for i in range(4):
359
+ text = pytesseract.image_to_string(image_xyz, lang="ind+eng", config="--psm 6")
360
+ if "PROVINSI" in text or "Darah" in text or "NIK" in text:
361
+ return image_xyz
362
+ else:
363
+ image_xyz = cv2.rotate(image_xyz, cv2.ROTATE_90_CLOCKWISE)
364
+
365
+ # If text is not found until last round, return image in original rotation
366
+ if i == 3:
367
+ return image_xyz
368
+
369
+
370
+ def correct_labels(new_data, labels):
371
+ """
372
+ Correct the labels of the extracted data by matching them with a list of valid labels.
373
+
374
+ Args:
375
+ new_data (list): The extracted data list to be corrected.
376
+ labels (list): The list of valid labels.
377
+
378
+ Returns:
379
+ tuple: The corrected extracted data list with updated labels and list of labels
380
+ and corresponding indexes.
381
+ """
382
+ thresh = 75
383
+ found_labels = [["provinsi", 0]]
384
+ for i in range(len(new_data)):
385
+ paddle_fuzz = process.extractOne(new_data[i][0], labels, scorer=fuzz.ratio)
386
+ tess_fuzz = process.extractOne(new_data[i][1], labels, scorer=fuzz.ratio)
387
+
388
+ # Skip adding provinsi because it's already added at index 0
389
+ if paddle_fuzz[0] != 'provinsi' and tess_fuzz[0] != 'provinsi':
390
+ # Correct text using the match that is more than the threshold
391
+ if paddle_fuzz[1] >= thresh:
392
+ new_data[i][0] = paddle_fuzz[0]
393
+ new_data[i][1] = paddle_fuzz[0]
394
+ new_data[i].append("label")
395
+ found_labels.append([paddle_fuzz[0], i])
396
+ elif tess_fuzz[1] >= thresh:
397
+ new_data[i][0] = tess_fuzz[0]
398
+ new_data[i][1] = tess_fuzz[0]
399
+ new_data[i].append("label")
400
+ found_labels.append([tess_fuzz[0], i])
401
+ # Correct "NIK"
402
+ elif (len(new_data[i][0]) == 3 or len(new_data[i][1]) == 3) and (
403
+ "IK" == new_data[i][0] or "IK" in new_data[i][1]
404
+ ):
405
+ new_data[i][0] = "nik"
406
+ new_data[i][1] = "nik"
407
+ new_data[i].append("label")
408
+ found_labels.append(["nik", i])
409
+
410
+ return new_data, found_labels
411
+
412
+
413
+ def find_uppercase_index(text):
414
+ """
415
+ Find the index of the first uppercase word in the given text.
416
+
417
+ Args:
418
+ text (str): The input text.
419
+
420
+ Returns:
421
+ int: The index of the first uppercase word, or -1 if no uppercase word is found.
422
+ """
423
+ # Split lowercase followed by uppercase without space
424
+ pattern = r"(?<![A-Z])[A-Z]{3,}"
425
+ match = re.search(pattern, text)
426
+ if match:
427
+ return match.start()
428
+ else:
429
+ return -1
430
+
431
+
432
+ def correct_data(new_data, df):
433
+ """
434
+ Correct the extracted data based on reference data from a DataFrame.
435
+
436
+ Args:
437
+ new_data (list): The extracted data list to be corrected.
438
+ df (DataFrame): The reference DataFrame containing the data for correction.
439
+
440
+ Returns:
441
+ list: The corrected extracted data list.
442
+ """
443
+ # Make lists to be used in text correction
444
+ provinsi_df = df["provinsi"].dropna().tolist()
445
+ provinsi = [f"PROVINSI {item}" for item in provinsi_df]
446
+ other_vals = [
447
+ "LAKI-LAKI",
448
+ "PEREMPUAN",
449
+ "A",
450
+ "B",
451
+ "AB",
452
+ "O",
453
+ "ISLAM",
454
+ "KRISTEN",
455
+ "KATOLIK",
456
+ "HINDU",
457
+ "BUDHA",
458
+ "KONGHUCU",
459
+ "BELUM KAWIN",
460
+ "KAWIN",
461
+ "CERAI HIDUP",
462
+ "CERAI MATI",
463
+ "WNI",
464
+ "WNA",
465
+ "SEUMUR HIDUP",
466
+ ]
467
+
468
+ paddle_except_city = []
469
+ for i in range(len(new_data)):
470
+
471
+ # Fix Provinsi
472
+ if i == 0 or ("PROVINSI" in new_data[i][0] or "PROVINSI" in new_data[i][1]):
473
+ new_data[i][0], new_data[i][1] = replace_data(new_data, i, provinsi)
474
+ kabupaten = df[new_data[i][0].replace("PROVINSI ", "")].dropna().tolist()
475
+
476
+ # Fix Kabupaten
477
+ elif i == 1:
478
+ try:
479
+ new_data[i][0], new_data[i][1] = replace_data(new_data, i, kabupaten)
480
+ except:
481
+ pass
482
+
483
+ # Fix other values such as religion
484
+ elif len(new_data[i]) == 2:
485
+ new_data[i][0], new_data[i][1] = replace_data(new_data, i, other_vals)
486
+
487
+ # Fix NIK
488
+ elif i == 3 or new_data[i - 1][0].upper() == "NIK":
489
+ new_data[i][1] = new_data[i][0]
490
+
491
+ # Fix dates
492
+ if i > 4:
493
+ pattern = r"(\d{2})\W{0,1}(\d{2})\W{0,1}((19|20)\d{2})"
494
+ new_data[i][0] = re.sub(pattern, r"\1-\2-\3", new_data[i][0])
495
+ new_data[i][1] = re.sub(pattern, r"\1-\2-\3", new_data[i][1])
496
+
497
+ if i != 1:
498
+ paddle_except_city.append(new_data[i][0])
499
+
500
+ # Add WNI if no WNI or WNA
501
+ paddle_temp = [data[0] for data in new_data]
502
+ tess_temp = [data[1] for data in new_data]
503
+ if not {"WNI", "WNA"}.intersection(set(paddle_temp)) and not {"WNI", "WNA"}.intersection(set(tess_temp)):
504
+ try:
505
+ kew_idx = paddle_temp.index("kewarganegaraan")
506
+ new_data.insert(kew_idx+1, ["WNI", "WNI"])
507
+ except:
508
+ pass
509
+
510
+ # Fix issuer province name if similar to province name in line 2
511
+ issuer_fuzz = process.extractOne(new_data[1][0], paddle_except_city, scorer=fuzz.ratio)
512
+ if issuer_fuzz[1] >= 85:
513
+ for i in range(len(new_data)):
514
+ if new_data[i][0] == issuer_fuzz[0]:
515
+ new_data[i][0], new_data[i][1] = new_data[1][0], new_data[1][0]
516
+
517
+
518
+ return new_data
519
+
520
+
521
+ def replace_data(new_data, i, options_list):
522
+ """
523
+ Replace the data in the extracted list with the closest matching option from the given list.
524
+
525
+ Args:
526
+ new_data (list): The extracted data list.
527
+ i (int): The index of the item to be replaced.
528
+ options_list (list): The list of options for replacement.
529
+
530
+ Returns:
531
+ tuple: A tuple containing the replaced values for the item at index i.
532
+ """
533
+ paddle_fuzz = process.extractOne(new_data[i][0], options_list, scorer=fuzz.ratio)
534
+ tess_fuzz = process.extractOne(new_data[i][1], options_list, scorer=fuzz.ratio)
535
+
536
+ # Replace values if fuzzy matching score exceeds threshold
537
+ if len(new_data[i][0]) < 4:
538
+ thresh = 65
539
+ else:
540
+ thresh = 75
541
+ if paddle_fuzz[1] > thresh:
542
+ new_data[i][0] = paddle_fuzz[0]
543
+ new_data[i][1] = paddle_fuzz[0]
544
+ elif tess_fuzz[1] > thresh:
545
+ new_data[i][0] = tess_fuzz[0]
546
+ new_data[i][1] = tess_fuzz[0]
547
+ return new_data[i][0], new_data[i][1]
548
+
549
+
550
+ def split_items(all_data):
551
+ """
552
+ Split the data items in the given list into separate items based on certain conditions.
553
+
554
+ Args:
555
+ all_data (list): The list of data items to be split.
556
+
557
+ Returns:
558
+ list: The new list of split data items.
559
+ """
560
+ new_data = []
561
+ for i in range(len(all_data)):
562
+ paddle_idx = find_uppercase_index(all_data[i][4])
563
+ tess_idx = find_uppercase_index(all_data[i][5])
564
+ if paddle_idx not in [0, -1] and tess_idx not in [0, -1]:
565
+ p1 = [all_data[i][4][:paddle_idx].strip(), all_data[i][5][:tess_idx].strip()]
566
+ p2 = [all_data[i][4][paddle_idx:].strip(), all_data[i][5][tess_idx:].strip()]
567
+ if p1 != ["",""]:
568
+ new_data.append(p1)
569
+ if p2 != ["",""]:
570
+ new_data.append(p2)
571
+
572
+ # Fix the text related to blood type
573
+ elif "Darah" in all_data[i][4] or "Darah" in all_data[i][5]:
574
+
575
+ # Add space between blood type and label
576
+ darah_match_1 = re.sub(r"(Darah)\W*((A|AB|B|O))", r"\1 \2", all_data[i][4])
577
+ darah_match_2 = re.sub(r"(Darah)\W*((A|AB|B|O))", r"\1 \2", all_data[i][5])
578
+
579
+ # Locate the space
580
+ space_1 = darah_match_1.rfind(" ")
581
+ space_2 = darah_match_2.rfind(" ")
582
+
583
+ # Write the label and values in two seperate lists
584
+ try:
585
+ if darah_match_1[-1] in ["A", "B", "O"]:
586
+ new_data.append(
587
+ [darah_match_1[:space_1].strip(), darah_match_1[:space_1].strip()]
588
+ )
589
+ new_data.append(
590
+ [
591
+ darah_match_1[space_1 + 1 :].strip(),
592
+ darah_match_1[space_1 + 1 :].strip(),
593
+ ]
594
+ )
595
+ elif darah_match_2[-1] in ["A", "B", "O"]:
596
+ new_data.append(
597
+ [darah_match_2[:space_2].strip(), darah_match_2[:space_2].strip()]
598
+ )
599
+ new_data.append(
600
+ [
601
+ darah_match_2[space_2 + 1 :].strip(),
602
+ darah_match_2[space_2 + 1 :].strip(),
603
+ ]
604
+ )
605
+ except:
606
+ pass
607
+ else:
608
+ new_data.append([all_data[i][4].strip(), all_data[i][5].strip()])
609
+
610
+ return new_data
611
+
612
+
613
+ def print_output(new_data):
614
+ """
615
+ Create a formatted string output based on the given data.
616
+
617
+ Args:
618
+ new_data (list): The list of data items.
619
+
620
+ Returns:
621
+ str: The formatted string output.
622
+ """
623
+ text = ""
624
+ for i in range(len(new_data)):
625
+
626
+ # Change labels to Uppercase
627
+ if new_data[i][0] == new_data[i][1] and len(new_data[i]) == 3:
628
+ text += f"{new_data[i][0].upper()}\n"
629
+ else:
630
+ if len(new_data[i][0]) > 0:
631
+ text += f"{new_data[i][0]}\n"
632
+ return text
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ tesseract-ocr-all
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.100.0
2
+ opencv-python==4.6.0.66
3
+ paddleocr==2.6.1.3
4
+ paddlepaddle==2.4.1
5
+ pydantic==1.10.5
6
+ pytesseract==0.3.10
7
+ python-Levenshtein==0.20.9
8
+ thefuzz==0.19.0
9
+ uvicorn==0.23.1